diff options
author | Andrew Trick <atrick@apple.com> | 2011-04-13 00:38:32 +0000 |
---|---|---|
committer | Andrew Trick <atrick@apple.com> | 2011-04-13 00:38:32 +0000 |
commit | 87896d9368e08d93493427ce7bf8272d1e5cca35 (patch) | |
tree | d3183747b2917bf4e1254c7a331ff86fd3352a2f | |
parent | f93f7b2446bec3febc30b7136e18704664bd98cc (diff) | |
download | external_llvm-87896d9368e08d93493427ce7bf8272d1e5cca35.zip external_llvm-87896d9368e08d93493427ce7bf8272d1e5cca35.tar.gz external_llvm-87896d9368e08d93493427ce7bf8272d1e5cca35.tar.bz2 |
Recommit r129383. PreRA scheduler heuristic fixes: VRegCycle, TokenFactor latency.
Additional fixes:
Do something reasonable for subtargets with generic
itineraries by handle node latency the same as for an empty
itinerary. Now nodes default to unit latency unless an itinerary
explicitly specifies a zero cycle stage or it is a TokenFactor chain.
Original fixes:
UnitsSharePred was a source of randomness in the scheduler: node
priority depended on the queue data structure. I rewrote the recent
VRegCycle heuristics to completely replace the old heuristic without
any randomness. To make the ndoe latency adjustments work, I also
needed to do something a little more reasonable with TokenFactor. I
gave it zero latency to its consumers and always schedule it as low as
possible.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@129421 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/Target/TargetInstrItineraries.h | 10 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp | 286 | ||||
-rw-r--r-- | lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp | 60 | ||||
-rw-r--r-- | test/CodeGen/ARM/memcpy-inline.ll | 8 | ||||
-rw-r--r-- | test/CodeGen/ARM/neon_div.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/ARM/va_arg.ll | 10 | ||||
-rw-r--r-- | test/CodeGen/ARM/vfp.ll | 2 | ||||
-rw-r--r-- | test/CodeGen/Mips/o32_cc_vararg.ll | 26 | ||||
-rw-r--r-- | test/CodeGen/Thumb2/thumb2-uxtb.ll | 4 |
9 files changed, 222 insertions, 186 deletions
diff --git a/include/llvm/Target/TargetInstrItineraries.h b/include/llvm/Target/TargetInstrItineraries.h index a95b70f..198d585 100644 --- a/include/llvm/Target/TargetInstrItineraries.h +++ b/include/llvm/Target/TargetInstrItineraries.h @@ -155,9 +155,13 @@ public: /// in the itinerary. /// unsigned getStageLatency(unsigned ItinClassIndx) const { - // If the target doesn't provide itinerary information, use a - // simple non-zero default value for all instructions. - if (isEmpty()) + // If the target doesn't provide itinerary information, use a simple + // non-zero default value for all instructions. Some target's provide a + // dummy (Generic) itinerary which should be handled as if it's itinerary is + // empty. We identify this by looking for a reference to stage zero (invalid + // stage). This is different from beginStage == endState != 0, which could + // be used for zero-latency pseudo ops. + if (isEmpty() || Itineraries[ItinClassIndx].FirstStage == 0) return 1; // Calculate the maximum completion time for any stage. diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp index b2e9c15..ac2f3d5 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp @@ -102,11 +102,11 @@ static cl::opt<unsigned> AvgIPC( #ifndef NDEBUG namespace { // For sched=list-ilp, Count the number of times each factor comes into play. - enum { FactPressureDiff, FactRegUses, FactHeight, FactDepth, FactStatic, - FactOther, NumFactors }; + enum { FactPressureDiff, FactRegUses, FactStall, FactHeight, FactDepth, + FactStatic, FactOther, NumFactors }; } static const char *FactorName[NumFactors] = -{"PressureDiff", "RegUses", "Height", "Depth","Static", "Other"}; +{"PressureDiff", "RegUses", "Stall", "Height", "Depth","Static", "Other"}; static int FactorCount[NumFactors]; #endif //!NDEBUG @@ -463,6 +463,13 @@ void ScheduleDAGRRList::AdvancePastStalls(SUnit *SU) { if (DisableSchedCycles) return; + // FIXME: Nodes such as CopyFromReg probably should not advance the current + // cycle. Otherwise, we can wrongly mask real stalls. If the non-machine node + // has predecessors the cycle will be advanced when they are scheduled. + // But given the crude nature of modeling latency though such nodes, we + // currently need to treat these nodes like real instructions. + // if (!SU->getNode() || !SU->getNode()->isMachineOpcode()) return; + unsigned ReadyCycle = isBottomUp ? SU->getHeight() : SU->getDepth(); // Bump CurCycle to account for latency. We assume the latency of other @@ -533,6 +540,8 @@ void ScheduleDAGRRList::EmitNode(SUnit *SU) { } } +static void resetVRegCycle(SUnit *SU); + /// ScheduleNodeBottomUp - Add the node to the schedule. Decrement the pending /// count of its predecessors. If a predecessor pending count is zero, add it to /// the Available queue. @@ -542,7 +551,8 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { #ifndef NDEBUG if (CurCycle < SU->getHeight()) - DEBUG(dbgs() << " Height [" << SU->getHeight() << "] pipeline stall!\n"); + DEBUG(dbgs() << " Height [" << SU->getHeight() + << "] pipeline stall!\n"); #endif // FIXME: Do not modify node height. It may interfere with @@ -559,7 +569,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { AvailableQueue->ScheduledNode(SU); // If HazardRec is disabled, and each inst counts as one cycle, then - // advance CurCycle before ReleasePredecessors to avoid useles pushed to + // advance CurCycle before ReleasePredecessors to avoid useless pushes to // PendingQueue for schedulers that implement HasReadyFilter. if (!HazardRec->isEnabled() && AvgIPC < 2) AdvanceToCycle(CurCycle + 1); @@ -580,20 +590,25 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) { } } + resetVRegCycle(SU); + SU->isScheduled = true; // Conditions under which the scheduler should eagerly advance the cycle: // (1) No available instructions // (2) All pipelines full, so available instructions must have hazards. // - // If HazardRec is disabled, the cycle was advanced earlier. + // If HazardRec is disabled, the cycle was pre-advanced before calling + // ReleasePredecessors. In that case, IssueCount should remain 0. // // Check AvailableQueue after ReleasePredecessors in case of zero latency. - ++IssueCount; - if ((HazardRec->isEnabled() && HazardRec->atIssueLimit()) - || (!HazardRec->isEnabled() && AvgIPC > 1 && IssueCount == AvgIPC) - || AvailableQueue->empty()) - AdvanceToCycle(CurCycle + 1); + if (HazardRec->isEnabled() || AvgIPC > 1) { + if (SU->getNode() && SU->getNode()->isMachineOpcode()) + ++IssueCount; + if ((HazardRec->isEnabled() && HazardRec->atIssueLimit()) + || (!HazardRec->isEnabled() && IssueCount == AvgIPC)) + AdvanceToCycle(CurCycle + 1); + } } /// CapturePred - This does the opposite of ReleasePred. Since SU is being @@ -1220,7 +1235,7 @@ void ScheduleDAGRRList::ListScheduleBottomUp() { // priority. If it is not ready put it back. Schedule the node. Sequence.reserve(SUnits.size()); while (!AvailableQueue->empty()) { - DEBUG(dbgs() << "\n*** Examining Available\n"; + DEBUG(dbgs() << "\nExamining Available:\n"; AvailableQueue->dump(this)); // Pick the best node to schedule taking all constraints into @@ -1661,17 +1676,6 @@ void RegReductionPQBase::CalculateSethiUllmanNumbers() { CalcNodeSethiUllmanNumber(&(*SUnits)[i], SethiUllmanNumbers); } -void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) { - SUnits = &sunits; - // Add pseudo dependency edges for two-address nodes. - AddPseudoTwoAddrDeps(); - // Reroute edges to nodes with multiple uses. - if (!TracksRegPressure) - PrescheduleNodesWithMultipleUses(); - // Calculate node priorities. - CalculateSethiUllmanNumbers(); -} - void RegReductionPQBase::addNode(const SUnit *SU) { unsigned SUSize = SethiUllmanNumbers.size(); if (SUnits->size() > SUSize) @@ -2008,7 +2012,29 @@ static unsigned calcMaxScratches(const SUnit *SU) { return Scratches; } -/// hasOnlyLiveOutUse - Return true if SU has a single value successor that is a +/// hasOnlyLiveInOpers - Return true if SU has only value predecessors that are +/// CopyFromReg from a virtual register. +static bool hasOnlyLiveInOpers(const SUnit *SU) { + bool RetVal = false; + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); + I != E; ++I) { + if (I->isCtrl()) continue; + const SUnit *PredSU = I->getSUnit(); + if (PredSU->getNode() && + PredSU->getNode()->getOpcode() == ISD::CopyFromReg) { + unsigned Reg = + cast<RegisterSDNode>(PredSU->getNode()->getOperand(1))->getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + RetVal = true; + continue; + } + } + return false; + } + return RetVal; +} + +/// hasOnlyLiveOutUses - Return true if SU has only value successors that are /// CopyToReg to a virtual register. This SU def is probably a liveout and /// it has no other use. It should be scheduled closer to the terminator. static bool hasOnlyLiveOutUses(const SUnit *SU) { @@ -2030,62 +2056,71 @@ static bool hasOnlyLiveOutUses(const SUnit *SU) { return RetVal; } -/// UnitsSharePred - Return true if the two scheduling units share a common -/// data predecessor. -static bool UnitsSharePred(const SUnit *left, const SUnit *right) { - SmallSet<const SUnit*, 4> Preds; - for (SUnit::const_pred_iterator I = left->Preds.begin(),E = left->Preds.end(); +// Set isVRegCycle for a node with only live in opers and live out uses. Also +// set isVRegCycle for its CopyFromReg operands. +// +// This is only relevant for single-block loops, in which case the VRegCycle +// node is likely an induction variable in which the operand and target virtual +// registers should be coalesced (e.g. pre/post increment values). Setting the +// isVRegCycle flag helps the scheduler prioritize other uses of the same +// CopyFromReg so that this node becomes the virtual register "kill". This +// avoids interference between the values live in and out of the block and +// eliminates a copy inside the loop. +static void initVRegCycle(SUnit *SU) { + if (DisableSchedVRegCycle) + return; + + if (!hasOnlyLiveInOpers(SU) || !hasOnlyLiveOutUses(SU)) + return; + + DEBUG(dbgs() << "VRegCycle: SU(" << SU->NodeNum << ")\n"); + + SU->isVRegCycle = true; + + for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end(); I != E; ++I) { - if (I->isCtrl()) continue; // ignore chain preds - Preds.insert(I->getSUnit()); + if (I->isCtrl()) continue; + I->getSUnit()->isVRegCycle = true; } - for (SUnit::const_pred_iterator I = right->Preds.begin(),E = right->Preds.end(); +} + +// After scheduling the definition of a VRegCycle, clear the isVRegCycle flag of +// CopyFromReg operands. We should no longer penalize other uses of this VReg. +static void resetVRegCycle(SUnit *SU) { + if (!SU->isVRegCycle) + return; + + for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end(); I != E; ++I) { if (I->isCtrl()) continue; // ignore chain preds - if (Preds.count(I->getSUnit())) - return true; + SUnit *PredSU = I->getSUnit(); + if (PredSU->isVRegCycle) { + assert(PredSU->getNode()->getOpcode() == ISD::CopyFromReg && + "VRegCycle def must be CopyFromReg"); + I->getSUnit()->isVRegCycle = 0; + } } - return false; } -// Return true if the virtual register defined by VRCycleSU may interfere with -// VRUseSU. -// -// Note: We may consider two SU's that use the same value live into a loop as -// interferng even though the value is not an induction variable. This is an -// unfortunate consequence of scheduling on the selection DAG. -static bool checkVRegCycleInterference(const SUnit *VRCycleSU, - const SUnit *VRUseSU) { - for (SUnit::const_pred_iterator I = VRCycleSU->Preds.begin(), - E = VRCycleSU->Preds.end(); I != E; ++I) { +// Return true if this SUnit uses a CopyFromReg node marked as a VRegCycle. This +// means a node that defines the VRegCycle has not been scheduled yet. +static bool hasVRegCycleUse(const SUnit *SU) { + // If this SU also defines the VReg, don't hoist it as a "use". + if (SU->isVRegCycle) + return false; + + for (SUnit::const_pred_iterator I = SU->Preds.begin(),E = SU->Preds.end(); + I != E; ++I) { if (I->isCtrl()) continue; // ignore chain preds - SDNode *InNode = I->getSUnit()->getNode(); - if (!InNode || InNode->getOpcode() != ISD::CopyFromReg) - continue; - for (SUnit::const_pred_iterator II = VRUseSU->Preds.begin(), - EE = VRUseSU->Preds.end(); II != EE; ++II) { - if (II->getSUnit() == I->getSUnit()) - return true; + if (I->getSUnit()->isVRegCycle && + I->getSUnit()->getNode()->getOpcode() == ISD::CopyFromReg) { + DEBUG(dbgs() << " VReg cycle use: SU (" << SU->NodeNum << ")\n"); + return true; } } return false; } -// Compare the VRegCycle properties of the nodes. -// Return -1 if left has higher priority, 1 if right has higher priority. -// Return 0 if priority is equivalent. -static int BUCompareVRegCycle(const SUnit *left, const SUnit *right) { - if (left->isVRegCycle && !right->isVRegCycle) { - if (checkVRegCycleInterference(left, right)) - return -1; - } - else if (!left->isVRegCycle && right->isVRegCycle) { - if (checkVRegCycleInterference(right, left)) - return 1; - } - return 0; -} - // Check for either a dependence (latency) or resource (hazard) stall. // // Note: The ScheduleHazardRecognizer interface requires a non-const SU. @@ -2101,23 +2136,12 @@ static bool BUHasStall(SUnit *SU, int Height, RegReductionPQBase *SPQ) { // Return 0 if latency-based priority is equivalent. static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, RegReductionPQBase *SPQ) { - // If the two nodes share an operand and one of them has a single - // use that is a live out copy, favor the one that is live out. Otherwise - // it will be difficult to eliminate the copy if the instruction is a - // loop induction variable update. e.g. - // BB: - // sub r1, r3, #1 - // str r0, [r2, r3] - // mov r3, r1 - // cmp - // bne BB - bool SharePred = UnitsSharePred(left, right); - // FIXME: Only adjust if BB is a loop back edge. - // FIXME: What's the cost of a copy? - int LBonus = (SharePred && hasOnlyLiveOutUses(left)) ? 1 : 0; - int RBonus = (SharePred && hasOnlyLiveOutUses(right)) ? 1 : 0; - int LHeight = (int)left->getHeight() - LBonus; - int RHeight = (int)right->getHeight() - RBonus; + // Scheduling an instruction that uses a VReg whose postincrement has not yet + // been scheduled will induce a copy. Model this as an extra cycle of latency. + int LPenalty = hasVRegCycleUse(left) ? 1 : 0; + int RPenalty = hasVRegCycleUse(right) ? 1 : 0; + int LHeight = (int)left->getHeight() + LPenalty; + int RHeight = (int)right->getHeight() + RPenalty; bool LStall = (!checkPref || left->SchedulingPref == Sched::Latency) && BUHasStall(left, LHeight, SPQ); @@ -2128,36 +2152,47 @@ static int BUCompareLatency(SUnit *left, SUnit *right, bool checkPref, // If scheduling either one of the node will cause a pipeline stall, sort // them according to their height. if (LStall) { - if (!RStall) + if (!RStall) { + DEBUG(++FactorCount[FactStall]); return 1; - if (LHeight != RHeight) + } + if (LHeight != RHeight) { + DEBUG(++FactorCount[FactStall]); return LHeight > RHeight ? 1 : -1; - } else if (RStall) + } + } else if (RStall) { + DEBUG(++FactorCount[FactStall]); return -1; + } // If either node is scheduling for latency, sort them by height/depth // and latency. if (!checkPref || (left->SchedulingPref == Sched::Latency || right->SchedulingPref == Sched::Latency)) { if (DisableSchedCycles) { - if (LHeight != RHeight) + if (LHeight != RHeight) { + DEBUG(++FactorCount[FactHeight]); return LHeight > RHeight ? 1 : -1; + } } else { // If neither instruction stalls (!LStall && !RStall) then // its height is already covered so only its depth matters. We also reach // this if both stall but have the same height. - unsigned LDepth = left->getDepth(); - unsigned RDepth = right->getDepth(); + int LDepth = left->getDepth() - LPenalty; + int RDepth = right->getDepth() - RPenalty; if (LDepth != RDepth) { + DEBUG(++FactorCount[FactDepth]); DEBUG(dbgs() << " Comparing latency of SU (" << left->NodeNum << ") depth " << LDepth << " vs SU (" << right->NodeNum << ") depth " << RDepth << "\n"); return LDepth < RDepth ? 1 : -1; } } - if (left->Latency != right->Latency) + if (left->Latency != right->Latency) { + DEBUG(++FactorCount[FactOther]); return left->Latency > right->Latency ? 1 : -1; + } } return 0; } @@ -2169,7 +2204,19 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { DEBUG(++FactorCount[FactStatic]); return LPriority > RPriority; } - DEBUG(++FactorCount[FactOther]); + else if(LPriority == 0) { + // Schedule zero-latency TokenFactor below any other special + // nodes. The alternative may be to avoid artificially boosting the + // TokenFactor's height when it is scheduled, but we currently rely on an + // instruction's final height to equal the cycle in which it is scheduled, + // so heights are monotonically increasing. + unsigned LOpc = left->getNode() ? left->getNode()->getOpcode() : 0; + unsigned ROpc = right->getNode() ? right->getNode()->getOpcode() : 0; + if (LOpc == ISD::TokenFactor) + return false; + if (ROpc == ISD::TokenFactor) + return true; + } // Try schedule def + use closer when Sethi-Ullman numbers are the same. // e.g. @@ -2190,14 +2237,18 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { // This creates more short live intervals. unsigned LDist = closestSucc(left); unsigned RDist = closestSucc(right); - if (LDist != RDist) + if (LDist != RDist) { + DEBUG(++FactorCount[FactOther]); return LDist < RDist; + } // How many registers becomes live when the node is scheduled. unsigned LScratch = calcMaxScratches(left); unsigned RScratch = calcMaxScratches(right); - if (LScratch != RScratch) + if (LScratch != RScratch) { + DEBUG(++FactorCount[FactOther]); return LScratch > RScratch; + } if (!DisableSchedCycles) { int result = BUCompareLatency(left, right, false /*checkPref*/, SPQ); @@ -2205,15 +2256,20 @@ static bool BURRSort(SUnit *left, SUnit *right, RegReductionPQBase *SPQ) { return result > 0; } else { - if (left->getHeight() != right->getHeight()) + if (left->getHeight() != right->getHeight()) { + DEBUG(++FactorCount[FactHeight]); return left->getHeight() > right->getHeight(); + } - if (left->getDepth() != right->getDepth()) + if (left->getDepth() != right->getDepth()) { + DEBUG(++FactorCount[FactDepth]); return left->getDepth() < right->getDepth(); + } } assert(left->NodeQueueId && right->NodeQueueId && "NodeQueueId cannot be zero"); + DEBUG(++FactorCount[FactOther]); return (left->NodeQueueId > right->NodeQueueId); } @@ -2264,24 +2320,22 @@ bool hybrid_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { // Avoid causing spills. If register pressure is high, schedule for // register pressure reduction. if (LHigh && !RHigh) { + DEBUG(++FactorCount[FactPressureDiff]); DEBUG(dbgs() << " pressure SU(" << left->NodeNum << ") > SU(" << right->NodeNum << ")\n"); return true; } else if (!LHigh && RHigh) { + DEBUG(++FactorCount[FactPressureDiff]); DEBUG(dbgs() << " pressure SU(" << right->NodeNum << ") > SU(" << left->NodeNum << ")\n"); return false; } - int result = 0; - if (!DisableSchedVRegCycle) { - result = BUCompareVRegCycle(left, right); - } - if (result == 0 && !LHigh && !RHigh) { - result = BUCompareLatency(left, right, true /*checkPref*/, SPQ); + if (!LHigh && !RHigh) { + int result = BUCompareLatency(left, right, true /*checkPref*/, SPQ); + if (result != 0) + return result > 0; } - if (result != 0) - return result > 0; return BURRSort(left, right, SPQ); } @@ -2347,12 +2401,6 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { if (RReduce && !LReduce) return true; } - if (!DisableSchedVRegCycle) { - int result = BUCompareVRegCycle(left, right); - if (result != 0) - return result > 0; - } - if (!DisableSchedLiveUses && (LLiveUses != RLiveUses)) { DEBUG(dbgs() << "Live uses SU(" << left->NodeNum << "): " << LLiveUses << " != SU(" << right->NodeNum << "): " << RLiveUses << "\n"); @@ -2391,6 +2439,24 @@ bool ilp_ls_rr_sort::operator()(SUnit *left, SUnit *right) const { return BURRSort(left, right, SPQ); } +void RegReductionPQBase::initNodes(std::vector<SUnit> &sunits) { + SUnits = &sunits; + // Add pseudo dependency edges for two-address nodes. + AddPseudoTwoAddrDeps(); + // Reroute edges to nodes with multiple uses. + if (!TracksRegPressure) + PrescheduleNodesWithMultipleUses(); + // Calculate node priorities. + CalculateSethiUllmanNumbers(); + + // For single block loops, mark nodes that look like canonical IV increments. + if (scheduleDAG->BB->isSuccessor(scheduleDAG->BB)) { + for (unsigned i = 0, e = sunits.size(); i != e; ++i) { + initVRegCycle(&sunits[i]); + } + } +} + //===----------------------------------------------------------------------===// // Preschedule for Register Pressure //===----------------------------------------------------------------------===// diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp index 24a1937..078533b 100644 --- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp +++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp @@ -342,10 +342,6 @@ void ScheduleDAGSDNodes::BuildSchedUnits() { assert(N->getNodeId() == -1 && "Node already inserted!"); N->setNodeId(NodeSUnit->NodeNum); - // Set isVRegCycle if the node operands are live into and value is live out - // of a single block loop. - InitVRegCycleFlag(NodeSUnit); - // Compute NumRegDefsLeft. This must be done before AddSchedEdges. InitNumRegDefsLeft(NodeSUnit); @@ -417,6 +413,10 @@ void ScheduleDAGSDNodes::AddSchedEdges() { // If this is a ctrl dep, latency is 1. unsigned OpLatency = isChain ? 1 : OpSU->Latency; + // Special-case TokenFactor chains as zero-latency. + if(isChain && OpN->getOpcode() == ISD::TokenFactor) + OpLatency = 0; + const SDep &dep = SDep(OpSU, isChain ? SDep::Order : SDep::Data, OpLatency, PhysReg); if (!isChain && !UnitLatencies) { @@ -512,47 +512,6 @@ void ScheduleDAGSDNodes::RegDefIter::Advance() { } } -// Set isVRegCycle if this node's single use is CopyToReg and its only active -// data operands are CopyFromReg. -// -// This is only relevant for single-block loops, in which case the VRegCycle -// node is likely an induction variable in which the operand and target virtual -// registers should be coalesced (e.g. pre/post increment values). Setting the -// isVRegCycle flag helps the scheduler prioritize other uses of the same -// CopyFromReg so that this node becomes the virtual register "kill". This -// avoids interference between the values live in and out of the block and -// eliminates a copy inside the loop. -void ScheduleDAGSDNodes::InitVRegCycleFlag(SUnit *SU) { - if (!BB->isSuccessor(BB)) - return; - - SDNode *N = SU->getNode(); - if (N->getGluedNode()) - return; - - if (!N->hasOneUse() || N->use_begin()->getOpcode() != ISD::CopyToReg) - return; - - bool FoundLiveIn = false; - for (SDNode::op_iterator OI = N->op_begin(), E = N->op_end(); OI != E; ++OI) { - EVT OpVT = OI->getValueType(); - assert(OpVT != MVT::Glue && "Glued nodes should be in same sunit!"); - - if (OpVT == MVT::Other) - continue; // ignore chain operands - - if (isPassiveNode(OI->getNode())) - continue; // ignore constants and such - - if (OI->getNode()->getOpcode() != ISD::CopyFromReg) - return; - - FoundLiveIn = true; - } - if (FoundLiveIn) - SU->isVRegCycle = true; -} - void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) { assert(SU->NumRegDefsLeft == 0 && "expect a new node"); for (RegDefIter I(SU, this); I.IsValid(); I.Advance()) { @@ -562,6 +521,16 @@ void ScheduleDAGSDNodes::InitNumRegDefsLeft(SUnit *SU) { } void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { + SDNode *N = SU->getNode(); + + // TokenFactor operands are considered zero latency, and some schedulers + // (e.g. Top-Down list) may rely on the fact that operand latency is nonzero + // whenever node latency is nonzero. + if (N && N->getOpcode() == ISD::TokenFactor) { + SU->Latency = 0; + return; + } + // Check to see if the scheduler cares about latencies. if (ForceUnitLatencies()) { SU->Latency = 1; @@ -569,7 +538,6 @@ void ScheduleDAGSDNodes::ComputeLatency(SUnit *SU) { } if (!InstrItins || InstrItins->isEmpty()) { - SDNode *N = SU->getNode(); if (N && N->isMachineOpcode() && TII->isHighLatencyDef(N->getMachineOpcode())) SU->Latency = HighLatencyCycles; diff --git a/test/CodeGen/ARM/memcpy-inline.ll b/test/CodeGen/ARM/memcpy-inline.ll index e8a2a3b..5bae037 100644 --- a/test/CodeGen/ARM/memcpy-inline.ll +++ b/test/CodeGen/ARM/memcpy-inline.ll @@ -1,10 +1,8 @@ -; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s -; RUN: llc < %s -mtriple=arm-apple-darwin -regalloc=basic -disable-post-ra | FileCheck %s +; RUN: llc < %s -mtriple=thumbv7-apple-darwin -regalloc=linearscan -disable-post-ra | FileCheck %s ; The ARM magic hinting works best with linear scan. -; CHECK: ldmia -; CHECK: stmia -; CHECK: ldrh +; CHECK: ldrd +; CHECK: strd ; CHECK: ldrb %struct.x = type { i8, i8, i8, i8, i8, i8, i8, i8, i8, i8, i8 } diff --git a/test/CodeGen/ARM/neon_div.ll b/test/CodeGen/ARM/neon_div.ll index e337970..de48fee 100644 --- a/test/CodeGen/ARM/neon_div.ll +++ b/test/CodeGen/ARM/neon_div.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=arm -mattr=+neon | FileCheck %s +; RUN: llc < %s -march=arm -mattr=+neon -pre-RA-sched=source | FileCheck %s define <8 x i8> @sdivi8(<8 x i8>* %A, <8 x i8>* %B) nounwind { ;CHECK: vrecpe.f32 diff --git a/test/CodeGen/ARM/va_arg.ll b/test/CodeGen/ARM/va_arg.ll index 7cb9762..bb40453 100644 --- a/test/CodeGen/ARM/va_arg.ll +++ b/test/CodeGen/ARM/va_arg.ll @@ -1,10 +1,10 @@ -; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi | FileCheck %s +; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -pre-RA-sched=source | FileCheck %s ; Test that we correctly align elements when using va_arg ; CHECK: test1: ; CHECK-NOT: bfc -; CHECK: add r0, r0, #7 -; CHECK: bfc r0, #0, #3 +; CHECK: add [[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7 +; CHECK: bfc [[REG]], #0, #3 ; CHECK-NOT: bfc define i64 @test1(i32 %i, ...) nounwind optsize { @@ -19,8 +19,8 @@ entry: ; CHECK: test2: ; CHECK-NOT: bfc -; CHECK: add r0, r0, #7 -; CHECK: bfc r0, #0, #3 +; CHECK: add [[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7 +; CHECK: bfc [[REG]], #0, #3 ; CHECK-NOT: bfc ; CHECK: bx lr diff --git a/test/CodeGen/ARM/vfp.ll b/test/CodeGen/ARM/vfp.ll index 390457f..49a6982 100644 --- a/test/CodeGen/ARM/vfp.ll +++ b/test/CodeGen/ARM/vfp.ll @@ -40,8 +40,8 @@ define void @test_add(float* %P, double* %D) { define void @test_ext_round(float* %P, double* %D) { ;CHECK: test_ext_round: %a = load float* %P ; <float> [#uses=1] -;CHECK: vcvt.f32.f64 ;CHECK: vcvt.f64.f32 +;CHECK: vcvt.f32.f64 %b = fpext float %a to double ; <double> [#uses=1] %A = load double* %D ; <double> [#uses=1] %B = fptrunc double %A to float ; <float> [#uses=1] diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll index 6601d25..1f71ed2 100644 --- a/test/CodeGen/Mips/o32_cc_vararg.ll +++ b/test/CodeGen/Mips/o32_cc_vararg.ll @@ -1,12 +1,12 @@ -; RUN: llc -march=mipsel -mcpu=mips2 < %s | FileCheck %s -; RUN: llc -march=mipsel -mcpu=mips2 < %s -regalloc=basic | FileCheck %s +; RUN: llc -march=mipsel -mcpu=mips2 -pre-RA-sched=source < %s | FileCheck %s +; RUN: llc -march=mipsel -mcpu=mips2 -pre-RA-sched=source < %s -regalloc=basic | FileCheck %s ; All test functions do the same thing - they return the first variable ; argument. -; All CHECK's do the same thing - they check whether variable arguments from -; registers are placed on correct stack locations, and whether the first +; All CHECK's do the same thing - they check whether variable arguments from +; registers are placed on correct stack locations, and whether the first ; variable argument is returned from the correct stack location. @@ -31,14 +31,14 @@ entry: ; CHECK: va1: ; CHECK: addiu $sp, $sp, -32 -; CHECK: sw $5, 36($sp) -; CHECK: sw $6, 40($sp) ; CHECK: sw $7, 44($sp) +; CHECK: sw $6, 40($sp) +; CHECK: sw $5, 36($sp) ; CHECK: lw $2, 36($sp) } -; check whether the variable double argument will be accessed from the 8-byte -; aligned location (i.e. whether the address is computed by adding 7 and +; check whether the variable double argument will be accessed from the 8-byte +; aligned location (i.e. whether the address is computed by adding 7 and ; clearing lower 3 bits) define double @va2(i32 %a, ...) nounwind { entry: @@ -57,10 +57,10 @@ entry: ; CHECK: va2: ; CHECK: addiu $sp, $sp, -40 -; CHECK: addiu $[[R0:[0-9]+]], $sp, 44 -; CHECK: sw $5, 44($sp) -; CHECK: sw $6, 48($sp) ; CHECK: sw $7, 52($sp) +; CHECK: sw $6, 48($sp) +; CHECK: sw $5, 44($sp) +; CHECK: addiu $[[R0:[0-9]+]], $sp, 44 ; CHECK: addiu $[[R1:[0-9]+]], $[[R0]], 7 ; CHECK: addiu $[[R2:[0-9]+]], $zero, -8 ; CHECK: and $[[R3:[0-9]+]], $[[R1]], $[[R2]] @@ -85,8 +85,8 @@ entry: ; CHECK: va3: ; CHECK: addiu $sp, $sp, -40 -; CHECK: sw $6, 48($sp) ; CHECK: sw $7, 52($sp) +; CHECK: sw $6, 48($sp) ; CHECK: lw $2, 48($sp) } @@ -108,8 +108,8 @@ entry: ; CHECK: va4: ; CHECK: addiu $sp, $sp, -48 -; CHECK: sw $6, 56($sp) ; CHECK: sw $7, 60($sp) +; CHECK: sw $6, 56($sp) ; CHECK: addiu $[[R0:[0-9]+]], $sp, 56 ; CHECK: addiu $[[R1:[0-9]+]], $[[R0]], 7 ; CHECK: addiu $[[R2:[0-9]+]], $zero, -8 diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll index 2074f98..35914b1 100644 --- a/test/CodeGen/Thumb2/thumb2-uxtb.ll +++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll @@ -128,9 +128,9 @@ define i32 @test10(i32 %p0) { ; ARMv7M: test10 ; ARMv7M: mov.w r1, #16253176 +; ARMv7M: mov.w r2, #458759 ; ARMv7M: and.w r0, r1, r0, lsr #7 -; ARMv7M: mov.w r1, #458759 -; ARMv7M: and.w r1, r1, r0, lsr #5 +; ARMv7M: and.w r1, r2, r0, lsr #5 ; ARMv7M: orrs r0, r1 %tmp1 = lshr i32 %p0, 7 ; <i32> [#uses=1] %tmp2 = and i32 %tmp1, 16253176 ; <i32> [#uses=2] |