aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/CodeGen/MachineBlockPlacement.cpp136
-rw-r--r--test/CodeGen/X86/block-placement.ll129
2 files changed, 196 insertions, 69 deletions
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 22d7212..6226237 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -102,13 +102,13 @@ public:
}
/// \brief Iterator over blocks within the chain.
- typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator iterator;
+ typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
/// \brief Beginning of blocks within the chain.
- iterator begin() const { return Blocks.begin(); }
+ iterator begin() { return Blocks.begin(); }
/// \brief End of blocks within the chain.
- iterator end() const { return Blocks.end(); }
+ iterator end() { return Blocks.end(); }
/// \brief Merge a block chain into this one.
///
@@ -211,12 +211,11 @@ class MachineBlockPlacement : public MachineFunctionPass {
void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
const BlockFilterSet *BlockFilter = 0);
- MachineBasicBlock *findBestLoopTop(MachineFunction &F,
- MachineLoop &L,
- const BlockFilterSet &LoopBlockSet);
+ MachineBasicBlock *findBestLoopExit(MachineFunction &F,
+ MachineLoop &L,
+ const BlockFilterSet &LoopBlockSet);
void buildLoopChains(MachineFunction &F, MachineLoop &L);
void buildCFGChains(MachineFunction &F);
- void AlignLoops(MachineFunction &F);
public:
static char ID; // Pass identification, replacement for typeid
@@ -544,9 +543,9 @@ void MachineBlockPlacement::buildChain(
/// block to layout at the top of the loop. Typically this is done to maximize
/// fallthrough opportunities.
MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
- MachineLoop &L,
- const BlockFilterSet &LoopBlockSet) {
+MachineBlockPlacement::findBestLoopExit(MachineFunction &F,
+ MachineLoop &L,
+ const BlockFilterSet &LoopBlockSet) {
// We don't want to layout the loop linearly in all cases. If the loop header
// is just a normal basic block in the loop, we want to look for what block
// within the loop is the best one to layout at the top. However, if the loop
@@ -557,11 +556,11 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
// header and only rotate if safe.
BlockChain &HeaderChain = *BlockToChain[L.getHeader()];
if (!LoopBlockSet.count(*HeaderChain.begin()))
- return L.getHeader();
+ return 0;
BlockFrequency BestExitEdgeFreq;
+ unsigned BestExitLoopDepth = 0;
MachineBasicBlock *ExitingBB = 0;
- MachineBasicBlock *LoopingBB = 0;
// If there are exits to outer loops, loop rotation can severely limit
// fallthrough opportunites unless it selects such an exit. Keep a set of
// blocks where rotating to exit with that block will reach an outer loop.
@@ -584,15 +583,10 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
// successor isn't found.
MachineBasicBlock *OldExitingBB = ExitingBB;
BlockFrequency OldBestExitEdgeFreq = BestExitEdgeFreq;
- // We also compute and store the best looping successor for use in layout.
- MachineBasicBlock *BestLoopSucc = 0;
+ bool HasLoopingSucc = false;
// FIXME: Due to the performance of the probability and weight routines in
- // the MBPI analysis, we use the internal weights. This is only valid
- // because it is purely a ranking function, we don't care about anything
- // but the relative values.
- uint32_t BestLoopSuccWeight = 0;
- // FIXME: We also manually compute the probabilities to avoid quadratic
- // behavior.
+ // the MBPI analysis, we use the internal weights and manually compute the
+ // probabilities to avoid quadratic behavior.
uint32_t WeightScale = 0;
uint32_t SumWeight = MBPI->getSumForBlock(*I, WeightScale);
for (MachineBasicBlock::succ_iterator SI = (*I)->succ_begin(),
@@ -604,10 +598,8 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
continue;
BlockChain &SuccChain = *BlockToChain[*SI];
// Don't split chains, either this chain or the successor's chain.
- if (&Chain == &SuccChain || *SI != *SuccChain.begin()) {
- DEBUG(dbgs() << " " << (LoopBlockSet.count(*SI) ? "looping: "
- : "exiting: ")
- << getBlockName(*I) << " -> "
+ if (&Chain == &SuccChain) {
+ DEBUG(dbgs() << " exiting: " << getBlockName(*I) << " -> "
<< getBlockName(*SI) << " (chain conflict)\n");
continue;
}
@@ -616,60 +608,55 @@ MachineBlockPlacement::findBestLoopTop(MachineFunction &F,
if (LoopBlockSet.count(*SI)) {
DEBUG(dbgs() << " looping: " << getBlockName(*I) << " -> "
<< getBlockName(*SI) << " (" << SuccWeight << ")\n");
- if (BestLoopSucc && BestLoopSuccWeight >= SuccWeight)
- continue;
-
- BestLoopSucc = *SI;
- BestLoopSuccWeight = SuccWeight;
+ HasLoopingSucc = true;
continue;
}
+ unsigned SuccLoopDepth = 0;
+ if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI)) {
+ SuccLoopDepth = ExitLoop->getLoopDepth();
+ if (ExitLoop->contains(&L))
+ BlocksExitingToOuterLoop.insert(*I);
+ }
+
BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
BlockFrequency ExitEdgeFreq = MBFI->getBlockFreq(*I) * SuccProb;
DEBUG(dbgs() << " exiting: " << getBlockName(*I) << " -> "
- << getBlockName(*SI) << " (" << ExitEdgeFreq << ")\n");
+ << getBlockName(*SI) << " [L:" << SuccLoopDepth
+ << "] (" << ExitEdgeFreq << ")\n");
// Note that we slightly bias this toward an existing layout successor to
// retain incoming order in the absence of better information.
// FIXME: Should we bias this more strongly? It's pretty weak.
- if (!ExitingBB || ExitEdgeFreq > BestExitEdgeFreq ||
+ if (!ExitingBB || BestExitLoopDepth < SuccLoopDepth ||
+ ExitEdgeFreq > BestExitEdgeFreq ||
((*I)->isLayoutSuccessor(*SI) &&
!(ExitEdgeFreq < BestExitEdgeFreq))) {
BestExitEdgeFreq = ExitEdgeFreq;
ExitingBB = *I;
}
-
- if (MachineLoop *ExitLoop = MLI->getLoopFor(*SI))
- if (ExitLoop->contains(&L))
- BlocksExitingToOuterLoop.insert(*I);
}
// Restore the old exiting state, no viable looping successor was found.
- if (!BestLoopSucc) {
+ if (!HasLoopingSucc) {
ExitingBB = OldExitingBB;
BestExitEdgeFreq = OldBestExitEdgeFreq;
continue;
}
-
- // If this was best exiting block thus far, also record the looping block.
- if (ExitingBB == *I)
- LoopingBB = BestLoopSucc;
}
- // Without a candidate exitting block or with only a single block in the
+ // Without a candidate exiting block or with only a single block in the
// loop, just use the loop header to layout the loop.
if (!ExitingBB || L.getNumBlocks() == 1)
- return L.getHeader();
+ return 0;
// Also, if we have exit blocks which lead to outer loops but didn't select
// one of them as the exiting block we are rotating toward, disable loop
// rotation altogether.
if (!BlocksExitingToOuterLoop.empty() &&
!BlocksExitingToOuterLoop.count(ExitingBB))
- return L.getHeader();
+ return 0;
- assert(LoopingBB && "All successors of a loop block are exit blocks!");
DEBUG(dbgs() << " Best exiting block: " << getBlockName(ExitingBB) << "\n");
- DEBUG(dbgs() << " Best top block: " << getBlockName(LoopingBB) << "\n");
- return LoopingBB;
+ return ExitingBB;
}
/// \brief Forms basic block chains from the natural loop structures.
@@ -688,8 +675,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
SmallVector<MachineBasicBlock *, 16> BlockWorkList;
BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
- MachineBasicBlock *LayoutTop = findBestLoopTop(F, L, LoopBlockSet);
- BlockChain &LoopChain = *BlockToChain[LayoutTop];
+ MachineBasicBlock *ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
+ BlockChain &LoopChain = *BlockToChain[L.getHeader()];
// FIXME: This is a really lame way of walking the chains in the loop: we
// walk the blocks, and use a set to prevent visiting a particular chain
@@ -721,7 +708,18 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
BlockWorkList.push_back(*Chain.begin());
}
- buildChain(LayoutTop, LoopChain, BlockWorkList, &LoopBlockSet);
+ buildChain(L.getHeader(), LoopChain, BlockWorkList, &LoopBlockSet);
+
+ // Once we have built a chain, try to rotate it to line up the hot exit block
+ // with fallthrough out of the loop (if we have a valid exit block for that).
+ if (ExitingBB) {
+ BlockChain::iterator ExitIt = std::find(LoopChain.begin(), LoopChain.end(),
+ ExitingBB);
+
+ if (ExitIt != LoopChain.end()) {
+ std::rotate(LoopChain.begin(), llvm::next(ExitIt), LoopChain.end());
+ }
+ }
DEBUG({
// Crash at the end so we get all of the debugging output first.
@@ -733,7 +731,8 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n";
}
for (BlockChain::iterator BCI = LoopChain.begin(), BCE = LoopChain.end();
- BCI != BCE; ++BCI)
+ BCI != BCE; ++BCI) {
+ dbgs() << " ... " << getBlockName(*BCI) << "\n";
if (!LoopBlockSet.erase(*BCI)) {
// We don't mark the loop as bad here because there are real situations
// where this can occur. For example, with an unanalyzable fallthrough
@@ -743,6 +742,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
<< " Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
<< " Bad block: " << getBlockName(*BCI) << "\n";
}
+ }
if (!LoopBlockSet.empty()) {
BadLoop = true;
@@ -882,28 +882,33 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
if (!TII->AnalyzeBranch(F.back(), TBB, FBB, Cond))
F.back().updateTerminator();
-}
-
-/// \brief Recursive helper to align a loop and any nested loops.
-static void AlignLoop(MachineFunction &F, MachineLoop *L, unsigned Align) {
- // Recurse through nested loops.
- for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I)
- AlignLoop(F, *I, Align);
- L->getTopBlock()->setAlignment(Align);
-}
-
-/// \brief Align loop headers to target preferred alignments.
-void MachineBlockPlacement::AlignLoops(MachineFunction &F) {
+ // Walk through the backedges of the function now that we have fully laid out
+ // the basic blocks and align the destination of each backedge. We don't rely
+ // on the loop info here so that we can align backedges in unnatural CFGs and
+ // backedges that were introduced purely because of the loop rotations done
+ // during this layout pass.
+ // FIXME: This isn't quite right, we shouldn't align backedges that result
+ // from blocks being sunken below the exit block for the function.
if (F.getFunction()->hasFnAttr(Attribute::OptimizeForSize))
return;
-
unsigned Align = TLI->getPrefLoopAlignment();
if (!Align)
return; // Don't care about loop alignment.
- for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); I != E; ++I)
- AlignLoop(F, *I, Align);
+ SmallPtrSet<MachineBasicBlock *, 16> PreviousBlocks;
+ for (BlockChain::iterator BI = FunctionChain.begin(),
+ BE = FunctionChain.end();
+ BI != BE; ++BI) {
+ PreviousBlocks.insert(*BI);
+ // Set alignment on the destination of all the back edges in the new
+ // ordering.
+ for (MachineBasicBlock::succ_iterator SI = (*BI)->succ_begin(),
+ SE = (*BI)->succ_end();
+ SI != SE; ++SI)
+ if (PreviousBlocks.count(*SI))
+ (*SI)->setAlignment(Align);
+ }
}
bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
@@ -919,7 +924,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &F) {
assert(BlockToChain.empty());
buildCFGChains(F);
- AlignLoops(F);
BlockToChain.clear();
ChainAllocator.DestroyAll();
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index 167d522..f3c9727 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -76,11 +76,11 @@ define i32 @test_loop_cold_blocks(i32 %i, i32* %a) {
; Check that we sink cold loop blocks after the hot loop body.
; CHECK: test_loop_cold_blocks:
; CHECK: %entry
+; CHECK: %unlikely1
+; CHECK: %unlikely2
; CHECK: %body1
; CHECK: %body2
; CHECK: %body3
-; CHECK: %unlikely1
-; CHECK: %unlikely2
; CHECK: %exit
entry:
@@ -348,7 +348,6 @@ define void @unnatural_cfg2() {
; CHECK: %entry
; CHECK: %loop.body1
; CHECK: %loop.body2
-; CHECK: %loop.header
; CHECK: %loop.body3
; CHECK: %loop.inner1.begin
; The end block is folded with %loop.body3...
@@ -356,6 +355,7 @@ define void @unnatural_cfg2() {
; CHECK: %loop.body4
; CHECK: %loop.inner2.begin
; The loop.inner2.end block is folded
+; CHECK: %loop.header
; CHECK: %bail
entry:
@@ -928,3 +928,126 @@ entry:
exit:
ret void
}
+
+define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
+; This test case comes from the heapsort benchmark, and exemplifies several
+; important aspects to block placement in the presence of loops:
+; 1) Loop rotation needs to *ensure* that the desired exiting edge can be
+; a fallthrough.
+; 2) The exiting edge from the loop which is rotated to be laid out at the
+; bottom of the loop needs to be exiting into the nearest enclosing loop (to
+; which there is an exit). Otherwise, we force that enclosing loop into
+; strange layouts that are siginificantly less efficient, often times maing
+; it discontiguous.
+;
+; CHECK: @benchmark_heapsort
+; CHECK: %entry
+; First rotated loop top.
+; CHECK: .align
+; CHECK: %if.end10
+; Second rotated loop top
+; CHECK: .align
+; CHECK: %while.cond.outer
+; Third rotated loop top
+; CHECK: .align
+; CHECK: %while.cond
+; CHECK: %while.body
+; CHECK: %land.lhs.true
+; CHECK: %if.then19
+; CHECK: %if.then19
+; CHECK: %if.then24
+; CHECK: %while.end
+; CHECK: %for.cond
+; CHECK: %if.then
+; CHECK: %if.else
+; CHECK: %if.then8
+; CHECK: ret
+
+entry:
+ %shr = ashr i32 %n, 1
+ %add = add nsw i32 %shr, 1
+ %arrayidx3 = getelementptr inbounds double* %ra, i64 1
+ br label %for.cond
+
+for.cond:
+ %ir.0 = phi i32 [ %n, %entry ], [ %ir.1, %while.end ]
+ %l.0 = phi i32 [ %add, %entry ], [ %l.1, %while.end ]
+ %cmp = icmp sgt i32 %l.0, 1
+ br i1 %cmp, label %if.then, label %if.else
+
+if.then:
+ %dec = add nsw i32 %l.0, -1
+ %idxprom = sext i32 %dec to i64
+ %arrayidx = getelementptr inbounds double* %ra, i64 %idxprom
+ %0 = load double* %arrayidx, align 8
+ br label %if.end10
+
+if.else:
+ %idxprom1 = sext i32 %ir.0 to i64
+ %arrayidx2 = getelementptr inbounds double* %ra, i64 %idxprom1
+ %1 = load double* %arrayidx2, align 8
+ %2 = load double* %arrayidx3, align 8
+ store double %2, double* %arrayidx2, align 8
+ %dec6 = add nsw i32 %ir.0, -1
+ %cmp7 = icmp eq i32 %dec6, 1
+ br i1 %cmp7, label %if.then8, label %if.end10
+
+if.then8:
+ store double %1, double* %arrayidx3, align 8
+ ret void
+
+if.end10:
+ %ir.1 = phi i32 [ %ir.0, %if.then ], [ %dec6, %if.else ]
+ %l.1 = phi i32 [ %dec, %if.then ], [ %l.0, %if.else ]
+ %rra.0 = phi double [ %0, %if.then ], [ %1, %if.else ]
+ %add31 = add nsw i32 %ir.1, 1
+ br label %while.cond.outer
+
+while.cond.outer:
+ %j.0.ph.in = phi i32 [ %l.1, %if.end10 ], [ %j.1, %if.then24 ]
+ %j.0.ph = shl i32 %j.0.ph.in, 1
+ br label %while.cond
+
+while.cond:
+ %j.0 = phi i32 [ %add31, %if.end20 ], [ %j.0.ph, %while.cond.outer ]
+ %cmp11 = icmp sgt i32 %j.0, %ir.1
+ br i1 %cmp11, label %while.end, label %while.body
+
+while.body:
+ %cmp12 = icmp slt i32 %j.0, %ir.1
+ br i1 %cmp12, label %land.lhs.true, label %if.end20
+
+land.lhs.true:
+ %idxprom13 = sext i32 %j.0 to i64
+ %arrayidx14 = getelementptr inbounds double* %ra, i64 %idxprom13
+ %3 = load double* %arrayidx14, align 8
+ %add15 = add nsw i32 %j.0, 1
+ %idxprom16 = sext i32 %add15 to i64
+ %arrayidx17 = getelementptr inbounds double* %ra, i64 %idxprom16
+ %4 = load double* %arrayidx17, align 8
+ %cmp18 = fcmp olt double %3, %4
+ br i1 %cmp18, label %if.then19, label %if.end20
+
+if.then19:
+ br label %if.end20
+
+if.end20:
+ %j.1 = phi i32 [ %add15, %if.then19 ], [ %j.0, %land.lhs.true ], [ %j.0, %while.body ]
+ %idxprom21 = sext i32 %j.1 to i64
+ %arrayidx22 = getelementptr inbounds double* %ra, i64 %idxprom21
+ %5 = load double* %arrayidx22, align 8
+ %cmp23 = fcmp olt double %rra.0, %5
+ br i1 %cmp23, label %if.then24, label %while.cond
+
+if.then24:
+ %idxprom27 = sext i32 %j.0.ph.in to i64
+ %arrayidx28 = getelementptr inbounds double* %ra, i64 %idxprom27
+ store double %5, double* %arrayidx28, align 8
+ br label %while.cond.outer
+
+while.end:
+ %idxprom33 = sext i32 %j.0.ph.in to i64
+ %arrayidx34 = getelementptr inbounds double* %ra, i64 %idxprom33
+ store double %rra.0, double* %arrayidx34, align 8
+ br label %for.cond
+}