From 57e6b2d1f3de0bf459e96f7038e692d624f7e580 Mon Sep 17 00:00:00 2001 From: Tom Stellard Date: Sat, 27 Jul 2013 00:01:07 +0000 Subject: SimplifyCFG: Use parallel-and and parallel-or mode to consolidate branch conditions Merge consecutive if-regions if they contain identical statements. Both transformations reduce number of branches. The transformation is guarded by a target-hook, and is currently enabled only for +R600, but the correctness has been tested on X86 target using a variety of CPU benchmarks. Patch by: Mei Ye git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187278 91177308-0d34-0410-b5e6-96231b3b80d8 --- include/llvm/Analysis/TargetTransformInfo.h | 6 + include/llvm/InitializePasses.h | 3 +- include/llvm/Transforms/Scalar.h | 2 +- include/llvm/Transforms/Utils/Local.h | 3 +- lib/Analysis/TargetTransformInfo.cpp | 6 + lib/CodeGen/BasicTargetTransformInfo.cpp | 3 + lib/Target/R600/AMDGPU.h | 4 + lib/Target/R600/AMDGPUTargetMachine.cpp | 12 + lib/Target/R600/AMDGPUTargetMachine.h | 3 + lib/Target/R600/AMDGPUTargetTransformInfo.cpp | 90 ++++++ lib/Target/R600/CMakeLists.txt | 1 + lib/Transforms/IPO/PassManagerBuilder.cpp | 4 +- lib/Transforms/Scalar/Scalar.cpp | 3 +- lib/Transforms/Scalar/SimplifyCFGPass.cpp | 81 +++-- lib/Transforms/Utils/SimplifyCFG.cpp | 443 +++++++++++++++++++++++++- tools/lto/LTOCodeGenerator.cpp | 2 +- tools/opt/opt.cpp | 3 + 17 files changed, 629 insertions(+), 40 deletions(-) create mode 100644 lib/Target/R600/AMDGPUTargetTransformInfo.cpp diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index b8a44b5..21a3a12 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -171,6 +171,12 @@ public: /// comments for a detailed explanation of the cost values. virtual unsigned getUserCost(const User *U) const; + /// \brief hasBranchDivergence - Return true if branch divergence exists. + /// Branch divergence has a significantly negative impact on GPU performance + /// when threads in the same wavefront take different paths due to conditional + /// branches. + virtual bool hasBranchDivergence() const; + /// \brief Test whether calls to a function lower to actual program function /// calls. /// diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 86fd851..d49636d 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -86,7 +86,8 @@ void initializeCallGraphViewerPass(PassRegistry&); void initializeCFGOnlyPrinterPass(PassRegistry&); void initializeCFGOnlyViewerPass(PassRegistry&); void initializeCFGPrinterPass(PassRegistry&); -void initializeCFGSimplifyPassPass(PassRegistry&); +void initializeCFGOptimizePass(PassRegistry&); +void initializeCFGCanonicalizePass(PassRegistry&); void initializeStructurizeCFGPass(PassRegistry&); void initializeCFGViewerPass(PassRegistry&); void initializeCalculateSpillWeightsPass(PassRegistry&); diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h index a327dff..b52c327 100644 --- a/include/llvm/Transforms/Scalar.h +++ b/include/llvm/Transforms/Scalar.h @@ -196,7 +196,7 @@ FunctionPass *createJumpThreadingPass(); // CFGSimplification - Merge basic blocks, eliminate unreachable blocks, // simplify terminator instructions, etc... // -FunctionPass *createCFGSimplificationPass(); +FunctionPass *createCFGSimplificationPass(bool IsTargetAware = false); //===----------------------------------------------------------------------===// // diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h index 2678250..ec2ab2a 100644 --- a/include/llvm/Transforms/Utils/Local.h +++ b/include/llvm/Transforms/Utils/Local.h @@ -39,6 +39,7 @@ class DataLayout; class TargetLibraryInfo; class TargetTransformInfo; class DIBuilder; +class AliasAnalysis; template class SmallVectorImpl; @@ -136,7 +137,7 @@ bool EliminateDuplicatePHINodes(BasicBlock *BB); /// the basic block that was pointed to. /// bool SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, - const DataLayout *TD = 0); + const DataLayout *TD = 0, AliasAnalysis *AA = 0); /// FoldBranchToCommonDest - If this basic block is ONLY a setcc and a branch, /// and if a predecessor branches to us and one of our successors, fold the diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 4b2bf3c..4ad7162 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -88,6 +88,10 @@ unsigned TargetTransformInfo::getUserCost(const User *U) const { return PrevTTI->getUserCost(U); } +bool TargetTransformInfo::hasBranchDivergence() const { + return PrevTTI->hasBranchDivergence(); +} + bool TargetTransformInfo::isLoweredToCall(const Function *F) const { return PrevTTI->isLoweredToCall(F); } @@ -420,6 +424,8 @@ struct NoTTI : ImmutablePass, TargetTransformInfo { U->getOperand(0)->getType() : 0); } + bool hasBranchDivergence() const { return false; } + bool isLoweredToCall(const Function *F) const { // FIXME: These should almost certainly not be handled here, and instead // handled with the help of TLI or the target itself. This was largely diff --git a/lib/CodeGen/BasicTargetTransformInfo.cpp b/lib/CodeGen/BasicTargetTransformInfo.cpp index 3755320..cb12a40 100644 --- a/lib/CodeGen/BasicTargetTransformInfo.cpp +++ b/lib/CodeGen/BasicTargetTransformInfo.cpp @@ -65,6 +65,8 @@ public: return this; } + virtual bool hasBranchDivergence() const; + /// \name Scalar TTI Implementations /// @{ @@ -124,6 +126,7 @@ llvm::createBasicTargetTransformInfoPass(const TargetMachine *TM) { return new BasicTTI(TM); } +bool BasicTTI::hasBranchDivergence() const { return false; } bool BasicTTI::isLegalAddImmediate(int64_t imm) const { return getTLI()->isLegalAddImmediate(imm); diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h index fbf1fce..51d0d3c 100644 --- a/lib/Target/R600/AMDGPU.h +++ b/lib/Target/R600/AMDGPU.h @@ -45,6 +45,10 @@ FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm); FunctionPass *createAMDGPUIndirectAddressingPass(TargetMachine &tm); FunctionPass *createAMDGPUISelDag(TargetMachine &tm); +/// \brief Creates an AMDGPU-specific Target Transformation Info pass. +ImmutablePass * +createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM); + extern Target TheAMDGPUTarget; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp index 1dc1b6b..33e2dae 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.cpp +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp @@ -105,6 +105,18 @@ TargetPassConfig *AMDGPUTargetMachine::createPassConfig(PassManagerBase &PM) { return new AMDGPUPassConfig(this, PM); } +//===----------------------------------------------------------------------===// +// AMDGPU Analysis Pass Setup +//===----------------------------------------------------------------------===// + +void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) { + // Add first the target-independent BasicTTI pass, then our AMDGPU pass. This + // allows the AMDGPU pass to delegate to the target independent layer when + // appropriate. + PM.add(createBasicTargetTransformInfoPass(this)); + PM.add(createAMDGPUTargetTransformInfoPass(this)); +} + bool AMDGPUPassConfig::addPreISel() { const AMDGPUSubtarget &ST = TM->getSubtarget(); diff --git a/lib/Target/R600/AMDGPUTargetMachine.h b/lib/Target/R600/AMDGPUTargetMachine.h index 26e95d3..f942614 100644 --- a/lib/Target/R600/AMDGPUTargetMachine.h +++ b/lib/Target/R600/AMDGPUTargetMachine.h @@ -61,6 +61,9 @@ public: } virtual const DataLayout *getDataLayout() const { return &Layout; } virtual TargetPassConfig *createPassConfig(PassManagerBase &PM); + + /// \brief Register R600 analysis passes with a pass manager. + virtual void addAnalysisPasses(PassManagerBase &PM); }; } // End namespace llvm diff --git a/lib/Target/R600/AMDGPUTargetTransformInfo.cpp b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp new file mode 100644 index 0000000..8db319c --- /dev/null +++ b/lib/Target/R600/AMDGPUTargetTransformInfo.cpp @@ -0,0 +1,90 @@ +//===-- AMDGPUTargetTransformInfo.cpp - AMDGPU specific TTI pass ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// \file +// This file implements a TargetTransformInfo analysis pass specific to the +// AMDGPU target machine. It uses the target's detailed information to provide +// more precise answers to certain TTI queries, while letting the target +// independent and default TTI implementations handle the rest. +// +//===----------------------------------------------------------------------===// + +#define DEBUG_TYPE "AMDGPUtti" +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/CostTable.h" +using namespace llvm; + +// Declare the pass initialization routine locally as target-specific passes +// don't have a target-wide initialization entry point, and so we rely on the +// pass constructor initialization. +namespace llvm { +void initializeAMDGPUTTIPass(PassRegistry &); +} + +namespace { + +class AMDGPUTTI : public ImmutablePass, public TargetTransformInfo { + const AMDGPUTargetMachine *TM; + const AMDGPUSubtarget *ST; + const AMDGPUTargetLowering *TLI; + + /// Estimate the overhead of scalarizing an instruction. Insert and Extract + /// are set if the result needs to be inserted and/or extracted from vectors. + unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const; + +public: + AMDGPUTTI() : ImmutablePass(ID), TM(0), ST(0), TLI(0) { + llvm_unreachable("This pass cannot be directly constructed"); + } + + AMDGPUTTI(const AMDGPUTargetMachine *TM) + : ImmutablePass(ID), TM(TM), ST(TM->getSubtargetImpl()), + TLI(TM->getTargetLowering()) { + initializeAMDGPUTTIPass(*PassRegistry::getPassRegistry()); + } + + virtual void initializePass() { pushTTIStack(this); } + + virtual void finalizePass() { popTTIStack(); } + + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + TargetTransformInfo::getAnalysisUsage(AU); + } + + /// Pass identification. + static char ID; + + /// Provide necessary pointer adjustments for the two base classes. + virtual void *getAdjustedAnalysisPointer(const void *ID) { + if (ID == &TargetTransformInfo::ID) + return (TargetTransformInfo *)this; + return this; + } + + virtual bool hasBranchDivergence() const; + + /// @} +}; + +} // end anonymous namespace + +INITIALIZE_AG_PASS(AMDGPUTTI, TargetTransformInfo, "AMDGPUtti", + "AMDGPU Target Transform Info", true, true, false) +char AMDGPUTTI::ID = 0; + +ImmutablePass * +llvm::createAMDGPUTargetTransformInfoPass(const AMDGPUTargetMachine *TM) { + return new AMDGPUTTI(TM); +} + +bool AMDGPUTTI::hasBranchDivergence() const { return true; } diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt index 4f8665b..40d255a 100644 --- a/lib/Target/R600/CMakeLists.txt +++ b/lib/Target/R600/CMakeLists.txt @@ -23,6 +23,7 @@ add_llvm_target(R600CodeGen AMDGPUMachineFunction.cpp AMDGPUSubtarget.cpp AMDGPUTargetMachine.cpp + AMDGPUTargetTransformInfo.cpp AMDGPUISelLowering.cpp AMDGPUConvertToISA.cpp AMDGPUInstrInfo.cpp diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp index 1917cc8..6355204 100644 --- a/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -235,7 +235,7 @@ void PassManagerBuilder::populateModulePassManager(PassManagerBase &MPM) { } MPM.add(createAggressiveDCEPass()); // Delete dead instructions - MPM.add(createCFGSimplificationPass()); // Merge & remove BBs + MPM.add(createCFGSimplificationPass(true)); // Merge & remove BBs MPM.add(createInstructionCombiningPass()); // Clean up after everything. // As an experimental mode, run any vectorization passes in a separate @@ -371,7 +371,7 @@ void PassManagerBuilder::populateLTOPassManager(PassManagerBase &PM, PM.add(createJumpThreadingPass()); // Delete basic blocks, which optimization passes may have killed. - PM.add(createCFGSimplificationPass()); + PM.add(createCFGSimplificationPass(true)); // Now that we have optimized the program, discard unreachable functions. PM.add(createGlobalDCEPass()); diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp index 758334d..18cdcfe 100644 --- a/lib/Transforms/Scalar/Scalar.cpp +++ b/lib/Transforms/Scalar/Scalar.cpp @@ -57,7 +57,8 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) { initializeSROAPass(Registry); initializeSROA_DTPass(Registry); initializeSROA_SSAUpPass(Registry); - initializeCFGSimplifyPassPass(Registry); + initializeCFGCanonicalizePass(Registry); + initializeCFGOptimizePass(Registry); initializeStructurizeCFGPass(Registry); initializeSinkingPass(Registry); initializeTailCallElimPass(Registry); diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp index c243d34..9ab5f45 100644 --- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp +++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp @@ -27,6 +27,7 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -41,30 +42,62 @@ using namespace llvm; STATISTIC(NumSimpl, "Number of blocks simplified"); namespace { - struct CFGSimplifyPass : public FunctionPass { - static char ID; // Pass identification, replacement for typeid - CFGSimplifyPass() : FunctionPass(ID) { - initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry()); - } - - virtual bool runOnFunction(Function &F); +struct CFGSimplifyPass : public FunctionPass { + CFGSimplifyPass(char &ID, bool isTargetAware) + : FunctionPass(ID), IsTargetAware(isTargetAware) {} + virtual bool runOnFunction(Function &F); - virtual void getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); - } - }; + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + } +private: + AliasAnalysis *AA; + bool IsTargetAware; // Should the pass be target-aware? +}; + +// CFGSimplifyPass that does optimizations. +struct CFGOptimize : public CFGSimplifyPass { + static char ID; // Pass identification, replacement for typeid +public: + CFGOptimize() : CFGSimplifyPass(ID, true) { + initializeCFGOptimizePass(*PassRegistry::getPassRegistry()); + } + virtual void getAnalysisUsage(AnalysisUsage &AU) const { + AU.addRequired(); + AU.addRequired(); + } +}; + +// CFGSimplifyPass that does canonicalizations. +struct CFGCanonicalize : public CFGSimplifyPass { + static char ID; // Pass identification, replacement for typeid +public: + CFGCanonicalize() : CFGSimplifyPass(ID, false) { + initializeCFGCanonicalizePass(*PassRegistry::getPassRegistry()); + } +}; } -char CFGSimplifyPass::ID = 0; -INITIALIZE_PASS_BEGIN(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", - false, false) +char CFGCanonicalize::ID = 0; +char CFGOptimize::ID = 0; +INITIALIZE_PASS_BEGIN(CFGCanonicalize, "simplifycfg", "Simplify the CFG", false, + false) +INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) +INITIALIZE_PASS_END(CFGCanonicalize, "simplifycfg", "Simplify the CFG", false, + false) +INITIALIZE_PASS_BEGIN(CFGOptimize, "optimizecfg", "optimize the CFG", false, + false) INITIALIZE_AG_DEPENDENCY(TargetTransformInfo) -INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", - false, false) +INITIALIZE_AG_DEPENDENCY(AliasAnalysis) +INITIALIZE_PASS_END(CFGOptimize, "optimizecfg", "Optimize the CFG", false, + false) // Public interface to the CFGSimplification pass -FunctionPass *llvm::createCFGSimplificationPass() { - return new CFGSimplifyPass(); +FunctionPass *llvm::createCFGSimplificationPass(bool IsTargetAware) { + if (IsTargetAware) + return new CFGOptimize(); + else + return new CFGCanonicalize(); } /// changeToUnreachable - Insert an unreachable instruction before the specified @@ -301,7 +334,7 @@ static bool mergeEmptyReturnBlocks(Function &F) { /// iterativelySimplifyCFG - Call SimplifyCFG on all the blocks in the function, /// iterating until no more changes are made. static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, - const DataLayout *TD) { + const DataLayout *TD, AliasAnalysis *AA) { bool Changed = false; bool LocalChange = true; while (LocalChange) { @@ -310,7 +343,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // Loop over all of the basic blocks and remove them if they are unneeded... // for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) { - if (SimplifyCFG(BBIt++, TTI, TD)) { + if (SimplifyCFG(BBIt++, TTI, TD, AA)) { LocalChange = true; ++NumSimpl; } @@ -324,11 +357,15 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI, // simplify the CFG. // bool CFGSimplifyPass::runOnFunction(Function &F) { + if (IsTargetAware) + AA = &getAnalysis(); + else + AA = NULL; const TargetTransformInfo &TTI = getAnalysis(); const DataLayout *TD = getAnalysisIfAvailable(); bool EverChanged = removeUnreachableBlocksFromFn(F); EverChanged |= mergeEmptyReturnBlocks(F); - EverChanged |= iterativelySimplifyCFG(F, TTI, TD); + EverChanged |= iterativelySimplifyCFG(F, TTI, TD, AA); // If neither pass changed anything, we're done. if (!EverChanged) return false; @@ -342,7 +379,7 @@ bool CFGSimplifyPass::runOnFunction(Function &F) { return true; do { - EverChanged = iterativelySimplifyCFG(F, TTI, TD); + EverChanged = iterativelySimplifyCFG(F, TTI, TD, AA); EverChanged |= removeUnreachableBlocksFromFn(F); } while (EverChanged); diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp index eaeb19f..084b74c 100644 --- a/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/lib/Transforms/Utils/SimplifyCFG.cpp @@ -19,6 +19,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" @@ -65,6 +66,10 @@ static cl::opt HoistCondStores("simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true), cl::desc("Hoist conditional stores if an unconditional store preceeds")); +static cl::opt +ParallelAndOr("simplifycfg-parallel-and-or", cl::Hidden, cl::init(true), + cl::desc("Use parallel-and-or mode for branch conditions")); + STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps"); STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables"); STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block"); @@ -90,6 +95,7 @@ namespace { class SimplifyCFGOpt { const TargetTransformInfo &TTI; const DataLayout *const TD; + AliasAnalysis *AA; Value *isValueEqualityComparison(TerminatorInst *TI); BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI, @@ -107,10 +113,25 @@ class SimplifyCFGOpt { bool SimplifyIndirectBr(IndirectBrInst *IBI); bool SimplifyUncondBranch(BranchInst *BI, IRBuilder <> &Builder); bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder); + /// \brief Use parallel-and or parallel-or to generate conditions for + /// conditional branches. + bool SimplifyParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0); + /// \brief If \param BB is the merge block of an if-region, attempt to merge + /// the if-region with an adjacent if-region upstream if two if-regions + /// contain identical instructions. + bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, Pass *P = 0); + /// \brief Compare a pair of blocks: \param Block1 and \param Block2, which + /// are from two if-regions whose entry blocks are \param Head1 and \param + /// Head2. \returns true if \param Block1 and \param Block2 contain identical + /// instructions, and have no memory reference alias with \param Head2. + /// This is used as a legality check for merging if-regions. + bool CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, + BasicBlock *Block1, BasicBlock *Block2); public: - SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *TD) - : TTI(TTI), TD(TD) {} + SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *TD, + AliasAnalysis *AA) + : TTI(TTI), TD(TD), AA(AA) {} bool run(BasicBlock *BB); }; } @@ -197,8 +218,8 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, } -/// GetIfCondition - Given a basic block (BB) with two predecessors (and at -/// least one PHI node in it), check to see if the merge at this block is due +/// GetIfCondition - Given a basic block (BB) with two predecessors, +/// check to see if the merge at this block is due /// to an "if condition". If so, return the boolean condition that determines /// which entry into BB will be taken. Also, return by references the block /// that will be entered from if the condition is true, and the block that will @@ -208,11 +229,26 @@ static void AddPredecessorToBlock(BasicBlock *Succ, BasicBlock *NewPred, /// instructions in them. static Value *GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue, BasicBlock *&IfFalse) { - PHINode *SomePHI = cast(BB->begin()); - assert(SomePHI->getNumIncomingValues() == 2 && - "Function can only handle blocks with 2 predecessors!"); - BasicBlock *Pred1 = SomePHI->getIncomingBlock(0); - BasicBlock *Pred2 = SomePHI->getIncomingBlock(1); + PHINode *SomePHI = dyn_cast(BB->begin()); + BasicBlock *Pred1 = NULL; + BasicBlock *Pred2 = NULL; + + if (SomePHI) { + if (SomePHI->getNumIncomingValues() != 2) + return NULL; + Pred1 = SomePHI->getIncomingBlock(0); + Pred2 = SomePHI->getIncomingBlock(1); + } else { + pred_iterator PI = pred_begin(BB), PE = pred_end(BB); + if (PI == PE) // No predecessor + return NULL; + Pred1 = *PI++; + if (PI == PE) // Only one predecessor + return NULL; + Pred2 = *PI++; + if (PI != PE) // More than two predecessors + return NULL; + } // We can only handle branches. Other control flow will be lowered to // branches if possible anyway. @@ -4066,6 +4102,383 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) { return false; } +/// If \param [in] BB has more than one predecessor that is a conditional +/// branch, attempt to use parallel and/or for the branch condition. \returns +/// true on success. +/// +/// Before: +/// ...... +/// %cmp10 = fcmp une float %tmp1, %tmp2 +/// br i1 %cmp1, label %if.then, label %lor.rhs +/// +/// lor.rhs: +/// ...... +/// %cmp11 = fcmp une float %tmp3, %tmp4 +/// br i1 %cmp11, label %if.then, label %ifend +/// +/// if.end: // the merge block +/// ...... +/// +/// if.then: // has two predecessors, both of them contains conditional branch. +/// ...... +/// br label %if.end; +/// +/// After: +/// ...... +/// %cmp10 = fcmp une float %tmp1, %tmp2 +/// ...... +/// %cmp11 = fcmp une float %tmp3, %tmp4 +/// %cmp12 = or i1 %cmp10, %cmp11 // parallel-or mode. +/// br i1 %cmp12, label %if.then, label %ifend +/// +/// if.end: +/// ...... +/// +/// if.then: +/// ...... +/// br label %if.end; +/// +/// Current implementation handles two cases. +/// Case 1: \param BB is on the else-path. +/// +/// BB1 +/// / | +/// BB2 | +/// / \ | +/// BB3 \ | where, BB1, BB2 contain conditional branches. +/// \ | / BB3 contains unconditional branch. +/// \ | / BB4 corresponds to \param BB which is also the merge. +/// BB => BB4 +/// +/// +/// Corresponding source code: +/// +/// if (a == b && c == d) +/// statement; // BB3 +/// +/// Case 2: \param BB BB is on the then-path. +/// +/// BB1 +/// / | +/// | BB2 +/// \ / | where BB1, BB2 contain conditional branches. +/// BB => BB3 | BB3 contains unconditiona branch and corresponds +/// \ / to \param BB. BB4 is the merge. +/// BB4 +/// +/// Corresponding source code: +/// +/// if (a == b || c == d) +/// statement; // BB3 +/// +/// In both cases, \param BB is the common successor of conditional branches. +/// In Case 1, \param BB (BB4) has an unconditional branch (BB3) as +/// its predecessor. In Case 2, \param BB (BB3) only has conditional branches +/// as its predecessors. +/// +bool SimplifyCFGOpt::SimplifyParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder, + Pass *P) { + PHINode *PHI = dyn_cast(BB->begin()); + if (PHI) + return false; // For simplicity, avoid cases containing PHI nodes. + + BasicBlock *LastCondBlock = NULL; + BasicBlock *FirstCondBlock = NULL; + BasicBlock *UnCondBlock = NULL; + int Idx = -1; + + // Check predecessors of \param BB. + SmallPtrSet Preds(pred_begin(BB), pred_end(BB)); + for (SmallPtrSetIterator PI = Preds.begin(), PE = Preds.end(); + PI != PE; ++PI) { + BasicBlock *Pred = *PI; + BranchInst *PBI = dyn_cast(Pred->getTerminator()); + + // All predecessors should terminate with a branch. + if (!PBI) + return false; + + BasicBlock *PP = Pred->getSinglePredecessor(); + + if (PBI->isUnconditional()) { + // Case 1: Pred (BB3) is an unconditional block, it should + // have a single predecessor (BB2) that is also a predecessor + // of \param BB (BB4) and should not have address-taken. + // There should exist only one such unconditional + // branch among the predecessors. + if (UnCondBlock || !PP || (Preds.count(PP) == 0) || + Pred->hasAddressTaken()) + return false; + + UnCondBlock = Pred; + continue; + } + + // Only conditional branches are allowed beyond this point. + assert(PBI->isConditional()); + + // Condition's unique use should be the branch instruction. + Value *PC = PBI->getCondition(); + if (!PC || !PC->hasOneUse()) + return false; + + if (PP && Preds.count(PP)) { + // These are internal condition blocks to be merged from, e.g., + // BB2 in both cases. + // Should not be address-taken. + if (Pred->hasAddressTaken()) + return false; + + // Instructions in the internal condition blocks should be safe + // to hoist up. + for (BasicBlock::iterator BI = Pred->begin(), BE = PBI; BI != BE;) { + Instruction *CI = BI++; + if (isa(CI) || + !isSafeToSpeculativelyExecute(CI)) + return false; + } + } else { + // This is the condition block to be merged into, e.g. BB1 in + // both cases. + if (FirstCondBlock) + return false; + FirstCondBlock = Pred; + } + + // Find whether BB is uniformly on the true (or false) path + // for all of its predecessors. + BasicBlock *PS1 = PBI->getSuccessor(0); + BasicBlock *PS2 = PBI->getSuccessor(1); + BasicBlock *PS = (PS1 == BB) ? PS2 : PS1; + int CIdx = (PS1 == BB) ? 0 : 1; + + if (Idx == -1) + Idx = CIdx; + else if (CIdx != Idx) + return false; + + // PS is the successor which is not BB. Check successors to identify + // the last conditional branch. + if (Preds.count(PS) == 0) { + // Case 2. + // BB must have an unique successor. + TerminatorInst *TBB = BB->getTerminator(); + if (TBB->getNumSuccessors() != 1) + return false; + + BasicBlock *SBB = TBB->getSuccessor(0); + PHI = dyn_cast(SBB->begin()); + if (PHI) + return false; + + // PS (BB4) should be BB's successor. + if (SBB != PS) + return false; + LastCondBlock = Pred; + } else { + BranchInst *BPS = dyn_cast(PS->getTerminator()); + if (BPS && BPS->isUnconditional()) { + // Case 1: PS(BB3) should be an unconditional branch. + LastCondBlock = Pred; + } + } + } + + if (!FirstCondBlock || !LastCondBlock || (FirstCondBlock == LastCondBlock)) + return false; + + // Do the transformation. + BasicBlock *CB; + bool Iteration = true; + BasicBlock::iterator ItOld = Builder.GetInsertPoint(); + BranchInst *PBI = dyn_cast(FirstCondBlock->getTerminator()); + Value *PC = PBI->getCondition(); + do { + CB = PBI->getSuccessor(1 - Idx); + // Delete the conditional branch. + FirstCondBlock->getInstList().pop_back(); + FirstCondBlock->getInstList().splice(FirstCondBlock->end(), CB->getInstList()); + PBI = cast(FirstCondBlock->getTerminator()); + Value *CC = PBI->getCondition(); + // Merge conditions. + Builder.SetInsertPoint(PBI); + Value *NC; + if (Idx == 0) + // Case 2, use parallel or. + NC = Builder.CreateOr(PC, CC); + else + // Case 1, use parallel and. + NC = Builder.CreateAnd(PC, CC); + + PBI->replaceUsesOfWith(CC, NC); + PC = NC; + if (CB == LastCondBlock) + Iteration = false; + // Remove internal conditional branches. + CB->dropAllReferences(); + // make CB unreachable and let downstream to delete the block. + new UnreachableInst(CB->getContext(), CB); + } while (Iteration); + + Builder.SetInsertPoint(ItOld); + DEBUG(dbgs() << "Use parallel and/or in:\n" << *FirstCondBlock); + return true; +} + +/// Compare blocks from two if-regions, where \param Head1 is the entry of the +/// 1st if-region. \param Head2 is the entry of the 2nd if-region. \param +/// Block1 is a block in the 1st if-region to compare. \param Block2 is a block +// in the 2nd if-region to compare. \returns true if \param Block1 and \param +/// Block2 have identical instructions and do not have memory reference alias +/// with \param Head2. +/// +bool SimplifyCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2, + BasicBlock *Block1, BasicBlock *Block2) { + TerminatorInst *PTI2 = Head2->getTerminator(); + Instruction *PBI2 = Head2->begin(); + + bool eq1 = (Block1 == Head1); + bool eq2 = (Block2 == Head2); + if (eq1 || eq2) { + // An empty then-path or else-path. + return (eq1 == eq2); + } + + // Check whether instructions in Block1 and Block2 are identical + // and do not alias with instructions in Head2. + BasicBlock::iterator iter1 = Block1->begin(); + BasicBlock::iterator end1 = Block1->getTerminator(); + BasicBlock::iterator iter2 = Block2->begin(); + BasicBlock::iterator end2 = Block2->getTerminator(); + + while (1) { + if (iter1 == end1) { + if (iter2 != end2) + return false; + break; + } + + if (!iter1->isIdenticalTo(iter2)) + return false; + + // Illegal to remove instructions with side effects except + // non-volatile stores. + if (iter1->mayHaveSideEffects()) { + Instruction *CurI = &*iter1; + StoreInst *SI = dyn_cast(CurI); + if (!SI || SI->isVolatile()) + return false; + } + + // For simplicity and speed, data dependency check can be + // avoided if read from memory doesn't exist. + if (iter1->mayReadFromMemory()) + return false; + + if (iter1->mayWriteToMemory()) { + for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { + if (BI->mayReadFromMemory() || BI->mayWriteToMemory()) { + // Check alias with Head2. + if (!AA || AA->alias(iter1, BI)) + return false; + } + } + } + ++iter1; + ++iter2; + } + + return true; +} + +/// Check whether \param BB is the merge block of a if-region. If yes, check +/// whether there exists an adjacent if-region upstream, the two if-regions +/// contain identical instuctions and can be legally merged. \returns true if +/// the two if-regions are merged. +/// +/// From: +/// if (a) +/// statement; +/// if (b) +/// statement; +/// +/// To: +/// if (a || b) +/// statement; +/// +bool SimplifyCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder, + Pass *P) { + BasicBlock *IfTrue2, *IfFalse2; + Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2); + Instruction *CInst2 = dyn_cast_or_null(IfCond2); + if (!CInst2) + return false; + + BasicBlock *SecondEntryBlock = CInst2->getParent(); + if (SecondEntryBlock->hasAddressTaken()) + return false; + + BasicBlock *IfTrue1, *IfFalse1; + Value *IfCond1 = GetIfCondition(SecondEntryBlock, IfTrue1, IfFalse1); + Instruction *CInst1 = dyn_cast_or_null(IfCond1); + if (!CInst1) + return false; + + BasicBlock *FirstEntryBlock = CInst1->getParent(); + + // Either then-path or else-path should be empty. + if ((IfTrue1 != FirstEntryBlock) && (IfFalse1 != FirstEntryBlock)) + return false; + if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock)) + return false; + + TerminatorInst *PTI2 = SecondEntryBlock->getTerminator(); + Instruction *PBI2 = SecondEntryBlock->begin(); + + if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1, IfTrue2)) + return false; + + if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfFalse1, IfFalse2)) + return false; + + // Check whether \param SecondEntryBlock has side-effect and is safe to speculate. + for (BasicBlock::iterator BI = PBI2, BE = PTI2; BI != BE; ++BI) { + Instruction *CI = BI; + if (isa(CI) || CI->mayHaveSideEffects() || + !isSafeToSpeculativelyExecute(CI)) + return false; + } + + // Merge \param SecondEntryBlock into \param FirstEntryBlock. + FirstEntryBlock->getInstList().pop_back(); + FirstEntryBlock->getInstList().splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList()); + BranchInst *PBI = dyn_cast(FirstEntryBlock->getTerminator()); + Value *CC = PBI->getCondition(); + BasicBlock::iterator ItOld = Builder.GetInsertPoint(); + Builder.SetInsertPoint(PBI); + Value *NC = Builder.CreateOr(CInst1, CC); + PBI->replaceUsesOfWith(CC, NC); + Builder.SetInsertPoint(ItOld); + + // Remove IfTrue1 + if (IfTrue1 != FirstEntryBlock) { + IfTrue1->dropAllReferences(); + IfTrue1->eraseFromParent(); + } + + // Remove IfFalse1 + if (IfFalse1 != FirstEntryBlock) { + IfFalse1->dropAllReferences(); + IfFalse1->eraseFromParent(); + } + + // Remove \param SecondEntryBlock + SecondEntryBlock->dropAllReferences(); + SecondEntryBlock->eraseFromParent(); + DEBUG(dbgs() << "If conditions merged into:\n" << *FirstEntryBlock); + return true; +} + /// Check if passing a value to an instruction will cause undefined behavior. static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) { Constant *C = dyn_cast(V); @@ -4168,6 +4581,11 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { return true; IRBuilder<> Builder(BB); + // Whether to optimize conditional branches. + bool OptCB = (ParallelAndOr && AA && TTI.hasBranchDivergence()); + + if (OptCB && SimplifyParallelAndOr(BB, Builder)) + return true; // If there is a trivial two-entry PHI node in this basic block, and we can // eliminate it, do so now. @@ -4196,6 +4614,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { if (SimplifyIndirectBr(IBI)) return true; } + if (OptCB && MergeIfRegion(BB, Builder)) + return true; + return Changed; } @@ -4205,6 +4626,6 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) { /// of the CFG. It returns true if a modification was made. /// bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI, - const DataLayout *TD) { - return SimplifyCFGOpt(TTI, TD).run(BB); + const DataLayout *TD, AliasAnalysis *AA) { + return SimplifyCFGOpt(TTI, TD, AA).run(BB); } diff --git a/tools/lto/LTOCodeGenerator.cpp b/tools/lto/LTOCodeGenerator.cpp index 6139ade..776a5b9 100644 --- a/tools/lto/LTOCodeGenerator.cpp +++ b/tools/lto/LTOCodeGenerator.cpp @@ -118,7 +118,7 @@ void LTOCodeGenerator::initializeLTOPasses() { initializeGVNPass(R); initializeMemCpyOptPass(R); initializeDCEPass(R); - initializeCFGSimplifyPassPass(R); + initializeCFGCanonicalizePass(R); } bool LTOCodeGenerator::addModule(LTOModule* mod, std::string& errMsg) { diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp index 68fca83..fa0a0ed 100644 --- a/tools/opt/opt.cpp +++ b/tools/opt/opt.cpp @@ -667,6 +667,9 @@ int main(int argc, char **argv) { FPasses.reset(new FunctionPassManager(M.get())); if (TD) FPasses->add(new DataLayout(*TD)); + if (TM.get()) + TM->addAnalysisPasses(*FPasses); + } if (PrintBreakpoints) { -- cgit v1.1