diff options
Diffstat (limited to 'lib/Transforms')
-rw-r--r-- | lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp | 285 |
1 files changed, 285 insertions, 0 deletions
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 88e16e9..0579c27 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -16,10 +16,20 @@ #include "llvm/Analysis/Loads.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" using namespace llvm; +/// Hidden option to stress test load slicing, i.e., when this option +/// is enabled, load slicing bypasses most of its profitability guards. +/// It will also generate, uncanonalized form of slicing. +static cl::opt<bool> +StressLoadSlicing("instcombine-stress-load-slicing", cl::Hidden, + cl::desc("Bypass the profitability model of load " + "slicing"), + cl::init(false)); + STATISTIC(NumDeadStore, "Number of dead stores eliminated"); STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global"); @@ -337,6 +347,274 @@ static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI, return 0; } +namespace { + /// \brief Helper structure used to slice a load in smaller loads. + struct LoadedSlice { + // The last instruction that represent the slice. This should be a + // truncate instruction. + Instruction *Inst; + // The original load instruction. + LoadInst *Origin; + // The right shift amount in bits from the original load. + unsigned Shift; + + LoadedSlice(Instruction *Inst = NULL, LoadInst *Origin = NULL, + unsigned Shift = 0) + : Inst(Inst), Origin(Origin), Shift(Shift) {} + + LoadedSlice(const LoadedSlice& LS) : Inst(LS.Inst), Origin(LS.Origin), + Shift(LS.Shift) {} + + /// \brief Get the bits used in a chunk of bits \p BitWidth large. + /// \return Result is \p BitWidth and has used bits set to 1 and + /// not used bits set to 0. + APInt getUsedBits() const { + // Reproduce the trunc(lshr) sequence: + // - Start from the truncated value. + // - Zero extend to the desired bit width. + // - Shift left. + assert(Origin && "No original load to compare against."); + unsigned BitWidth = Origin->getType()->getPrimitiveSizeInBits(); + assert(Inst && "This slice is not bound to an instruction"); + assert(Inst->getType()->getPrimitiveSizeInBits() <= BitWidth && + "Extracted slice is smaller than the whole type!"); + APInt UsedBits(Inst->getType()->getPrimitiveSizeInBits(), 0); + UsedBits.setAllBits(); + UsedBits = UsedBits.zext(BitWidth); + UsedBits <<= Shift; + return UsedBits; + } + + /// \brief Get the size of the slice to be loaded in bytes. + unsigned getLoadedSize() const { + unsigned SliceSize = getUsedBits().countPopulation(); + assert(!(SliceSize & 0x7) && "Size is not a multiple of a byte."); + return SliceSize / 8; + } + + /// \brief Get the offset in bytes of this slice in the original chunk of + /// bits, whose layout is defined by \p IsBigEndian. + uint64_t getOffsetFromBase(bool IsBigEndian) const { + assert(!(Shift & 0x7) && "Shifts not aligned on Bytes are not support."); + uint64_t Offset = Shift / 8; + unsigned TySizeInBytes = Origin->getType()->getPrimitiveSizeInBits() / 8; + assert(!(Origin->getType()->getPrimitiveSizeInBits() & 0x7) && + "The size of the original loaded type is not a multiple of a" + " byte."); + // If Offset is bigger than TySizeInBytes, it means we are loading all + // zeros. This should have been optimized before in the process. + assert(TySizeInBytes > Offset && + "Invalid shift amount for given loaded size"); + if (IsBigEndian) + Offset = TySizeInBytes - Offset - getLoadedSize(); + return Offset; + } + + /// \brief Generate the sequence of instructions to load the slice + /// represented by this object and redirect the uses of this slice to + /// this new sequence of instructions. + /// \pre this->Inst && this->Origin are valid Instructions. + /// \return The last instruction of the sequence used to load the slice. + Instruction *loadSlice(InstCombiner::BuilderTy &Builder, + bool IsBigEndian) const { + assert(Inst && Origin && "Unable to replace a non-existing slice."); + Value *BaseAddr = Origin->getOperand(0); + unsigned Alignment = Origin->getAlignment(); + Builder.SetInsertPoint(Origin); + // Assume we are looking at a chunk of bytes. + // BaseAddr = (i8*)BaseAddr. + BaseAddr = Builder.CreateBitCast(BaseAddr, Builder.getInt8PtrTy(), + "raw_cast"); + // Get the offset in that chunk of bytes w.r.t. the endianess. + uint64_t Offset = getOffsetFromBase(IsBigEndian); + if (Offset) { + APInt APOffset(64, Offset); + // BaseAddr = BaseAddr + Offset. + BaseAddr = Builder.CreateInBoundsGEP(BaseAddr, Builder.getInt(APOffset), + "raw_idx"); + } + + // Create the type of the loaded slice according to its size. + Type *SliceType = + Type::getIntNTy(Origin->getContext(), getLoadedSize() * 8); + + // Bit cast the raw pointer to the pointer type of the slice. + BaseAddr = Builder.CreateBitCast(BaseAddr, SliceType->getPointerTo(), + "cast"); + + // Compute the new alignment. + if (Offset != 0) + Alignment = MinAlign(Alignment, Alignment + Offset); + + // Create the load for the slice. + Instruction *LastInst = Builder.CreateAlignedLoad(BaseAddr, Alignment, + Inst->getName()+".val"); + // If the final type is not the same as the loaded type, this means that + // we have to pad with zero. Create a zero extend for that. + Type * FinalType = Inst->getType(); + if (SliceType != FinalType) + LastInst = cast<Instruction>(Builder.CreateZExt(LastInst, FinalType)); + + // Update the IR to reflect the new access to the slice. + Inst->replaceAllUsesWith(LastInst); + + return LastInst; + } + + /// \brief Check if it would be profitable to expand this slice as an + /// independant load. + bool isProfitable() const { + // Slicing is assumed to be profitable iff the chains leads to arithmetic + // operations. + SmallVector<const Instruction *, 8> Uses; + Uses.push_back(Inst); + do { + const Instruction *Use = Uses.pop_back_val(); + for (Value::const_use_iterator UseIt = Use->use_begin(), + UseItEnd = Use->use_end(); UseIt != UseItEnd; ++UseIt) { + const Instruction *UseOfUse = cast<Instruction>(*UseIt); + // Consider these instructions as arithmetic operations. + if (isa<BinaryOperator>(UseOfUse) || + isa<CastInst>(UseOfUse) || + isa<PHINode>(UseOfUse) || + isa<GetElementPtrInst>(UseOfUse)) + return true; + // No need to check if the Use has already been checked as we do not + // insert any PHINode. + Uses.push_back(UseOfUse); + } + } while (!Uses.empty()); + DEBUG(dbgs() << "IC: Not a profitable slice " << *Inst << '\n'); + return false; + } + }; +} + +/// \brief Check the profitability of all involved LoadedSlice. +/// Unless StressLoadSlicing is specified, this also returns false +/// when slicing is not in the canonical form. +/// The canonical form of sliced load is (1) two loads, +/// which are (2) next to each other in memory. +/// +/// FIXME: We may want to allow more slices to be created but +/// this means other passes should know how to deal with all those +/// slices. +/// FIXME: We may want to split loads to different types, e.g., +/// int vs. float. +static bool +isSlicingProfitable(const SmallVectorImpl<LoadedSlice> &LoadedSlices, + const APInt &UsedBits) { + unsigned NbOfSlices = LoadedSlices.size(); + // Check (1). + if (!StressLoadSlicing && NbOfSlices != 2) + return false; + + // Check (2). + if (!StressLoadSlicing && !UsedBits.isAllOnesValue()) { + // Get rid of the unused bits on the right. + APInt MemoryLayout = UsedBits.lshr(UsedBits.countTrailingZeros()); + // Get rid of the unused bits on the left. + if (MemoryLayout.countLeadingZeros()) + MemoryLayout = MemoryLayout.trunc(MemoryLayout.getActiveBits()); + // Check that the chunk of memory is completely used. + if (!MemoryLayout.isAllOnesValue()) + return false; + } + + unsigned NbOfProfitableSlices = 0; + for (unsigned CurrSlice = 0; CurrSlice < NbOfSlices; ++CurrSlice) { + if (LoadedSlices[CurrSlice].isProfitable()) + ++NbOfProfitableSlices; + else if (!StressLoadSlicing) + return false; + } + // In Stress mode, we may have 0 profitable slice. + // Check that here. + // In non-Stress mode, all the slices are profitable at this point. + return NbOfProfitableSlices > 0; +} + +/// \brief If the given load, \p LI, is used only by trunc or trunc(lshr) +/// operations, split it in the various pieces being extracted. +/// +/// This sort of thing is introduced by SROA. +/// This slicing takes care not to insert overlapping loads. +/// \pre LI is a simple load (i.e., not an atomic or volatile load). +static Instruction *sliceUpLoadInst(LoadInst &LI, + InstCombiner::BuilderTy &Builder, + DataLayout &TD) { + assert(LI.isSimple() && "We are trying to transform a non-simple load!"); + + // FIXME: If we want to support floating point and vector types, we should + // support bitcast and extract/insert element instructions. + Type *LITy = LI.getType(); + if (!LITy->isIntegerTy()) return 0; + + // Keep track of already used bits to detect overlapping values. + // In that case, we will just abort the transformation. + APInt UsedBits(LITy->getPrimitiveSizeInBits(), 0); + + SmallVector<LoadedSlice, 4> LoadedSlices; + + // Check if this load is used as several smaller chunks of bits. + // Basically, look for uses in trunc or trunc(lshr) and record a new chain + // of computation for each trunc. + for (Value::use_iterator UI = LI.use_begin(), UIEnd = LI.use_end(); + UI != UIEnd; ++UI) { + Instruction *User = cast<Instruction>(*UI); + unsigned Shift = 0; + + // Check if this is a trunc(lshr). + if (User->getOpcode() == Instruction::LShr && User->hasOneUse() && + isa<ConstantInt>(User->getOperand(1))) { + Shift = cast<ConstantInt>(User->getOperand(1))->getZExtValue(); + User = User->use_back(); + } + + // At this point, User is a TruncInst, iff we encountered, trunc or + // trunc(lshr). + if (!isa<TruncInst>(User)) + return 0; + + // The width of the type must be a power of 2 and greater than 8-bits. + // Otherwise the load cannot be represented in LLVM IR. + // Moreover, if we shifted with a non 8-bits multiple, the slice + // will be accross several bytes. We do not support that. + unsigned Width = User->getType()->getPrimitiveSizeInBits(); + if (Width < 8 || !isPowerOf2_32(Width) || (Shift & 0x7)) + return 0; + + // Build the slice for this chain of computations. + LoadedSlice LS(User, &LI, Shift); + APInt CurrentUsedBits = LS.getUsedBits(); + + // Check if this slice overlaps with another. + if ((CurrentUsedBits & UsedBits) != 0) + return 0; + // Update the bits used globally. + UsedBits |= CurrentUsedBits; + + // Record the slice. + LoadedSlices.push_back(LS); + } + + // Abort slicing if it does not seem to be profitable. + if (!isSlicingProfitable(LoadedSlices, UsedBits)) + return 0; + + // Rewrite each chain to use an independent load. + // By construction, each chain can be represented by a unique load. + bool IsBigEndian = TD.isBigEndian(); + for (SmallVectorImpl<LoadedSlice>::const_iterator LSIt = LoadedSlices.begin(), + LSItEnd = LoadedSlices.end(); LSIt != LSItEnd; ++LSIt) { + Instruction *SliceInst = LSIt->loadSlice(Builder, IsBigEndian); + (void)SliceInst; + DEBUG(dbgs() << "IC: Replacing " << *LSIt->Inst << "\n" + " with " << *SliceInst << '\n'); + } + return 0; // Don't do anything with LI. +} + Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { Value *Op = LI.getOperand(0); @@ -443,6 +721,13 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) { } } } + + // Try to split a load in smaller non-overlapping loads to expose independant + // chain of computations and get rid of trunc/lshr sequence of code. + // The data layout is required for that operation, as code generation will + // change with respect to endianess. + if (TD) + return sliceUpLoadInst(LI, *Builder, *TD); return 0; } |