diff options
author | Ahmed Bougacha <ahmed.bougacha@gmail.com> | 2013-05-24 01:07:04 +0000 |
---|---|---|
committer | Ahmed Bougacha <ahmed.bougacha@gmail.com> | 2013-05-24 01:07:04 +0000 |
commit | ef99356dfebb96f6f90efb912c2877214bad060e (patch) | |
tree | 76250a4be7eff5e9bec963f6ff0daef8cb8d84bf /include | |
parent | 2c94d0faa0e1c268893d5e04dc77e8a35889db00 (diff) | |
download | external_llvm-ef99356dfebb96f6f90efb912c2877214bad060e.zip external_llvm-ef99356dfebb96f6f90efb912c2877214bad060e.tar.gz external_llvm-ef99356dfebb96f6f90efb912c2877214bad060e.tar.bz2 |
MC: Disassembled CFG reconstruction.
This patch builds on some existing code to do CFG reconstruction from
a disassembled binary:
- MCModule represents the binary, and has a list of MCAtoms.
- MCAtom represents either disassembled instructions (MCTextAtom), or
contiguous data (MCDataAtom), and covers a specific range of addresses.
- MCBasicBlock and MCFunction form the reconstructed CFG. An MCBB is
backed by an MCTextAtom, and has the usual successors/predecessors.
- MCObjectDisassembler creates a module from an ObjectFile using a
disassembler. It first builds an atom for each section. It can also
construct the CFG, and this splits the text atoms into basic blocks.
MCModule and MCAtom were only sketched out; MCFunction and MCBB were
implemented under the experimental "-cfg" llvm-objdump -macho option.
This cleans them up for further use; llvm-objdump -d -cfg now generates
graphviz files for each function found in the binary.
In the future, MCObjectDisassembler may be the right place to do
"intelligent" disassembly: for example, handling constant islands is just
a matter of splitting the atom, using information that may be available
in the ObjectFile. Also, better initial atom formation than just using
sections is possible using symbols (and things like Mach-O's
function_starts load command).
This brings two minor regressions in llvm-objdump -macho -cfg:
- The printing of a relocation's referenced symbol.
- An annotation on loop BBs, i.e., which are their own successor.
Relocation printing is replaced by the MCSymbolizer; the basic CFG
annotation will be superseded by more related functionality.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@182628 91177308-0d34-0410-b5e6-96231b3b80d8
Diffstat (limited to 'include')
-rw-r--r-- | include/llvm/MC/MCAtom.h | 174 | ||||
-rw-r--r-- | include/llvm/MC/MCFunction.h | 122 | ||||
-rw-r--r-- | include/llvm/MC/MCInstrAnalysis.h | 11 | ||||
-rw-r--r-- | include/llvm/MC/MCModule.h | 93 | ||||
-rw-r--r-- | include/llvm/MC/MCObjectDisassembler.h | 69 | ||||
-rw-r--r-- | include/llvm/Support/StringRefMemoryObject.h | 42 |
6 files changed, 461 insertions, 50 deletions
diff --git a/include/llvm/MC/MCAtom.h b/include/llvm/MC/MCAtom.h index ae5bf0b..6a93798 100644 --- a/include/llvm/MC/MCAtom.h +++ b/include/llvm/MC/MCAtom.h @@ -1,4 +1,4 @@ -//===-- llvm/MC/MCAtom.h - MCAtom class ---------------------*- C++ -*-===// +//===-- llvm/MC/MCAtom.h ----------------------------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -9,7 +9,7 @@ // // This file contains the declaration of the MCAtom class, which is used to // represent a contiguous region in a decoded object that is uniformly data or -// instructions; +// instructions. // //===----------------------------------------------------------------------===// @@ -24,45 +24,169 @@ namespace llvm { class MCModule; -/// MCData - An entry in a data MCAtom. -// NOTE: This may change to a more complex type in the future. -typedef uint8_t MCData; +class MCAtom; +class MCTextAtom; +class MCDataAtom; /// MCAtom - Represents a contiguous range of either instructions (a TextAtom) /// or data (a DataAtom). Address ranges are expressed as _closed_ intervals. class MCAtom { - friend class MCModule; - typedef enum { TextAtom, DataAtom } AtomType; - - AtomType Type; +public: + virtual ~MCAtom() {} + + enum AtomKind { TextAtom, DataAtom }; + AtomKind getKind() const { return Kind; } + + /// \brief Get the start address of the atom. + uint64_t getBeginAddr() const { return Begin; } + /// \brief Get the end address, i.e. the last one inside the atom. + uint64_t getEndAddr() const { return End; } + + /// \name Atom modification methods: + /// When modifying a TextAtom, keep instruction boundaries in mind. + /// For instance, split must me given the start address of an instruction. + /// @{ + + /// \brief Splits the atom in two at a given address. + /// \param SplitPt Address at which to start a new atom, splitting this one. + /// \returns The newly created atom starting at \p SplitPt. + virtual MCAtom *split(uint64_t SplitPt) = 0; + + /// \brief Truncates an atom, discarding everything after \p TruncPt. + /// \param TruncPt Last byte address to be contained in this atom. + virtual void truncate(uint64_t TruncPt) = 0; + /// @} + + /// \name Naming: + /// + /// This is mostly for display purposes, and may contain anything that hints + /// at what the atom contains: section or symbol name, BB start address, .. + /// @{ + StringRef getName() const { return Name; } + void setName(StringRef NewName) { Name = NewName.str(); } + /// @} + +protected: + const AtomKind Kind; + std::string Name; MCModule *Parent; uint64_t Begin, End; - std::vector<std::pair<uint64_t, MCInst> > Text; - std::vector<MCData> Data; + friend class MCModule; + MCAtom(AtomKind K, MCModule *P, uint64_t B, uint64_t E) + : Kind(K), Name("(unknown)"), Parent(P), Begin(B), End(E) { } + + /// \name Atom remapping helpers + /// @{ + + /// \brief Remap the atom, using the given range, updating Begin/End. + /// One or both of the bounds can remain the same, but overlapping with other + /// atoms in the module is still forbidden. + void remap(uint64_t NewBegin, uint64_t NewEnd); + + /// \brief Remap the atom to prepare for a truncation at TruncPt. + /// Equivalent to: + /// \code + /// // Bound checks + /// remap(Begin, TruncPt); + /// \endcode + void remapForTruncate(uint64_t TruncPt); + + /// \brief Remap the atom to prepare for a split at SplitPt. + /// The bounds for the resulting atoms are returned in {L,R}{Begin,End}. + /// The current atom is truncated to \p LEnd. + void remapForSplit(uint64_t SplitPt, + uint64_t &LBegin, uint64_t &LEnd, + uint64_t &RBegin, uint64_t &REnd); + /// @} +}; - // Private constructor - only callable by MCModule - MCAtom(AtomType T, MCModule *P, uint64_t B, uint64_t E) - : Type(T), Parent(P), Begin(B), End(E) { } +/// \name Text atom +/// @{ + +/// \brief An entry in an MCTextAtom: a disassembled instruction. +/// NOTE: Both the Address and Size field are actually redundant when taken in +/// the context of the text atom, and may better be exposed in an iterator +/// instead of stored in the atom, which would replace this class. +class MCDecodedInst { +public: + MCInst Inst; + uint64_t Address; + uint64_t Size; + MCDecodedInst(const MCInst &Inst, uint64_t Address, uint64_t Size) + : Inst(Inst), Address(Address), Size(Size) {} +}; + +/// \brief An atom consisting of disassembled instructions. +class MCTextAtom : public MCAtom { +private: + typedef std::vector<MCDecodedInst> InstListTy; + InstListTy Insts; + /// \brief The address of the next appended instruction, i.e., the + /// address immediately after the last instruction in the atom. + uint64_t NextInstAddress; public: - bool isTextAtom() const { return Type == TextAtom; } - bool isDataAtom() const { return Type == DataAtom; } + /// Append an instruction, expanding the atom if necessary. + void addInst(const MCInst &Inst, uint64_t Size); + + /// \name Instruction list access + /// @{ + typedef InstListTy::const_iterator const_iterator; + const_iterator begin() const { return Insts.begin(); } + const_iterator end() const { return Insts.end(); } + + const MCDecodedInst &back() const { return Insts.back(); } + const MCDecodedInst &at(size_t n) const { return Insts.at(n); } + uint64_t size() const { return Insts.size(); } + /// @} + + /// \name Atom type specific split/truncate logic. + /// @{ + MCTextAtom *split(uint64_t SplitPt) LLVM_OVERRIDE; + void truncate(uint64_t TruncPt) LLVM_OVERRIDE; + /// @} + + // Class hierarchy. + static bool classof(const MCAtom *A) { return A->getKind() == TextAtom; } +private: + friend class MCModule; + // Private constructor - only callable by MCModule + MCTextAtom(MCModule *P, uint64_t Begin, uint64_t End) + : MCAtom(TextAtom, P, Begin, End), NextInstAddress(Begin) {} +}; +/// @} + +/// \name Data atom +/// @{ + +/// \brief An entry in an MCDataAtom. +// NOTE: This may change to a more complex type in the future. +typedef uint8_t MCData; - void addInst(const MCInst &I, uint64_t Address, unsigned Size); +/// \brief An atom consising of a sequence of bytes. +class MCDataAtom : public MCAtom { + std::vector<MCData> Data; + +public: + /// Append a data entry, expanding the atom if necessary. void addData(const MCData &D); - /// split - Splits the atom in two at a given address, which must align with - /// and instruction boundary if this is a TextAtom. Returns the newly created - /// atom representing the high part of the split. - MCAtom *split(uint64_t SplitPt); + /// \name Atom type specific split/truncate logic. + /// @{ + MCDataAtom *split(uint64_t SplitPt) LLVM_OVERRIDE; + void truncate(uint64_t TruncPt) LLVM_OVERRIDE; + /// @} - /// truncate - Truncates an atom so that TruncPt is the last byte address - /// contained in the atom. - void truncate(uint64_t TruncPt); + // Class hierarchy. + static bool classof(const MCAtom *A) { return A->getKind() == DataAtom; } +private: + friend class MCModule; + // Private constructor - only callable by MCModule + MCDataAtom(MCModule *P, uint64_t Begin, uint64_t End) + : MCAtom(DataAtom, P, Begin, End), Data(End - Begin) {} }; } #endif - diff --git a/include/llvm/MC/MCFunction.h b/include/llvm/MC/MCFunction.h new file mode 100644 index 0000000..b85011e --- /dev/null +++ b/include/llvm/MC/MCFunction.h @@ -0,0 +1,122 @@ +//===-- llvm/MC/MCFunction.h ------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the data structures to hold a CFG reconstructed from +// machine code. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MC_MCFUNCTION_H +#define LLVM_MC_MCFUNCTION_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/MC/MCInst.h" +#include <string> +#include <vector> + +namespace llvm { + +class MCFunction; +class MCModule; +class MCTextAtom; + +/// \brief Basic block containing a sequence of disassembled instructions. +/// The basic block is backed by an MCTextAtom, which holds the instructions, +/// and the address range it covers. +/// Create a basic block using MCFunction::createBlock. +class MCBasicBlock { + const MCTextAtom *Insts; + + // MCFunction owns the basic block. + MCFunction *Parent; + friend class MCFunction; + MCBasicBlock(const MCTextAtom &Insts, MCFunction *Parent); + + /// \name Predecessors/Successors, to represent the CFG. + /// @{ + typedef std::vector<const MCBasicBlock *> BasicBlockListTy; + BasicBlockListTy Successors; + BasicBlockListTy Predecessors; + /// @} +public: + + /// \brief Get the backing MCTextAtom, containing the instruction sequence. + const MCTextAtom *getInsts() const { return Insts; } + + /// \name Get the owning MCFunction. + /// @{ + const MCFunction *getParent() const { return Parent; } + MCFunction *getParent() { return Parent; } + /// @} + + /// MC CFG access: Predecessors/Successors. + /// @{ + typedef BasicBlockListTy::const_iterator succ_const_iterator; + succ_const_iterator succ_begin() const { return Successors.begin(); } + succ_const_iterator succ_end() const { return Successors.end(); } + + typedef BasicBlockListTy::const_iterator pred_const_iterator; + pred_const_iterator pred_begin() const { return Predecessors.begin(); } + pred_const_iterator pred_end() const { return Predecessors.end(); } + + void addSuccessor(const MCBasicBlock *MCBB); + bool isSuccessor(const MCBasicBlock *MCBB) const; + + void addPredecessor(const MCBasicBlock *MCBB); + bool isPredecessor(const MCBasicBlock *MCBB) const; + /// @} +}; + +/// \brief Represents a function in machine code, containing MCBasicBlocks. +/// MCFunctions are created using MCModule::createFunction. +class MCFunction { + MCFunction (const MCFunction&) LLVM_DELETED_FUNCTION; + MCFunction& operator=(const MCFunction&) LLVM_DELETED_FUNCTION; + + std::string Name; + typedef std::vector<MCBasicBlock*> BasicBlockListTy; + BasicBlockListTy Blocks; + + // MCModule owns the function. + friend class MCModule; + MCFunction(StringRef Name); +public: + ~MCFunction(); + + /// \brief Create an MCBasicBlock backed by Insts and add it to this function. + /// \param Insts Sequence of straight-line code backing the basic block. + /// \returns The newly created basic block. + MCBasicBlock &createBlock(const MCTextAtom &Insts); + + StringRef getName() const { return Name; } + + /// \name Access to the function's basic blocks. No ordering is enforced. + /// @{ + /// \brief Get the entry point basic block. + const MCBasicBlock *getEntryBlock() const { return front(); } + MCBasicBlock *getEntryBlock() { return front(); } + + // NOTE: Dereferencing iterators gives pointers, so maybe a list is best here. + typedef BasicBlockListTy::const_iterator const_iterator; + typedef BasicBlockListTy:: iterator iterator; + const_iterator begin() const { return Blocks.begin(); } + iterator begin() { return Blocks.begin(); } + const_iterator end() const { return Blocks.end(); } + iterator end() { return Blocks.end(); } + + const MCBasicBlock* front() const { return Blocks.front(); } + MCBasicBlock* front() { return Blocks.front(); } + const MCBasicBlock* back() const { return Blocks.back(); } + MCBasicBlock* back() { return Blocks.back(); } + /// @} +}; + +} + +#endif diff --git a/include/llvm/MC/MCInstrAnalysis.h b/include/llvm/MC/MCInstrAnalysis.h index acad633..17bfd15 100644 --- a/include/llvm/MC/MCInstrAnalysis.h +++ b/include/llvm/MC/MCInstrAnalysis.h @@ -52,10 +52,15 @@ public: return Info->get(Inst.getOpcode()).isReturn(); } + virtual bool isTerminator(const MCInst &Inst) const { + return Info->get(Inst.getOpcode()).isTerminator(); + } + /// evaluateBranch - Given a branch instruction try to get the address the - /// branch targets. Otherwise return -1. - virtual uint64_t - evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size) const; + /// branch targets. Return true on success, and the address in Target. + virtual bool + evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size, + uint64_t &Target) const; }; } diff --git a/include/llvm/MC/MCModule.h b/include/llvm/MC/MCModule.h index 755fa02..02f8ca0 100644 --- a/include/llvm/MC/MCModule.h +++ b/include/llvm/MC/MCModule.h @@ -15,44 +15,93 @@ #ifndef LLVM_MC_MCMODULE_H #define LLVM_MC_MCMODULE_H -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/IntervalMap.h" -#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/DataTypes.h" +#include <vector> namespace llvm { class MCAtom; +class MCDataAtom; +class MCFunction; +class MCObjectDisassembler; +class MCTextAtom; -/// MCModule - This class represent a completely disassembled object file or -/// executable. It comprises a list of MCAtom's, and a branch target table. -/// Each atom represents a contiguous range of either instructions or data. +/// \brief A completely disassembled object file or executable. +/// It comprises a list of MCAtom's, each representing a contiguous range of +/// either instructions or data. +/// An MCModule is created using MCObjectDisassembler::buildModule. class MCModule { - /// AtomAllocationTracker - An MCModule owns its component MCAtom's, so it - /// must track them in order to ensure they are properly freed as atoms are - /// merged or otherwise manipulated. - SmallPtrSet<MCAtom*, 8> AtomAllocationTracker; + /// \name Atom tracking + /// @{ - /// OffsetMap - Efficiently maps offset ranges to MCAtom's. - IntervalMap<uint64_t, MCAtom*> OffsetMap; - - /// BranchTargetMap - Maps offsets that are determined to be branches and - /// can be statically resolved to their target offsets. - DenseMap<uint64_t, MCAtom*> BranchTargetMap; + /// \brief Atoms in this module, sorted by begin address. + /// FIXME: This doesn't handle overlapping atoms (which happen when a basic + /// block starts in the middle of an instruction of another basic block.) + typedef std::vector<MCAtom*> AtomListTy; + AtomListTy Atoms; friend class MCAtom; - - /// remap - Update the interval mapping for an MCAtom. + /// \brief Remap \p Atom to the given range, and update its Begin/End fields. + /// \param Atom An atom belonging to this module. + /// An atom should always use this method to update its bounds, because this + /// enables the owning MCModule to keep track of its atoms. void remap(MCAtom *Atom, uint64_t NewBegin, uint64_t NewEnd); + /// \brief Insert an atom in the module, using its Begin and End addresses. + void map(MCAtom *NewAtom); + /// @} + + /// \name Function tracking + /// @{ + typedef std::vector<MCFunction*> FunctionListTy; + FunctionListTy Functions; + /// @} + + MCModule (const MCModule &) LLVM_DELETED_FUNCTION; + MCModule& operator=(const MCModule &) LLVM_DELETED_FUNCTION; + + // MCObjectDisassembler creates MCModules. + friend class MCObjectDisassembler; + MCModule() : Atoms() { } + public: - MCModule(IntervalMap<uint64_t, MCAtom*>::Allocator &A) : OffsetMap(A) { } + ~MCModule(); - /// createAtom - Creates a new MCAtom covering the specified offset range. - MCAtom *createAtom(MCAtom::AtomType Type, uint64_t Begin, uint64_t End); + /// \name Create a new MCAtom covering the specified offset range. + /// @{ + MCTextAtom *createTextAtom(uint64_t Begin, uint64_t End); + MCDataAtom *createDataAtom(uint64_t Begin, uint64_t End); + /// @} + + /// \name Access to the owned atom list, ordered by begin address. + /// @{ + const MCAtom *findAtomContaining(uint64_t Addr) const; + MCAtom *findAtomContaining(uint64_t Addr); + + typedef AtomListTy::const_iterator const_atom_iterator; + typedef AtomListTy:: iterator atom_iterator; + const_atom_iterator atom_begin() const { return Atoms.begin(); } + atom_iterator atom_begin() { return Atoms.begin(); } + const_atom_iterator atom_end() const { return Atoms.end(); } + atom_iterator atom_end() { return Atoms.end(); } + /// @} + + /// \name Create a new MCFunction. + MCFunction *createFunction(const StringRef &Name); + + /// \name Access to the owned function list. + /// @{ + typedef FunctionListTy::const_iterator const_func_iterator; + typedef FunctionListTy:: iterator func_iterator; + const_func_iterator func_begin() const { return Functions.begin(); } + func_iterator func_begin() { return Functions.begin(); } + const_func_iterator func_end() const { return Functions.end(); } + func_iterator func_end() { return Functions.end(); } + /// @} }; } #endif - diff --git a/include/llvm/MC/MCObjectDisassembler.h b/include/llvm/MC/MCObjectDisassembler.h new file mode 100644 index 0000000..749a54e --- /dev/null +++ b/include/llvm/MC/MCObjectDisassembler.h @@ -0,0 +1,69 @@ +//===-- llvm/MC/MCObjectDisassembler.h --------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the MCObjectDisassembler class, which +// can be used to construct an MCModule and an MC CFG from an ObjectFile. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_MC_MCOBJECTDISASSEMBLER_H +#define LLVM_MC_MCOBJECTDISASSEMBLER_H + +namespace llvm { + +namespace object { + class ObjectFile; +} + +class MCBasicBlock; +class MCDisassembler; +class MCFunction; +class MCInstrAnalysis; +class MCModule; + +/// \brief Disassemble an ObjectFile to an MCModule and MCFunctions. +/// This class builds on MCDisassembler to disassemble whole sections, creating +/// MCAtom (MCTextAtom for disassembled sections and MCDataAtom for raw data). +/// It can also be used to create a control flow graph consisting of MCFunctions +/// and MCBasicBlocks. +class MCObjectDisassembler { + const object::ObjectFile &Obj; + const MCDisassembler &Dis; + const MCInstrAnalysis &MIA; + +public: + MCObjectDisassembler(const object::ObjectFile &Obj, + const MCDisassembler &Dis, + const MCInstrAnalysis &MIA); + + /// \brief Build an MCModule, creating atoms and optionally functions. + /// \param withCFG Also build a CFG by adding MCFunctions to the Module. + /// If withCFG is false, the MCModule built only contains atoms, representing + /// what was found in the object file. If withCFG is true, MCFunctions are + /// created, containing MCBasicBlocks. All text atoms are split to form basic + /// block atoms, which then each back an MCBasicBlock. + MCModule *buildModule(bool withCFG = false); + +private: + /// \brief Fill \p Module by creating an atom for each section. + /// This could be made much smarter, using information like symbols, but also + /// format-specific features, like mach-o function_start or data_in_code LCs. + void buildSectionAtoms(MCModule *Module); + + /// \brief Enrich \p Module with a CFG consisting of MCFunctions. + /// \param Module An MCModule returned by buildModule, with no CFG. + /// NOTE: Each MCBasicBlock in a MCFunction is backed by a single MCTextAtom. + /// When the CFG is built, contiguous instructions that were previously in a + /// single MCTextAtom will be split in multiple basic block atoms. + void buildCFG(MCModule *Module); +}; + +} + +#endif diff --git a/include/llvm/Support/StringRefMemoryObject.h b/include/llvm/Support/StringRefMemoryObject.h new file mode 100644 index 0000000..a0ef35a --- /dev/null +++ b/include/llvm/Support/StringRefMemoryObject.h @@ -0,0 +1,42 @@ +//===- llvm/Support/StringRefMemoryObject.h ---------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains the declaration of the StringRefMemObject class, a simple +// wrapper around StringRef implementing the MemoryObject interface. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_STRINGREFMEMORYOBJECT_H +#define LLVM_SUPPORT_STRINGREFMEMORYOBJECT_H + +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/MemoryObject.h" + +namespace llvm { + +/// StringRefMemoryObject - Simple StringRef-backed MemoryObject +class StringRefMemoryObject : public MemoryObject { + StringRef Bytes; + uint64_t Base; +public: + StringRefMemoryObject(StringRef Bytes, uint64_t Base = 0) + : Bytes(Bytes), Base(Base) {} + + uint64_t getBase() const { return Base; } + uint64_t getExtent() const { return Bytes.size(); } + + int readByte(uint64_t Addr, uint8_t *Byte) const; + int readBytes(uint64_t Addr, uint64_t Size, + uint8_t *Buf, uint64_t *Copied) const; + +}; + +} + +#endif |