34 files changed, 3780 insertions, 1239 deletions
diff --git a/lib/Transforms/Utils/AddrModeMatcher.cpp b/lib/Transforms/Utils/AddrModeMatcher.cpp
deleted file mode 100644
index 1e6586b..0000000
--- a/lib/Transforms/Utils/AddrModeMatcher.cpp
+++ /dev/null
@@ -1,577 +0,0 @@
-//===- AddrModeMatcher.cpp - Addressing mode matching facility --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements target addressing mode matcher class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Transforms/Utils/AddrModeMatcher.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalValue.h"
-#include "llvm/Instruction.h"
-#include "llvm/Assembly/Writer.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/GetElementPtrTypeIterator.h"
-#include "llvm/Support/PatternMatch.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/CallSite.h"
-
-using namespace llvm;
-using namespace llvm::PatternMatch;
-
-void ExtAddrMode::print(raw_ostream &OS) const {
-  bool NeedPlus = false;
-  OS << "[";
-  if (BaseGV) {
-    OS << (NeedPlus ? " + " : "")
-       << "GV:";
-    WriteAsOperand(OS, BaseGV, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-
-  if (BaseOffs)
-    OS << (NeedPlus ? " + " : "") << BaseOffs, NeedPlus = true;
-
-  if (BaseReg) {
-    OS << (NeedPlus ? " + " : "")
-       << "Base:";
-    WriteAsOperand(OS, BaseReg, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-  if (Scale) {
-    OS << (NeedPlus ? " + " : "")
-       << Scale << "*";
-    WriteAsOperand(OS, ScaledReg, /*PrintType=*/false);
-    NeedPlus = true;
-  }
-
-  OS << ']';
-}
-
-#ifndef NDEBUG
-void ExtAddrMode::dump() const {
-  print(dbgs());
-  dbgs() << '\n';
-}
-#endif
-
-
-/// MatchScaledValue - Try adding ScaleReg*Scale to the current addressing mode.
-/// Return true and update AddrMode if this addr mode is legal for the target,
-/// false if not.
-bool AddressingModeMatcher::MatchScaledValue(Value *ScaleReg, int64_t Scale,
-                                             unsigned Depth) {
-  // If Scale is 1, then this is the same as adding ScaleReg to the addressing
-  // mode.  Just process that directly.
-  if (Scale == 1)
-    return MatchAddr(ScaleReg, Depth);
-  
-  // If the scale is 0, it takes nothing to add this.
-  if (Scale == 0)
-    return true;
-  
-  // If we already have a scale of this value, we can add to it, otherwise, we
-  // need an available scale field.
-  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
-    return false;
-
-  ExtAddrMode TestAddrMode = AddrMode;
-
-  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
-  // [A+B + A*7] -> [B+A*8].
-  TestAddrMode.Scale += Scale;
-  TestAddrMode.ScaledReg = ScaleReg;
-
-  // If the new address isn't legal, bail out.
-  if (!TLI.isLegalAddressingMode(TestAddrMode, AccessTy))
-    return false;
-
-  // It was legal, so commit it.
-  AddrMode = TestAddrMode;
-  
-  // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
-  // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
-  // X*Scale + C*Scale to addr mode.
-  ConstantInt *CI = 0; Value *AddLHS = 0;
-  if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
-      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
-    TestAddrMode.ScaledReg = AddLHS;
-    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
-      
-    // If this addressing mode is legal, commit it and remember that we folded
-    // this instruction.
-    if (TLI.isLegalAddressingMode(TestAddrMode, AccessTy)) {
-      AddrModeInsts.push_back(cast<Instruction>(ScaleReg));
-      AddrMode = TestAddrMode;
-      return true;
-    }
-  }
-
-  // Otherwise, not (x+c)*scale, just return what we have.
-  return true;
-}
-
-/// MightBeFoldableInst - This is a little filter, which returns true if an
-/// addressing computation involving I might be folded into a load/store
-/// accessing it.  This doesn't need to be perfect, but needs to accept at least
-/// the set of instructions that MatchOperationAddr can.
-static bool MightBeFoldableInst(Instruction *I) {
-  switch (I->getOpcode()) {
-  case Instruction::BitCast:
-    // Don't touch identity bitcasts.
-    if (I->getType() == I->getOperand(0)->getType())
-      return false;
-    return I->getType()->isPointerTy() || I->getType()->isIntegerTy();
-  case Instruction::PtrToInt:
-    // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return true;
-  case Instruction::IntToPtr:
-    // We know the input is intptr_t, so this is foldable.
-    return true;
-  case Instruction::Add:
-    return true;
-  case Instruction::Mul:
-  case Instruction::Shl:
-    // Can only handle X*C and X << C.
-    return isa<ConstantInt>(I->getOperand(1));
-  case Instruction::GetElementPtr:
-    return true;
-  default:
-    return false;
-  }
-}
-
-
-/// MatchOperationAddr - Given an instruction or constant expr, see if we can
-/// fold the operation into the addressing mode.  If so, update the addressing
-/// mode and return true, otherwise return false without modifying AddrMode.
-bool AddressingModeMatcher::MatchOperationAddr(User *AddrInst, unsigned Opcode,
-                                               unsigned Depth) {
-  // Avoid exponential behavior on extremely deep expression trees.
-  if (Depth >= 5) return false;
-  
-  switch (Opcode) {
-  case Instruction::PtrToInt:
-    // PtrToInt is always a noop, as we know that the int type is pointer sized.
-    return MatchAddr(AddrInst->getOperand(0), Depth);
-  case Instruction::IntToPtr:
-    // This inttoptr is a no-op if the integer type is pointer sized.
-    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
-        TLI.getPointerTy())
-      return MatchAddr(AddrInst->getOperand(0), Depth);
-    return false;
-  case Instruction::BitCast:
-    // BitCast is always a noop, and we can handle it as long as it is
-    // int->int or pointer->pointer (we don't want int<->fp or something).
-    if ((AddrInst->getOperand(0)->getType()->isPointerTy() ||
-         AddrInst->getOperand(0)->getType()->isIntegerTy()) &&
-        // Don't touch identity bitcasts.  These were probably put here by LSR,
-        // and we don't want to mess around with them.  Assume it knows what it
-        // is doing.
-        AddrInst->getOperand(0)->getType() != AddrInst->getType())
-      return MatchAddr(AddrInst->getOperand(0), Depth);
-    return false;
-  case Instruction::Add: {
-    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-    if (MatchAddr(AddrInst->getOperand(1), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(0), Depth+1))
-      return true;
-    
-    // Restore the old addr mode info.
-    AddrMode = BackupAddrMode;
-    AddrModeInsts.resize(OldSize);
-    
-    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
-    if (MatchAddr(AddrInst->getOperand(0), Depth+1) &&
-        MatchAddr(AddrInst->getOperand(1), Depth+1))
-      return true;
-    
-    // Otherwise we definitely can't merge the ADD in.
-    AddrMode = BackupAddrMode;
-    AddrModeInsts.resize(OldSize);
-    break;
-  }
-  //case Instruction::Or:
-  // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
-  //break;
-  case Instruction::Mul:
-  case Instruction::Shl: {
-    // Can only handle X*C and X << C.
-    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
-    if (!RHS) return false;
-    int64_t Scale = RHS->getSExtValue();
-    if (Opcode == Instruction::Shl)
-      Scale = 1LL << Scale;
-    
-    return MatchScaledValue(AddrInst->getOperand(0), Scale, Depth);
-  }
-  case Instruction::GetElementPtr: {
-    // Scan the GEP.  We check it if it contains constant offsets and at most
-    // one variable offset.
-    int VariableOperand = -1;
-    unsigned VariableScale = 0;
-    
-    int64_t ConstantOffset = 0;
-    const TargetData *TD = TLI.getTargetData();
-    gep_type_iterator GTI = gep_type_begin(AddrInst);
-    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
-      if (StructType *STy = dyn_cast<StructType>(*GTI)) {
-        const StructLayout *SL = TD->getStructLayout(STy);
-        unsigned Idx =
-          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
-        ConstantOffset += SL->getElementOffset(Idx);
-      } else {
-        uint64_t TypeSize = TD->getTypeAllocSize(GTI.getIndexedType());
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
-          ConstantOffset += CI->getSExtValue()*TypeSize;
-        } else if (TypeSize) {  // Scales of zero don't do anything.
-          // We only allow one variable index at the moment.
-          if (VariableOperand != -1)
-            return false;
-          
-          // Remember the variable index.
-          VariableOperand = i;
-          VariableScale = TypeSize;
-        }
-      }
-    }
-    
-    // A common case is for the GEP to only do a constant offset.  In this case,
-    // just add it to the disp field and check validity.
-    if (VariableOperand == -1) {
-      AddrMode.BaseOffs += ConstantOffset;
-      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
-        // Check to see if we can fold the base pointer in too.
-        if (MatchAddr(AddrInst->getOperand(0), Depth+1))
-          return true;
-      }
-      AddrMode.BaseOffs -= ConstantOffset;
-      return false;
-    }
-
-    // Save the valid addressing mode in case we can't match.
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-
-    // See if the scale and offset amount is valid for this target.
-    AddrMode.BaseOffs += ConstantOffset;
-
-    // Match the base operand of the GEP.
-    if (!MatchAddr(AddrInst->getOperand(0), Depth+1)) {
-      // If it couldn't be matched, just stuff the value in a register.
-      if (AddrMode.HasBaseReg) {
-        AddrMode = BackupAddrMode;
-        AddrModeInsts.resize(OldSize);
-        return false;
-      }
-      AddrMode.HasBaseReg = true;
-      AddrMode.BaseReg = AddrInst->getOperand(0);
-    }
-
-    // Match the remaining variable portion of the GEP.
-    if (!MatchScaledValue(AddrInst->getOperand(VariableOperand), VariableScale,
-                          Depth)) {
-      // If it couldn't be matched, try stuffing the base into a register
-      // instead of matching it, and retrying the match of the scale.
-      AddrMode = BackupAddrMode;
-      AddrModeInsts.resize(OldSize);
-      if (AddrMode.HasBaseReg)
-        return false;
-      AddrMode.HasBaseReg = true;
-      AddrMode.BaseReg = AddrInst->getOperand(0);
-      AddrMode.BaseOffs += ConstantOffset;
-      if (!MatchScaledValue(AddrInst->getOperand(VariableOperand),
-                            VariableScale, Depth)) {
-        // If even that didn't work, bail.
-        AddrMode = BackupAddrMode;
-        AddrModeInsts.resize(OldSize);
-        return false;
-      }
-    }
-
-    return true;
-  }
-  }
-  return false;
-}
-
-/// MatchAddr - If we can, try to add the value of 'Addr' into the current
-/// addressing mode.  If Addr can't be added to AddrMode this returns false and
-/// leaves AddrMode unmodified.  This assumes that Addr is either a pointer type
-/// or intptr_t for the target.
-///
-bool AddressingModeMatcher::MatchAddr(Value *Addr, unsigned Depth) {
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
-    // Fold in immediates if legal for the target.
-    AddrMode.BaseOffs += CI->getSExtValue();
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.BaseOffs -= CI->getSExtValue();
-  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
-    // If this is a global variable, try to fold it into the addressing mode.
-    if (AddrMode.BaseGV == 0) {
-      AddrMode.BaseGV = GV;
-      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-        return true;
-      AddrMode.BaseGV = 0;
-    }
-  } else if (Instruction *I = dyn_cast<Instruction>(Addr)) {
-    ExtAddrMode BackupAddrMode = AddrMode;
-    unsigned OldSize = AddrModeInsts.size();
-
-    // Check to see if it is possible to fold this operation.
-    if (MatchOperationAddr(I, I->getOpcode(), Depth)) {
-      // Okay, it's possible to fold this.  Check to see if it is actually
-      // *profitable* to do so.  We use a simple cost model to avoid increasing
-      // register pressure too much.
-      if (I->hasOneUse() ||
-          IsProfitableToFoldIntoAddressingMode(I, BackupAddrMode, AddrMode)) {
-        AddrModeInsts.push_back(I);
-        return true;
-      }
-      
-      // It isn't profitable to do this, roll back.
-      //cerr << "NOT FOLDING: " << *I;
-      AddrMode = BackupAddrMode;
-      AddrModeInsts.resize(OldSize);
-    }
-  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
-    if (MatchOperationAddr(CE, CE->getOpcode(), Depth))
-      return true;
-  } else if (isa<ConstantPointerNull>(Addr)) {
-    // Null pointer gets folded without affecting the addressing mode.
-    return true;
-  }
-
-  // Worse case, the target should support [reg] addressing modes. :)
-  if (!AddrMode.HasBaseReg) {
-    AddrMode.HasBaseReg = true;
-    AddrMode.BaseReg = Addr;
-    // Still check for legality in case the target supports [imm] but not [i+r].
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.HasBaseReg = false;
-    AddrMode.BaseReg = 0;
-  }
-
-  // If the base register is already taken, see if we can do [r+r].
-  if (AddrMode.Scale == 0) {
-    AddrMode.Scale = 1;
-    AddrMode.ScaledReg = Addr;
-    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
-      return true;
-    AddrMode.Scale = 0;
-    AddrMode.ScaledReg = 0;
-  }
-  // Couldn't match.
-  return false;
-}
-
-
-/// IsOperandAMemoryOperand - Check to see if all uses of OpVal by the specified
-/// inline asm call are due to memory operands.  If so, return true, otherwise
-/// return false.
-static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetLowering &TLI) {
-  TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(ImmutableCallSite(CI));
-  for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
-    TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
-    
-    // Compute the constraint code and ConstraintType to use.
-    TLI.ComputeConstraintToUse(OpInfo, SDValue());
-
-    // If this asm operand is our Value*, and if it isn't an indirect memory
-    // operand, we can't fold it!
-    if (OpInfo.CallOperandVal == OpVal &&
-        (OpInfo.ConstraintType != TargetLowering::C_Memory ||
-         !OpInfo.isIndirect))
-      return false;
-  }
-
-  return true;
-}
-
-
-/// FindAllMemoryUses - Recursively walk all the uses of I until we find a
-/// memory use.  If we find an obviously non-foldable instruction, return true.
-/// Add the ultimately found memory instructions to MemoryUses.
-static bool FindAllMemoryUses(Instruction *I,
-                SmallVectorImpl<std::pair<Instruction*,unsigned> > &MemoryUses,
-                              SmallPtrSet<Instruction*, 16> &ConsideredInsts,
-                              const TargetLowering &TLI) {
-  // If we already considered this instruction, we're done.
-  if (!ConsideredInsts.insert(I))
-    return false;
-  
-  // If this is an obviously unfoldable instruction, bail out.
-  if (!MightBeFoldableInst(I))
-    return true;
-
-  // Loop over all the uses, recursively processing them.
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
-       UI != E; ++UI) {
-    User *U = *UI;
-
-    if (LoadInst *LI = dyn_cast<LoadInst>(U)) {
-      MemoryUses.push_back(std::make_pair(LI, UI.getOperandNo()));
-      continue;
-    }
-    
-    if (StoreInst *SI = dyn_cast<StoreInst>(U)) {
-      unsigned opNo = UI.getOperandNo();
-      if (opNo == 0) return true; // Storing addr, not into addr.
-      MemoryUses.push_back(std::make_pair(SI, opNo));
-      continue;
-    }
-    
-    if (CallInst *CI = dyn_cast<CallInst>(U)) {
-      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
-      if (!IA) return true;
-      
-      // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TLI))
-        return true;
-      continue;
-    }
-    
-    if (FindAllMemoryUses(cast<Instruction>(U), MemoryUses, ConsideredInsts,
-                          TLI))
-      return true;
-  }
-
-  return false;
-}
-
-
-/// ValueAlreadyLiveAtInst - Retrn true if Val is already known to be live at
-/// the use site that we're folding it into.  If so, there is no cost to
-/// include it in the addressing mode.  KnownLive1 and KnownLive2 are two values
-/// that we know are live at the instruction already.
-bool AddressingModeMatcher::ValueAlreadyLiveAtInst(Value *Val,Value *KnownLive1,
-                                                   Value *KnownLive2) {
-  // If Val is either of the known-live values, we know it is live!
-  if (Val == 0 || Val == KnownLive1 || Val == KnownLive2)
-    return true;
-  
-  // All values other than instructions and arguments (e.g. constants) are live.
-  if (!isa<Instruction>(Val) && !isa<Argument>(Val)) return true;
-  
-  // If Val is a constant sized alloca in the entry block, it is live, this is
-  // true because it is just a reference to the stack/frame pointer, which is
-  // live for the whole function.
-  if (AllocaInst *AI = dyn_cast<AllocaInst>(Val))
-    if (AI->isStaticAlloca())
-      return true;
-  
-  // Check to see if this value is already used in the memory instruction's
-  // block.  If so, it's already live into the block at the very least, so we
-  // can reasonably fold it.
-  return Val->isUsedInBasicBlock(MemoryInst->getParent());
-}
-
-
-
-/// IsProfitableToFoldIntoAddressingMode - It is possible for the addressing
-/// mode of the machine to fold the specified instruction into a load or store
-/// that ultimately uses it.  However, the specified instruction has multiple
-/// uses.  Given this, it may actually increase register pressure to fold it
-/// into the load.  For example, consider this code:
-///
-///     X = ...
-///     Y = X+1
-///     use(Y)   -> nonload/store
-///     Z = Y+1
-///     load Z
-///
-/// In this case, Y has multiple uses, and can be folded into the load of Z
-/// (yielding load [X+2]).  However, doing this will cause both "X" and "X+1" to
-/// be live at the use(Y) line.  If we don't fold Y into load Z, we use one
-/// fewer register.  Since Y can't be folded into "use(Y)" we don't increase the
-/// number of computations either.
-///
-/// Note that this (like most of CodeGenPrepare) is just a rough heuristic.  If
-/// X was live across 'load Z' for other reasons, we actually *would* want to
-/// fold the addressing mode in the Z case.  This would make Y die earlier.
-bool AddressingModeMatcher::
-IsProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
-                                     ExtAddrMode &AMAfter) {
-  if (IgnoreProfitability) return true;
-  
-  // AMBefore is the addressing mode before this instruction was folded into it,
-  // and AMAfter is the addressing mode after the instruction was folded.  Get
-  // the set of registers referenced by AMAfter and subtract out those
-  // referenced by AMBefore: this is the set of values which folding in this
-  // address extends the lifetime of.
-  //
-  // Note that there are only two potential values being referenced here,
-  // BaseReg and ScaleReg (global addresses are always available, as are any
-  // folded immediates).
-  Value *BaseReg = AMAfter.BaseReg, *ScaledReg = AMAfter.ScaledReg;
-  
-  // If the BaseReg or ScaledReg was referenced by the previous addrmode, their
-  // lifetime wasn't extended by adding this instruction.
-  if (ValueAlreadyLiveAtInst(BaseReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    BaseReg = 0;
-  if (ValueAlreadyLiveAtInst(ScaledReg, AMBefore.BaseReg, AMBefore.ScaledReg))
-    ScaledReg = 0;
-
-  // If folding this instruction (and it's subexprs) didn't extend any live
-  // ranges, we're ok with it.
-  if (BaseReg == 0 && ScaledReg == 0)
-    return true;
-
-  // If all uses of this instruction are ultimately load/store/inlineasm's,
-  // check to see if their addressing modes will include this instruction.  If
-  // so, we can fold it into all uses, so it doesn't matter if it has multiple
-  // uses.
-  SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
-  SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI))
-    return false;  // Has a non-memory, non-foldable use!
-  
-  // Now that we know that all uses of this instruction are part of a chain of
-  // computation involving only operations that could theoretically be folded
-  // into a memory use, loop over each of these uses and see if they could
-  // *actually* fold the instruction.
-  SmallVector<Instruction*, 32> MatchedAddrModeInsts;
-  for (unsigned i = 0, e = MemoryUses.size(); i != e; ++i) {
-    Instruction *User = MemoryUses[i].first;
-    unsigned OpNo = MemoryUses[i].second;
-    
-    // Get the access type of this use.  If the use isn't a pointer, we don't
-    // know what it accesses.
-    Value *Address = User->getOperand(OpNo);
-    if (!Address->getType()->isPointerTy())
-      return false;
-    Type *AddressAccessTy =
-      cast<PointerType>(Address->getType())->getElementType();
-    
-    // Do a match against the root of this address, ignoring profitability. This
-    // will tell us if the addressing mode for the memory operation will
-    // *actually* cover the shared instruction.
-    ExtAddrMode Result;
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, AddressAccessTy,
-                                  MemoryInst, Result);
-    Matcher.IgnoreProfitability = true;
-    bool Success = Matcher.MatchAddr(Address, 0);
-    (void)Success; assert(Success && "Couldn't select *anything*?");
-
-    // If the match didn't cover I, then it won't be shared by it.
-    if (std::find(MatchedAddrModeInsts.begin(), MatchedAddrModeInsts.end(),
-                  I) == MatchedAddrModeInsts.end())
-      return false;
-    
-    MatchedAddrModeInsts.clear();
-  }
-  
-  return true;
-}
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index 75a7817..8330e84 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -13,20 +13,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Constant.h"
-#include "llvm/Type.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ValueHandle.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -687,3 +687,42 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
   return cast<ReturnInst>(NewRet);
 }
 
+/// SplitBlockAndInsertIfThen - Split the containing block at the
+/// specified instruction - everything before and including Cmp stays
+/// in the old basic block, and everything after Cmp is moved to a
+/// new block. The two blocks are connected by a conditional branch
+/// (with value of Cmp being the condition).
+/// Before:
+///   Head
+///   Cmp
+///   Tail
+/// After:
+///   Head
+///   Cmp
+///   if (Cmp)
+///     ThenBlock
+///   Tail
+///
+/// If Unreachable is true, then ThenBlock ends with
+/// UnreachableInst, otherwise it branches to Tail.
+/// Returns the NewBasicBlock's terminator.
+
+TerminatorInst *llvm::SplitBlockAndInsertIfThen(Instruction *Cmp,
+    bool Unreachable, MDNode *BranchWeights) {
+  Instruction *SplitBefore = Cmp->getNextNode();
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  LLVMContext &C = Head->getContext();
+  BasicBlock *ThenBlock = BasicBlock::Create(C, "", Head->getParent(), Tail);
+  TerminatorInst *CheckTerm;
+  if (Unreachable)
+    CheckTerm = new UnreachableInst(C, ThenBlock);
+  else
+    CheckTerm = BranchInst::Create(Tail, ThenBlock);
+  BranchInst *HeadNewTerm =
+    BranchInst::Create(/*ifTrue*/ThenBlock, /*ifFalse*/Tail, Cmp);
+  HeadNewTerm->setMetadata(LLVMContext::MD_prof, BranchWeights);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+  return CheckTerm;
+}
diff --git a/lib/Transforms/Utils/BreakCriticalEdges.cpp b/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 6b04e3d..8513772 100644
--- a/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -17,17 +17,17 @@
 
 #define DEBUG_TYPE "break-crit-edges"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileInfo.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Type.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 using namespace llvm;
 
 STATISTIC(NumBroken, "Number of blocks inserted");
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index e13fd71..bf540b0 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -12,17 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Target/TargetLibraryInfo.h"
 
 using namespace llvm;
@@ -34,19 +32,22 @@ Value *llvm::CastToCStr(Value *V, IRBuilder<> &B) {
 
 /// EmitStrLen - Emit a call to the strlen function to the builder, for the
 /// specified pointer.  This always returns an integer value of size intptr_t.
-Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD,
+Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strlen))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrLen = M->getOrInsertFunction("strlen", AttrListPtr::get(AWI),
+  Constant *StrLen = M->getOrInsertFunction("strlen",
+                                            AttributeSet::get(M->getContext(),
+                                                             AWI),
                                             TD->getIntPtrType(Context),
                                             B.getInt8PtrTy(),
                                             NULL);
@@ -61,18 +62,21 @@ Value *llvm::EmitStrLen(Value *Ptr, IRBuilder<> &B, const TargetData *TD,
 /// specified pointer.  Ptr is required to be some pointer type, MaxLen must
 /// be of size_t type, and the return value has 'intptr_t' type.
 Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
-                         const TargetData *TD, const TargetLibraryInfo *TLI) {
+                         const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strnlen))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Constant *StrNLen = M->getOrInsertFunction("strnlen", AttrListPtr::get(AWI),
+  Constant *StrNLen = M->getOrInsertFunction("strnlen",
+                                             AttributeSet::get(M->getContext(),
+                                                              AWI),
                                              TD->getIntPtrType(Context),
                                              B.getInt8PtrTy(),
                                              TD->getIntPtrType(Context),
@@ -88,17 +92,21 @@ Value *llvm::EmitStrNLen(Value *Ptr, Value *MaxLen, IRBuilder<> &B,
 /// specified pointer and character.  Ptr is required to be some pointer type,
 /// and the return value has 'i8*' type.
 Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
-                        const TargetData *TD, const TargetLibraryInfo *TLI) {
+                        const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strchr))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
   AttributeWithIndex AWI =
-    AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
+    AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                            ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
-  Constant *StrChr = M->getOrInsertFunction("strchr", AttrListPtr::get(AWI),
+  Constant *StrChr = M->getOrInsertFunction("strchr",
+                                            AttributeSet::get(M->getContext(),
+                                                             AWI),
                                             I8Ptr, I8Ptr, I32Ty, NULL);
   CallInst *CI = B.CreateCall2(StrChr, CastToCStr(Ptr, B),
                                ConstantInt::get(I32Ty, C), "strchr");
@@ -109,20 +117,23 @@ Value *llvm::EmitStrChr(Value *Ptr, char C, IRBuilder<> &B,
 
 /// EmitStrNCmp - Emit a call to the strncmp function to the builder.
 Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
-                         IRBuilder<> &B, const TargetData *TD,
+                         IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::strncmp))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *StrNCmp = M->getOrInsertFunction("strncmp", AttrListPtr::get(AWI),
+  Value *StrNCmp = M->getOrInsertFunction("strncmp",
+                                          AttributeSet::get(M->getContext(),
+                                                           AWI),
                                           B.getInt32Ty(),
                                           B.getInt8PtrTy(),
                                           B.getInt8PtrTy(),
@@ -139,17 +150,19 @@ Value *llvm::EmitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len,
 /// EmitStrCpy - Emit a call to the strcpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
-                        const TargetData *TD, const TargetLibraryInfo *TLI,
+                        const DataLayout *TD, const TargetLibraryInfo *TLI,
                         StringRef Name) {
   if (!TLI->has(LibFunc::strcpy))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
+  Value *StrCpy = M->getOrInsertFunction(Name,
+                                         AttributeSet::get(M->getContext(), AWI),
                                          I8Ptr, I8Ptr, I8Ptr, NULL);
   CallInst *CI = B.CreateCall2(StrCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
                                Name);
@@ -161,17 +174,20 @@ Value *llvm::EmitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
 /// EmitStrNCpy - Emit a call to the strncpy function to the builder, for the
 /// specified pointer arguments.
 Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
-                         IRBuilder<> &B, const TargetData *TD,
+                         IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI, StringRef Name) {
   if (!TLI->has(LibFunc::strncpy))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   Attribute::NoUnwind);
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrNCpy = M->getOrInsertFunction(Name, AttrListPtr::get(AWI),
+  Value *StrNCpy = M->getOrInsertFunction(Name,
+                                          AttributeSet::get(M->getContext(),
+                                                           AWI),
                                           I8Ptr, I8Ptr, I8Ptr,
                                           Len->getType(), NULL);
   CallInst *CI = B.CreateCall3(StrNCpy, CastToCStr(Dst, B), CastToCStr(Src, B),
@@ -185,17 +201,18 @@ Value *llvm::EmitStrNCpy(Value *Dst, Value *Src, Value *Len,
 /// This expects that the Len and ObjSize have type 'intptr_t' and Dst/Src
 /// are pointers.
 Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
-                           IRBuilder<> &B, const TargetData *TD,
+                           IRBuilder<> &B, const DataLayout *TD,
                            const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcpy_chk))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction("__memcpy_chk",
-                                         AttrListPtr::get(AWI),
+                                         AttributeSet::get(M->getContext(), AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -212,16 +229,19 @@ Value *llvm::EmitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
 /// EmitMemChr - Emit a call to the memchr function.  This assumes that Ptr is
 /// a pointer, Val is an i32 value, and Len is an 'intptr_t' value.
 Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
-                        Value *Len, IRBuilder<> &B, const TargetData *TD,
+                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memchr))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI;
-  AWI = AttributeWithIndex::get(~0u, Attribute::ReadOnly | Attribute::NoUnwind);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AWI = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                ArrayRef<Attribute::AttrKind>(AVs, 2));
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemChr = M->getOrInsertFunction("memchr", AttrListPtr::get(AWI),
+  Value *MemChr = M->getOrInsertFunction("memchr",
+                                         AttributeSet::get(M->getContext(), AWI),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
                                          B.getInt32Ty(),
@@ -237,20 +257,22 @@ Value *llvm::EmitMemChr(Value *Ptr, Value *Val,
 
 /// EmitMemCmp - Emit a call to the memcmp function.
 Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
-                        Value *Len, IRBuilder<> &B, const TargetData *TD,
+                        Value *Len, IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::memcmp))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::ReadOnly |
-                                   Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  Attribute::AttrKind AVs[2] = { Attribute::ReadOnly, Attribute::NoUnwind };
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   ArrayRef<Attribute::AttrKind>(AVs, 2));
 
   LLVMContext &Context = B.GetInsertBlock()->getContext();
-  Value *MemCmp = M->getOrInsertFunction("memcmp", AttrListPtr::get(AWI),
+  Value *MemCmp = M->getOrInsertFunction("memcmp",
+                                         AttributeSet::get(M->getContext(), AWI),
                                          B.getInt32Ty(),
                                          B.getInt8PtrTy(),
                                          B.getInt8PtrTy(),
@@ -269,7 +291,7 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
 /// returns one value with the same type.  If 'Op' is a long double, 'l' is
 /// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
 Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttrListPtr &Attrs) {
+                                  const AttributeSet &Attrs) {
   SmallString<20> NameBuffer;
   if (!Op->getType()->isDoubleTy()) {
     // If we need to add a suffix, copy into NameBuffer.
@@ -294,7 +316,7 @@ Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
 
 /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
 /// is an integer.
-Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD,
+Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const DataLayout *TD,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::putchar))
     return 0;
@@ -316,17 +338,19 @@ Value *llvm::EmitPutChar(Value *Char, IRBuilder<> &B, const TargetData *TD,
 
 /// EmitPutS - Emit a call to the puts function.  This assumes that Str is
 /// some pointer.
-Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD,
+Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const DataLayout *TD,
                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::puts))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   Attribute::NoUnwind);
 
-  Value *PutS = M->getOrInsertFunction("puts", AttrListPtr::get(AWI),
+  Value *PutS = M->getOrInsertFunction("puts",
+                                       AttributeSet::get(M->getContext(), AWI),
                                        B.getInt32Ty(),
                                        B.getInt8PtrTy(),
                                        NULL);
@@ -339,17 +363,19 @@ Value *llvm::EmitPutS(Value *Str, IRBuilder<> &B, const TargetData *TD,
 /// EmitFPutC - Emit a call to the fputc function.  This assumes that Char is
 /// an integer and File is a pointer to FILE.
 Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
-                       const TargetData *TD, const TargetLibraryInfo *TLI) {
+                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputc))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[2];
-  AWI[0] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   Attribute::NoUnwind);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputc", AttrListPtr::get(AWI),
+    F = M->getOrInsertFunction("fputc",
+                               AttributeSet::get(M->getContext(), AWI),
                                B.getInt32Ty(),
                                B.getInt32Ty(), File->getType(),
                                NULL);
@@ -370,19 +396,21 @@ Value *llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
 /// pointer and File is a pointer to FILE.
 Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
-                       const TargetData *TD, const TargetLibraryInfo *TLI) {
+                       const DataLayout *TD, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fputs))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 2, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   Attribute::NoUnwind);
   StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI),
+    F = M->getOrInsertFunction(FPutsName,
+                               AttributeSet::get(M->getContext(), AWI),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
@@ -400,21 +428,23 @@ Value *llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
 /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
 Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
-                        IRBuilder<> &B, const TargetData *TD,
+                        IRBuilder<> &B, const DataLayout *TD,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc::fwrite))
     return 0;
 
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
-  AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
-  AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture);
-  AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  AWI[0] = AttributeWithIndex::get(M->getContext(), 1, Attribute::NoCapture);
+  AWI[1] = AttributeWithIndex::get(M->getContext(), 4, Attribute::NoCapture);
+  AWI[2] = AttributeWithIndex::get(M->getContext(), AttributeSet::FunctionIndex,
+                                   Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI),
+    F = M->getOrInsertFunction(FWriteName,
+                               AttributeSet::get(M->getContext(), AWI),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
@@ -436,9 +466,9 @@ Value *llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
 
 SimplifyFortifiedLibCalls::~SimplifyFortifiedLibCalls() { }
 
-bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const TargetData *TD,
+bool SimplifyFortifiedLibCalls::fold(CallInst *CI, const DataLayout *TD,
                                      const TargetLibraryInfo *TLI) {
-  // We really need TargetData for later.
+  // We really need DataLayout for later.
   if (!TD) return false;
   
   this->CI = CI;
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index 30d60be..00cda8e 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -16,11 +16,11 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "bypass-slow-division"
-#include "llvm/Instructions.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/ADT/DenseMap.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
@@ -221,7 +221,7 @@ static bool reuseOrInsertFastDiv(Function &F,
 // be profitably bypassed and carried out with a shorter, faster divide.
 bool llvm::bypassSlowDivision(Function &F,
                               Function::iterator &I,
-                              const DenseMap<Type *, Type *> &BypassTypeMap) {
+                              const DenseMap<unsigned int, unsigned int> &BypassWidths) {
   DivCacheTy DivCache;
 
   bool MadeChange = false;
@@ -238,14 +238,23 @@ bool llvm::bypassSlowDivision(Function &F,
     if (!UseDivOp && !UseRemOp)
       continue;
 
-    // Continue if div/rem type is not bypassed
-    DenseMap<Type *, Type *>::const_iterator BT =
-      BypassTypeMap.find(J->getType());
-    if (BT == BypassTypeMap.end())
+    // Skip division on vector types, only optimize integer instructions
+    if (!J->getType()->isIntegerTy())
+      continue;
+
+    // Get bitwidth of div/rem instruction
+    IntegerType *T = cast<IntegerType>(J->getType());
+    int bitwidth = T->getBitWidth();
+
+    // Continue if bitwidth is not bypassed
+    DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
+    if (BI == BypassWidths.end())
       continue;
 
-    IntegerType *BypassType = cast<IntegerType>(BT->second);
-    MadeChange |= reuseOrInsertFastDiv(F, I, J, BypassType, UseDivOp,
+    // Get type for div/rem instruction with bypass bitwidth
+    IntegerType *BT = IntegerType::get(J->getContext(), BI->second);
+
+    MadeChange |= reuseOrInsertFastDiv(F, I, J, BT, UseDivOp,
                                        UseSignedOp, DivCache);
   }
 
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index 215a16f..b71628b 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -1,5 +1,4 @@
 add_llvm_library(LLVMTransformUtils
-  AddrModeMatcher.cpp
   BasicBlockUtils.cpp
   BreakCriticalEdges.cpp
   BuildLibCalls.cpp
@@ -11,6 +10,7 @@ add_llvm_library(LLVMTransformUtils
   DemoteRegToStack.cpp
   InlineFunction.cpp
   InstructionNamer.cpp
+  IntegerDivision.cpp
   LCSSA.cpp
   Local.cpp
   LoopSimplify.cpp
@@ -20,12 +20,14 @@ add_llvm_library(LLVMTransformUtils
   LowerInvoke.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
+  MetaRenamer.cpp
   ModuleUtils.cpp
   PromoteMemoryToRegister.cpp
   SSAUpdater.cpp
   SimplifyCFG.cpp
   SimplifyIndVar.cpp
   SimplifyInstructions.cpp
+  SimplifyLibCalls.cpp
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 99237b8..ccc3eae 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -14,22 +14,22 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Constants.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/ADT/SmallVector.h"
 #include <map>
 using namespace llvm;
 
@@ -98,10 +98,14 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
         Anew->addAttr( OldFunc->getAttributes()
                        .getParamAttributes(I->getArgNo() + 1));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(0, OldFunc->getAttributes()
+                           .addAttr(NewFunc->getContext(),
+                                    AttributeSet::ReturnIndex,
+                                    OldFunc->getAttributes()
                                      .getRetAttributes()));
     NewFunc->setAttributes(NewFunc->getAttributes()
-                           .addAttr(~0, OldFunc->getAttributes()
+                           .addAttr(NewFunc->getContext(),
+                                    AttributeSet::FunctionIndex,
+                                    OldFunc->getAttributes()
                                      .getFnAttributes()));
 
   }
@@ -202,14 +206,14 @@ namespace {
     bool ModuleLevelChanges;
     const char *NameSuffix;
     ClonedCodeInfo *CodeInfo;
-    const TargetData *TD;
+    const DataLayout *TD;
   public:
     PruningFunctionCloner(Function *newFunc, const Function *oldFunc,
                           ValueToValueMapTy &valueMap,
                           bool moduleLevelChanges,
                           const char *nameSuffix, 
                           ClonedCodeInfo *codeInfo,
-                          const TargetData *td)
+                          const DataLayout *td)
     : NewFunc(newFunc), OldFunc(oldFunc),
       VMap(valueMap), ModuleLevelChanges(moduleLevelChanges),
       NameSuffix(nameSuffix), CodeInfo(codeInfo), TD(td) {
@@ -365,7 +369,7 @@ void llvm::CloneAndPruneFunctionInto(Function *NewFunc, const Function *OldFunc,
                                      SmallVectorImpl<ReturnInst*> &Returns,
                                      const char *NameSuffix, 
                                      ClonedCodeInfo *CodeInfo,
-                                     const TargetData *TD,
+                                     const DataLayout *TD,
                                      Instruction *TheCall) {
   assert(NameSuffix && "NameSuffix cannot be null!");
   
diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp
index 1dac6b5..64df089 100644
--- a/lib/Transforms/Utils/CloneModule.cpp
+++ b/lib/Transforms/Utils/CloneModule.cpp
@@ -13,9 +13,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Module.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Constant.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
@@ -38,10 +38,6 @@ Module *llvm::CloneModule(const Module *M, ValueToValueMapTy &VMap) {
   New->setTargetTriple(M->getTargetTriple());
   New->setModuleInlineAsm(M->getModuleInlineAsm());
    
-  // Copy all of the dependent libraries over.
-  for (Module::lib_iterator I = M->lib_begin(), E = M->lib_end(); I != E; ++I)
-    New->addLibrary(*I);
-
   // Loop over all of the global variables, making corresponding globals in the
   // new module.  Here we add them to the VMap and to the new Module.  We
   // don't worry about attributes or initializers, they will come later.
diff --git a/lib/Transforms/Utils/CmpInstAnalysis.cpp b/lib/Transforms/Utils/CmpInstAnalysis.cpp
index 9b09915..8fa412a 100644
--- a/lib/Transforms/Utils/CmpInstAnalysis.cpp
+++ b/lib/Transforms/Utils/CmpInstAnalysis.cpp
@@ -13,8 +13,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CmpInstAnalysis.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index c545cd6..3a21528 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -14,25 +14,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CodeExtractor.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/Verifier.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
 #include <set>
 using namespace llvm;
@@ -346,7 +346,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
                                            header->getName(), M);
   // If the old function is no-throw, so is the new one.
   if (oldFunction->doesNotThrow())
-    newFunction->setDoesNotThrow(true);
+    newFunction->setDoesNotThrow();
   
   newFunction->getBasicBlockList().push_back(newRootNode);
 
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 99b5830..d5c41f5 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -8,10 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
 using namespace llvm;
 
 /// DemoteRegToStack - This function takes a virtual register computed by an
@@ -124,7 +124,12 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
   }
 
   // Insert a load in place of the PHI and replace all uses.
-  Value *V = new LoadInst(Slot, P->getName()+".reload", P);
+  BasicBlock::iterator InsertPt = P;
+
+  for (; isa<PHINode>(InsertPt) || isa<LandingPadInst>(InsertPt); ++InsertPt)
+    /* empty */;   // Don't insert before PHI nodes or landingpad instrs.
+
+  Value *V = new LoadInst(Slot, P->getName()+".reload", InsertPt);
   P->replaceAllUsesWith(V);
 
   // Delete PHI.
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 89e89e7..0d2598a 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -13,21 +13,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Attributes.h"
-#include "llvm/Constants.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Module.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/CallSite.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -357,7 +357,7 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
 
   Type *VoidPtrTy = Type::getInt8PtrTy(Context);
   
-  // Create the alloca.  If we have TargetData, use nice alignment.
+  // Create the alloca.  If we have DataLayout, use nice alignment.
   unsigned Align = 1;
   if (IFI.TD)
     Align = IFI.TD->getPrefTypeAlignment(AggTy);
@@ -668,10 +668,29 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       if (hasLifetimeMarkers(AI))
         continue;
 
-      builder.CreateLifetimeStart(AI);
+      // Try to determine the size of the allocation.
+      ConstantInt *AllocaSize = 0;
+      if (ConstantInt *AIArraySize =
+          dyn_cast<ConstantInt>(AI->getArraySize())) {
+        if (IFI.TD) {
+          Type *AllocaType = AI->getAllocatedType();
+          uint64_t AllocaTypeSize = IFI.TD->getTypeAllocSize(AllocaType);
+          uint64_t AllocaArraySize = AIArraySize->getLimitedValue();
+          assert(AllocaArraySize > 0 && "array size of AllocaInst is zero");
+          // Check that array size doesn't saturate uint64_t and doesn't
+          // overflow when it's multiplied by type size.
+          if (AllocaArraySize != ~0ULL &&
+              UINT64_MAX / AllocaArraySize >= AllocaTypeSize) {
+            AllocaSize = ConstantInt::get(Type::getInt64Ty(AI->getContext()),
+                                          AllocaArraySize * AllocaTypeSize);
+          }
+        }
+      }
+
+      builder.CreateLifetimeStart(AI, AllocaSize);
       for (unsigned ri = 0, re = Returns.size(); ri != re; ++ri) {
         IRBuilder<> builder(Returns[ri]);
-        builder.CreateLifetimeEnd(AI);
+        builder.CreateLifetimeEnd(AI, AllocaSize);
       }
     }
   }
diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp
index 45c15de..a020bc7 100644
--- a/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/lib/Transforms/Utils/InstructionNamer.cpp
@@ -15,9 +15,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Function.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Pass.h"
-#include "llvm/Type.h"
 using namespace llvm;
 
 namespace {
diff --git a/lib/Transforms/Utils/IntegerDivision.cpp b/lib/Transforms/Utils/IntegerDivision.cpp
new file mode 100644
index 0000000..5187d7c
--- /dev/null
+++ b/lib/Transforms/Utils/IntegerDivision.cpp
@@ -0,0 +1,420 @@
+//===-- IntegerDivision.cpp - Expand integer division ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains an implementation of 32bit scalar integer division for
+// targets that don't have native support. It's largely derived from
+// compiler-rt's implementation of __udivsi3, but hand-tuned to reduce the
+// amount of control flow
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "integer-division"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+
+/// Generate code to compute the remainder of two signed integers. Returns the
+/// remainder, which will have the sign of the dividend. Builder's insert point
+/// should be pointing where the caller wants code generated, e.g. at the srem
+/// instruction. This will generate a urem in the process, and Builder's insert
+/// point will be pointing at the uren (if present, i.e. not folded), ready to
+/// be expanded if the user wishes
+static Value *generateSignedRemainderCode(Value *Dividend, Value *Divisor,
+                                          IRBuilder<> &Builder) {
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+
+  // ;   %dividend_sgn = ashr i32 %dividend, 31
+  // ;   %divisor_sgn  = ashr i32 %divisor, 31
+  // ;   %dvd_xor      = xor i32 %dividend, %dividend_sgn
+  // ;   %dvs_xor      = xor i32 %divisor, %divisor_sgn
+  // ;   %u_dividend   = sub i32 %dvd_xor, %dividend_sgn
+  // ;   %u_divisor    = sub i32 %dvs_xor, %divisor_sgn
+  // ;   %urem         = urem i32 %dividend, %divisor
+  // ;   %xored        = xor i32 %urem, %dividend_sgn
+  // ;   %srem         = sub i32 %xored, %dividend_sgn
+  Value *DividendSign = Builder.CreateAShr(Dividend, ThirtyOne);
+  Value *DivisorSign  = Builder.CreateAShr(Divisor, ThirtyOne);
+  Value *DvdXor       = Builder.CreateXor(Dividend, DividendSign);
+  Value *DvsXor       = Builder.CreateXor(Divisor, DivisorSign);
+  Value *UDividend    = Builder.CreateSub(DvdXor, DividendSign);
+  Value *UDivisor     = Builder.CreateSub(DvsXor, DivisorSign);
+  Value *URem         = Builder.CreateURem(UDividend, UDivisor);
+  Value *Xored        = Builder.CreateXor(URem, DividendSign);
+  Value *SRem         = Builder.CreateSub(Xored, DividendSign);
+
+  if (Instruction *URemInst = dyn_cast<Instruction>(URem))
+    Builder.SetInsertPoint(URemInst);
+
+  return SRem;
+}
+
+
+/// Generate code to compute the remainder of two unsigned integers. Returns the
+/// remainder. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the urem instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes
+static Value *generatedUnsignedRemainderCode(Value *Dividend, Value *Divisor,
+                                             IRBuilder<> &Builder) {
+  // Remainder = Dividend - Quotient*Divisor
+
+  // ;   %quotient  = udiv i32 %dividend, %divisor
+  // ;   %product   = mul i32 %divisor, %quotient
+  // ;   %remainder = sub i32 %dividend, %product
+  Value *Quotient  = Builder.CreateUDiv(Dividend, Divisor);
+  Value *Product   = Builder.CreateMul(Divisor, Quotient);
+  Value *Remainder = Builder.CreateSub(Dividend, Product);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Quotient))
+    Builder.SetInsertPoint(UDiv);
+
+  return Remainder;
+}
+
+/// Generate code to divide two signed integers. Returns the quotient, rounded
+/// towards 0. Builder's insert point should be pointing where the caller wants
+/// code generated, e.g. at the sdiv instruction. This will generate a udiv in
+/// the process, and Builder's insert point will be pointing at the udiv (if
+/// present, i.e. not folded), ready to be expanded if the user wishes.
+static Value *generateSignedDivisionCode(Value *Dividend, Value *Divisor,
+                                         IRBuilder<> &Builder) {
+  // Implementation taken from compiler-rt's __divsi3
+
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+
+  // ;   %tmp    = ashr i32 %dividend, 31
+  // ;   %tmp1   = ashr i32 %divisor, 31
+  // ;   %tmp2   = xor i32 %tmp, %dividend
+  // ;   %u_dvnd = sub nsw i32 %tmp2, %tmp
+  // ;   %tmp3   = xor i32 %tmp1, %divisor
+  // ;   %u_dvsr = sub nsw i32 %tmp3, %tmp1
+  // ;   %q_sgn  = xor i32 %tmp1, %tmp
+  // ;   %q_mag  = udiv i32 %u_dvnd, %u_dvsr
+  // ;   %tmp4   = xor i32 %q_mag, %q_sgn
+  // ;   %q      = sub i32 %tmp4, %q_sgn
+  Value *Tmp    = Builder.CreateAShr(Dividend, ThirtyOne);
+  Value *Tmp1   = Builder.CreateAShr(Divisor, ThirtyOne);
+  Value *Tmp2   = Builder.CreateXor(Tmp, Dividend);
+  Value *U_Dvnd = Builder.CreateSub(Tmp2, Tmp);
+  Value *Tmp3   = Builder.CreateXor(Tmp1, Divisor);
+  Value *U_Dvsr = Builder.CreateSub(Tmp3, Tmp1);
+  Value *Q_Sgn  = Builder.CreateXor(Tmp1, Tmp);
+  Value *Q_Mag  = Builder.CreateUDiv(U_Dvnd, U_Dvsr);
+  Value *Tmp4   = Builder.CreateXor(Q_Mag, Q_Sgn);
+  Value *Q      = Builder.CreateSub(Tmp4, Q_Sgn);
+
+  if (Instruction *UDiv = dyn_cast<Instruction>(Q_Mag))
+    Builder.SetInsertPoint(UDiv);
+
+  return Q;
+}
+
+/// Generates code to divide two unsigned scalar 32-bit integers. Returns the
+/// quotient, rounded towards 0. Builder's insert point should be pointing where
+/// the caller wants code generated, e.g. at the udiv instruction.
+static Value *generateUnsignedDivisionCode(Value *Dividend, Value *Divisor,
+                                           IRBuilder<> &Builder) {
+  // The basic algorithm can be found in the compiler-rt project's
+  // implementation of __udivsi3.c. Here, we do a lower-level IR based approach
+  // that's been hand-tuned to lessen the amount of control flow involved.
+
+  // Some helper values
+  IntegerType *I32Ty = Builder.getInt32Ty();
+
+  ConstantInt *Zero      = Builder.getInt32(0);
+  ConstantInt *One       = Builder.getInt32(1);
+  ConstantInt *ThirtyOne = Builder.getInt32(31);
+  ConstantInt *NegOne    = ConstantInt::getSigned(I32Ty, -1);
+  ConstantInt *True      = Builder.getTrue();
+
+  BasicBlock *IBB = Builder.GetInsertBlock();
+  Function *F = IBB->getParent();
+  Function *CTLZi32 = Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
+                                                I32Ty);
+
+  // Our CFG is going to look like:
+  // +---------------------+
+  // | special-cases       |
+  // |   ...               |
+  // +---------------------+
+  //  |       |
+  //  |   +----------+
+  //  |   |  bb1     |
+  //  |   |  ...     |
+  //  |   +----------+
+  //  |    |      |
+  //  |    |  +------------+
+  //  |    |  |  preheader |
+  //  |    |  |  ...       |
+  //  |    |  +------------+
+  //  |    |      |
+  //  |    |      |      +---+
+  //  |    |      |      |   |
+  //  |    |  +------------+ |
+  //  |    |  |  do-while  | |
+  //  |    |  |  ...       | |
+  //  |    |  +------------+ |
+  //  |    |      |      |   |
+  //  |   +-----------+  +---+
+  //  |   | loop-exit |
+  //  |   |  ...      |
+  //  |   +-----------+
+  //  |     |
+  // +-------+
+  // | ...   |
+  // | end   |
+  // +-------+
+  BasicBlock *SpecialCases = Builder.GetInsertBlock();
+  SpecialCases->setName(Twine(SpecialCases->getName(), "_udiv-special-cases"));
+  BasicBlock *End = SpecialCases->splitBasicBlock(Builder.GetInsertPoint(),
+                                                  "udiv-end");
+  BasicBlock *LoopExit  = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-loop-exit", F, End);
+  BasicBlock *DoWhile   = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-do-while", F, End);
+  BasicBlock *Preheader = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-preheader", F, End);
+  BasicBlock *BB1       = BasicBlock::Create(Builder.getContext(),
+                                             "udiv-bb1", F, End);
+
+  // We'll be overwriting the terminator to insert our extra blocks
+  SpecialCases->getTerminator()->eraseFromParent();
+
+  // First off, check for special cases: dividend or divisor is zero, divisor
+  // is greater than dividend, and divisor is 1.
+  // ; special-cases:
+  // ;   %ret0_1      = icmp eq i32 %divisor, 0
+  // ;   %ret0_2      = icmp eq i32 %dividend, 0
+  // ;   %ret0_3      = or i1 %ret0_1, %ret0_2
+  // ;   %tmp0        = tail call i32 @llvm.ctlz.i32(i32 %divisor, i1 true)
+  // ;   %tmp1        = tail call i32 @llvm.ctlz.i32(i32 %dividend, i1 true)
+  // ;   %sr          = sub nsw i32 %tmp0, %tmp1
+  // ;   %ret0_4      = icmp ugt i32 %sr, 31
+  // ;   %ret0        = or i1 %ret0_3, %ret0_4
+  // ;   %retDividend = icmp eq i32 %sr, 31
+  // ;   %retVal      = select i1 %ret0, i32 0, i32 %dividend
+  // ;   %earlyRet    = or i1 %ret0, %retDividend
+  // ;   br i1 %earlyRet, label %end, label %bb1
+  Builder.SetInsertPoint(SpecialCases);
+  Value *Ret0_1      = Builder.CreateICmpEQ(Divisor, Zero);
+  Value *Ret0_2      = Builder.CreateICmpEQ(Dividend, Zero);
+  Value *Ret0_3      = Builder.CreateOr(Ret0_1, Ret0_2);
+  Value *Tmp0        = Builder.CreateCall2(CTLZi32, Divisor, True);
+  Value *Tmp1        = Builder.CreateCall2(CTLZi32, Dividend, True);
+  Value *SR          = Builder.CreateSub(Tmp0, Tmp1);
+  Value *Ret0_4      = Builder.CreateICmpUGT(SR, ThirtyOne);
+  Value *Ret0        = Builder.CreateOr(Ret0_3, Ret0_4);
+  Value *RetDividend = Builder.CreateICmpEQ(SR, ThirtyOne);
+  Value *RetVal      = Builder.CreateSelect(Ret0, Zero, Dividend);
+  Value *EarlyRet    = Builder.CreateOr(Ret0, RetDividend);
+  Builder.CreateCondBr(EarlyRet, End, BB1);
+
+  // ; bb1:                                             ; preds = %special-cases
+  // ;   %sr_1     = add i32 %sr, 1
+  // ;   %tmp2     = sub i32 31, %sr
+  // ;   %q        = shl i32 %dividend, %tmp2
+  // ;   %skipLoop = icmp eq i32 %sr_1, 0
+  // ;   br i1 %skipLoop, label %loop-exit, label %preheader
+  Builder.SetInsertPoint(BB1);
+  Value *SR_1     = Builder.CreateAdd(SR, One);
+  Value *Tmp2     = Builder.CreateSub(ThirtyOne, SR);
+  Value *Q        = Builder.CreateShl(Dividend, Tmp2);
+  Value *SkipLoop = Builder.CreateICmpEQ(SR_1, Zero);
+  Builder.CreateCondBr(SkipLoop, LoopExit, Preheader);
+
+  // ; preheader:                                           ; preds = %bb1
+  // ;   %tmp3 = lshr i32 %dividend, %sr_1
+  // ;   %tmp4 = add i32 %divisor, -1
+  // ;   br label %do-while
+  Builder.SetInsertPoint(Preheader);
+  Value *Tmp3 = Builder.CreateLShr(Dividend, SR_1);
+  Value *Tmp4 = Builder.CreateAdd(Divisor, NegOne);
+  Builder.CreateBr(DoWhile);
+
+  // ; do-while:                                 ; preds = %do-while, %preheader
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  // ;   %sr_3    = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  // ;   %r_1     = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  // ;   %q_2     = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  // ;   %tmp5  = shl i32 %r_1, 1
+  // ;   %tmp6  = lshr i32 %q_2, 31
+  // ;   %tmp7  = or i32 %tmp5, %tmp6
+  // ;   %tmp8  = shl i32 %q_2, 1
+  // ;   %q_1   = or i32 %carry_1, %tmp8
+  // ;   %tmp9  = sub i32 %tmp4, %tmp7
+  // ;   %tmp10 = ashr i32 %tmp9, 31
+  // ;   %carry = and i32 %tmp10, 1
+  // ;   %tmp11 = and i32 %tmp10, %divisor
+  // ;   %r     = sub i32 %tmp7, %tmp11
+  // ;   %sr_2  = add i32 %sr_3, -1
+  // ;   %tmp12 = icmp eq i32 %sr_2, 0
+  // ;   br i1 %tmp12, label %loop-exit, label %do-while
+  Builder.SetInsertPoint(DoWhile);
+  PHINode *Carry_1 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *SR_3    = Builder.CreatePHI(I32Ty, 2);
+  PHINode *R_1     = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_2     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp5  = Builder.CreateShl(R_1, One);
+  Value *Tmp6  = Builder.CreateLShr(Q_2, ThirtyOne);
+  Value *Tmp7  = Builder.CreateOr(Tmp5, Tmp6);
+  Value *Tmp8  = Builder.CreateShl(Q_2, One);
+  Value *Q_1   = Builder.CreateOr(Carry_1, Tmp8);
+  Value *Tmp9  = Builder.CreateSub(Tmp4, Tmp7);
+  Value *Tmp10 = Builder.CreateAShr(Tmp9, 31);
+  Value *Carry = Builder.CreateAnd(Tmp10, One);
+  Value *Tmp11 = Builder.CreateAnd(Tmp10, Divisor);
+  Value *R     = Builder.CreateSub(Tmp7, Tmp11);
+  Value *SR_2  = Builder.CreateAdd(SR_3, NegOne);
+  Value *Tmp12 = Builder.CreateICmpEQ(SR_2, Zero);
+  Builder.CreateCondBr(Tmp12, LoopExit, DoWhile);
+
+  // ; loop-exit:                                      ; preds = %do-while, %bb1
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  // ;   %q_3     = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  // ;   %tmp13 = shl i32 %q_3, 1
+  // ;   %q_4   = or i32 %carry_2, %tmp13
+  // ;   br label %end
+  Builder.SetInsertPoint(LoopExit);
+  PHINode *Carry_2 = Builder.CreatePHI(I32Ty, 2);
+  PHINode *Q_3     = Builder.CreatePHI(I32Ty, 2);
+  Value *Tmp13 = Builder.CreateShl(Q_3, One);
+  Value *Q_4   = Builder.CreateOr(Carry_2, Tmp13);
+  Builder.CreateBr(End);
+
+  // ; end:                                 ; preds = %loop-exit, %special-cases
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  // ;   ret i32 %q_5
+  Builder.SetInsertPoint(End, End->begin());
+  PHINode *Q_5 = Builder.CreatePHI(I32Ty, 2);
+
+  // Populate the Phis, since all values have now been created. Our Phis were:
+  // ;   %carry_1 = phi i32 [ 0, %preheader ], [ %carry, %do-while ]
+  Carry_1->addIncoming(Zero, Preheader);
+  Carry_1->addIncoming(Carry, DoWhile);
+  // ;   %sr_3 = phi i32 [ %sr_1, %preheader ], [ %sr_2, %do-while ]
+  SR_3->addIncoming(SR_1, Preheader);
+  SR_3->addIncoming(SR_2, DoWhile);
+  // ;   %r_1 = phi i32 [ %tmp3, %preheader ], [ %r, %do-while ]
+  R_1->addIncoming(Tmp3, Preheader);
+  R_1->addIncoming(R, DoWhile);
+  // ;   %q_2 = phi i32 [ %q, %preheader ], [ %q_1, %do-while ]
+  Q_2->addIncoming(Q, Preheader);
+  Q_2->addIncoming(Q_1, DoWhile);
+  // ;   %carry_2 = phi i32 [ 0, %bb1 ], [ %carry, %do-while ]
+  Carry_2->addIncoming(Zero, BB1);
+  Carry_2->addIncoming(Carry, DoWhile);
+  // ;   %q_3 = phi i32 [ %q, %bb1 ], [ %q_1, %do-while ]
+  Q_3->addIncoming(Q, BB1);
+  Q_3->addIncoming(Q_1, DoWhile);
+  // ;   %q_5 = phi i32 [ %q_4, %loop-exit ], [ %retVal, %special-cases ]
+  Q_5->addIncoming(Q_4, LoopExit);
+  Q_5->addIncoming(RetVal, SpecialCases);
+
+  return Q_5;
+}
+
+/// Generate code to calculate the remainder of two integers, replacing Rem with
+/// the generated code. This currently generates code using the udiv expansion,
+/// but future work includes generating more specialized code, e.g. when more
+/// information about the operands are known. Currently only implements 32bit
+/// scalar division (due to udiv's limitation), but future work is removing this
+/// limitation.
+///
+/// @brief Replace Rem with generated code.
+bool llvm::expandRemainder(BinaryOperator *Rem) {
+  assert((Rem->getOpcode() == Instruction::SRem ||
+          Rem->getOpcode() == Instruction::URem) &&
+         "Trying to expand remainder from a non-remainder function");
+
+  IRBuilder<> Builder(Rem);
+
+  // First prepare the sign if it's a signed remainder
+  if (Rem->getOpcode() == Instruction::SRem) {
+    Value *Remainder = generateSignedRemainderCode(Rem->getOperand(0),
+                                                   Rem->getOperand(1), Builder);
+
+    Rem->replaceAllUsesWith(Remainder);
+    Rem->dropAllReferences();
+    Rem->eraseFromParent();
+
+    // If we didn't actually generate a udiv instruction, we're done
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    if (!BO || BO->getOpcode() != Instruction::URem)
+      return true;
+
+    Rem = BO;
+  }
+
+  Value *Remainder = generatedUnsignedRemainderCode(Rem->getOperand(0),
+                                                    Rem->getOperand(1),
+                                                    Builder);
+
+  Rem->replaceAllUsesWith(Remainder);
+  Rem->dropAllReferences();
+  Rem->eraseFromParent();
+
+  // Expand the udiv
+  if (BinaryOperator *UDiv = dyn_cast<BinaryOperator>(Builder.GetInsertPoint())) {
+    assert(UDiv->getOpcode() == Instruction::UDiv && "Non-udiv in expansion?");
+    expandDivision(UDiv);
+  }
+
+  return true;
+}
+
+
+/// Generate code to divide two integers, replacing Div with the generated
+/// code. This currently generates code similarly to compiler-rt's
+/// implementations, but future work includes generating more specialized code
+/// when more information about the operands are known. Currently only
+/// implements 32bit scalar division, but future work is removing this
+/// limitation.
+///
+/// @brief Replace Div with generated code.
+bool llvm::expandDivision(BinaryOperator *Div) {
+  assert((Div->getOpcode() == Instruction::SDiv ||
+          Div->getOpcode() == Instruction::UDiv) &&
+         "Trying to expand division from a non-division function");
+
+  IRBuilder<> Builder(Div);
+
+  if (Div->getType()->isVectorTy())
+    llvm_unreachable("Div over vectors not supported");
+
+  // First prepare the sign if it's a signed division
+  if (Div->getOpcode() == Instruction::SDiv) {
+    // Lower the code to unsigned division, and reset Div to point to the udiv.
+    Value *Quotient = generateSignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1), Builder);
+    Div->replaceAllUsesWith(Quotient);
+    Div->dropAllReferences();
+    Div->eraseFromParent();
+
+    // If we didn't actually generate a udiv instruction, we're done
+    BinaryOperator *BO = dyn_cast<BinaryOperator>(Builder.GetInsertPoint());
+    if (!BO || BO->getOpcode() != Instruction::UDiv)
+      return true;
+
+    Div = BO;
+  }
+
+  // Insert the unsigned division code
+  Value *Quotient = generateUnsignedDivisionCode(Div->getOperand(0),
+                                                 Div->getOperand(1),
+                                                 Builder);
+  Div->replaceAllUsesWith(Quotient);
+  Div->dropAllReferences();
+  Div->eraseFromParent();
+
+  return true;
+}
diff --git a/lib/Transforms/Utils/LCSSA.cpp b/lib/Transforms/Utils/LCSSA.cpp
index b654111..2d1b166 100644
--- a/lib/Transforms/Utils/LCSSA.cpp
+++ b/lib/Transforms/Utils/LCSSA.cpp
@@ -29,17 +29,17 @@
 
 #define DEBUG_TYPE "lcssa"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Pass.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/PredIteratorCache.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 using namespace llvm;
 
 STATISTIC(NumLCSSA, "Number of live out of a loop variables");
@@ -53,6 +53,8 @@ namespace {
 
     // Cached analysis information for the current function.
     DominatorTree *DT;
+    LoopInfo *LI;
+    ScalarEvolution *SE;
     std::vector<BasicBlock*> LoopBlocks;
     PredIteratorCache PredCache;
     Loop *L;
@@ -117,6 +119,8 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
   L = TheLoop;
   
   DT = &getAnalysis<DominatorTree>();
+  LI = &getAnalysis<LoopInfo>();
+  SE = getAnalysisIfAvailable<ScalarEvolution>();
 
   // Get the set of exiting blocks.
   SmallVector<BasicBlock*, 8> ExitBlocks;
@@ -156,6 +160,12 @@ bool LCSSA::runOnLoop(Loop *TheLoop, LPPassManager &LPM) {
       MadeChange |= ProcessInstruction(I, ExitBlocks);
     }
   }
+
+  // If we modified the code, remove any caches about the loop from SCEV to
+  // avoid dangling entries.
+  // FIXME: This is a big hammer, can we clear the cache more selectively?
+  if (SE && MadeChange)
+    SE->forgetLoop(L);
   
   assert(L->isLCSSAForm(*DT));
   PredCache.clear();
@@ -245,7 +255,7 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
     // Remember that this phi makes the value alive in this block.
     SSAUpdate.AddAvailableValue(ExitBB, PN);
   }
-  
+
   // Rewrite all uses outside the loop in terms of the new PHIs we just
   // inserted.
   for (unsigned i = 0, e = UsesToRewrite.size(); i != e; ++i) {
@@ -260,6 +270,9 @@ bool LCSSA::ProcessInstruction(Instruction *Inst,
 
     if (isa<PHINode>(UserBB->begin()) &&
         isExitBlock(UserBB, ExitBlocks)) {
+      // Tell the VHs that the uses changed. This updates SCEV's caches.
+      if (UsesToRewrite[i]->get()->hasValueHandle())
+        ValueHandleBase::ValueIsRAUWd(*UsesToRewrite[i], UserBB->begin());
       UsesToRewrite[i]->set(UserBB->begin());
       continue;
     }
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 0601433..a54ee08 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -13,32 +13,34 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalAlias.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/Metadata.h"
-#include "llvm/Operator.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -122,6 +124,27 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
       if (i.getCaseSuccessor() == DefaultDest) {
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        // MD should have 2 + NumCases operands.
+        if (MD && MD->getNumOperands() == 2 + SI->getNumCases()) {
+          // Collect branch weights into a vector.
+          SmallVector<uint32_t, 8> Weights;
+          for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+               ++MD_i) {
+            ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+            assert(CI);
+            Weights.push_back(CI->getValue().getZExtValue());
+          }
+          // Merge weight of this case to the default weight.
+          unsigned idx = i.getCaseIndex();
+          Weights[0] += Weights[idx+1];
+          // Remove weight for this case.
+          std::swap(Weights[idx+1], Weights.back());
+          Weights.pop_back();
+          SI->setMetadata(LLVMContext::MD_prof,
+                          MDBuilder(BB->getContext()).
+                          createBranchWeights(Weights));
+        }
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
         SI->removeCase(i);
@@ -178,8 +201,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
             "cond");
 
         // Insert the new branch.
-        Builder.CreateCondBr(Cond, FirstCase.getCaseSuccessor(),
-                             SI->getDefaultDest());
+        BranchInst *NewBr = Builder.CreateCondBr(Cond,
+                                FirstCase.getCaseSuccessor(),
+                                SI->getDefaultDest());
+        MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+        if (MD && MD->getNumOperands() == 3) {
+          ConstantInt *SICase = dyn_cast<ConstantInt>(MD->getOperand(2));
+          ConstantInt *SIDef = dyn_cast<ConstantInt>(MD->getOperand(1));
+          assert(SICase && SIDef);
+          // The TrueWeight should be the weight for the single case of SI.
+          NewBr->setMetadata(LLVMContext::MD_prof,
+                 MDBuilder(BB->getContext()).
+                 createBranchWeights(SICase->getValue().getZExtValue(),
+                                     SIDef->getValue().getZExtValue()));
+        }
 
         // Delete the old switch.
         SI->eraseFromParent();
@@ -363,7 +398,7 @@ bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
 ///
 /// This returns true if it changed the code, note that it can delete
 /// instructions in other blocks as well in this block.
-bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD,
+bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const DataLayout *TD,
                                        const TargetLibraryInfo *TLI) {
   bool MadeChange = false;
 
@@ -411,7 +446,7 @@ bool llvm::SimplifyInstructionsInBlock(BasicBlock *BB, const TargetData *TD,
 /// .. and delete the predecessor corresponding to the '1', this will attempt to
 /// recursively fold the and to 0.
 void llvm::RemovePredecessorAndSimplify(BasicBlock *BB, BasicBlock *Pred,
-                                        TargetData *TD) {
+                                        DataLayout *TD) {
   // This only adjusts blocks with PHI nodes.
   if (!isa<PHINode>(BB->begin()))
     return;
@@ -570,7 +605,7 @@ bool llvm::TryToSimplifyUncondBranchFromEmptyBlock(BasicBlock *BB) {
   // possible to handle such cases, but difficult: it requires checking whether
   // BB dominates Succ, which is non-trivial to calculate in the case where
   // Succ has multiple predecessors.  Also, it requires checking whether
-  // constructing the necessary self-referential PHI node doesn't intoduce any
+  // constructing the necessary self-referential PHI node doesn't introduce any
   // conflicts; this isn't too difficult, but the previous code for doing this
   // was incorrect.
   //
@@ -726,7 +761,7 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// their preferred alignment from the beginning.
 ///
 static unsigned enforceKnownAlignment(Value *V, unsigned Align,
-                                      unsigned PrefAlign, const TargetData *TD) {
+                                      unsigned PrefAlign, const DataLayout *TD) {
   V = V->stripPointerCasts();
 
   if (AllocaInst *AI = dyn_cast<AllocaInst>(V)) {
@@ -769,7 +804,7 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Align,
 /// and it is more than the alignment of the ultimate object, see if we can
 /// increase the alignment of the ultimate object, making this check succeed.
 unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                          const TargetData *TD) {
+                                          const DataLayout *TD) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
   unsigned BitWidth = TD ? TD->getPointerSizeInBits() : 64;
@@ -894,3 +929,78 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
 
   return 0;
 }
+
+bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
+                                      DIBuilder &Builder) {
+  DbgDeclareInst *DDI = FindAllocaDbgDeclare(AI);
+  if (!DDI)
+    return false;
+  DIVariable DIVar(DDI->getVariable());
+  if (!DIVar.Verify())
+    return false;
+
+  // Create a copy of the original DIDescriptor for user variable, appending
+  // "deref" operation to a list of address elements, as new llvm.dbg.declare
+  // will take a value storing address of the memory for variable, not
+  // alloca itself.
+  Type *Int64Ty = Type::getInt64Ty(AI->getContext());
+  SmallVector<Value*, 4> NewDIVarAddress;
+  if (DIVar.hasComplexAddress()) {
+    for (unsigned i = 0, n = DIVar.getNumAddrElements(); i < n; ++i) {
+      NewDIVarAddress.push_back(
+          ConstantInt::get(Int64Ty, DIVar.getAddrElement(i)));
+    }
+  }
+  NewDIVarAddress.push_back(ConstantInt::get(Int64Ty, DIBuilder::OpDeref));
+  DIVariable NewDIVar = Builder.createComplexVariable(
+      DIVar.getTag(), DIVar.getContext(), DIVar.getName(),
+      DIVar.getFile(), DIVar.getLineNumber(), DIVar.getType(),
+      NewDIVarAddress, DIVar.getArgNumber());
+
+  // Insert llvm.dbg.declare in the same basic block as the original alloca,
+  // and remove old llvm.dbg.declare.
+  BasicBlock *BB = AI->getParent();
+  Builder.insertDeclare(NewAllocaAddress, NewDIVar, BB);
+  DDI->eraseFromParent();
+  return true;
+}
+
+bool llvm::removeUnreachableBlocks(Function &F) {
+  SmallPtrSet<BasicBlock*, 16> Reachable;
+  SmallVector<BasicBlock*, 128> Worklist;
+  Worklist.push_back(&F.getEntryBlock());
+  Reachable.insert(&F.getEntryBlock());
+  do {
+    BasicBlock *BB = Worklist.pop_back_val();
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      if (Reachable.insert(*SI))
+        Worklist.push_back(*SI);
+  } while (!Worklist.empty());
+
+  if (Reachable.size() == F.size())
+    return false;
+
+  assert(Reachable.size() < F.size());
+  for (Function::iterator I = llvm::next(F.begin()), E = F.end(); I != E; ++I) {
+    if (Reachable.count(I))
+      continue;
+
+    // Remove the block as predecessor of all its reachable successors.
+    // Unreachable successors don't matter as they'll soon be removed, too.
+    for (succ_iterator SI = succ_begin(I), SE = succ_end(I); SI != SE; ++SI)
+      if (Reachable.count(*SI))
+        (*SI)->removePredecessor(I);
+
+    // Zap all instructions in this basic block.
+    while (!I->empty()) {
+      Instruction &Inst = I->back();
+      if (!Inst.use_empty())
+        Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
+      I->getInstList().pop_back();
+    }
+
+    --I;
+    llvm::next(I)->eraseFromParent();
+  }
+  return true;
+}
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 0bc185d..37819cc 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -39,25 +39,26 @@
 
 #define DEBUG_TYPE "loop-simplify"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Function.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Type.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/ADT/SetOperations.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
 STATISTIC(NumInserted, "Number of pre-header or exit blocks inserted");
@@ -89,6 +90,7 @@ namespace {
 
       AU.addPreserved<AliasAnalysis>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<DependenceAnalysis>();
       AU.addPreservedID(BreakCriticalEdgesID);  // No critical edges added.
     }
 
@@ -194,6 +196,11 @@ ReprocessLoop:
 
           BI->setCondition(ConstantInt::get(Cond->getType(),
                                             !L->contains(BI->getSuccessor(0))));
+
+          // This may make the loop analyzable, force SCEV recomputation.
+          if (SE)
+            SE->forgetLoop(L);
+
           Changed = true;
         }
       }
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 2023750..cb581b3 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -18,12 +18,12 @@
 
 #define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 67e17f4..d801d5f 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -23,12 +23,12 @@
 
 #define DEBUG_TYPE "loop-unroll"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include "llvm/BasicBlock.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
diff --git a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
index 02bdcda..4aee8ff 100644
--- a/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Utils/LowerExpectIntrinsic.cpp
@@ -12,17 +12,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "lower-expect-intrinsic"
-#include "llvm/BasicBlock.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/MDBuilder.h"
-#include "llvm/Metadata.h"
-#include "llvm/Pass.h"
-#include "llvm/ADT/Statistic.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include <vector>
diff --git a/lib/Transforms/Utils/LowerInvoke.cpp b/lib/Transforms/Utils/LowerInvoke.cpp
index 9305554..9ec84d7 100644
--- a/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/lib/Transforms/Utils/LowerInvoke.cpp
@@ -36,19 +36,19 @@
 
 #define DEBUG_TYPE "lowerinvoke"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Instructions.h"
-#include "llvm/Intrinsics.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Module.h"
-#include "llvm/Pass.h"
-#include "llvm/Transforms/Utils/BasicBlockUtils.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <csetjmp>
 #include <set>
 using namespace llvm;
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 1547439..955b853 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -14,16 +14,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/Pass.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
 #include <algorithm>
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index f4ca81a..61b3965 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -14,12 +14,12 @@
 
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Instructions.h"
-#include "llvm/Function.h"
-#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
 STATISTIC(NumPromoted, "Number of alloca's promoted");
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
new file mode 100644
index 0000000..d519fb7
--- /dev/null
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -0,0 +1,131 @@
+//===- MetaRenamer.cpp - Rename everything with metasyntatic names --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass renames everything with metasyntatic names. The intent is to use
+// this pass after bugpoint reduction to conceal the nature of the original
+// program.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/Pass.h"
+using namespace llvm;
+
+namespace {
+
+  // This PRNG is from the ISO C spec. It is intentionally simple and
+  // unsuitable for cryptographic use. We're just looking for enough
+  // variety to surprise and delight users.
+  struct PRNG {
+    unsigned long next;
+
+    void srand(unsigned int seed) {
+      next = seed;
+    }
+
+    int rand() {
+      next = next * 1103515245 + 12345;
+      return (unsigned int)(next / 65536) % 32768;
+    }
+  };
+
+  struct MetaRenamer : public ModulePass {
+    static char ID; // Pass identification, replacement for typeid
+    MetaRenamer() : ModulePass(ID) {
+      initializeMetaRenamerPass(*PassRegistry::getPassRegistry());
+    }
+
+    void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesAll();
+    }
+
+    bool runOnModule(Module &M) {
+      static const char *metaNames[] = {
+        // See http://en.wikipedia.org/wiki/Metasyntactic_variable
+        "foo", "bar", "baz", "quux", "barney", "snork", "zot", "blam", "hoge",
+        "wibble", "wobble", "widget", "wombat", "ham", "eggs", "pluto", "spam"
+      };
+
+      // Seed our PRNG with simple additive sum of ModuleID. We're looking to
+      // simply avoid always having the same function names, and we need to
+      // remain deterministic.
+      unsigned int randSeed = 0;
+      for (std::string::const_iterator I = M.getModuleIdentifier().begin(),
+           E = M.getModuleIdentifier().end(); I != E; ++I)
+        randSeed += *I;
+
+      PRNG prng;
+      prng.srand(randSeed);
+
+      // Rename all aliases
+      for (Module::alias_iterator AI = M.alias_begin(), AE = M.alias_end();
+           AI != AE; ++AI)
+        AI->setName("alias");
+
+      // Rename all global variables
+      for (Module::global_iterator GI = M.global_begin(), GE = M.global_end();
+           GI != GE; ++GI)
+        GI->setName("global");
+
+      // Rename all struct types
+      TypeFinder StructTypes;
+      StructTypes.run(M, true);
+      for (unsigned i = 0, e = StructTypes.size(); i != e; ++i) {
+        StructType *STy = StructTypes[i];
+        if (STy->isLiteral() || STy->getName().empty()) continue;
+
+        SmallString<128> NameStorage;
+        STy->setName((Twine("struct.") + metaNames[prng.rand() %
+                     array_lengthof(metaNames)]).toStringRef(NameStorage));
+      }
+
+      // Rename all functions
+      for (Module::iterator FI = M.begin(), FE = M.end();
+           FI != FE; ++FI) {
+        FI->setName(metaNames[prng.rand() % array_lengthof(metaNames)]);
+        runOnFunction(*FI);
+      }
+      return true;
+    }
+
+    bool runOnFunction(Function &F) {
+      for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end();
+           AI != AE; ++AI)
+        if (!AI->getType()->isVoidTy())
+          AI->setName("arg");
+
+      for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+        BB->setName("bb");
+
+        for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+          if (!I->getType()->isVoidTy())
+            I->setName("tmp");
+      }
+      return true;
+    }
+  };
+}
+
+char MetaRenamer::ID = 0;
+INITIALIZE_PASS(MetaRenamer, "metarenamer", 
+                "Assign new names to everything", false, false)
+//===----------------------------------------------------------------------===//
+//
+// MetaRenamer - Rename everything with metasyntactic names.
+//
+ModulePass *llvm::createMetaRenamerPass() {
+  return new MetaRenamer();
+}
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index dbcf3b2..d090b48 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -12,10 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/Function.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Module.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Module.h"
 
 using namespace llvm;
 
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index dd5e20e..de335ec 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -27,26 +27,26 @@
 
 #define DEBUG_TYPE "mem2reg"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
-#include "llvm/Constants.h"
-#include "llvm/DebugInfo.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/DIBuilder.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/Metadata.h"
-#include "llvm/Analysis/AliasSetTracker.h"
-#include "llvm/Analysis/Dominators.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/DIBuilder.h"
+#include "llvm/DebugInfo.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <queue>
 using namespace llvm;
@@ -212,9 +212,13 @@ namespace {
     ///
     DenseMap<AllocaInst*, unsigned>  AllocaLookup;
 
-    /// NewPhiNodes - The PhiNodes we're adding.
+    /// NewPhiNodes - The PhiNodes we're adding.  That map is used to simplify
+    /// some Phi nodes as we iterate over it, so it should have deterministic
+    /// iterators.  We could use a MapVector, but since we already maintain a
+    /// map from BasicBlock* to a stable numbering (BBNumbers), the DenseMap is
+    /// more efficient (also supports removal).
     ///
-    DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*> NewPhiNodes;
+    DenseMap<std::pair<unsigned, unsigned>, PHINode*> NewPhiNodes;
     
     /// PhiToAllocaMap - For each PHI node, keep track of which entry in Allocas
     /// it corresponds to.
@@ -588,7 +592,11 @@ void PromoteMem2Reg::run() {
   while (EliminatedAPHI) {
     EliminatedAPHI = false;
     
-    for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+    // Iterating over NewPhiNodes is deterministic, so it is safe to try to
+    // simplify and RAUW them as we go.  If it was not, we could add uses to
+    // the values we replace with in a non deterministic order, thus creating
+    // non deterministic def->use chains.
+    for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
            NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E;) {
       PHINode *PN = I->second;
 
@@ -612,7 +620,7 @@ void PromoteMem2Reg::run() {
   // have incoming values for all predecessors.  Loop over all PHI nodes we have
   // created, inserting undef values if they are missing any incoming values.
   //
-  for (DenseMap<std::pair<BasicBlock*, unsigned>, PHINode*>::iterator I =
+  for (DenseMap<std::pair<unsigned, unsigned>, PHINode*>::iterator I =
          NewPhiNodes.begin(), E = NewPhiNodes.end(); I != E; ++I) {
     // We want to do this once per basic block.  As such, only process a block
     // when we find the PHI that is the first entry in the block.
@@ -992,7 +1000,7 @@ void PromoteMem2Reg::PromoteSingleBlockAlloca(AllocaInst *AI, AllocaInfo &Info,
 bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
                                   unsigned &Version) {
   // Look up the basic-block in question.
-  PHINode *&PN = NewPhiNodes[std::make_pair(BB, AllocaNo)];
+  PHINode *&PN = NewPhiNodes[std::make_pair(BBNumbers[BB], AllocaNo)];
 
   // If the BB already has a phi node added for the i'th alloca then we're done!
   if (PN) return false;
diff --git a/lib/Transforms/Utils/SSAUpdater.cpp b/lib/Transforms/Utils/SSAUpdater.cpp
index 72d4199..9d90fbe 100644
--- a/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/lib/Transforms/Utils/SSAUpdater.cpp
@@ -12,12 +12,13 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "ssaupdater"
-#include "llvm/Constants.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
+#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/AlignOf.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CFG.h"
@@ -25,7 +26,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/SSAUpdaterImpl.h"
 
 using namespace llvm;
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 32d7fa1..f10c35f 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -13,18 +13,6 @@
 
 #define DEBUG_TYPE "simplifycfg"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Constants.h"
-#include "llvm/DerivedTypes.h"
-#include "llvm/GlobalVariable.h"
-#include "llvm/IRBuilder.h"
-#include "llvm/Instructions.h"
-#include "llvm/IntrinsicInst.h"
-#include "llvm/LLVMContext.h"
-#include "llvm/MDBuilder.h"
-#include "llvm/Metadata.h"
-#include "llvm/Module.h"
-#include "llvm/Operator.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -32,18 +20,31 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
 #include "llvm/Support/CFG.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConstantRange.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/NoFolder.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetData.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
-#include <set>
 #include <map>
+#include <set>
 using namespace llvm;
 
 static cl::opt<unsigned>
@@ -54,8 +55,14 @@ static cl::opt<bool>
 DupRet("simplifycfg-dup-ret", cl::Hidden, cl::init(false),
        cl::desc("Duplicate return instructions into unconditional branches"));
 
-STATISTIC(NumSpeculations, "Number of speculative executed instructions");
+static cl::opt<bool>
+SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
+       cl::desc("Sink common instructions down to the end block"));
+
+STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLookupTables, "Number of switch instructions turned into lookup tables");
+STATISTIC(NumSinkCommons, "Number of common instructions sunk down to the end block");
+STATISTIC(NumSpeculations, "Number of speculative executed instructions");
 
 namespace {
   /// ValueEqualityComparisonCase - Represents a case of a switch.
@@ -70,10 +77,13 @@ namespace {
       // Comparing pointers is ok as we only rely on the order for uniquing.
       return Value < RHS.Value;
     }
+
+    bool operator==(BasicBlock *RHSDest) const { return Dest == RHSDest; }
   };
 
 class SimplifyCFGOpt {
-  const TargetData *const TD;
+  const TargetTransformInfo &TTI;
+  const DataLayout *const TD;
 
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(TerminatorInst *TI,
@@ -93,7 +103,8 @@ class SimplifyCFGOpt {
   bool SimplifyCondBranch(BranchInst *BI, IRBuilder <>&Builder);
 
 public:
-  explicit SimplifyCFGOpt(const TargetData *td) : TD(td) {}
+  SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout *TD)
+      : TTI(TTI), TD(TD) {}
   bool run(BasicBlock *BB);
 };
 }
@@ -376,7 +387,7 @@ static bool DominatesMergePoint(Value *V, BasicBlock *BB,
 
 /// GetConstantInt - Extract ConstantInt from value, looking through IntToPtr
 /// and PointerNullValue. Return NULL if value is not a constant int.
-static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
+static ConstantInt *GetConstantInt(Value *V, const DataLayout *TD) {
   // Normal constant int.
   ConstantInt *CI = dyn_cast<ConstantInt>(V);
   if (CI || !TD || !isa<Constant>(V) || !V->getType()->isPointerTy())
@@ -384,7 +395,7 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
 
   // This is some kind of pointer constant. Turn it into a pointer-sized
   // ConstantInt if possible.
-  IntegerType *PtrTy = TD->getIntPtrType(V->getContext());
+  IntegerType *PtrTy = cast<IntegerType>(TD->getIntPtrType(V->getType()));
 
   // Null pointer means 0, see SelectionDAGBuilder::getValue(const Value*).
   if (isa<ConstantPointerNull>(V))
@@ -410,7 +421,7 @@ static ConstantInt *GetConstantInt(Value *V, const TargetData *TD) {
 /// Values vector.
 static Value *
 GatherConstantCompares(Value *V, std::vector<ConstantInt*> &Vals, Value *&Extra,
-                       const TargetData *TD, bool isEQ, unsigned &UsedICmps) {
+                       const DataLayout *TD, bool isEQ, unsigned &UsedICmps) {
   Instruction *I = dyn_cast<Instruction>(V);
   if (I == 0) return 0;
 
@@ -558,11 +569,7 @@ GetValueEqualityComparisonCases(TerminatorInst *TI,
 /// in the list that match the specified block.
 static void EliminateBlockCases(BasicBlock *BB,
                               std::vector<ValueEqualityComparisonCase> &Cases) {
-  for (unsigned i = 0, e = Cases.size(); i != e; ++i)
-    if (Cases[i].Dest == BB) {
-      Cases.erase(Cases.begin()+i);
-      --i; --e;
-    }
+  Cases.erase(std::remove(Cases.begin(), Cases.end(), BB), Cases.end());
 }
 
 /// ValuesOverlap - Return true if there are any keys in C1 that exist in C2 as
@@ -667,13 +674,32 @@ SimplifyEqualityComparisonWithOnlyPredecessor(TerminatorInst *TI,
     DEBUG(dbgs() << "Threading pred instr: " << *Pred->getTerminator()
                  << "Through successor TI: " << *TI);
 
+    // Collect branch weights into a vector.
+    SmallVector<uint32_t, 8> Weights;
+    MDNode* MD = SI->getMetadata(LLVMContext::MD_prof);
+    bool HasWeight = MD && (MD->getNumOperands() == 2 + SI->getNumCases());
+    if (HasWeight)
+      for (unsigned MD_i = 1, MD_e = MD->getNumOperands(); MD_i < MD_e;
+           ++MD_i) {
+        ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(MD_i));
+        assert(CI);
+        Weights.push_back(CI->getValue().getZExtValue());
+      }
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
       if (DeadCases.count(i.getCaseValue())) {
+        if (HasWeight) {
+          std::swap(Weights[i.getCaseIndex()+1], Weights.back());
+          Weights.pop_back();
+        }
         i.getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
     }
+    if (HasWeight && Weights.size() >= 2)
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getParent()->getContext()).
+                      createBranchWeights(Weights));
 
     DEBUG(dbgs() << "Leaving: " << *TI << "\n");
     return true;
@@ -752,38 +778,27 @@ static inline bool HasBranchWeights(const Instruction* I) {
   return false;
 }
 
-/// Tries to get a branch weight for the given instruction, returns NULL if it
-/// can't. Pos starts at 0.
-static ConstantInt* GetWeight(Instruction* I, int Pos) {
-  MDNode* ProfMD = I->getMetadata(LLVMContext::MD_prof);
-  if (ProfMD && ProfMD->getOperand(0)) {
-    if (MDString* MDS = dyn_cast<MDString>(ProfMD->getOperand(0))) {
-      if (MDS->getString().equals("branch_weights")) {
-        assert(ProfMD->getNumOperands() >= 3);
-        return dyn_cast<ConstantInt>(ProfMD->getOperand(1 + Pos));
-      }
-    }
-  }
-
-  return 0;
-}
-
-/// Scale the given weights based on the successor TI's metadata. Scaling is
-/// done by multiplying every weight by the sum of the successor's weights.
-static void ScaleWeights(Instruction* STI, MutableArrayRef<uint64_t> Weights) {
-  // Sum the successor's weights
-  assert(HasBranchWeights(STI));
-  unsigned Scale = 0;
-  MDNode* ProfMD = STI->getMetadata(LLVMContext::MD_prof);
-  for (unsigned i = 1; i < ProfMD->getNumOperands(); ++i) {
-    ConstantInt* CI = dyn_cast<ConstantInt>(ProfMD->getOperand(i));
+/// Get Weights of a given TerminatorInst, the default weight is at the front
+/// of the vector. If TI is a conditional eq, we need to swap the branch-weight
+/// metadata.
+static void GetBranchWeights(TerminatorInst *TI,
+                             SmallVectorImpl<uint64_t> &Weights) {
+  MDNode* MD = TI->getMetadata(LLVMContext::MD_prof);
+  assert(MD);
+  for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
+    ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i));
     assert(CI);
-    Scale += CI->getValue().getZExtValue();
+    Weights.push_back(CI->getValue().getZExtValue());
   }
 
-  // Skip default, as it's replaced during the folding
-  for (unsigned i = 1; i < Weights.size(); ++i) {
-    Weights[i] *= Scale;
+  // If TI is a conditional eq, the default case is the false case,
+  // and the corresponding branch-weight data is at index 2. We swap the
+  // default weight to be the first entry.
+  if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
+    assert(Weights.size() == 2);
+    ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
+    if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+      std::swap(Weights.front(), Weights.back());
   }
 }
 
@@ -838,52 +853,28 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
 
       // Update the branch weight metadata along the way
       SmallVector<uint64_t, 8> Weights;
-      uint64_t PredDefaultWeight = 0;
       bool PredHasWeights = HasBranchWeights(PTI);
       bool SuccHasWeights = HasBranchWeights(TI);
 
       if (PredHasWeights) {
-        MDNode* MD = PTI->getMetadata(LLVMContext::MD_prof);
-        assert(MD);
-        for (unsigned i = 1, e = MD->getNumOperands(); i < e; ++i) {
-          ConstantInt* CI = dyn_cast<ConstantInt>(MD->getOperand(i));
-          assert(CI);
-          Weights.push_back(CI->getValue().getZExtValue());
-        }
-
-        // If the predecessor is a conditional eq, then swap the default weight
-        // to be the first entry.
-        if (BranchInst* BI = dyn_cast<BranchInst>(PTI)) {
-          assert(Weights.size() == 2);
-          ICmpInst *ICI = cast<ICmpInst>(BI->getCondition());
-
-          if (ICI->getPredicate() == ICmpInst::ICMP_EQ) {
-            std::swap(Weights.front(), Weights.back());
-          }
-        }
-
-        PredDefaultWeight = Weights.front();
-      } else if (SuccHasWeights) {
+        GetBranchWeights(PTI, Weights);
+        // branch-weight metadata is inconsistent here.
+        if (Weights.size() != 1 + PredCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (SuccHasWeights)
         // If there are no predecessor weights but there are successor weights,
         // populate Weights with 1, which will later be scaled to the sum of
         // successor's weights
         Weights.assign(1 + PredCases.size(), 1);
-        PredDefaultWeight = 1;
-      }
 
-      uint64_t SuccDefaultWeight = 0;
+      SmallVector<uint64_t, 8> SuccWeights;
       if (SuccHasWeights) {
-        int Index = 0;
-        if (BranchInst* BI = dyn_cast<BranchInst>(TI)) {
-          ICmpInst* ICI = dyn_cast<ICmpInst>(BI->getCondition());
-          assert(ICI);
-
-          if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
-            Index = 1;
-        }
-
-        SuccDefaultWeight = GetWeight(TI, Index)->getValue().getZExtValue();
-      }
+        GetBranchWeights(TI, SuccWeights);
+        // branch-weight metadata is inconsistent here.
+        if (SuccWeights.size() != 1 + BBCases.size())
+          PredHasWeights = SuccHasWeights = false;
+      } else if (PredHasWeights)
+        SuccWeights.assign(1 + BBCases.size(), 1);
 
       if (PredDefault == BB) {
         // If this is the default destination from PTI, only the edges in TI
@@ -896,7 +887,9 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
             // The default destination is BB, we don't need explicit targets.
             std::swap(PredCases[i], PredCases.back());
 
-            if (PredHasWeights) {
+            if (PredHasWeights || SuccHasWeights) {
+              // Increase weight for the default case.
+              Weights[0] += Weights[i+1];
               std::swap(Weights[i+1], Weights.back());
               Weights.pop_back();
             }
@@ -912,40 +905,46 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
           NewSuccessors.push_back(BBDefault);
         }
 
-        if (SuccHasWeights) {
-          ScaleWeights(TI, Weights);
-          Weights.front() *= SuccDefaultWeight;
-        } else if (PredHasWeights) {
-          Weights.front() /= (1 + BBCases.size());
-        }
-
+        unsigned CasesFromPred = Weights.size();
+        uint64_t ValidTotalSuccWeight = 0;
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
           if (!PTIHandled.count(BBCases[i].Value) &&
               BBCases[i].Dest != BBDefault) {
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
-            if (SuccHasWeights) {
-              Weights.push_back(PredDefaultWeight *
-                                GetWeight(TI, i)->getValue().getZExtValue());
-            } else if (PredHasWeights) {
-              // Split the old default's weight amongst the children
-              Weights.push_back(PredDefaultWeight / (1 + BBCases.size()));
+            if (SuccHasWeights || PredHasWeights) {
+              // The default weight is at index 0, so weight for the ith case
+              // should be at index i+1. Scale the cases from successor by
+              // PredDefaultWeight (Weights[0]).
+              Weights.push_back(Weights[0] * SuccWeights[i+1]);
+              ValidTotalSuccWeight += SuccWeights[i+1];
             }
           }
 
+        if (SuccHasWeights || PredHasWeights) {
+          ValidTotalSuccWeight += SuccWeights[0];
+          // Scale the cases from predecessor by ValidTotalSuccWeight.
+          for (unsigned i = 1; i < CasesFromPred; ++i)
+            Weights[i] *= ValidTotalSuccWeight;
+          // Scale the default weight by SuccDefaultWeight (SuccWeights[0]).
+          Weights[0] *= SuccWeights[0];
+        }
       } else {
-        // FIXME: preserve branch weight metadata, similarly to the 'then'
-        // above. For now, drop it.
-        PredHasWeights = false;
-        SuccHasWeights = false;
-
         // If this is not the default destination from PSI, only the edges
         // in SI that occur in PSI with a destination of BB will be
         // activated.
         std::set<ConstantInt*, ConstantIntOrdering> PTIHandled;
+        std::map<ConstantInt*, uint64_t> WeightsForHandled;
         for (unsigned i = 0, e = PredCases.size(); i != e; ++i)
           if (PredCases[i].Dest == BB) {
             PTIHandled.insert(PredCases[i].Value);
+
+            if (PredHasWeights || SuccHasWeights) {
+              WeightsForHandled[PredCases[i].Value] = Weights[i+1];
+              std::swap(Weights[i+1], Weights.back());
+              Weights.pop_back();
+            }
+
             std::swap(PredCases[i], PredCases.back());
             PredCases.pop_back();
             --i; --e;
@@ -956,6 +955,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (unsigned i = 0, e = BBCases.size(); i != e; ++i)
           if (PTIHandled.count(BBCases[i].Value)) {
             // If this is one we are capable of getting...
+            if (PredHasWeights || SuccHasWeights)
+              Weights.push_back(WeightsForHandled[BBCases[i].Value]);
             PredCases.push_back(BBCases[i]);
             NewSuccessors.push_back(BBCases[i].Dest);
             PTIHandled.erase(BBCases[i].Value);// This constant is taken care of
@@ -966,6 +967,8 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
         for (std::set<ConstantInt*, ConstantIntOrdering>::iterator I =
                                     PTIHandled.begin(),
                E = PTIHandled.end(); I != E; ++I) {
+          if (PredHasWeights || SuccHasWeights)
+            Weights.push_back(WeightsForHandled[*I]);
           PredCases.push_back(ValueEqualityComparisonCase(*I, BBDefault));
           NewSuccessors.push_back(BBDefault);
         }
@@ -980,7 +983,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       Builder.SetInsertPoint(PTI);
       // Convert pointer to int before we switch.
       if (CV->getType()->isPointerTy()) {
-        assert(TD && "Cannot switch on pointer without TargetData");
+        assert(TD && "Cannot switch on pointer without DataLayout");
         CV = Builder.CreatePtrToInt(CV, TD->getIntPtrType(CV->getContext()),
                                     "magicptr");
       }
@@ -1160,6 +1163,175 @@ HoistTerminator:
   return true;
 }
 
+/// SinkThenElseCodeToEnd - Given an unconditional branch that goes to BBEnd,
+/// check whether BBEnd has only two predecessors and the other predecessor
+/// ends with an unconditional branch. If it is true, sink any common code
+/// in the two predecessors to BBEnd.
+static bool SinkThenElseCodeToEnd(BranchInst *BI1) {
+  assert(BI1->isUnconditional());
+  BasicBlock *BB1 = BI1->getParent();
+  BasicBlock *BBEnd = BI1->getSuccessor(0);
+
+  // Check that BBEnd has two predecessors and the other predecessor ends with
+  // an unconditional branch.
+  pred_iterator PI = pred_begin(BBEnd), PE = pred_end(BBEnd);
+  BasicBlock *Pred0 = *PI++;
+  if (PI == PE) // Only one predecessor.
+    return false;
+  BasicBlock *Pred1 = *PI++;
+  if (PI != PE) // More than two predecessors.
+    return false;
+  BasicBlock *BB2 = (Pred0 == BB1) ? Pred1 : Pred0;
+  BranchInst *BI2 = dyn_cast<BranchInst>(BB2->getTerminator());
+  if (!BI2 || !BI2->isUnconditional())
+    return false;
+
+  // Gather the PHI nodes in BBEnd.
+  std::map<Value*, std::pair<Value*, PHINode*> > MapValueFromBB1ToBB2;
+  Instruction *FirstNonPhiInBBEnd = 0;
+  for (BasicBlock::iterator I = BBEnd->begin(), E = BBEnd->end();
+       I != E; ++I) {
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      Value *BB1V = PN->getIncomingValueForBlock(BB1);
+      Value *BB2V = PN->getIncomingValueForBlock(BB2);
+      MapValueFromBB1ToBB2[BB1V] = std::make_pair(BB2V, PN);
+    } else {
+      FirstNonPhiInBBEnd = &*I;
+      break;
+    }
+  }
+  if (!FirstNonPhiInBBEnd)
+    return false;
+
+
+  // This does very trivial matching, with limited scanning, to find identical
+  // instructions in the two blocks.  We scan backward for obviously identical
+  // instructions in an identical order.
+  BasicBlock::InstListType::reverse_iterator RI1 = BB1->getInstList().rbegin(),
+      RE1 = BB1->getInstList().rend(), RI2 = BB2->getInstList().rbegin(),
+      RE2 = BB2->getInstList().rend();
+  // Skip debug info.
+  while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+  if (RI1 == RE1)
+    return false;
+  while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+  if (RI2 == RE2)
+    return false;
+  // Skip the unconditional branches.
+  ++RI1;
+  ++RI2;
+
+  bool Changed = false;
+  while (RI1 != RE1 && RI2 != RE2) {
+    // Skip debug info.
+    while (RI1 != RE1 && isa<DbgInfoIntrinsic>(&*RI1)) ++RI1;
+    if (RI1 == RE1)
+      return Changed;
+    while (RI2 != RE2 && isa<DbgInfoIntrinsic>(&*RI2)) ++RI2;
+    if (RI2 == RE2)
+      return Changed;
+
+    Instruction *I1 = &*RI1, *I2 = &*RI2;
+    // I1 and I2 should have a single use in the same PHI node, and they
+    // perform the same operation.
+    // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+    if (isa<PHINode>(I1) || isa<PHINode>(I2) ||
+        isa<TerminatorInst>(I1) || isa<TerminatorInst>(I2) ||
+        isa<LandingPadInst>(I1) || isa<LandingPadInst>(I2) ||
+        isa<AllocaInst>(I1) || isa<AllocaInst>(I2) ||
+        I1->mayHaveSideEffects() || I2->mayHaveSideEffects() ||
+        I1->mayReadOrWriteMemory() || I2->mayReadOrWriteMemory() ||
+        !I1->hasOneUse() || !I2->hasOneUse() ||
+        MapValueFromBB1ToBB2.find(I1) == MapValueFromBB1ToBB2.end() ||
+        MapValueFromBB1ToBB2[I1].first != I2)
+      return Changed;
+
+    // Check whether we should swap the operands of ICmpInst.
+    ICmpInst *ICmp1 = dyn_cast<ICmpInst>(I1), *ICmp2 = dyn_cast<ICmpInst>(I2);
+    bool SwapOpnds = false;
+    if (ICmp1 && ICmp2 &&
+        ICmp1->getOperand(0) != ICmp2->getOperand(0) &&
+        ICmp1->getOperand(1) != ICmp2->getOperand(1) &&
+        (ICmp1->getOperand(0) == ICmp2->getOperand(1) ||
+         ICmp1->getOperand(1) == ICmp2->getOperand(0))) {
+      ICmp2->swapOperands();
+      SwapOpnds = true;
+    }
+    if (!I1->isSameOperationAs(I2)) {
+      if (SwapOpnds)
+        ICmp2->swapOperands();
+      return Changed;
+    }
+
+    // The operands should be either the same or they need to be generated
+    // with a PHI node after sinking. We only handle the case where there is
+    // a single pair of different operands.
+    Value *DifferentOp1 = 0, *DifferentOp2 = 0;
+    unsigned Op1Idx = 0;
+    for (unsigned I = 0, E = I1->getNumOperands(); I != E; ++I) {
+      if (I1->getOperand(I) == I2->getOperand(I))
+        continue;
+      // Early exit if we have more-than one pair of different operands or
+      // the different operand is already in MapValueFromBB1ToBB2.
+      // Early exit if we need a PHI node to replace a constant.
+      if (DifferentOp1 ||
+          MapValueFromBB1ToBB2.find(I1->getOperand(I)) !=
+          MapValueFromBB1ToBB2.end() ||
+          isa<Constant>(I1->getOperand(I)) ||
+          isa<Constant>(I2->getOperand(I))) {
+        // If we can't sink the instructions, undo the swapping.
+        if (SwapOpnds)
+          ICmp2->swapOperands();
+        return Changed;
+      }
+      DifferentOp1 = I1->getOperand(I);
+      Op1Idx = I;
+      DifferentOp2 = I2->getOperand(I);
+    }
+
+    // We insert the pair of different operands to MapValueFromBB1ToBB2 and
+    // remove (I1, I2) from MapValueFromBB1ToBB2.
+    if (DifferentOp1) {
+      PHINode *NewPN = PHINode::Create(DifferentOp1->getType(), 2,
+                                       DifferentOp1->getName() + ".sink",
+                                       BBEnd->begin());
+      MapValueFromBB1ToBB2[DifferentOp1] = std::make_pair(DifferentOp2, NewPN);
+      // I1 should use NewPN instead of DifferentOp1.
+      I1->setOperand(Op1Idx, NewPN);
+      NewPN->addIncoming(DifferentOp1, BB1);
+      NewPN->addIncoming(DifferentOp2, BB2);
+      DEBUG(dbgs() << "Create PHI node " << *NewPN << "\n";);
+    }
+    PHINode *OldPN = MapValueFromBB1ToBB2[I1].second;
+    MapValueFromBB1ToBB2.erase(I1);
+
+    DEBUG(dbgs() << "SINK common instructions " << *I1 << "\n";);
+    DEBUG(dbgs() << "                         " << *I2 << "\n";);
+    // We need to update RE1 and RE2 if we are going to sink the first
+    // instruction in the basic block down.
+    bool UpdateRE1 = (I1 == BB1->begin()), UpdateRE2 = (I2 == BB2->begin());
+    // Sink the instruction.
+    BBEnd->getInstList().splice(FirstNonPhiInBBEnd, BB1->getInstList(), I1);
+    if (!OldPN->use_empty())
+      OldPN->replaceAllUsesWith(I1);
+    OldPN->eraseFromParent();
+
+    if (!I2->use_empty())
+      I2->replaceAllUsesWith(I1);
+    I1->intersectOptionalDataWith(I2);
+    I2->eraseFromParent();
+
+    if (UpdateRE1)
+      RE1 = BB1->getInstList().rend();
+    if (UpdateRE2)
+      RE2 = BB2->getInstList().rend();
+    FirstNonPhiInBBEnd = I1;
+    NumSinkCommons++;
+    Changed = true;
+  }
+  return Changed;
+}
+
 /// SpeculativelyExecuteBB - Given a conditional branch that goes to BB1
 /// and an BB2 and the only successor of BB1 is BB2, hoist simple code
 /// (for now, restricted to a single instruction that's side effect free) from
@@ -1243,7 +1415,7 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *BB1) {
     if (BB1V == BIParentV)
       continue;
 
-    // Check for saftey.
+    // Check for safety.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(BB1V)) {
       // An unfolded ConstantExpr could end up getting expanded into
       // Instructions. Don't speculate this and another instruction at
@@ -1339,7 +1511,7 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// that is defined in the same block as the branch and if any PHI entries are
 /// constants, thread edges corresponding to that entry to be branches to their
 /// ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout *TD) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
@@ -1435,7 +1607,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const TargetData *TD) {
 
 /// FoldTwoEntryPHINode - Given a BB that starts with the specified two-entry
 /// PHI node, see if we can eliminate it.
-static bool FoldTwoEntryPHINode(PHINode *PN, const TargetData *TD) {
+static bool FoldTwoEntryPHINode(PHINode *PN, const DataLayout *TD) {
   // Ok, this is a two entry PHI node.  Check to see if this is a simple "if
   // statement", which has a very simple dominance structure.  Basically, we
   // are trying to find the condition that is being branched on, which
@@ -1662,7 +1834,7 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
 /// parameters and return true, or returns false if no or invalid metadata was
 /// found.
 static bool ExtractBranchMetadata(BranchInst *BI,
-                                  APInt &ProbTrue, APInt &ProbFalse) {
+                                  uint64_t &ProbTrue, uint64_t &ProbFalse) {
   assert(BI->isConditional() &&
          "Looking for probabilities on unconditional branch?");
   MDNode *ProfileData = BI->getMetadata(LLVMContext::MD_prof);
@@ -1670,35 +1842,11 @@ static bool ExtractBranchMetadata(BranchInst *BI,
   ConstantInt *CITrue = dyn_cast<ConstantInt>(ProfileData->getOperand(1));
   ConstantInt *CIFalse = dyn_cast<ConstantInt>(ProfileData->getOperand(2));
   if (!CITrue || !CIFalse) return false;
-  ProbTrue = CITrue->getValue();
-  ProbFalse = CIFalse->getValue();
-  assert(ProbTrue.getBitWidth() == 32 && ProbFalse.getBitWidth() == 32 &&
-         "Branch probability metadata must be 32-bit integers");
+  ProbTrue = CITrue->getValue().getZExtValue();
+  ProbFalse = CIFalse->getValue().getZExtValue();
   return true;
 }
 
-/// MultiplyAndLosePrecision - Multiplies A and B, then returns the result. In
-/// the event of overflow, logically-shifts all four inputs right until the
-/// multiply fits.
-static APInt MultiplyAndLosePrecision(APInt &A, APInt &B, APInt &C, APInt &D,
-                                      unsigned &BitsLost) {
-  BitsLost = 0;
-  bool Overflow = false;
-  APInt Result = A.umul_ov(B, Overflow);
-  if (Overflow) {
-    APInt MaxB = APInt::getMaxValue(A.getBitWidth()).udiv(A);
-    do {
-      B = B.lshr(1);
-      ++BitsLost;
-    } while (B.ugt(MaxB));
-    A = A.lshr(BitsLost);
-    C = C.lshr(BitsLost);
-    D = D.lshr(BitsLost);
-    Result = A * B;
-  }
-  return Result;
-}
-
 /// checkCSEInPredecessor - Return true if the given instruction is available
 /// in its predecessor block. If yes, the instruction will be removed.
 ///
@@ -1824,7 +1972,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
       continue;
 
     // Determine if the two branches share a common destination.
-    Instruction::BinaryOps Opc;
+    Instruction::BinaryOps Opc = Instruction::BinaryOpsEnd;
     bool InvertPredCond = false;
 
     if (BI->isConditional()) {
@@ -1923,14 +2071,53 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
                                             New, "or.cond"));
       PBI->setCondition(NewCond);
 
+      uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+      bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                                  PredFalseWeight);
+      bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                                  SuccFalseWeight);
+      SmallVector<uint64_t, 8> NewWeights;
+
       if (PBI->getSuccessor(0) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, BB, FalseDest
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * TotalWeight for BI +
+          //               TrueWeight for PBI * FalseWeight for BI.
+          // We assume that total weights of a BranchInst can fit into 32 bits.
+          // Therefore, we will not have overflow using 64-bit arithmetic.
+          NewWeights.push_back(PredFalseWeight * (SuccFalseWeight +
+               SuccTrueWeight) + PredTrueWeight * SuccFalseWeight);
+        }
         AddPredecessorToBlock(TrueDest, PredBlock, BB);
         PBI->setSuccessor(0, TrueDest);
       }
       if (PBI->getSuccessor(1) == BB) {
+        if (PredHasWeights && SuccHasWeights) {
+          // PBI: br i1 %x, TrueDest, BB
+          // BI:  br i1 %y, TrueDest, FalseDest
+          //TrueWeight is TrueWeight for PBI * TotalWeight for BI +
+          //              FalseWeight for PBI * TrueWeight for BI.
+          NewWeights.push_back(PredTrueWeight * (SuccFalseWeight +
+              SuccTrueWeight) + PredFalseWeight * SuccTrueWeight);
+          //FalseWeight is FalseWeight for PBI * FalseWeight for BI.
+          NewWeights.push_back(PredFalseWeight * SuccFalseWeight);
+        }
         AddPredecessorToBlock(FalseDest, PredBlock, BB);
         PBI->setSuccessor(1, FalseDest);
       }
+      if (NewWeights.size() == 2) {
+        // Halve the weights if any of them cannot fit in an uint32_t
+        FitWeights(NewWeights);
+
+        SmallVector<uint32_t, 8> MDWeights(NewWeights.begin(),NewWeights.end());
+        PBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(BI->getContext()).
+                         createBranchWeights(MDWeights));
+      } else
+        PBI->setMetadata(LLVMContext::MD_prof, NULL);
     } else {
       // Update PHI nodes in the common successors.
       for (unsigned i = 0, e = PHIs.size(); i != e; ++i) {
@@ -1985,90 +2172,6 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI) {
     // TODO: If BB is reachable from all paths through PredBlock, then we
     // could replace PBI's branch probabilities with BI's.
 
-    // Merge probability data into PredBlock's branch.
-    APInt A, B, C, D;
-    if (PBI->isConditional() && BI->isConditional() &&
-        ExtractBranchMetadata(PBI, C, D) && ExtractBranchMetadata(BI, A, B)) {
-      // Given IR which does:
-      //   bbA:
-      //     br i1 %x, label %bbB, label %bbC
-      //   bbB:
-      //     br i1 %y, label %bbD, label %bbC
-      // Let's call the probability that we take the edge from %bbA to %bbB
-      // 'a', from %bbA to %bbC, 'b', from %bbB to %bbD 'c' and from %bbB to
-      // %bbC probability 'd'.
-      //
-      // We transform the IR into:
-      //   bbA:
-      //     br i1 %z, label %bbD, label %bbC
-      // where the probability of going to %bbD is (a*c) and going to bbC is
-      // (b+a*d).
-      //
-      // Probabilities aren't stored as ratios directly. Using branch weights,
-      // we get:
-      // (a*c)% = A*C, (b+(a*d))% = A*D+B*C+B*D.
-
-      // In the event of overflow, we want to drop the LSB of the input
-      // probabilities.
-      unsigned BitsLost;
-
-      // Ignore overflow result on ProbTrue.
-      APInt ProbTrue = MultiplyAndLosePrecision(A, C, B, D, BitsLost);
-
-      APInt Tmp1 = MultiplyAndLosePrecision(B, D, A, C, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-      }
-
-      APInt Tmp2 = MultiplyAndLosePrecision(A, D, C, B, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-        Tmp1 = Tmp1.lshr(BitsLost*2);
-      }
-
-      APInt Tmp3 = MultiplyAndLosePrecision(B, C, A, D, BitsLost);
-      if (BitsLost) {
-        ProbTrue = ProbTrue.lshr(BitsLost*2);
-        Tmp1 = Tmp1.lshr(BitsLost*2);
-        Tmp2 = Tmp2.lshr(BitsLost*2);
-      }
-
-      bool Overflow1 = false, Overflow2 = false;
-      APInt Tmp4 = Tmp2.uadd_ov(Tmp3, Overflow1);
-      APInt ProbFalse = Tmp4.uadd_ov(Tmp1, Overflow2);
-
-      if (Overflow1 || Overflow2) {
-        ProbTrue = ProbTrue.lshr(1);
-        Tmp1 = Tmp1.lshr(1);
-        Tmp2 = Tmp2.lshr(1);
-        Tmp3 = Tmp3.lshr(1);
-        Tmp4 = Tmp2 + Tmp3;
-        ProbFalse = Tmp4 + Tmp1;
-      }
-
-      // The sum of branch weights must fit in 32-bits.
-      if (ProbTrue.isNegative() && ProbFalse.isNegative()) {
-        ProbTrue = ProbTrue.lshr(1);
-        ProbFalse = ProbFalse.lshr(1);
-      }
-
-      if (ProbTrue != ProbFalse) {
-        // Normalize the result.
-        APInt GCD = APIntOps::GreatestCommonDivisor(ProbTrue, ProbFalse);
-        ProbTrue = ProbTrue.udiv(GCD);
-        ProbFalse = ProbFalse.udiv(GCD);
-
-        MDBuilder MDB(BI->getContext());
-        MDNode *N = MDB.createBranchWeights(ProbTrue.getZExtValue(),
-                                            ProbFalse.getZExtValue());
-        PBI->setMetadata(LLVMContext::MD_prof, N);
-      } else {
-        PBI->setMetadata(LLVMContext::MD_prof, NULL);
-      }
-    } else {
-      PBI->setMetadata(LLVMContext::MD_prof, NULL);
-    }
-
     // Copy any debug value intrinsics into the end of PredBlock.
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
       if (isa<DbgInfoIntrinsic>(*I))
@@ -2223,6 +2326,33 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
   PBI->setSuccessor(0, CommonDest);
   PBI->setSuccessor(1, OtherDest);
 
+  // Update branch weight for PBI.
+  uint64_t PredTrueWeight, PredFalseWeight, SuccTrueWeight, SuccFalseWeight;
+  bool PredHasWeights = ExtractBranchMetadata(PBI, PredTrueWeight,
+                                              PredFalseWeight);
+  bool SuccHasWeights = ExtractBranchMetadata(BI, SuccTrueWeight,
+                                              SuccFalseWeight);
+  if (PredHasWeights && SuccHasWeights) {
+    uint64_t PredCommon = PBIOp ? PredFalseWeight : PredTrueWeight;
+    uint64_t PredOther = PBIOp ?PredTrueWeight : PredFalseWeight;
+    uint64_t SuccCommon = BIOp ? SuccFalseWeight : SuccTrueWeight;
+    uint64_t SuccOther = BIOp ? SuccTrueWeight : SuccFalseWeight;
+    // The weight to CommonDest should be PredCommon * SuccTotal +
+    //                                    PredOther * SuccCommon.
+    // The weight to OtherDest should be PredOther * SuccOther.
+    SmallVector<uint64_t, 2> NewWeights;
+    NewWeights.push_back(PredCommon * (SuccCommon + SuccOther) +
+                         PredOther * SuccCommon);
+    NewWeights.push_back(PredOther * SuccOther);
+    // Halve the weights if any of them cannot fit in an uint32_t
+    FitWeights(NewWeights);
+
+    SmallVector<uint32_t, 2> MDWeights(NewWeights.begin(),NewWeights.end());
+    PBI->setMetadata(LLVMContext::MD_prof,
+                     MDBuilder(BI->getContext()).
+                     createBranchWeights(MDWeights));
+  }
+
   // OtherDest may have phi nodes.  If so, add an entry from PBI's
   // block that are identical to the entries for BI's block.
   AddPredecessorToBlock(OtherDest, PBI->getParent(), BB);
@@ -2259,7 +2389,9 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI) {
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
 static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
-                                       BasicBlock *TrueBB, BasicBlock *FalseBB){
+                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                       uint32_t TrueWeight,
+                                       uint32_t FalseWeight){
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -2288,10 +2420,15 @@ static bool SimplifyTerminatorOnSelect(TerminatorInst *OldTerm, Value *Cond,
       // We were only looking for one successor, and it was present.
       // Create an unconditional branch to it.
       Builder.CreateBr(TrueBB);
-    else
+    else {
       // We found both of the successors we were looking for.
       // Create a conditional branch sharing the condition of the select.
-      Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      BranchInst *NewBI = Builder.CreateCondBr(Cond, TrueBB, FalseBB);
+      if (TrueWeight != FalseWeight)
+        NewBI->setMetadata(LLVMContext::MD_prof,
+                           MDBuilder(OldTerm->getContext()).
+                           createBranchWeights(TrueWeight, FalseWeight));
+    }
   } else if (KeepEdge1 && (KeepEdge2 || TrueBB == FalseBB)) {
     // Neither of the selected blocks were successors, so this
     // terminator must be unreachable.
@@ -2328,8 +2465,23 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
   BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
   BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
 
+  // Get weight for TrueBB and FalseBB.
+  uint32_t TrueWeight = 0, FalseWeight = 0;
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      TrueWeight = (uint32_t)Weights[SI->findCaseValue(TrueVal).
+                                     getSuccessorIndex()];
+      FalseWeight = (uint32_t)Weights[SI->findCaseValue(FalseVal).
+                                      getSuccessorIndex()];
+    }
+  }
+
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB);
+  return SimplifyTerminatorOnSelect(SI, Condition, TrueBB, FalseBB,
+                                    TrueWeight, FalseWeight);
 }
 
 // SimplifyIndirectBrOnSelect - Replaces
@@ -2349,7 +2501,8 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
   BasicBlock *FalseBB = FBA->getBasicBlock();
 
   // Perform the actual simplification.
-  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB);
+  return SimplifyTerminatorOnSelect(IBI, SI->getCondition(), TrueBB, FalseBB,
+                                    0, 0);
 }
 
 /// TryToSimplifyUncondBranchWithICmpInIt - This is called when we find an icmp
@@ -2369,9 +2522,9 @@ static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
 ///
 /// We prefer to split the edge to 'end' so that there is a true/false entry to
 /// the PHI, merging the third icmp into the switch.
-static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
-                                                  const TargetData *TD,
-                                                  IRBuilder<> &Builder) {
+static bool TryToSimplifyUncondBranchWithICmpInIt(
+    ICmpInst *ICI, IRBuilder<> &Builder, const TargetTransformInfo &TTI,
+    const DataLayout *TD) {
   BasicBlock *BB = ICI->getParent();
 
   // If the block has any PHIs in it or the icmp has multiple uses, it is too
@@ -2404,7 +2557,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
       ICI->eraseFromParent();
     }
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
   }
 
   // Ok, the block is reachable from the default dest.  If the constant we're
@@ -2420,7 +2573,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
     ICI->replaceAllUsesWith(V);
     ICI->eraseFromParent();
     // BB is now empty, so it is likely to simplify away.
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
   }
 
   // The use of the icmp has to be in the 'end' block, by the only PHI node in
@@ -2448,6 +2601,21 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
   // the switch to the merge point on the compared value.
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "switch.edge",
                                          BB->getParent(), BB);
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Split weight for default case to case for "Cst".
+      Weights[0] = (Weights[0]+1) >> 1;
+      Weights.push_back(Weights[0]);
+
+      SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+      SI->setMetadata(LLVMContext::MD_prof,
+                      MDBuilder(SI->getContext()).
+                      createBranchWeights(MDWeights));
+    }
+  }
   SI->addCase(Cst, NewBB);
 
   // NewBB branches to the phi block, add the uncond branch and the phi entry.
@@ -2461,7 +2629,7 @@ static bool TryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
 /// SimplifyBranchOnICmpChain - The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
+static bool SimplifyBranchOnICmpChain(BranchInst *BI, const DataLayout *TD,
                                       IRBuilder<> &Builder) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (Cond == 0) return false;
@@ -2542,7 +2710,7 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, const TargetData *TD,
   Builder.SetInsertPoint(BI);
   // Convert pointer to int before we switch.
   if (CompVal->getType()->isPointerTy()) {
-    assert(TD && "Cannot switch on pointer without TargetData");
+    assert(TD && "Cannot switch on pointer without DataLayout");
     CompVal = Builder.CreatePtrToInt(CompVal,
                                      TD->getIntPtrType(CompVal->getContext()),
                                      "magicptr");
@@ -2861,9 +3029,28 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   if (!Offset->isNullValue())
     Sub = Builder.CreateAdd(Sub, Offset, Sub->getName()+".off");
   Value *Cmp = Builder.CreateICmpULT(Sub, NumCases, "switch");
-  Builder.CreateCondBr(
+  BranchInst *NewBI = Builder.CreateCondBr(
       Cmp, SI->case_begin().getCaseSuccessor(), SI->getDefaultDest());
 
+  // Update weight for the newly-created conditional branch.
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeights = HasBranchWeights(SI);
+  if (HasWeights) {
+    GetBranchWeights(SI, Weights);
+    if (Weights.size() == 1 + SI->getNumCases()) {
+      // Combine all weights for the cases to be the true weight of NewBI.
+      // We assume that the sum of all weights for a Terminator can fit into 32
+      // bits.
+      uint32_t NewTrueWeight = 0;
+      for (unsigned I = 1, E = Weights.size(); I != E; ++I)
+        NewTrueWeight += (uint32_t)Weights[I];
+      NewBI->setMetadata(LLVMContext::MD_prof,
+                         MDBuilder(SI->getContext()).
+                         createBranchWeights(NewTrueWeight,
+                                             (uint32_t)Weights[0]));
+    }
+  }
+
   // Prune obsolete incoming values off the successor's PHI nodes.
   for (BasicBlock::iterator BBI = SI->case_begin().getCaseSuccessor()->begin();
        isa<PHINode>(BBI); ++BBI) {
@@ -2894,15 +3081,33 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI) {
     }
   }
 
+  SmallVector<uint64_t, 8> Weights;
+  bool HasWeight = HasBranchWeights(SI);
+  if (HasWeight) {
+    GetBranchWeights(SI, Weights);
+    HasWeight = (Weights.size() == 1 + SI->getNumCases());
+  }
+
   // Remove dead cases from the switch.
   for (unsigned I = 0, E = DeadCases.size(); I != E; ++I) {
     SwitchInst::CaseIt Case = SI->findCaseValue(DeadCases[I]);
     assert(Case != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
+    if (HasWeight) {
+      std::swap(Weights[Case.getCaseIndex()+1], Weights.back());
+      Weights.pop_back();
+    }
+
     // Prune unused values from PHI nodes.
     Case.getCaseSuccessor()->removePredecessor(SI->getParent());
     SI->removeCase(Case);
   }
+  if (HasWeight) {
+    SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
+    SI->setMetadata(LLVMContext::MD_prof,
+                    MDBuilder(SI->getParent()->getContext()).
+                    createBranchWeights(MDWeights));
+  }
 
   return !DeadCases.empty();
 }
@@ -2991,26 +3196,95 @@ static bool ValidLookupTableConstant(Constant *C) {
       isa<UndefValue>(C);
 }
 
-/// GetCaseResulsts - Try to determine the resulting constant values in phi
-/// nodes at the common destination basic block for one of the case
-/// destinations of a switch instruction.
+/// LookupConstant - If V is a Constant, return it. Otherwise, try to look up
+/// its constant value in ConstantPool, returning 0 if it's not there.
+static Constant *LookupConstant(Value *V,
+                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return C;
+  return ConstantPool.lookup(V);
+}
+
+/// ConstantFold - Try to fold instruction I into a constant. This works for
+/// simple instructions such as binary operations where both operands are
+/// constant or can be replaced by constants from the ConstantPool. Returns the
+/// resulting constant on success, 0 otherwise.
+static Constant *ConstantFold(Instruction *I,
+                         const SmallDenseMap<Value*, Constant*>& ConstantPool) {
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+    Constant *A = LookupConstant(BO->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    Constant *B = LookupConstant(BO->getOperand(1), ConstantPool);
+    if (!B)
+      return 0;
+    return ConstantExpr::get(BO->getOpcode(), A, B);
+  }
+
+  if (CmpInst *Cmp = dyn_cast<CmpInst>(I)) {
+    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    Constant *B = LookupConstant(I->getOperand(1), ConstantPool);
+    if (!B)
+      return 0;
+    return ConstantExpr::getCompare(Cmp->getPredicate(), A, B);
+  }
+
+  if (SelectInst *Select = dyn_cast<SelectInst>(I)) {
+    Constant *A = LookupConstant(Select->getCondition(), ConstantPool);
+    if (!A)
+      return 0;
+    if (A->isAllOnesValue())
+      return LookupConstant(Select->getTrueValue(), ConstantPool);
+    if (A->isNullValue())
+      return LookupConstant(Select->getFalseValue(), ConstantPool);
+    return 0;
+  }
+
+  if (CastInst *Cast = dyn_cast<CastInst>(I)) {
+    Constant *A = LookupConstant(I->getOperand(0), ConstantPool);
+    if (!A)
+      return 0;
+    return ConstantExpr::getCast(Cast->getOpcode(), A, Cast->getDestTy());
+  }
+
+  return 0;
+}
+
+/// GetCaseResults - Try to determine the resulting constant values in phi nodes
+/// at the common destination basic block, *CommonDest, for one of the case
+/// destionations CaseDest corresponding to value CaseVal (0 for the default
+/// case), of a switch instruction SI.
 static bool GetCaseResults(SwitchInst *SI,
+                           ConstantInt *CaseVal,
                            BasicBlock *CaseDest,
                            BasicBlock **CommonDest,
                            SmallVector<std::pair<PHINode*,Constant*>, 4> &Res) {
   // The block from which we enter the common destination.
   BasicBlock *Pred = SI->getParent();
 
-  // If CaseDest is empty, continue to its successor.
-  if (CaseDest->getFirstNonPHIOrDbg() == CaseDest->getTerminator() &&
-      !isa<PHINode>(CaseDest->begin())) {
-
-    TerminatorInst *Terminator = CaseDest->getTerminator();
-    if (Terminator->getNumSuccessors() != 1)
-      return false;
-
-    Pred = CaseDest;
-    CaseDest = Terminator->getSuccessor(0);
+  // If CaseDest is empty except for some side-effect free instructions through
+  // which we can constant-propagate the CaseVal, continue to its successor.
+  SmallDenseMap<Value*, Constant*> ConstantPool;
+  ConstantPool.insert(std::make_pair(SI->getCondition(), CaseVal));
+  for (BasicBlock::iterator I = CaseDest->begin(), E = CaseDest->end(); I != E;
+       ++I) {
+    if (TerminatorInst *T = dyn_cast<TerminatorInst>(I)) {
+      // If the terminator is a simple branch, continue to the next block.
+      if (T->getNumSuccessors() != 1)
+        return false;
+      Pred = CaseDest;
+      CaseDest = T->getSuccessor(0);
+    } else if (isa<DbgInfoIntrinsic>(I)) {
+      // Skip debug intrinsic.
+      continue;
+    } else if (Constant *C = ConstantFold(I, ConstantPool)) {
+      // Instruction is side-effect free and constant.
+      ConstantPool.insert(std::make_pair(I, C));
+    } else {
+      break;
+    }
   }
 
   // If we did not have a CommonDest before, use the current one.
@@ -3027,10 +3301,17 @@ static bool GetCaseResults(SwitchInst *SI,
     if (Idx == -1)
       continue;
 
-    Constant *ConstVal = dyn_cast<Constant>(PHI->getIncomingValue(Idx));
+    Constant *ConstVal = LookupConstant(PHI->getIncomingValue(Idx),
+                                        ConstantPool);
     if (!ConstVal)
       return false;
 
+    // Note: If the constant comes from constant-propagating the case value
+    // through the CaseDest basic block, it will be safe to remove the
+    // instructions in that block. They cannot be used (except in the phi nodes
+    // we visit) outside CaseDest, because that block does not dominate its
+    // successor. If it did, we would not be in this phi node.
+
     // Be conservative about which kinds of constants we support.
     if (!ValidLookupTableConstant(ConstVal))
       return false;
@@ -3041,83 +3322,255 @@ static bool GetCaseResults(SwitchInst *SI,
   return true;
 }
 
-/// BuildLookupTable - Build a lookup table with the contents of Results, using
-/// DefaultResult to fill the holes in the table. If the table ends up
-/// containing the same result in each element, set *SingleResult to that value
-/// and return NULL.
-static GlobalVariable *BuildLookupTable(Module &M,
-                                        uint64_t TableSize,
-                                        ConstantInt *Offset,
-              const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Results,
-                                        Constant *DefaultResult,
-                                        Constant **SingleResult) {
-  assert(Results.size() && "Need values to build lookup table");
-  assert(TableSize >= Results.size() && "Table needs to hold all values");
+namespace {
+  /// SwitchLookupTable - This class represents a lookup table that can be used
+  /// to replace a switch.
+  class SwitchLookupTable {
+  public:
+    /// SwitchLookupTable - Create a lookup table to use as a switch replacement
+    /// with the contents of Values, using DefaultValue to fill any holes in the
+    /// table.
+    SwitchLookupTable(Module &M,
+                      uint64_t TableSize,
+                      ConstantInt *Offset,
+               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+                      Constant *DefaultValue,
+                      const DataLayout *TD);
+
+    /// BuildLookup - Build instructions with Builder to retrieve the value at
+    /// the position given by Index in the lookup table.
+    Value *BuildLookup(Value *Index, IRBuilder<> &Builder);
+
+    /// WouldFitInRegister - Return true if a table with TableSize elements of
+    /// type ElementType would fit in a target-legal register.
+    static bool WouldFitInRegister(const DataLayout *TD,
+                                   uint64_t TableSize,
+                                   const Type *ElementType);
+
+  private:
+    // Depending on the contents of the table, it can be represented in
+    // different ways.
+    enum {
+      // For tables where each element contains the same value, we just have to
+      // store that single value and return it for each lookup.
+      SingleValueKind,
+
+      // For small tables with integer elements, we can pack them into a bitmap
+      // that fits into a target-legal register. Values are retrieved by
+      // shift and mask operations.
+      BitMapKind,
+
+      // The table is stored as an array of values. Values are retrieved by load
+      // instructions from the table.
+      ArrayKind
+    } Kind;
+
+    // For SingleValueKind, this is the single value.
+    Constant *SingleValue;
+
+    // For BitMapKind, this is the bitmap.
+    ConstantInt *BitMap;
+    IntegerType *BitMapElementTy;
+
+    // For ArrayKind, this is the array.
+    GlobalVariable *Array;
+  };
+}
+
+SwitchLookupTable::SwitchLookupTable(Module &M,
+                                     uint64_t TableSize,
+                                     ConstantInt *Offset,
+               const SmallVector<std::pair<ConstantInt*, Constant*>, 4>& Values,
+                                     Constant *DefaultValue,
+                                     const DataLayout *TD) {
+  assert(Values.size() && "Can't build lookup table without values!");
+  assert(TableSize >= Values.size() && "Can't fit values in table!");
 
   // If all values in the table are equal, this is that value.
-  Constant *SameResult = Results.begin()->second;
+  SingleValue = Values.begin()->second;
 
   // Build up the table contents.
-  std::vector<Constant*> TableContents(TableSize);
-  for (size_t I = 0, E = Results.size(); I != E; ++I) {
-    ConstantInt *CaseVal = Results[I].first;
-    Constant *CaseRes = Results[I].second;
-
-    uint64_t Idx = (CaseVal->getValue() - Offset->getValue()).getLimitedValue();
+  SmallVector<Constant*, 64> TableContents(TableSize);
+  for (size_t I = 0, E = Values.size(); I != E; ++I) {
+    ConstantInt *CaseVal = Values[I].first;
+    Constant *CaseRes = Values[I].second;
+    assert(CaseRes->getType() == DefaultValue->getType());
+
+    uint64_t Idx = (CaseVal->getValue() - Offset->getValue())
+                   .getLimitedValue();
     TableContents[Idx] = CaseRes;
 
-    if (CaseRes != SameResult)
-      SameResult = NULL;
+    if (CaseRes != SingleValue)
+      SingleValue = 0;
   }
 
   // Fill in any holes in the table with the default result.
-  if (Results.size() < TableSize) {
-    for (unsigned i = 0; i < TableSize; ++i) {
-      if (!TableContents[i])
-        TableContents[i] = DefaultResult;
+  if (Values.size() < TableSize) {
+    for (uint64_t I = 0; I < TableSize; ++I) {
+      if (!TableContents[I])
+        TableContents[I] = DefaultValue;
     }
 
-    if (DefaultResult != SameResult)
-      SameResult = NULL;
+    if (DefaultValue != SingleValue)
+      SingleValue = 0;
   }
 
-  // Same result was used in the entire table; just return that.
-  if (SameResult) {
-    *SingleResult = SameResult;
-    return NULL;
+  // If each element in the table contains the same value, we only need to store
+  // that single value.
+  if (SingleValue) {
+    Kind = SingleValueKind;
+    return;
   }
 
-  ArrayType *ArrayTy = ArrayType::get(DefaultResult->getType(), TableSize);
+  // If the type is integer and the table fits in a register, build a bitmap.
+  if (WouldFitInRegister(TD, TableSize, DefaultValue->getType())) {
+    IntegerType *IT = cast<IntegerType>(DefaultValue->getType());
+    APInt TableInt(TableSize * IT->getBitWidth(), 0);
+    for (uint64_t I = TableSize; I > 0; --I) {
+      TableInt <<= IT->getBitWidth();
+      // Insert values into the bitmap. Undef values are set to zero.
+      if (!isa<UndefValue>(TableContents[I - 1])) {
+        ConstantInt *Val = cast<ConstantInt>(TableContents[I - 1]);
+        TableInt |= Val->getValue().zext(TableInt.getBitWidth());
+      }
+    }
+    BitMap = ConstantInt::get(M.getContext(), TableInt);
+    BitMapElementTy = IT;
+    Kind = BitMapKind;
+    ++NumBitMaps;
+    return;
+  }
+
+  // Store the table in an array.
+  ArrayType *ArrayTy = ArrayType::get(DefaultValue->getType(), TableSize);
   Constant *Initializer = ConstantArray::get(ArrayTy, TableContents);
 
-  GlobalVariable *GV = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
-                                          GlobalVariable::PrivateLinkage,
-                                          Initializer,
-                                          "switch.table");
-  GV->setUnnamedAddr(true);
-  return GV;
+  Array = new GlobalVariable(M, ArrayTy, /*constant=*/ true,
+                             GlobalVariable::PrivateLinkage,
+                             Initializer,
+                             "switch.table");
+  Array->setUnnamedAddr(true);
+  Kind = ArrayKind;
+}
+
+Value *SwitchLookupTable::BuildLookup(Value *Index, IRBuilder<> &Builder) {
+  switch (Kind) {
+    case SingleValueKind:
+      return SingleValue;
+    case BitMapKind: {
+      // Type of the bitmap (e.g. i59).
+      IntegerType *MapTy = BitMap->getType();
+
+      // Cast Index to the same type as the bitmap.
+      // Note: The Index is <= the number of elements in the table, so
+      // truncating it to the width of the bitmask is safe.
+      Value *ShiftAmt = Builder.CreateZExtOrTrunc(Index, MapTy, "switch.cast");
+
+      // Multiply the shift amount by the element width.
+      ShiftAmt = Builder.CreateMul(ShiftAmt,
+                      ConstantInt::get(MapTy, BitMapElementTy->getBitWidth()),
+                                   "switch.shiftamt");
+
+      // Shift down.
+      Value *DownShifted = Builder.CreateLShr(BitMap, ShiftAmt,
+                                              "switch.downshift");
+      // Mask off.
+      return Builder.CreateTrunc(DownShifted, BitMapElementTy,
+                                 "switch.masked");
+    }
+    case ArrayKind: {
+      Value *GEPIndices[] = { Builder.getInt32(0), Index };
+      Value *GEP = Builder.CreateInBoundsGEP(Array, GEPIndices,
+                                             "switch.gep");
+      return Builder.CreateLoad(GEP, "switch.load");
+    }
+  }
+  llvm_unreachable("Unknown lookup table kind!");
+}
+
+bool SwitchLookupTable::WouldFitInRegister(const DataLayout *TD,
+                                           uint64_t TableSize,
+                                           const Type *ElementType) {
+  if (!TD)
+    return false;
+  const IntegerType *IT = dyn_cast<IntegerType>(ElementType);
+  if (!IT)
+    return false;
+  // FIXME: If the type is wider than it needs to be, e.g. i8 but all values
+  // are <= 15, we could try to narrow the type.
+
+  // Avoid overflow, fitsInLegalInteger uses unsigned int for the width.
+  if (TableSize >= UINT_MAX/IT->getBitWidth())
+    return false;
+  return TD->fitsInLegalInteger(TableSize * IT->getBitWidth());
+}
+
+/// ShouldBuildLookupTable - Determine whether a lookup table should be built
+/// for this switch, based on the number of caes, size of the table and the
+/// types of the results.
+static bool ShouldBuildLookupTable(SwitchInst *SI,
+                                   uint64_t TableSize,
+                                   const TargetTransformInfo &TTI,
+                                   const DataLayout *TD,
+                            const SmallDenseMap<PHINode*, Type*>& ResultTypes) {
+  if (SI->getNumCases() > TableSize || TableSize >= UINT64_MAX / 10)
+    return false; // TableSize overflowed, or mul below might overflow.
+
+  bool AllTablesFitInRegister = true;
+  bool HasIllegalType = false;
+  for (SmallDenseMap<PHINode*, Type*>::const_iterator I = ResultTypes.begin(),
+       E = ResultTypes.end(); I != E; ++I) {
+    Type *Ty = I->second;
+
+    // Saturate this flag to true.
+    HasIllegalType = HasIllegalType || !TTI.isTypeLegal(Ty);
+
+    // Saturate this flag to false.
+    AllTablesFitInRegister = AllTablesFitInRegister &&
+      SwitchLookupTable::WouldFitInRegister(TD, TableSize, Ty);
+
+    // If both flags saturate, we're done. NOTE: This *only* works with
+    // saturating flags, and all flags have to saturate first due to the
+    // non-deterministic behavior of iterating over a dense map.
+    if (HasIllegalType && !AllTablesFitInRegister)
+      break;
+  }
+
+  // If each table would fit in a register, we should build it anyway.
+  if (AllTablesFitInRegister)
+    return true;
+
+  // Don't build a table that doesn't fit in-register if it has illegal types.
+  if (HasIllegalType)
+    return false;
+
+  // The table density should be at least 40%. This is the same criterion as for
+  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
+  // FIXME: Find the best cut-off.
+  return SI->getNumCases() * 10 >= TableSize * 4;
 }
 
 /// SwitchToLookupTable - If the switch is only used to initialize one or more
 /// phi nodes in a common successor block with different constant values,
 /// replace the switch with lookup tables.
 static bool SwitchToLookupTable(SwitchInst *SI,
-                                IRBuilder<> &Builder) {
+                                IRBuilder<> &Builder,
+                                const TargetTransformInfo &TTI,
+                                const DataLayout* TD) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
-  // FIXME: Handle unreachable cases.
+
+  // Only build lookup table when we have a target that supports it.
+  if (!TTI.shouldBuildLookupTables())
+    return false;
 
   // FIXME: If the switch is too sparse for a lookup table, perhaps we could
   // split off a dense part and build a lookup table for that.
 
-  // FIXME: If the results are all integers and the lookup table would fit in a
-  // target-legal register, we should store them as a bitmap and use shift/mask
-  // to look up the result.
-
   // FIXME: This creates arrays of GEPs to constant strings, which means each
   // GEP needs a runtime relocation in PIC code. We should just build one big
   // string and lookup indices into that.
 
-  // Ignore the switch if the number of cases are too small.
+  // Ignore the switch if the number of cases is too small.
   // This is similar to the check when building jump tables in
   // SelectionDAGBuilder::handleJTSwitchCase.
   // FIXME: Determine the best cut-off.
@@ -3131,7 +3584,7 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   ConstantInt *MinCaseVal = CI.getCaseValue();
   ConstantInt *MaxCaseVal = CI.getCaseValue();
 
-  BasicBlock *CommonDest = NULL;
+  BasicBlock *CommonDest = 0;
   typedef SmallVector<std::pair<ConstantInt*, Constant*>, 4> ResultListTy;
   SmallDenseMap<PHINode*, ResultListTy> ResultLists;
   SmallDenseMap<PHINode*, Constant*> DefaultResults;
@@ -3148,7 +3601,8 @@ static bool SwitchToLookupTable(SwitchInst *SI,
     // Resulting value at phi nodes for this case value.
     typedef SmallVector<std::pair<PHINode*, Constant*>, 4> ResultsTy;
     ResultsTy Results;
-    if (!GetCaseResults(SI, CI.getCaseSuccessor(), &CommonDest, Results))
+    if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
+                        Results))
       return false;
 
     // Append the result from this case to the list for each phi.
@@ -3161,7 +3615,8 @@ static bool SwitchToLookupTable(SwitchInst *SI,
 
   // Get the resulting values for the default case.
   SmallVector<std::pair<PHINode*, Constant*>, 4> DefaultResultsList;
-  if (!GetCaseResults(SI, SI->getDefaultDest(), &CommonDest, DefaultResultsList))
+  if (!GetCaseResults(SI, 0, SI->getDefaultDest(), &CommonDest,
+                      DefaultResultsList))
     return false;
   for (size_t I = 0, E = DefaultResultsList.size(); I != E; ++I) {
     PHINode *PHI = DefaultResultsList[I].first;
@@ -3171,33 +3626,12 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   }
 
   APInt RangeSpread = MaxCaseVal->getValue() - MinCaseVal->getValue();
-  // The table density should be at lest 40%. This is the same criterion as for
-  // jump tables, see SelectionDAGBuilder::handleJTSwitchCase.
-  // FIXME: Find the best cut-off.
-  // Be careful to avoid overlow in the density computation.
-  if (RangeSpread.zextOrSelf(64).ugt(UINT64_MAX / 4 - 1))
-    return false;
   uint64_t TableSize = RangeSpread.getLimitedValue() + 1;
-  if (SI->getNumCases() * 10 < TableSize * 4)
+  if (!ShouldBuildLookupTable(SI, TableSize, TTI, TD, ResultTypes))
     return false;
 
-  // Build the lookup tables.
-  SmallDenseMap<PHINode*, GlobalVariable*> LookupTables;
-  SmallDenseMap<PHINode*, Constant*> SingleResults;
-
-  Module &Mod = *CommonDest->getParent()->getParent();
-  for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end();
-       I != E; ++I) {
-    PHINode *PHI = *I;
-
-    Constant *SingleResult = NULL;
-    LookupTables[PHI] = BuildLookupTable(Mod, TableSize, MinCaseVal,
-                                         ResultLists[PHI], DefaultResults[PHI],
-                                         &SingleResult);
-    SingleResults[PHI] = SingleResult;
-  }
-
   // Create the BB that does the lookups.
+  Module &Mod = *CommonDest->getParent()->getParent();
   BasicBlock *LookupBB = BasicBlock::Create(Mod.getContext(),
                                             "switch.lookup",
                                             CommonDest->getParent(),
@@ -3215,31 +3649,24 @@ static bool SwitchToLookupTable(SwitchInst *SI,
   // Populate the BB that does the lookups.
   Builder.SetInsertPoint(LookupBB);
   bool ReturnedEarly = false;
-  for (SmallVector<PHINode*, 4>::iterator I = PHIs.begin(), E = PHIs.end();
-       I != E; ++I) {
-    PHINode *PHI = *I;
-    // There was a single result for this phi; just use that.
-    if (Constant *SingleResult = SingleResults[PHI]) {
-      PHI->addIncoming(SingleResult, LookupBB);
-      continue;
-    }
+  for (size_t I = 0, E = PHIs.size(); I != E; ++I) {
+    PHINode *PHI = PHIs[I];
 
-    Value *GEPIndices[] = { Builder.getInt32(0), TableIndex };
-    Value *GEP = Builder.CreateInBoundsGEP(LookupTables[PHI], GEPIndices,
-                                           "switch.gep");
-    Value *Result = Builder.CreateLoad(GEP, "switch.load");
-
-    // If the result is only going to be used to return from the function,
-    // we want to do that right here.
-    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin())) {
-      if (CommonDest->getFirstNonPHIOrDbg() == CommonDest->getTerminator()) {
-        Builder.CreateRet(Result);
-        ReturnedEarly = true;
-      }
+    SwitchLookupTable Table(Mod, TableSize, MinCaseVal, ResultLists[PHI],
+                            DefaultResults[PHI], TD);
+
+    Value *Result = Table.BuildLookup(TableIndex, Builder);
+
+    // If the result is used to return immediately from the function, we want to
+    // do that right here.
+    if (PHI->hasOneUse() && isa<ReturnInst>(*PHI->use_begin()) &&
+        *PHI->use_begin() == CommonDest->getFirstNonPHIOrDbg()) {
+      Builder.CreateRet(Result);
+      ReturnedEarly = true;
+      break;
     }
 
-    if (!ReturnedEarly)
-      PHI->addIncoming(Result, LookupBB);
+    PHI->addIncoming(Result, LookupBB);
   }
 
   if (!ReturnedEarly)
@@ -3258,46 +3685,44 @@ static bool SwitchToLookupTable(SwitchInst *SI,
 }
 
 bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
-  // If this switch is too complex to want to look at, ignore it.
-  if (!isValueEqualityComparison(SI))
-    return false;
-
   BasicBlock *BB = SI->getParent();
 
-  // If we only have one predecessor, and if it is a branch on this value,
-  // see if that predecessor totally determines the outcome of this switch.
-  if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
-    if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
-      return SimplifyCFG(BB) | true;
+  if (isValueEqualityComparison(SI)) {
+    // If we only have one predecessor, and if it is a branch on this value,
+    // see if that predecessor totally determines the outcome of this switch.
+    if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
+      if (SimplifyEqualityComparisonWithOnlyPredecessor(SI, OnlyPred, Builder))
+        return SimplifyCFG(BB, TTI, TD) | true;
 
-  Value *Cond = SI->getCondition();
-  if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
-    if (SimplifySwitchOnSelect(SI, Select))
-      return SimplifyCFG(BB) | true;
+    Value *Cond = SI->getCondition();
+    if (SelectInst *Select = dyn_cast<SelectInst>(Cond))
+      if (SimplifySwitchOnSelect(SI, Select))
+        return SimplifyCFG(BB, TTI, TD) | true;
 
-  // If the block only contains the switch, see if we can fold the block
-  // away into any preds.
-  BasicBlock::iterator BBI = BB->begin();
-  // Ignore dbg intrinsics.
-  while (isa<DbgInfoIntrinsic>(BBI))
-    ++BBI;
-  if (SI == &*BBI)
-    if (FoldValueComparisonIntoPredecessors(SI, Builder))
-      return SimplifyCFG(BB) | true;
+    // If the block only contains the switch, see if we can fold the block
+    // away into any preds.
+    BasicBlock::iterator BBI = BB->begin();
+    // Ignore dbg intrinsics.
+    while (isa<DbgInfoIntrinsic>(BBI))
+      ++BBI;
+    if (SI == &*BBI)
+      if (FoldValueComparisonIntoPredecessors(SI, Builder))
+        return SimplifyCFG(BB, TTI, TD) | true;
+  }
 
   // Try to transform the switch into an icmp and a branch.
   if (TurnSwitchRangeIntoICmp(SI, Builder))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   // Remove unreachable cases.
   if (EliminateDeadSwitchCases(SI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   if (ForwardSwitchConditionToPHI(SI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
-  if (SwitchToLookupTable(SI, Builder))
-    return SimplifyCFG(BB) | true;
+  if (SwitchToLookupTable(SI, Builder, TTI, TD))
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   return false;
 }
@@ -3334,7 +3759,7 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 
   if (SelectInst *SI = dyn_cast<SelectInst>(IBI->getAddress())) {
     if (SimplifyIndirectBrOnSelect(IBI, SI))
-      return SimplifyCFG(BB) | true;
+      return SimplifyCFG(BB, TTI, TD) | true;
   }
   return Changed;
 }
@@ -3342,6 +3767,9 @@ bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
 bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   BasicBlock *BB = BI->getParent();
 
+  if (SinkCommon && SinkThenElseCodeToEnd(BI))
+    return true;
+
   // If the Terminator is the only non-phi instruction, simplify the block.
   BasicBlock::iterator I = BB->getFirstNonPHIOrDbgOrLifetime();
   if (I->isTerminator() && BB != &BB->getParent()->getEntryBlock() &&
@@ -3355,7 +3783,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
       for (++I; isa<DbgInfoIntrinsic>(I); ++I)
         ;
       if (I->isTerminator() &&
-          TryToSimplifyUncondBranchWithICmpInIt(ICI, TD, Builder))
+          TryToSimplifyUncondBranchWithICmpInIt(ICI, Builder, TTI, TD))
         return true;
     }
 
@@ -3364,7 +3792,7 @@ bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder){
   // predecessor and use logical operations to update the incoming value
   // for PHI nodes in common successor.
   if (FoldBranchToCommonDest(BI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
   return false;
 }
 
@@ -3379,7 +3807,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     // switch.
     if (BasicBlock *OnlyPred = BB->getSinglePredecessor())
       if (SimplifyEqualityComparisonWithOnlyPredecessor(BI, OnlyPred, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
 
     // This block must be empty, except for the setcond inst, if it exists.
     // Ignore dbg intrinsics.
@@ -3389,14 +3817,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       ++I;
     if (&*I == BI) {
       if (FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
     } else if (&*I == cast<Instruction>(BI->getCondition())){
       ++I;
       // Ignore dbg intrinsics.
       while (isa<DbgInfoIntrinsic>(I))
         ++I;
       if (&*I == BI && FoldValueComparisonIntoPredecessors(BI, Builder))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
     }
   }
 
@@ -3408,7 +3836,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // branches to us and one of our successors, fold the comparison into the
   // predecessor and use logical operations to pick the right destination.
   if (FoldBranchToCommonDest(BI))
-    return SimplifyCFG(BB) | true;
+    return SimplifyCFG(BB, TTI, TD) | true;
 
   // We have a conditional branch to two blocks that are only reachable
   // from BI.  We know that the condbr dominates the two blocks, so see if
@@ -3417,7 +3845,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (BI->getSuccessor(0)->getSinglePredecessor() != 0) {
     if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
       if (HoistThenElseCodeToIf(BI))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
     } else {
       // If Successor #1 has multiple preds, we may be able to conditionally
       // execute Successor #0 if it branches to successor #1.
@@ -3425,7 +3853,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
       if (Succ0TI->getNumSuccessors() == 1 &&
           Succ0TI->getSuccessor(0) == BI->getSuccessor(1))
         if (SpeculativelyExecuteBB(BI, BI->getSuccessor(0)))
-          return SimplifyCFG(BB) | true;
+          return SimplifyCFG(BB, TTI, TD) | true;
     }
   } else if (BI->getSuccessor(1)->getSinglePredecessor() != 0) {
     // If Successor #0 has multiple preds, we may be able to conditionally
@@ -3434,7 +3862,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
     if (Succ1TI->getNumSuccessors() == 1 &&
         Succ1TI->getSuccessor(0) == BI->getSuccessor(0))
       if (SpeculativelyExecuteBB(BI, BI->getSuccessor(1)))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
   }
 
   // If this is a branch on a phi node in the current block, thread control
@@ -3442,14 +3870,14 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
       if (FoldCondBranchOnPHI(BI, TD))
-        return SimplifyCFG(BB) | true;
+        return SimplifyCFG(BB, TTI, TD) | true;
 
   // Scan predecessor blocks for conditional branches.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
     if (BranchInst *PBI = dyn_cast<BranchInst>((*PI)->getTerminator()))
       if (PBI != BI && PBI->isConditional())
         if (SimplifyCondBranchToCondBranch(PBI, BI))
-          return SimplifyCFG(BB) | true;
+          return SimplifyCFG(BB, TTI, TD) | true;
 
   return false;
 }
@@ -3460,11 +3888,12 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
   if (!C)
     return false;
 
-  if (!I->hasOneUse()) // Only look at single-use instructions, for compile time
+  if (I->use_empty())
     return false;
 
   if (C->isNullValue()) {
-    Instruction *Use = I->use_back();
+    // Only look at the first use, avoid hurting compile time with long uselists
+    User *Use = *I->use_begin();
 
     // Now make sure that there are no instructions in between that can alter
     // control flow (eg. calls)
@@ -3589,6 +4018,7 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 /// eliminates unreachable basic blocks, and does other "peephole" optimization
 /// of the CFG.  It returns true if a modification was made.
 ///
-bool llvm::SimplifyCFG(BasicBlock *BB, const TargetData *TD) {
-  return SimplifyCFGOpt(TD).run(BB);
+bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
+                       const DataLayout *TD) {
+  return SimplifyCFGOpt(TTI, TD).run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 5d673f1..41c207c 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -15,18 +15,18 @@
 
 #define DEBUG_TYPE "indvars"
 
-#include "llvm/Instructions.h"
+#include "llvm/Transforms/Utils/SimplifyIndVar.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
-#include "llvm/Target/TargetData.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/Statistic.h"
 
 using namespace llvm;
 
@@ -44,7 +44,7 @@ namespace {
     Loop             *L;
     LoopInfo         *LI;
     ScalarEvolution  *SE;
-    const TargetData *TD; // May be NULL
+    const DataLayout *TD; // May be NULL
 
     SmallVectorImpl<WeakVH> &DeadInsts;
 
@@ -56,7 +56,7 @@ namespace {
       L(Loop),
       LI(LPM->getAnalysisIfAvailable<LoopInfo>()),
       SE(SE),
-      TD(LPM->getAnalysisIfAvailable<TargetData>()),
+      TD(LPM->getAnalysisIfAvailable<DataLayout>()),
       DeadInsts(Dead),
       Changed(false) {
       assert(LI && "IV simplification requires LoopInfo");
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 528e6a1..f9687e4 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -15,17 +15,17 @@
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "instsimplify"
-#include "llvm/Function.h"
-#include "llvm/Pass.h"
-#include "llvm/Type.h"
+#include "llvm/Transforms/Scalar.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Target/TargetData.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Pass.h"
 #include "llvm/Target/TargetLibraryInfo.h"
-#include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -46,7 +46,7 @@ namespace {
     /// runOnFunction - Remove instructions that simplify.
     bool runOnFunction(Function &F) {
       const DominatorTree *DT = getAnalysisIfAvailable<DominatorTree>();
-      const TargetData *TD = getAnalysisIfAvailable<TargetData>();
+      const DataLayout *TD = getAnalysisIfAvailable<DataLayout>();
       const TargetLibraryInfo *TLI = &getAnalysis<TargetLibraryInfo>();
       SmallPtrSet<const Instruction*, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
       bool Changed = false;
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
new file mode 100644
index 0000000..83c74e7
--- /dev/null
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -0,0 +1,1894 @@
+//===------ SimplifyLibCalls.cpp - Library calls simplifier ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a utility pass used for testing the InstructionSimplify analysis.
+// The analysis is applied to every instruction, and if it simplifies then the
+// instruction is replaced by the simplification.  If you are looking for a pass
+// that performs serious instruction folding, use the instcombine pass instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetLibraryInfo.h"
+#include "llvm/Transforms/Utils/BuildLibCalls.h"
+
+using namespace llvm;
+
+/// This class is the abstract base class for the set of optimizations that
+/// corresponds to one library call.
+namespace {
+class LibCallOptimization {
+protected:
+  Function *Caller;
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
+  LLVMContext* Context;
+public:
+  LibCallOptimization() { }
+  virtual ~LibCallOptimization() {}
+
+  /// callOptimizer - This pure virtual method is implemented by base classes to
+  /// do various optimizations.  If this returns null then no transformation was
+  /// performed.  If it returns CI, then it transformed the call and CI is to be
+  /// deleted.  If it returns something else, replace CI with the new value and
+  /// delete CI.
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B)
+    =0;
+
+  Value *optimizeCall(CallInst *CI, const DataLayout *TD,
+                      const TargetLibraryInfo *TLI,
+                      const LibCallSimplifier *LCS, IRBuilder<> &B) {
+    Caller = CI->getParent()->getParent();
+    this->TD = TD;
+    this->TLI = TLI;
+    this->LCS = LCS;
+    if (CI->getCalledFunction())
+      Context = &CI->getCalledFunction()->getContext();
+
+    // We never change the calling convention.
+    if (CI->getCallingConv() != llvm::CallingConv::C)
+      return NULL;
+
+    return callOptimizer(CI->getCalledFunction(), CI, B);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Helper Functions
+//===----------------------------------------------------------------------===//
+
+/// isOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
+/// value is equal or not-equal to zero.
+static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+/// isOnlyUsedInEqualityComparison - Return true if it is only used in equality
+/// comparisons with With.
+static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end();
+       UI != E; ++UI) {
+    if (ICmpInst *IC = dyn_cast<ICmpInst>(*UI))
+      if (IC->isEquality() && IC->getOperand(1) == With)
+        continue;
+    // Unknown instruction.
+    return false;
+  }
+  return true;
+}
+
+static bool callHasFloatingPointArgument(const CallInst *CI) {
+  for (CallInst::const_op_iterator it = CI->op_begin(), e = CI->op_end();
+       it != e; ++it) {
+    if ((*it)->getType()->isFloatingPointTy())
+      return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// Fortified Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct FortifiedLibCallOptimization : public LibCallOptimization {
+protected:
+  virtual bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp,
+			  bool isString) const = 0;
+};
+
+struct InstFortifiedLibCallOptimization : public FortifiedLibCallOptimization {
+  CallInst *CI;
+
+  bool isFoldable(unsigned SizeCIOp, unsigned SizeArgOp, bool isString) const {
+    if (CI->getArgOperand(SizeCIOp) == CI->getArgOperand(SizeArgOp))
+      return true;
+    if (ConstantInt *SizeCI =
+                           dyn_cast<ConstantInt>(CI->getArgOperand(SizeCIOp))) {
+      if (SizeCI->isAllOnesValue())
+        return true;
+      if (isString) {
+        uint64_t Len = GetStringLength(CI->getArgOperand(SizeArgOp));
+        // If the length is 0 we don't know how long it is and so we can't
+        // remove the check.
+        if (Len == 0) return false;
+        return SizeCI->getZExtValue() >= Len;
+      }
+      if (ConstantInt *Arg = dyn_cast<ConstantInt>(
+                                                  CI->getArgOperand(SizeArgOp)))
+        return SizeCI->getZExtValue() >= Arg->getZExtValue();
+    }
+    return false;
+  }
+};
+
+struct MemCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct MemMoveChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                      CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct MemSetChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(Context) ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(),
+                                   false);
+      B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+      return CI->getArgOperand(0);
+    }
+    return 0;
+  }
+};
+
+struct StrCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        FT->getParamType(2) != TD->getIntPtrType(Context))
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // __strcpy_chk(x,x)  -> x
+      return Src;
+
+    // If a) we don't have any length information, or b) we know this will
+    // fit then just lower to a plain strcpy. Otherwise we'll keep our
+    // strcpy_chk call which may fail at runtime if the size is too long.
+    // TODO: It might be nice to get a maximum length out of the possible
+    // string lengths for varying.
+    if (isFoldable(2, 1, true)) {
+      Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6));
+      return Ret;
+    } else {
+      // Maybe we can stil fold __strcpy_chk to __memcpy_chk.
+      uint64_t Len = GetStringLength(Src);
+      if (Len == 0) return 0;
+
+      // This optimization require DataLayout.
+      if (!TD) return 0;
+
+      Value *Ret =
+	EmitMemCpyChk(Dst, Src,
+                      ConstantInt::get(TD->getIntPtrType(Context), Len),
+                      CI->getArgOperand(2), B, TD, TLI);
+      return Ret;
+    }
+    return 0;
+  }
+};
+
+struct StpCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        FT->getParamType(2) != TD->getIntPtrType(FT->getParamType(0)))
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
+      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+    }
+
+    // If a) we don't have any length information, or b) we know this will
+    // fit then just lower to a plain stpcpy. Otherwise we'll keep our
+    // stpcpy_chk call which may fail at runtime if the size is too long.
+    // TODO: It might be nice to get a maximum length out of the possible
+    // string lengths for varying.
+    if (isFoldable(2, 1, true)) {
+      Value *Ret = EmitStrCpy(Dst, Src, B, TD, TLI, Name.substr(2, 6));
+      return Ret;
+    } else {
+      // Maybe we can stil fold __stpcpy_chk to __memcpy_chk.
+      uint64_t Len = GetStringLength(Src);
+      if (Len == 0) return 0;
+
+      // This optimization require DataLayout.
+      if (!TD) return 0;
+
+      Type *PT = FT->getParamType(0);
+      Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len);
+      Value *DstEnd = B.CreateGEP(Dst,
+                                  ConstantInt::get(TD->getIntPtrType(PT),
+                                                   Len - 1));
+      if (!EmitMemCpyChk(Dst, Src, LenV, CI->getArgOperand(2), B, TD, TLI))
+        return 0;
+      return DstEnd;
+    }
+    return 0;
+  }
+};
+
+struct StrNCpyChkOpt : public InstFortifiedLibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    this->CI = CI;
+    StringRef Name = Callee->getName();
+    FunctionType *FT = Callee->getFunctionType();
+    LLVMContext &Context = CI->getParent()->getContext();
+
+    // Check if this has the right signature.
+    if (FT->getNumParams() != 4 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != Type::getInt8PtrTy(Context) ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        FT->getParamType(3) != TD->getIntPtrType(Context))
+      return 0;
+
+    if (isFoldable(3, 2, false)) {
+      Value *Ret = EmitStrNCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                               CI->getArgOperand(2), B, TD, TLI,
+                               Name.substr(2, 7));
+      return Ret;
+    }
+    return 0;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// String and Memory Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct StrCatOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcat" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType())
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+    --Len;  // Unbias length.
+
+    // Handle the simple, do-nothing case: strcat(x, "") -> x
+    if (Len == 0)
+      return Dst;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    return emitStrLenMemCpy(Src, Dst, Len, B);
+  }
+
+  Value *emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
+                          IRBuilder<> &B) {
+    // We need to find the end of the destination string.  That's where the
+    // memory is to be moved to. We just generate a call to strlen.
+    Value *DstLen = EmitStrLen(Dst, B, TD, TLI);
+    if (!DstLen)
+      return 0;
+
+    // Now that we have the destination's length, we must index into the
+    // destination's pointer to get the actual memcpy destination (end of
+    // the string .. we're concatenating).
+    Value *CpyDst = B.CreateGEP(Dst, DstLen, "endptr");
+
+    // We have enough information to now generate the memcpy call to do the
+    // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(CpyDst, Src,
+                   ConstantInt::get(TD->getIntPtrType(*Context), Len + 1), 1);
+    return Dst;
+  }
+};
+
+struct StrNCatOpt : public StrCatOpt {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncat" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        FT->getParamType(1) != FT->getReturnType() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    // Extract some information from the instruction
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    uint64_t Len;
+
+    // We don't do anything if length is not constant
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;  // Unbias length.
+
+    // Handle the simple, do-nothing cases:
+    // strncat(x, "", c) -> x
+    // strncat(x,  c, 0) -> x
+    if (SrcLen == 0 || Len == 0) return Dst;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // We don't optimize this case
+    if (Len < SrcLen) return 0;
+
+    // strncat(x, s, c) -> strcat(x, s)
+    // s is constant so the strcat can be optimized further
+    return emitStrLenMemCpy(Src, Dst, SrcLen, B);
+  }
+};
+
+struct StrChrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strchr" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        !FT->getParamType(1)->isIntegerTy(32))
+      return 0;
+
+    Value *SrcStr = CI->getArgOperand(0);
+
+    // If the second operand is non-constant, see if we can compute the length
+    // of the input string and turn this into memchr.
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    if (CharC == 0) {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      uint64_t Len = GetStringLength(SrcStr);
+      if (Len == 0 || !FT->getParamType(1)->isIntegerTy(32))// memchr needs i32.
+        return 0;
+
+      return EmitMemChr(SrcStr, CI->getArgOperand(1), // include nul.
+                        ConstantInt::get(TD->getIntPtrType(*Context), Len),
+                        B, TD, TLI);
+    }
+
+    // Otherwise, the character is a constant, see if the first argument is
+    // a string literal.  If so, we can constant fold.
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str))
+      return 0;
+
+    // Compute the offset, make sure to handle the case when we're searching for
+    // zero (a weird way to spell strlen).
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.find(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char.  strchr returns null.
+      return Constant::getNullValue(CI->getType());
+
+    // strchr(s+n,c)  -> gep(s+n+i,c)
+    return B.CreateGEP(SrcStr, B.getInt64(I), "strchr");
+  }
+};
+
+struct StrRChrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strrchr" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != B.getInt8PtrTy() ||
+        FT->getParamType(0) != FT->getReturnType() ||
+        !FT->getParamType(1)->isIntegerTy(32))
+      return 0;
+
+    Value *SrcStr = CI->getArgOperand(0);
+    ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+
+    // Cannot fold anything if we're not looking for a constant.
+    if (!CharC)
+      return 0;
+
+    StringRef Str;
+    if (!getConstantStringInfo(SrcStr, Str)) {
+      // strrchr(s, 0) -> strchr(s, 0)
+      if (TD && CharC->isZero())
+        return EmitStrChr(SrcStr, '\0', B, TD, TLI);
+      return 0;
+    }
+
+    // Compute the offset.
+    size_t I = CharC->getSExtValue() == 0 ?
+        Str.size() : Str.rfind(CharC->getSExtValue());
+    if (I == StringRef::npos) // Didn't find the char. Return null.
+      return Constant::getNullValue(CI->getType());
+
+    // strrchr(s+n,c) -> gep(s+n+i,c)
+    return B.CreateGEP(SrcStr, B.getInt64(I), "strrchr");
+  }
+};
+
+struct StrCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcmp" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+    if (Str1P == Str2P)      // strcmp(x,x)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+    // strcmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2)
+      return ConstantInt::get(CI->getType(), Str1.compare(Str2));
+
+    if (HasStr1 && Str1.empty()) // strcmp("", x) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty()) // strcmp(x,"") -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+    // strcmp(P, "x") -> memcmp(P, "x", 2)
+    uint64_t Len1 = GetStringLength(Str1P);
+    uint64_t Len2 = GetStringLength(Str2P);
+    if (Len1 && Len2) {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      return EmitMemCmp(Str1P, Str2P,
+                        ConstantInt::get(TD->getIntPtrType(*Context),
+                        std::min(Len1, Len2)), B, TD, TLI);
+    }
+
+    return 0;
+  }
+};
+
+struct StrNCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strncmp" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
+    if (Str1P == Str2P)      // strncmp(x,x,n)  -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    // Get the length argument if it is constant.
+    uint64_t Length;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(CI->getArgOperand(2)))
+      Length = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Length == 0) // strncmp(x,y,0)   -> 0
+      return ConstantInt::get(CI->getType(), 0);
+
+    if (TD && Length == 1) // strncmp(x,y,1) -> memcmp(x,y,1)
+      return EmitMemCmp(Str1P, Str2P, CI->getArgOperand(2), B, TD, TLI);
+
+    StringRef Str1, Str2;
+    bool HasStr1 = getConstantStringInfo(Str1P, Str1);
+    bool HasStr2 = getConstantStringInfo(Str2P, Str2);
+
+    // strncmp(x, y)  -> cnst  (if both x and y are constant strings)
+    if (HasStr1 && HasStr2) {
+      StringRef SubStr1 = Str1.substr(0, Length);
+      StringRef SubStr2 = Str2.substr(0, Length);
+      return ConstantInt::get(CI->getType(), SubStr1.compare(SubStr2));
+    }
+
+    if (HasStr1 && Str1.empty())  // strncmp("", x, n) -> -*x
+      return B.CreateNeg(B.CreateZExt(B.CreateLoad(Str2P, "strcmpload"),
+                                      CI->getType()));
+
+    if (HasStr2 && Str2.empty())  // strncmp(x, "", n) -> *x
+      return B.CreateZExt(B.CreateLoad(Str1P, "strcmpload"), CI->getType());
+
+    return 0;
+  }
+};
+
+struct StrCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "strcpy" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src)      // strcpy(x,x)  -> x
+      return Src;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(Dst, Src,
+		   ConstantInt::get(TD->getIntPtrType(*Context), Len), 1);
+    return Dst;
+  }
+};
+
+struct StpCpyOpt: public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Verify the "stpcpy" function prototype.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy())
+      return 0;
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
+    if (Dst == Src) {  // stpcpy(x,x)  -> x+strlen(x)
+      Value *StrLen = EmitStrLen(Src, B, TD, TLI);
+      return StrLen ? B.CreateInBoundsGEP(Dst, StrLen) : 0;
+    }
+
+    // See if we can get the length of the input string.
+    uint64_t Len = GetStringLength(Src);
+    if (Len == 0) return 0;
+
+    Type *PT = FT->getParamType(0);
+    Value *LenV = ConstantInt::get(TD->getIntPtrType(PT), Len);
+    Value *DstEnd = B.CreateGEP(Dst,
+                                ConstantInt::get(TD->getIntPtrType(PT),
+                                                 Len - 1));
+
+    // We have enough information to now generate the memcpy call to do the
+    // copy for us.  Make a memcpy to copy the nul byte with align = 1.
+    B.CreateMemCpy(Dst, Src, LenV, 1);
+    return DstEnd;
+  }
+};
+
+struct StrNCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getParamType(2)->isIntegerTy())
+      return 0;
+
+    Value *Dst = CI->getArgOperand(0);
+    Value *Src = CI->getArgOperand(1);
+    Value *LenOp = CI->getArgOperand(2);
+
+    // See if we can get the length of the input string.
+    uint64_t SrcLen = GetStringLength(Src);
+    if (SrcLen == 0) return 0;
+    --SrcLen;
+
+    if (SrcLen == 0) {
+      // strncpy(x, "", y) -> memset(x, '\0', y, 1)
+      B.CreateMemSet(Dst, B.getInt8('\0'), LenOp, 1);
+      return Dst;
+    }
+
+    uint64_t Len;
+    if (ConstantInt *LengthArg = dyn_cast<ConstantInt>(LenOp))
+      Len = LengthArg->getZExtValue();
+    else
+      return 0;
+
+    if (Len == 0) return Dst; // strncpy(x, y, 0) -> x
+
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // Let strncpy handle the zero padding
+    if (Len > SrcLen+1) return 0;
+
+    Type *PT = FT->getParamType(0);
+    // strncpy(x, s, c) -> memcpy(x, s, c, 1) [s and c are constant]
+    B.CreateMemCpy(Dst, Src,
+                   ConstantInt::get(TD->getIntPtrType(PT), Len), 1);
+
+    return Dst;
+  }
+};
+
+struct StrLenOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    Value *Src = CI->getArgOperand(0);
+
+    // Constant folding: strlen("xyz") -> 3
+    if (uint64_t Len = GetStringLength(Src))
+      return ConstantInt::get(CI->getType(), Len-1);
+
+    // strlen(x) != 0 --> *x != 0
+    // strlen(x) == 0 --> *x == 0
+    if (isOnlyUsedInZeroEqualityComparison(CI))
+      return B.CreateZExt(B.CreateLoad(Src, "strlenfirst"), CI->getType());
+    return 0;
+  }
+};
+
+struct StrPBrkOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        FT->getReturnType() != FT->getParamType(0))
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strpbrk(s, "") -> NULL
+    // strpbrk("", s) -> NULL
+    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t I = S1.find_first_of(S2);
+      if (I == std::string::npos) // No match.
+        return Constant::getNullValue(CI->getType());
+
+      return B.CreateGEP(CI->getArgOperand(0), B.getInt64(I), "strpbrk");
+    }
+
+    // strpbrk(s, "a") -> strchr(s, 'a')
+    if (TD && HasS2 && S2.size() == 1)
+      return EmitStrChr(CI->getArgOperand(0), S2[0], B, TD, TLI);
+
+    return 0;
+  }
+};
+
+struct StrToOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if ((FT->getNumParams() != 2 && FT->getNumParams() != 3) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy())
+      return 0;
+
+    Value *EndPtr = CI->getArgOperand(1);
+    if (isa<ConstantPointerNull>(EndPtr)) {
+      // With a null EndPtr, this function won't capture the main argument.
+      // It would be readonly too, except that it still may write to errno.
+      CI->addAttribute(1, Attribute::get(Callee->getContext(),
+                                          Attribute::NoCapture));
+    }
+
+    return 0;
+  }
+};
+
+struct StrSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strspn(s, "") -> 0
+    // strspn("", s) -> 0
+    if ((HasS1 && S1.empty()) || (HasS2 && S2.empty()))
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_not_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    return 0;
+  }
+};
+
+struct StrCSpnOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        FT->getParamType(0) != B.getInt8PtrTy() ||
+        FT->getParamType(1) != FT->getParamType(0) ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    StringRef S1, S2;
+    bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
+    bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
+
+    // strcspn("", s) -> 0
+    if (HasS1 && S1.empty())
+      return Constant::getNullValue(CI->getType());
+
+    // Constant folding.
+    if (HasS1 && HasS2) {
+      size_t Pos = S1.find_first_of(S2);
+      if (Pos == StringRef::npos) Pos = S1.size();
+      return ConstantInt::get(CI->getType(), Pos);
+    }
+
+    // strcspn(s, "") -> strlen(s)
+    if (TD && HasS2 && S2.empty())
+      return EmitStrLen(CI->getArgOperand(0), B, TD, TLI);
+
+    return 0;
+  }
+};
+
+struct StrStrOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isPointerTy())
+      return 0;
+
+    // fold strstr(x, x) -> x.
+    if (CI->getArgOperand(0) == CI->getArgOperand(1))
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // fold strstr(a, b) == a -> strncmp(a, b, strlen(b)) == 0
+    if (TD && isOnlyUsedInEqualityComparison(CI, CI->getArgOperand(0))) {
+      Value *StrLen = EmitStrLen(CI->getArgOperand(1), B, TD, TLI);
+      if (!StrLen)
+        return 0;
+      Value *StrNCmp = EmitStrNCmp(CI->getArgOperand(0), CI->getArgOperand(1),
+                                   StrLen, B, TD, TLI);
+      if (!StrNCmp)
+        return 0;
+      for (Value::use_iterator UI = CI->use_begin(), UE = CI->use_end();
+           UI != UE; ) {
+        ICmpInst *Old = cast<ICmpInst>(*UI++);
+        Value *Cmp = B.CreateICmp(Old->getPredicate(), StrNCmp,
+                                  ConstantInt::getNullValue(StrNCmp->getType()),
+                                  "cmp");
+        LCS->replaceAllUsesWith(Old, Cmp);
+      }
+      return CI;
+    }
+
+    // See if either input string is a constant string.
+    StringRef SearchStr, ToFindStr;
+    bool HasStr1 = getConstantStringInfo(CI->getArgOperand(0), SearchStr);
+    bool HasStr2 = getConstantStringInfo(CI->getArgOperand(1), ToFindStr);
+
+    // fold strstr(x, "") -> x.
+    if (HasStr2 && ToFindStr.empty())
+      return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
+
+    // If both strings are known, constant fold it.
+    if (HasStr1 && HasStr2) {
+      std::string::size_type Offset = SearchStr.find(ToFindStr);
+
+      if (Offset == StringRef::npos) // strstr("foo", "bar") -> null
+        return Constant::getNullValue(CI->getType());
+
+      // strstr("abcd", "bc") -> gep((char*)"abcd", 1)
+      Value *Result = CastToCStr(CI->getArgOperand(0), B);
+      Result = B.CreateConstInBoundsGEP1_64(Result, Offset, "strstr");
+      return B.CreateBitCast(Result, CI->getType());
+    }
+
+    // fold strstr(x, "y") -> strchr(x, 'y').
+    if (HasStr2 && ToFindStr.size() == 1) {
+      Value *StrChr= EmitStrChr(CI->getArgOperand(0), ToFindStr[0], B, TD, TLI);
+      return StrChr ? B.CreateBitCast(StrChr, CI->getType()) : 0;
+    }
+    return 0;
+  }
+};
+
+struct MemCmpOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy(32))
+      return 0;
+
+    Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
+
+    if (LHS == RHS)  // memcmp(s,s,x) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // Make sure we have a constant length.
+    ConstantInt *LenC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!LenC) return 0;
+    uint64_t Len = LenC->getZExtValue();
+
+    if (Len == 0) // memcmp(s1,s2,0) -> 0
+      return Constant::getNullValue(CI->getType());
+
+    // memcmp(S1,S2,1) -> *(unsigned char*)LHS - *(unsigned char*)RHS
+    if (Len == 1) {
+      Value *LHSV = B.CreateZExt(B.CreateLoad(CastToCStr(LHS, B), "lhsc"),
+                                 CI->getType(), "lhsv");
+      Value *RHSV = B.CreateZExt(B.CreateLoad(CastToCStr(RHS, B), "rhsc"),
+                                 CI->getType(), "rhsv");
+      return B.CreateSub(LHSV, RHSV, "chardiff");
+    }
+
+    // Constant folding: memcmp(x, y, l) -> cnst (all arguments are constant)
+    StringRef LHSStr, RHSStr;
+    if (getConstantStringInfo(LHS, LHSStr) &&
+        getConstantStringInfo(RHS, RHSStr)) {
+      // Make sure we're not reading out-of-bounds memory.
+      if (Len > LHSStr.size() || Len > RHSStr.size())
+        return 0;
+      // Fold the memcmp and normalize the result.  This way we get consistent
+      // results across multiple platforms.
+      uint64_t Ret = 0;
+      int Cmp = memcmp(LHSStr.data(), RHSStr.data(), Len);
+      if (Cmp < 0)
+        Ret = -1;
+      else if (Cmp > 0)
+        Ret = 1;
+      return ConstantInt::get(CI->getType(), Ret);
+    }
+
+    return 0;
+  }
+};
+
+struct MemCpyOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memcpy(x, y, n) -> llvm.memcpy(x, y, n, 1)
+    B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                   CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemMoveOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memmove(x, y, n) -> llvm.memmove(x, y, n, 1)
+    B.CreateMemMove(CI->getArgOperand(0), CI->getArgOperand(1),
+                    CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+struct MemSetOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 3 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        FT->getParamType(2) != TD->getIntPtrType(*Context))
+      return 0;
+
+    // memset(p, v, n) -> llvm.memset(p, v, n, 1)
+    Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
+    B.CreateMemSet(CI->getArgOperand(0), Val, CI->getArgOperand(2), 1);
+    return CI->getArgOperand(0);
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Math Library Optimizations
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Double -> Float Shrinking Optimizations for Unary Functions like 'floor'
+
+struct UnaryDoubleFPOpt : public LibCallOptimization {
+  bool CheckRetType;
+  UnaryDoubleFPOpt(bool CheckReturnType): CheckRetType(CheckReturnType) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isDoubleTy() ||
+        !FT->getParamType(0)->isDoubleTy())
+      return 0;
+
+    if (CheckRetType) {
+      // Check if all the uses for function like 'sin' are converted to float.
+      for (Value::use_iterator UseI = CI->use_begin(); UseI != CI->use_end();
+          ++UseI) {
+        FPTruncInst *Cast = dyn_cast<FPTruncInst>(*UseI);
+        if (Cast == 0 || !Cast->getType()->isFloatTy())
+          return 0;
+      }
+    }
+
+    // If this is something like 'floor((double)floatval)', convert to floorf.
+    FPExtInst *Cast = dyn_cast<FPExtInst>(CI->getArgOperand(0));
+    if (Cast == 0 || !Cast->getOperand(0)->getType()->isFloatTy())
+      return 0;
+
+    // floor((double)floatval) -> (double)floorf(floatval)
+    Value *V = Cast->getOperand(0);
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
+    return B.CreateFPExt(V, B.getDoubleTy());
+  }
+};
+
+struct UnsafeFPLibCallOptimization : public LibCallOptimization {
+  bool UnsafeFPShrink;
+  UnsafeFPLibCallOptimization(bool UnsafeFPShrink) {
+    this->UnsafeFPShrink = UnsafeFPShrink;
+  }
+};
+
+struct CosOpt : public UnsafeFPLibCallOptimization {
+  CosOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "cos" &&
+        TLI->has(LibFunc::cosf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    // cos(-x) -> cos(x)
+    Value *Op1 = CI->getArgOperand(0);
+    if (BinaryOperator::isFNeg(Op1)) {
+      BinaryOperator *BinExpr = cast<BinaryOperator>(Op1);
+      return B.CreateCall(Callee, BinExpr->getOperand(1), "cos");
+    }
+    return Ret;
+  }
+};
+
+struct PowOpt : public UnsafeFPLibCallOptimization {
+  PowOpt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "pow" &&
+        TLI->has(LibFunc::powf)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 2 || FT->getReturnType() != FT->getParamType(0) ||
+        FT->getParamType(0) != FT->getParamType(1) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    Value *Op1 = CI->getArgOperand(0), *Op2 = CI->getArgOperand(1);
+    if (ConstantFP *Op1C = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1C->isExactlyValue(1.0))  // pow(1.0, x) -> 1.0
+        return Op1C;
+      if (Op1C->isExactlyValue(2.0))  // pow(2.0, x) -> exp2(x)
+        return EmitUnaryFloatFnCall(Op2, "exp2", B, Callee->getAttributes());
+    }
+
+    ConstantFP *Op2C = dyn_cast<ConstantFP>(Op2);
+    if (Op2C == 0) return Ret;
+
+    if (Op2C->getValueAPF().isZero())  // pow(x, 0.0) -> 1.0
+      return ConstantFP::get(CI->getType(), 1.0);
+
+    if (Op2C->isExactlyValue(0.5)) {
+      // Expand pow(x, 0.5) to (x == -infinity ? +infinity : fabs(sqrt(x))).
+      // This is faster than calling pow, and still handles negative zero
+      // and negative infinity correctly.
+      // TODO: In fast-math mode, this could be just sqrt(x).
+      // TODO: In finite-only mode, this could be just fabs(sqrt(x)).
+      Value *Inf = ConstantFP::getInfinity(CI->getType());
+      Value *NegInf = ConstantFP::getInfinity(CI->getType(), true);
+      Value *Sqrt = EmitUnaryFloatFnCall(Op1, "sqrt", B,
+                                         Callee->getAttributes());
+      Value *FAbs = EmitUnaryFloatFnCall(Sqrt, "fabs", B,
+                                         Callee->getAttributes());
+      Value *FCmp = B.CreateFCmpOEQ(Op1, NegInf);
+      Value *Sel = B.CreateSelect(FCmp, Inf, FAbs);
+      return Sel;
+    }
+
+    if (Op2C->isExactlyValue(1.0))  // pow(x, 1.0) -> x
+      return Op1;
+    if (Op2C->isExactlyValue(2.0))  // pow(x, 2.0) -> x*x
+      return B.CreateFMul(Op1, Op1, "pow2");
+    if (Op2C->isExactlyValue(-1.0)) // pow(x, -1.0) -> 1.0/x
+      return B.CreateFDiv(ConstantFP::get(CI->getType(), 1.0),
+                          Op1, "powrecip");
+    return 0;
+  }
+};
+
+struct Exp2Opt : public UnsafeFPLibCallOptimization {
+  Exp2Opt(bool UnsafeFPShrink) : UnsafeFPLibCallOptimization(UnsafeFPShrink) {}
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    Value *Ret = NULL;
+    if (UnsafeFPShrink && Callee->getName() == "exp2" &&
+        TLI->has(LibFunc::exp2)) {
+      UnaryDoubleFPOpt UnsafeUnaryDoubleFP(true);
+      Ret = UnsafeUnaryDoubleFP.callOptimizer(Callee, CI, B);
+    }
+
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 1 argument of FP type, which matches the
+    // result type.
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isFloatingPointTy())
+      return Ret;
+
+    Value *Op = CI->getArgOperand(0);
+    // Turn exp2(sitofp(x)) -> ldexp(1.0, sext(x))  if sizeof(x) <= 32
+    // Turn exp2(uitofp(x)) -> ldexp(1.0, zext(x))  if sizeof(x) < 32
+    Value *LdExpArg = 0;
+    if (SIToFPInst *OpC = dyn_cast<SIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() <= 32)
+        LdExpArg = B.CreateSExt(OpC->getOperand(0), B.getInt32Ty());
+    } else if (UIToFPInst *OpC = dyn_cast<UIToFPInst>(Op)) {
+      if (OpC->getOperand(0)->getType()->getPrimitiveSizeInBits() < 32)
+        LdExpArg = B.CreateZExt(OpC->getOperand(0), B.getInt32Ty());
+    }
+
+    if (LdExpArg) {
+      const char *Name;
+      if (Op->getType()->isFloatTy())
+        Name = "ldexpf";
+      else if (Op->getType()->isDoubleTy())
+        Name = "ldexp";
+      else
+        Name = "ldexpl";
+
+      Constant *One = ConstantFP::get(*Context, APFloat(1.0f));
+      if (!Op->getType()->isFloatTy())
+        One = ConstantExpr::getFPExtend(One, Op->getType());
+
+      Module *M = Caller->getParent();
+      Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
+                                             Op->getType(),
+                                             B.getInt32Ty(), NULL);
+      CallInst *CI = B.CreateCall2(Callee, One, LdExpArg);
+      if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
+        CI->setCallingConv(F->getCallingConv());
+
+      return CI;
+    }
+    return Ret;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Integer Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct FFSOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // Just make sure this has 2 arguments of the same FP type, which match the
+    // result type.
+    if (FT->getNumParams() != 1 ||
+        !FT->getReturnType()->isIntegerTy(32) ||
+        !FT->getParamType(0)->isIntegerTy())
+      return 0;
+
+    Value *Op = CI->getArgOperand(0);
+
+    // Constant fold.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op)) {
+      if (CI->isZero()) // ffs(0) -> 0.
+        return B.getInt32(0);
+      // ffs(c) -> cttz(c)+1
+      return B.getInt32(CI->getValue().countTrailingZeros() + 1);
+    }
+
+    // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
+    Type *ArgType = Op->getType();
+    Value *F = Intrinsic::getDeclaration(Callee->getParent(),
+                                         Intrinsic::cttz, ArgType);
+    Value *V = B.CreateCall2(F, Op, B.getFalse(), "cttz");
+    V = B.CreateAdd(V, ConstantInt::get(V->getType(), 1));
+    V = B.CreateIntCast(V, B.getInt32Ty(), false);
+
+    Value *Cond = B.CreateICmpNE(Op, Constant::getNullValue(ArgType));
+    return B.CreateSelect(Cond, V, B.getInt32(0));
+  }
+};
+
+struct AbsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(integer) where the types agree.
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        FT->getParamType(0) != FT->getReturnType())
+      return 0;
+
+    // abs(x) -> x >s -1 ? x : -x
+    Value *Op = CI->getArgOperand(0);
+    Value *Pos = B.CreateICmpSGT(Op, Constant::getAllOnesValue(Op->getType()),
+                                 "ispos");
+    Value *Neg = B.CreateNeg(Op, "neg");
+    return B.CreateSelect(Pos, Op, Neg);
+  }
+};
+
+struct IsDigitOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // isdigit(c) -> (c-'0') <u 10
+    Value *Op = CI->getArgOperand(0);
+    Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
+    Op = B.CreateICmpULT(Op, B.getInt32(10), "isdigit");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+struct IsAsciiOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require integer(i32)
+    if (FT->getNumParams() != 1 || !FT->getReturnType()->isIntegerTy() ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // isascii(c) -> c <u 128
+    Value *Op = CI->getArgOperand(0);
+    Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
+    return B.CreateZExt(Op, CI->getType());
+  }
+};
+
+struct ToAsciiOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    FunctionType *FT = Callee->getFunctionType();
+    // We require i32(i32)
+    if (FT->getNumParams() != 1 || FT->getReturnType() != FT->getParamType(0) ||
+        !FT->getParamType(0)->isIntegerTy(32))
+      return 0;
+
+    // toascii(c) -> c & 0x7f
+    return B.CreateAnd(CI->getArgOperand(0),
+                       ConstantInt::get(CI->getType(),0x7F));
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Formatting and IO Library Call Optimizations
+//===----------------------------------------------------------------------===//
+
+struct PrintFOpt : public LibCallOptimization {
+  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // Check for a fixed format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
+      return 0;
+
+    // Empty format string -> noop.
+    if (FormatStr.empty())  // Tolerate printf's declared void.
+      return CI->use_empty() ? (Value*)CI :
+                               ConstantInt::get(CI->getType(), 0);
+
+    // Do not do any of the following transformations if the printf return value
+    // is used, in general the printf return value is not compatible with either
+    // putchar() or puts().
+    if (!CI->use_empty())
+      return 0;
+
+    // printf("x") -> putchar('x'), even for '%'.
+    if (FormatStr.size() == 1) {
+      Value *Res = EmitPutChar(B.getInt32(FormatStr[0]), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    // printf("foo\n") --> puts("foo")
+    if (FormatStr[FormatStr.size()-1] == '\n' &&
+        FormatStr.find('%') == std::string::npos) {  // no format characters.
+      // Create a string literal with no \n on it.  We expect the constant merge
+      // pass to be run after this pass, to merge duplicate strings.
+      FormatStr = FormatStr.drop_back();
+      Value *GV = B.CreateGlobalString(FormatStr, "str");
+      Value *NewCI = EmitPutS(GV, B, TD, TLI);
+      return (CI->use_empty() || !NewCI) ?
+              NewCI :
+              ConstantInt::get(CI->getType(), FormatStr.size()+1);
+    }
+
+    // Optimize specific format strings.
+    // printf("%c", chr) --> putchar(chr)
+    if (FormatStr == "%c" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isIntegerTy()) {
+      Value *Res = EmitPutChar(CI->getArgOperand(1), B, TD, TLI);
+
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    // printf("%s\n", str) --> puts(str)
+    if (FormatStr == "%s\n" && CI->getNumArgOperands() > 1 &&
+        CI->getArgOperand(1)->getType()->isPointerTy()) {
+      return EmitPutS(CI->getArgOperand(1), B, TD, TLI);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+        !(FT->getReturnType()->isIntegerTy() ||
+          FT->getReturnType()->isVoidTy()))
+      return 0;
+
+    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // printf(format, ...) -> iprintf(format, ...) if no floating point
+    // arguments.
+    if (TLI->has(LibFunc::iprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *IPrintFFn =
+        M->getOrInsertFunction("iprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(IPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct SPrintFOpt : public LibCallOptimization {
+  Value *OptimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // Check for a fixed format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // If we just have a format string (nothing else crazy) transform it.
+    if (CI->getNumArgOperands() == 2) {
+      // Make sure there's no % in the constant array.  We could try to handle
+      // %% -> % in the future if we cared.
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')
+          return 0; // we found a format specifier, bail out.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(str, fmt) -> llvm.memcpy(str, fmt, strlen(fmt)+1, 1)
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(1),
+                     ConstantInt::get(TD->getIntPtrType(*Context), // Copy the
+                                      FormatStr.size() + 1), 1);   // nul byte.
+      return ConstantInt::get(CI->getType(), FormatStr.size());
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // sprintf(dst, "%c", chr) --> *(i8*)dst = chr; *((i8*)dst+1) = 0
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *V = B.CreateTrunc(CI->getArgOperand(2), B.getInt8Ty(), "char");
+      Value *Ptr = CastToCStr(CI->getArgOperand(0), B);
+      B.CreateStore(V, Ptr);
+      Ptr = B.CreateGEP(Ptr, B.getInt32(1), "nul");
+      B.CreateStore(B.getInt8(0), Ptr);
+
+      return ConstantInt::get(CI->getType(), 1);
+    }
+
+    if (FormatStr[1] == 's') {
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      // sprintf(dest, "%s", str) -> llvm.memcpy(dest, str, strlen(str)+1, 1)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy()) return 0;
+
+      Value *Len = EmitStrLen(CI->getArgOperand(2), B, TD, TLI);
+      if (!Len)
+        return 0;
+      Value *IncLen = B.CreateAdd(Len,
+                                  ConstantInt::get(Len->getType(), 1),
+                                  "leninc");
+      B.CreateMemCpy(CI->getArgOperand(0), CI->getArgOperand(2), IncLen, 1);
+
+      // The sprintf result is the unincremented number of bytes in the string.
+      return B.CreateIntCast(Len, CI->getType(), false);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed pointer arguments and an integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = OptimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // sprintf(str, format, ...) -> siprintf(str, format, ...) if no floating
+    // point arguments.
+    if (TLI->has(LibFunc::siprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *SIPrintFFn =
+        M->getOrInsertFunction("siprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(SIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FPrintFOpt : public LibCallOptimization {
+  Value *optimizeFixedFormatString(Function *Callee, CallInst *CI,
+                                   IRBuilder<> &B) {
+    // All the optimizations depend on the format string.
+    StringRef FormatStr;
+    if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
+      return 0;
+
+    // fprintf(F, "foo") --> fwrite("foo", 3, 1, F)
+    if (CI->getNumArgOperands() == 2) {
+      for (unsigned i = 0, e = FormatStr.size(); i != e; ++i)
+        if (FormatStr[i] == '%')  // Could handle %% -> % if we cared.
+          return 0; // We found a format specifier.
+
+      // These optimizations require DataLayout.
+      if (!TD) return 0;
+
+      Value *NewCI = EmitFWrite(CI->getArgOperand(1),
+                                ConstantInt::get(TD->getIntPtrType(*Context),
+                                                 FormatStr.size()),
+                                CI->getArgOperand(0), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), FormatStr.size()) : 0;
+    }
+
+    // The remaining optimizations require the format string to be "%s" or "%c"
+    // and have an extra operand.
+    if (FormatStr.size() != 2 || FormatStr[0] != '%' ||
+        CI->getNumArgOperands() < 3)
+      return 0;
+
+    // Decode the second character of the format string.
+    if (FormatStr[1] == 'c') {
+      // fprintf(F, "%c", chr) --> fputc(chr, F)
+      if (!CI->getArgOperand(2)->getType()->isIntegerTy()) return 0;
+      Value *NewCI = EmitFPutC(CI->getArgOperand(2), CI->getArgOperand(0), B,
+                               TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    if (FormatStr[1] == 's') {
+      // fprintf(F, "%s", str) --> fputs(str, F)
+      if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
+        return 0;
+      return EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
+    }
+    return 0;
+  }
+
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require two fixed paramters as pointers and integer result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    if (Value *V = optimizeFixedFormatString(Callee, CI, B)) {
+      return V;
+    }
+
+    // fprintf(stream, format, ...) -> fiprintf(stream, format, ...) if no
+    // floating point arguments.
+    if (TLI->has(LibFunc::fiprintf) && !callHasFloatingPointArgument(CI)) {
+      Module *M = B.GetInsertBlock()->getParent()->getParent();
+      Constant *FIPrintFFn =
+        M->getOrInsertFunction("fiprintf", FT, Callee->getAttributes());
+      CallInst *New = cast<CallInst>(CI->clone());
+      New->setCalledFunction(FIPrintFFn);
+      B.Insert(New);
+      return New;
+    }
+    return 0;
+  }
+};
+
+struct FWriteOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require a pointer, an integer, an integer, a pointer, returning integer.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 4 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isIntegerTy() ||
+        !FT->getParamType(2)->isIntegerTy() ||
+        !FT->getParamType(3)->isPointerTy() ||
+        !FT->getReturnType()->isIntegerTy())
+      return 0;
+
+    // Get the element size and count.
+    ConstantInt *SizeC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
+    ConstantInt *CountC = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!SizeC || !CountC) return 0;
+    uint64_t Bytes = SizeC->getZExtValue()*CountC->getZExtValue();
+
+    // If this is writing zero records, remove the call (it's a noop).
+    if (Bytes == 0)
+      return ConstantInt::get(CI->getType(), 0);
+
+    // If this is writing one byte, turn it into fputc.
+    // This optimisation is only valid, if the return value is unused.
+    if (Bytes == 1 && CI->use_empty()) {  // fwrite(S,1,1,F) -> fputc(S[0],F)
+      Value *Char = B.CreateLoad(CastToCStr(CI->getArgOperand(0), B), "char");
+      Value *NewCI = EmitFPutC(Char, CI->getArgOperand(3), B, TD, TLI);
+      return NewCI ? ConstantInt::get(CI->getType(), 1) : 0;
+    }
+
+    return 0;
+  }
+};
+
+struct FPutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // These optimizations require DataLayout.
+    if (!TD) return 0;
+
+    // Require two pointers.  Also, we can't optimize if return value is used.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() != 2 || !FT->getParamType(0)->isPointerTy() ||
+        !FT->getParamType(1)->isPointerTy() ||
+        !CI->use_empty())
+      return 0;
+
+    // fputs(s,F) --> fwrite(s,1,strlen(s),F)
+    uint64_t Len = GetStringLength(CI->getArgOperand(0));
+    if (!Len) return 0;
+    // Known to have no uses (see above).
+    return EmitFWrite(CI->getArgOperand(0),
+                      ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
+                      CI->getArgOperand(1), B, TD, TLI);
+  }
+};
+
+struct PutsOpt : public LibCallOptimization {
+  virtual Value *callOptimizer(Function *Callee, CallInst *CI, IRBuilder<> &B) {
+    // Require one fixed pointer argument and an integer/void result.
+    FunctionType *FT = Callee->getFunctionType();
+    if (FT->getNumParams() < 1 || !FT->getParamType(0)->isPointerTy() ||
+        !(FT->getReturnType()->isIntegerTy() ||
+          FT->getReturnType()->isVoidTy()))
+      return 0;
+
+    // Check for a constant string.
+    StringRef Str;
+    if (!getConstantStringInfo(CI->getArgOperand(0), Str))
+      return 0;
+
+    if (Str.empty() && CI->use_empty()) {
+      // puts("") -> putchar('\n')
+      Value *Res = EmitPutChar(B.getInt32('\n'), B, TD, TLI);
+      if (CI->use_empty() || !Res) return Res;
+      return B.CreateIntCast(Res, CI->getType(), true);
+    }
+
+    return 0;
+  }
+};
+
+} // End anonymous namespace.
+
+namespace llvm {
+
+class LibCallSimplifierImpl {
+  const DataLayout *TD;
+  const TargetLibraryInfo *TLI;
+  const LibCallSimplifier *LCS;
+  bool UnsafeFPShrink;
+  StringMap<LibCallOptimization*> Optimizations;
+
+  // Fortified library call optimizations.
+  MemCpyChkOpt MemCpyChk;
+  MemMoveChkOpt MemMoveChk;
+  MemSetChkOpt MemSetChk;
+  StrCpyChkOpt StrCpyChk;
+  StpCpyChkOpt StpCpyChk;
+  StrNCpyChkOpt StrNCpyChk;
+
+  // String library call optimizations.
+  StrCatOpt StrCat;
+  StrNCatOpt StrNCat;
+  StrChrOpt StrChr;
+  StrRChrOpt StrRChr;
+  StrCmpOpt StrCmp;
+  StrNCmpOpt StrNCmp;
+  StrCpyOpt StrCpy;
+  StpCpyOpt StpCpy;
+  StrNCpyOpt StrNCpy;
+  StrLenOpt StrLen;
+  StrPBrkOpt StrPBrk;
+  StrToOpt StrTo;
+  StrSpnOpt StrSpn;
+  StrCSpnOpt StrCSpn;
+  StrStrOpt StrStr;
+
+  // Memory library call optimizations.
+  MemCmpOpt MemCmp;
+  MemCpyOpt MemCpy;
+  MemMoveOpt MemMove;
+  MemSetOpt MemSet;
+
+  // Math library call optimizations.
+  UnaryDoubleFPOpt UnaryDoubleFP, UnsafeUnaryDoubleFP;
+  CosOpt Cos; PowOpt Pow; Exp2Opt Exp2;
+
+  // Integer library call optimizations.
+  FFSOpt FFS;
+  AbsOpt Abs;
+  IsDigitOpt IsDigit;
+  IsAsciiOpt IsAscii;
+  ToAsciiOpt ToAscii;
+
+  // Formatting and IO library call optimizations.
+  PrintFOpt PrintF;
+  SPrintFOpt SPrintF;
+  FPrintFOpt FPrintF;
+  FWriteOpt FWrite;
+  FPutsOpt FPuts;
+  PutsOpt Puts;
+
+  void initOptimizations();
+  void addOpt(LibFunc::Func F, LibCallOptimization* Opt);
+  void addOpt(LibFunc::Func F1, LibFunc::Func F2, LibCallOptimization* Opt);
+public:
+  LibCallSimplifierImpl(const DataLayout *TD, const TargetLibraryInfo *TLI,
+                        const LibCallSimplifier *LCS,
+                        bool UnsafeFPShrink = false)
+    : UnaryDoubleFP(false), UnsafeUnaryDoubleFP(true),
+      Cos(UnsafeFPShrink), Pow(UnsafeFPShrink), Exp2(UnsafeFPShrink) {
+    this->TD = TD;
+    this->TLI = TLI;
+    this->LCS = LCS;
+    this->UnsafeFPShrink = UnsafeFPShrink;
+  }
+
+  Value *optimizeCall(CallInst *CI);
+};
+
+void LibCallSimplifierImpl::initOptimizations() {
+  // Fortified library call optimizations.
+  Optimizations["__memcpy_chk"] = &MemCpyChk;
+  Optimizations["__memmove_chk"] = &MemMoveChk;
+  Optimizations["__memset_chk"] = &MemSetChk;
+  Optimizations["__strcpy_chk"] = &StrCpyChk;
+  Optimizations["__stpcpy_chk"] = &StpCpyChk;
+  Optimizations["__strncpy_chk"] = &StrNCpyChk;
+  Optimizations["__stpncpy_chk"] = &StrNCpyChk;
+
+  // String library call optimizations.
+  addOpt(LibFunc::strcat, &StrCat);
+  addOpt(LibFunc::strncat, &StrNCat);
+  addOpt(LibFunc::strchr, &StrChr);
+  addOpt(LibFunc::strrchr, &StrRChr);
+  addOpt(LibFunc::strcmp, &StrCmp);
+  addOpt(LibFunc::strncmp, &StrNCmp);
+  addOpt(LibFunc::strcpy, &StrCpy);
+  addOpt(LibFunc::stpcpy, &StpCpy);
+  addOpt(LibFunc::strncpy, &StrNCpy);
+  addOpt(LibFunc::strlen, &StrLen);
+  addOpt(LibFunc::strpbrk, &StrPBrk);
+  addOpt(LibFunc::strtol, &StrTo);
+  addOpt(LibFunc::strtod, &StrTo);
+  addOpt(LibFunc::strtof, &StrTo);
+  addOpt(LibFunc::strtoul, &StrTo);
+  addOpt(LibFunc::strtoll, &StrTo);
+  addOpt(LibFunc::strtold, &StrTo);
+  addOpt(LibFunc::strtoull, &StrTo);
+  addOpt(LibFunc::strspn, &StrSpn);
+  addOpt(LibFunc::strcspn, &StrCSpn);
+  addOpt(LibFunc::strstr, &StrStr);
+
+  // Memory library call optimizations.
+  addOpt(LibFunc::memcmp, &MemCmp);
+  addOpt(LibFunc::memcpy, &MemCpy);
+  addOpt(LibFunc::memmove, &MemMove);
+  addOpt(LibFunc::memset, &MemSet);
+
+  // Math library call optimizations.
+  addOpt(LibFunc::ceil, LibFunc::ceilf, &UnaryDoubleFP);
+  addOpt(LibFunc::fabs, LibFunc::fabsf, &UnaryDoubleFP);
+  addOpt(LibFunc::floor, LibFunc::floorf, &UnaryDoubleFP);
+  addOpt(LibFunc::rint, LibFunc::rintf, &UnaryDoubleFP);
+  addOpt(LibFunc::round, LibFunc::roundf, &UnaryDoubleFP);
+  addOpt(LibFunc::nearbyint, LibFunc::nearbyintf, &UnaryDoubleFP);
+  addOpt(LibFunc::trunc, LibFunc::truncf, &UnaryDoubleFP);
+
+  if(UnsafeFPShrink) {
+    addOpt(LibFunc::acos, LibFunc::acosf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::acosh, LibFunc::acoshf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::asin, LibFunc::asinf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::asinh, LibFunc::asinhf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::atan, LibFunc::atanf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::atanh, LibFunc::atanhf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::cbrt, LibFunc::cbrtf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::cosh, LibFunc::coshf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::exp, LibFunc::expf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::exp10, LibFunc::exp10f, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::expm1, LibFunc::expm1f, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::log, LibFunc::logf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::log10, LibFunc::log10f, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::log1p, LibFunc::log1pf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::log2, LibFunc::log2f, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::logb, LibFunc::logbf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::sin, LibFunc::sinf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::sinh, LibFunc::sinhf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::sqrt, LibFunc::sqrtf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::tan, LibFunc::tanf, &UnsafeUnaryDoubleFP);
+    addOpt(LibFunc::tanh, LibFunc::tanhf, &UnsafeUnaryDoubleFP);
+  }
+
+  addOpt(LibFunc::cosf, &Cos);
+  addOpt(LibFunc::cos, &Cos);
+  addOpt(LibFunc::cosl, &Cos);
+  addOpt(LibFunc::powf, &Pow);
+  addOpt(LibFunc::pow, &Pow);
+  addOpt(LibFunc::powl, &Pow);
+  Optimizations["llvm.pow.f32"] = &Pow;
+  Optimizations["llvm.pow.f64"] = &Pow;
+  Optimizations["llvm.pow.f80"] = &Pow;
+  Optimizations["llvm.pow.f128"] = &Pow;
+  Optimizations["llvm.pow.ppcf128"] = &Pow;
+  addOpt(LibFunc::exp2l, &Exp2);
+  addOpt(LibFunc::exp2, &Exp2);
+  addOpt(LibFunc::exp2f, &Exp2);
+  Optimizations["llvm.exp2.ppcf128"] = &Exp2;
+  Optimizations["llvm.exp2.f128"] = &Exp2;
+  Optimizations["llvm.exp2.f80"] = &Exp2;
+  Optimizations["llvm.exp2.f64"] = &Exp2;
+  Optimizations["llvm.exp2.f32"] = &Exp2;
+
+  // Integer library call optimizations.
+  addOpt(LibFunc::ffs, &FFS);
+  addOpt(LibFunc::ffsl, &FFS);
+  addOpt(LibFunc::ffsll, &FFS);
+  addOpt(LibFunc::abs, &Abs);
+  addOpt(LibFunc::labs, &Abs);
+  addOpt(LibFunc::llabs, &Abs);
+  addOpt(LibFunc::isdigit, &IsDigit);
+  addOpt(LibFunc::isascii, &IsAscii);
+  addOpt(LibFunc::toascii, &ToAscii);
+
+  // Formatting and IO library call optimizations.
+  addOpt(LibFunc::printf, &PrintF);
+  addOpt(LibFunc::sprintf, &SPrintF);
+  addOpt(LibFunc::fprintf, &FPrintF);
+  addOpt(LibFunc::fwrite, &FWrite);
+  addOpt(LibFunc::fputs, &FPuts);
+  addOpt(LibFunc::puts, &Puts);
+}
+
+Value *LibCallSimplifierImpl::optimizeCall(CallInst *CI) {
+  if (Optimizations.empty())
+    initOptimizations();
+
+  Function *Callee = CI->getCalledFunction();
+  LibCallOptimization *LCO = Optimizations.lookup(Callee->getName());
+  if (LCO) {
+    IRBuilder<> Builder(CI);
+    return LCO->optimizeCall(CI, TD, TLI, LCS, Builder);
+  }
+  return 0;
+}
+
+void LibCallSimplifierImpl::addOpt(LibFunc::Func F, LibCallOptimization* Opt) {
+  if (TLI->has(F))
+    Optimizations[TLI->getName(F)] = Opt;
+}
+
+void LibCallSimplifierImpl::addOpt(LibFunc::Func F1, LibFunc::Func F2,
+                                   LibCallOptimization* Opt) {
+  if (TLI->has(F1) && TLI->has(F2))
+    Optimizations[TLI->getName(F1)] = Opt;
+}
+
+LibCallSimplifier::LibCallSimplifier(const DataLayout *TD,
+                                     const TargetLibraryInfo *TLI,
+                                     bool UnsafeFPShrink) {
+  Impl = new LibCallSimplifierImpl(TD, TLI, this, UnsafeFPShrink);
+}
+
+LibCallSimplifier::~LibCallSimplifier() {
+  delete Impl;
+}
+
+Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+  return Impl->optimizeCall(CI);
+}
+
+void LibCallSimplifier::replaceAllUsesWith(Instruction *I, Value *With) const {
+  I->replaceAllUsesWith(With);
+  I->eraseFromParent();
+}
+
+}
diff --git a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
index b1cad06..560f581 100644
--- a/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
+++ b/lib/Transforms/Utils/UnifyFunctionExitNodes.cpp
@@ -15,12 +15,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
-#include "llvm/Transforms/Scalar.h"
-#include "llvm/BasicBlock.h"
-#include "llvm/Function.h"
-#include "llvm/Instructions.h"
-#include "llvm/Type.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
 using namespace llvm;
 
 char UnifyFunctionExitNodes::ID = 0;
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index 24e8c8f..5812d46 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -29,6 +29,7 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializePromotePassPass(Registry);
   initializeUnifyFunctionExitNodesPass(Registry);
   initializeInstSimplifierPass(Registry);
+  initializeMetaRenamerPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp
index fc2538d..a5e1643 100644
--- a/lib/Transforms/Utils/ValueMapper.cpp
+++ b/lib/Transforms/Utils/ValueMapper.cpp
@@ -13,15 +13,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ValueMapper.h"
-#include "llvm/Constants.h"
-#include "llvm/Function.h"
-#include "llvm/InlineAsm.h"
-#include "llvm/Instructions.h"
-#include "llvm/Metadata.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Metadata.h"
 using namespace llvm;
 
 // Out of line method to get vtable etc for class.
-void ValueMapTypeRemapper::Anchor() {}
+void ValueMapTypeRemapper::anchor() {}
 
 Value *llvm::MapValue(const Value *V, ValueToValueMapTy &VM, RemapFlags Flags,
                       ValueMapTypeRemapper *TypeMapper) {