13 files changed, 1376 insertions, 944 deletions
diff --git a/include/llvm/Analysis/IVUsers.h b/include/llvm/Analysis/IVUsers.h
new file mode 100644
index 0000000..36ff07b
--- /dev/null
+++ b/include/llvm/Analysis/IVUsers.h
@@ -0,0 +1,235 @@
+//===- llvm/Analysis/IVUsers.h - Induction Variable Users -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bookkeeping for "interesting" users of expressions
+// computed from induction variables.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_IVUSERS_H
+#define LLVM_ANALYSIS_IVUSERS_H
+
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include <llvm/ADT/SmallVector.h>
+#include <map>
+
+namespace llvm {
+
+class DominatorTree;
+class Instruction;
+class Value;
+class IVUsersOfOneStride;
+
+/// IVStrideUse - Keep track of one use of a strided induction variable, where
+/// the stride is stored externally.  The Offset member keeps track of the
+/// offset from the IV, User is the actual user of the operand, and
+/// 'OperandValToReplace' is the operand of the User that is the use.
+class IVStrideUse : public CallbackVH, public ilist_node<IVStrideUse> {
+public:
+  IVStrideUse(IVUsersOfOneStride *parent,
+              const SCEVHandle &offset,
+              Instruction* U, Value *O, bool issigned)
+    : CallbackVH(U), Parent(parent), Offset(offset),
+      OperandValToReplace(O), IsSigned(issigned),
+      IsUseOfPostIncrementedValue(false) {
+  }
+
+  /// getUser - Return the user instruction for this use.
+  Instruction *getUser() const {
+    return cast<Instruction>(getValPtr());
+  }
+
+  /// setUser - Assign a new user instruction for this use.
+  void setUser(Instruction *NewUser) {
+    setValPtr(NewUser);
+  }
+
+  /// getParent - Return a pointer to the IVUsersOfOneStride that owns
+  /// this IVStrideUse.
+  IVUsersOfOneStride *getParent() const { return Parent; }
+
+  /// getOffset - Return the offset to add to a theoeretical induction
+  /// variable that starts at zero and counts up by the stride to compute
+  /// the value for the use. This always has the same type as the stride,
+  /// which may need to be casted to match the type of the use.
+  SCEVHandle getOffset() const { return Offset; }
+
+  /// setOffset - Assign a new offset to this use.
+  void setOffset(SCEVHandle Val) {
+    Offset = Val;
+  }
+
+  /// getOperandValToReplace - Return the Value of the operand in the user
+  /// instruction that this IVStrideUse is representing.
+  Value *getOperandValToReplace() const {
+    return OperandValToReplace;
+  }
+
+  /// setOperandValToReplace - Assign a new Value as the operand value
+  /// to replace.
+  void setOperandValToReplace(Value *Op) {
+    OperandValToReplace = Op;
+  }
+
+  /// isSigned - The stride (and thus also the Offset) of this use may be in
+  /// a narrower type than the use itself (OperandValToReplace->getType()).
+  /// When this is the case, isSigned() indicates whether the IV expression
+  /// should be signed-extended instead of zero-extended to fit the type of
+  /// the use.
+  bool isSigned() const { return IsSigned; }
+
+  /// isUseOfPostIncrementedValue - True if this should use the
+  /// post-incremented version of this IV, not the preincremented version.
+  /// This can only be set in special cases, such as the terminating setcc
+  /// instruction for a loop or uses dominated by the loop.
+  bool isUseOfPostIncrementedValue() const {
+    return IsUseOfPostIncrementedValue;
+  }
+
+  /// setIsUseOfPostIncrmentedValue - set the flag that indicates whether
+  /// this is a post-increment use.
+  void setIsUseOfPostIncrementedValue(bool Val) {
+    IsUseOfPostIncrementedValue = Val;
+  }
+
+private:
+  /// Parent - a pointer to the IVUsersOfOneStride that owns this IVStrideUse.
+  IVUsersOfOneStride *Parent;
+
+  /// Offset - The offset to add to the base induction expression.
+  SCEVHandle Offset;
+
+  /// OperandValToReplace - The Value of the operand in the user instruction
+  /// that this IVStrideUse is representing.
+  WeakVH OperandValToReplace;
+
+  /// IsSigned - Determines whether the replacement value is sign or
+  /// zero extended to the type of the use.
+  bool IsSigned;
+
+  /// IsUseOfPostIncrementedValue - True if this should use the
+  /// post-incremented version of this IV, not the preincremented version.
+  bool IsUseOfPostIncrementedValue;
+
+  /// Deleted - Implementation of CallbackVH virtual function to
+  /// recieve notification when the User is deleted.
+  virtual void deleted();
+};
+
+template<> struct ilist_traits<IVStrideUse>
+  : public ilist_default_traits<IVStrideUse> {
+  // createSentinel is used to get hold of a node that marks the end of
+  // the list...
+  // The sentinel is relative to this instance, so we use a non-static
+  // method.
+  IVStrideUse *createSentinel() const {
+    // since i(p)lists always publicly derive from the corresponding
+    // traits, placing a data member in this class will augment i(p)list.
+    // But since the NodeTy is expected to publicly derive from
+    // ilist_node<NodeTy>, there is a legal viable downcast from it
+    // to NodeTy. We use this trick to superpose i(p)list with a "ghostly"
+    // NodeTy, which becomes the sentinel. Dereferencing the sentinel is
+    // forbidden (save the ilist_node<NodeTy>) so no one will ever notice
+    // the superposition.
+    return static_cast<IVStrideUse*>(&Sentinel);
+  }
+  static void destroySentinel(IVStrideUse*) {}
+
+  IVStrideUse *provideInitialHead() const { return createSentinel(); }
+  IVStrideUse *ensureHead(IVStrideUse*) const { return createSentinel(); }
+  static void noteHead(IVStrideUse*, IVStrideUse*) {}
+
+private:
+  mutable ilist_node<IVStrideUse> Sentinel;
+};
+
+/// IVUsersOfOneStride - This structure keeps track of all instructions that
+/// have an operand that is based on the trip count multiplied by some stride.
+struct IVUsersOfOneStride : public ilist_node<IVUsersOfOneStride> {
+private:
+  IVUsersOfOneStride(const IVUsersOfOneStride &I); // do not implement
+  void operator=(const IVUsersOfOneStride &I);     // do not implement
+
+public:
+  IVUsersOfOneStride() : Stride(0) {}
+
+  explicit IVUsersOfOneStride(const SCEV *stride) : Stride(stride) {}
+
+  /// Stride - The stride for all the contained IVStrideUses. This is
+  /// a constant for affine strides.
+  const SCEV *Stride;
+
+  /// Users - Keep track of all of the users of this stride as well as the
+  /// initial value and the operand that uses the IV.
+  ilist<IVStrideUse> Users;
+
+  void addUser(const SCEVHandle &Offset,Instruction *User, Value *Operand,
+               bool isSigned) {
+    Users.push_back(new IVStrideUse(this, Offset, User, Operand, isSigned));
+  }
+};
+
+class IVUsers : public LoopPass {
+  friend class IVStrideUserVH;
+  Loop *L;
+  LoopInfo *LI;
+  DominatorTree *DT;
+  ScalarEvolution *SE;
+  SmallPtrSet<Instruction*,16> Processed;
+
+public:
+  /// IVUses - A list of all tracked IV uses of induction variable expressions
+  /// we are interested in.
+  ilist<IVUsersOfOneStride> IVUses;
+
+  /// IVUsesByStride - A mapping from the strides in StrideOrder to the
+  /// uses in IVUses.
+  std::map<SCEVHandle, IVUsersOfOneStride*> IVUsesByStride;
+
+  /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
+  /// We use this to iterate over the IVUsesByStride collection without being
+  /// dependent on random ordering of pointers in the process.
+  SmallVector<SCEVHandle, 16> StrideOrder;
+
+private:
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const;
+
+  virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+  virtual void releaseMemory();
+
+public:
+  static char ID; // Pass ID, replacement for typeid
+  IVUsers();
+
+  /// AddUsersIfInteresting - Inspect the specified Instruction.  If it is a
+  /// reducible SCEV, recursively add its users to the IVUsesByStride set and
+  /// return true.  Otherwise, return false.
+  bool AddUsersIfInteresting(Instruction *I);
+
+  /// getReplacementExpr - Return a SCEV expression which computes the
+  /// value of the OperandValToReplace of the given IVStrideUse.
+  SCEVHandle getReplacementExpr(const IVStrideUse &U) const;
+
+  void print(raw_ostream &OS, const Module* = 0) const;
+  virtual void print(std::ostream &OS, const Module* = 0) const;
+  void print(std::ostream *OS, const Module* M = 0) const {
+    if (OS) print(*OS, M);
+  }
+
+  /// dump - This method is used for debugging.
+  void dump() const;
+};
+
+Pass *createIVUsersPass();
+
+}
+
+#endif
diff --git a/lib/Analysis/IVUsers.cpp b/lib/Analysis/IVUsers.cpp
new file mode 100644
index 0000000..9ec9cac
--- /dev/null
+++ b/lib/Analysis/IVUsers.cpp
@@ -0,0 +1,391 @@
+//===- IVUsers.cpp - Induction Variable Users -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements bookkeeping for "interesting" users of expressions
+// computed from induction variables.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "iv-users"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+using namespace llvm;
+
+char IVUsers::ID = 0;
+static RegisterPass<IVUsers>
+X("iv-users", "Induction Variable Users", false, true);
+
+Pass *llvm::createIVUsersPass() {
+  return new IVUsers();
+}
+
+/// containsAddRecFromDifferentLoop - Determine whether expression S involves a
+/// subexpression that is an AddRec from a loop other than L.  An outer loop
+/// of L is OK, but not an inner loop nor a disjoint loop.
+static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
+  // This is very common, put it first.
+  if (isa<SCEVConstant>(S))
+    return false;
+  if (const SCEVCommutativeExpr *AE = dyn_cast<SCEVCommutativeExpr>(S)) {
+    for (unsigned int i=0; i< AE->getNumOperands(); i++)
+      if (containsAddRecFromDifferentLoop(AE->getOperand(i), L))
+        return true;
+    return false;
+  }
+  if (const SCEVAddRecExpr *AE = dyn_cast<SCEVAddRecExpr>(S)) {
+    if (const Loop *newLoop = AE->getLoop()) {
+      if (newLoop == L)
+        return false;
+      // if newLoop is an outer loop of L, this is OK.
+      if (!LoopInfoBase<BasicBlock>::isNotAlreadyContainedIn(L, newLoop))
+        return false;
+    }
+    return true;
+  }
+  if (const SCEVUDivExpr *DE = dyn_cast<SCEVUDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#if 0
+  // SCEVSDivExpr has been backed out temporarily, but will be back; we'll
+  // need this when it is.
+  if (const SCEVSDivExpr *DE = dyn_cast<SCEVSDivExpr>(S))
+    return containsAddRecFromDifferentLoop(DE->getLHS(), L) ||
+           containsAddRecFromDifferentLoop(DE->getRHS(), L);
+#endif
+  if (const SCEVCastExpr *CE = dyn_cast<SCEVCastExpr>(S))
+    return containsAddRecFromDifferentLoop(CE->getOperand(), L);
+  return false;
+}
+
+/// getSCEVStartAndStride - Compute the start and stride of this expression,
+/// returning false if the expression is not a start/stride pair, or true if it
+/// is.  The stride must be a loop invariant expression, but the start may be
+/// a mix of loop invariant and loop variant expressions.  The start cannot,
+/// however, contain an AddRec from a different loop, unless that loop is an
+/// outer loop of the current loop.
+static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L, Loop *UseLoop,
+                                  SCEVHandle &Start, SCEVHandle &Stride,
+                                  bool &isSigned,
+                                  ScalarEvolution *SE, DominatorTree *DT) {
+  SCEVHandle TheAddRec = Start;   // Initialize to zero.
+  bool isSExt = false;
+  bool isZExt = false;
+
+  // If the outer level is an AddExpr, the operands are all start values except
+  // for a nested AddRecExpr.
+  if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(SH)) {
+    for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i)
+      if (const SCEVAddRecExpr *AddRec =
+             dyn_cast<SCEVAddRecExpr>(AE->getOperand(i))) {
+        if (AddRec->getLoop() == L)
+          TheAddRec = SE->getAddExpr(AddRec, TheAddRec);
+        else
+          return false;  // Nested IV of some sort?
+      } else {
+        Start = SE->getAddExpr(Start, AE->getOperand(i));
+      }
+
+  } else if (const SCEVZeroExtendExpr *Z = dyn_cast<SCEVZeroExtendExpr>(SH)) {
+    TheAddRec = Z->getOperand();
+    isZExt = true;
+  } else if (const SCEVSignExtendExpr *S = dyn_cast<SCEVSignExtendExpr>(SH)) {
+    TheAddRec = S->getOperand();
+    isSExt = true;
+  } else if (isa<SCEVAddRecExpr>(SH)) {
+    TheAddRec = SH;
+  } else {
+    return false;  // not analyzable.
+  }
+
+  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(TheAddRec);
+  if (!AddRec || AddRec->getLoop() != L) return false;
+
+  // Use getSCEVAtScope to attempt to simplify other loops out of
+  // the picture.
+  SCEVHandle AddRecStart = AddRec->getStart();
+  SCEVHandle BetterAddRecStart = SE->getSCEVAtScope(AddRecStart, UseLoop);
+  if (!isa<SCEVCouldNotCompute>(BetterAddRecStart))
+    AddRecStart = BetterAddRecStart;
+
+  // FIXME: If Start contains an SCEVAddRecExpr from a different loop, other
+  // than an outer loop of the current loop, reject it.  LSR has no concept of
+  // operating on more than one loop at a time so don't confuse it with such
+  // expressions.
+  if (containsAddRecFromDifferentLoop(AddRecStart, L))
+    return false;
+
+  if (isSExt || isZExt)
+    Start = SE->getTruncateExpr(Start, AddRec->getType());
+
+  Start = SE->getAddExpr(Start, AddRecStart);
+
+  if (!isa<SCEVConstant>(AddRec->getStepRecurrence(*SE))) {
+    // If stride is an instruction, make sure it dominates the loop preheader.
+    // Otherwise we could end up with a use before def situation.
+    BasicBlock *Preheader = L->getLoopPreheader();
+    if (!AddRec->getStepRecurrence(*SE)->dominates(Preheader, DT))
+      return false;
+
+    DOUT << "[" << L->getHeader()->getName()
+         << "] Variable stride: " << *AddRec << "\n";
+  }
+
+  Stride = AddRec->getStepRecurrence(*SE);
+  isSigned = isSExt;
+  return true;
+}
+
+/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
+/// and now we need to decide whether the user should use the preinc or post-inc
+/// value.  If this user should use the post-inc version of the IV, return true.
+///
+/// Choosing wrong here can break dominance properties (if we choose to use the
+/// post-inc value when we cannot) or it can end up adding extra live-ranges to
+/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
+/// should use the post-inc value).
+static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
+                                       Loop *L, LoopInfo *LI, DominatorTree *DT,
+                                       Pass *P) {
+  // If the user is in the loop, use the preinc value.
+  if (L->contains(User->getParent())) return false;
+
+  BasicBlock *LatchBlock = L->getLoopLatch();
+
+  // Ok, the user is outside of the loop.  If it is dominated by the latch
+  // block, use the post-inc value.
+  if (DT->dominates(LatchBlock, User->getParent()))
+    return true;
+
+  // There is one case we have to be careful of: PHI nodes.  These little guys
+  // can live in blocks that are not dominated by the latch block, but (since
+  // their uses occur in the predecessor block, not the block the PHI lives in)
+  // should still use the post-inc value.  Check for this case now.
+  PHINode *PN = dyn_cast<PHINode>(User);
+  if (!PN) return false;  // not a phi, not dominated by latch block.
+
+  // Look at all of the uses of IV by the PHI node.  If any use corresponds to
+  // a block that is not dominated by the latch block, give up and use the
+  // preincremented value.
+  unsigned NumUses = 0;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == IV) {
+      ++NumUses;
+      if (!DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
+        return false;
+    }
+
+  // Okay, all uses of IV by PN are in predecessor blocks that really are
+  // dominated by the latch block.  Use the post-incremented value.
+  return true;
+}
+
+/// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
+/// reducible SCEV, recursively add its users to the IVUsesByStride set and
+/// return true.  Otherwise, return false.
+bool IVUsers::AddUsersIfInteresting(Instruction *I) {
+  if (!SE->isSCEVable(I->getType()))
+    return false;   // Void and FP expressions cannot be reduced.
+
+  // LSR is not APInt clean, do not touch integers bigger than 64-bits.
+  if (SE->getTypeSizeInBits(I->getType()) > 64)
+    return false;
+
+  if (!Processed.insert(I))
+    return true;    // Instruction already handled.
+
+  // Get the symbolic expression for this instruction.
+  SCEVHandle ISE = SE->getSCEV(I);
+  if (isa<SCEVCouldNotCompute>(ISE)) return false;
+
+  // Get the start and stride for this expression.
+  Loop *UseLoop = LI->getLoopFor(I->getParent());
+  SCEVHandle Start = SE->getIntegerSCEV(0, ISE->getType());
+  SCEVHandle Stride = Start;
+  bool isSigned;
+
+  if (!getSCEVStartAndStride(ISE, L, UseLoop, Start, Stride, isSigned, SE, DT))
+    return false;  // Non-reducible symbolic expression, bail out.
+
+  SmallPtrSet<Instruction *, 4> UniqueUsers;
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (!UniqueUsers.insert(User))
+      continue;
+
+    // Do not infinitely recurse on PHI nodes.
+    if (isa<PHINode>(User) && Processed.count(User))
+      continue;
+
+    // Descend recursively, but not into PHI nodes outside the current loop.
+    // It's important to see the entire expression outside the loop to get
+    // choices that depend on addressing mode use right, although we won't
+    // consider references ouside the loop in all cases.
+    // If User is already in Processed, we don't want to recurse into it again,
+    // but do want to record a second reference in the same instruction.
+    bool AddUserToIVUsers = false;
+    if (LI->getLoopFor(User->getParent()) != L) {
+      if (isa<PHINode>(User) || Processed.count(User) ||
+          !AddUsersIfInteresting(User)) {
+        DOUT << "FOUND USER in other loop: " << *User
+             << "   OF SCEV: " << *ISE << "\n";
+        AddUserToIVUsers = true;
+      }
+    } else if (Processed.count(User) ||
+               !AddUsersIfInteresting(User)) {
+      DOUT << "FOUND USER: " << *User
+           << "   OF SCEV: " << *ISE << "\n";
+      AddUserToIVUsers = true;
+    }
+
+    if (AddUserToIVUsers) {
+      IVUsersOfOneStride *StrideUses = IVUsesByStride[Stride];
+      if (!StrideUses) {    // First occurrence of this stride?
+        StrideOrder.push_back(Stride);
+        StrideUses = new IVUsersOfOneStride(Stride);
+        IVUses.push_back(StrideUses);
+        IVUsesByStride[Stride] = StrideUses;
+      }
+
+      // Okay, we found a user that we cannot reduce.  Analyze the instruction
+      // and decide what to do with it.  If we are a use inside of the loop, use
+      // the value before incrementation, otherwise use it after incrementation.
+      if (IVUseShouldUsePostIncValue(User, I, L, LI, DT, this)) {
+        // The value used will be incremented by the stride more than we are
+        // expecting, so subtract this off.
+        SCEVHandle NewStart = SE->getMinusSCEV(Start, Stride);
+        StrideUses->addUser(NewStart, User, I, isSigned);
+        StrideUses->Users.back().setIsUseOfPostIncrementedValue(true);
+        DOUT << "   USING POSTINC SCEV, START=" << *NewStart<< "\n";
+      } else {
+        StrideUses->addUser(Start, User, I, isSigned);
+      }
+    }
+  }
+  return true;
+}
+
+IVUsers::IVUsers()
+ : LoopPass(&ID) {
+}
+
+void IVUsers::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<LoopInfo>();
+  AU.addRequired<DominatorTree>();
+  AU.addRequired<ScalarEvolution>();
+  AU.setPreservesAll();
+}
+
+bool IVUsers::runOnLoop(Loop *l, LPPassManager &LPM) {
+
+  L = l;
+  LI = &getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+  SE = &getAnalysis<ScalarEvolution>();
+
+  // Find all uses of induction variables in this loop, and categorize
+  // them by stride.  Start by finding all of the PHI nodes in the header for
+  // this loop.  If they are induction variables, inspect their uses.
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
+    AddUsersIfInteresting(I);
+
+  return false;
+}
+
+/// getReplacementExpr - Return a SCEV expression which computes the
+/// value of the OperandValToReplace of the given IVStrideUse.
+SCEVHandle IVUsers::getReplacementExpr(const IVStrideUse &U) const {
+  const Type *UseTy = U.getOperandValToReplace()->getType();
+  // Start with zero.
+  SCEVHandle RetVal = SE->getIntegerSCEV(0, U.getParent()->Stride->getType());
+  // Create the basic add recurrence.
+  RetVal = SE->getAddRecExpr(RetVal, U.getParent()->Stride, L);
+  // Add the offset in a separate step, because it may be loop-variant.
+  RetVal = SE->getAddExpr(RetVal, U.getOffset());
+  // For uses of post-incremented values, add an extra stride to compute
+  // the actual replacement value.
+  if (U.isUseOfPostIncrementedValue())
+    RetVal = SE->getAddExpr(RetVal, U.getParent()->Stride);
+  // Evaluate the expression out of the loop, if possible.
+  if (!L->contains(U.getUser()->getParent())) {
+    SCEVHandle ExitVal = SE->getSCEVAtScope(RetVal, L->getParentLoop());
+    if (!isa<SCEVCouldNotCompute>(ExitVal) && ExitVal->isLoopInvariant(L))
+      RetVal = ExitVal;
+  }
+  // Promote the result to the type of the use.
+  if (SE->getTypeSizeInBits(RetVal->getType()) !=
+      SE->getTypeSizeInBits(UseTy)) {
+    if (U.isSigned())
+      RetVal = SE->getSignExtendExpr(RetVal, UseTy);
+    else
+      RetVal = SE->getZeroExtendExpr(RetVal, UseTy);
+  }
+  return RetVal;
+}
+
+void IVUsers::print(raw_ostream &OS, const Module *M) const {
+  OS << "IV Users for loop ";
+  WriteAsOperand(OS, L->getHeader(), false);
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << " with backedge-taken count "
+       << *SE->getBackedgeTakenCount(L);
+  }
+  OS << ":\n";
+
+  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride*>::const_iterator SI =
+      IVUsesByStride.find(StrideOrder[Stride]);
+    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
+    OS << "  Stride " << *SI->first->getType() << " " << *SI->first << ":\n";
+
+    for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
+         E = SI->second->Users.end(); UI != E; ++UI) {
+      OS << "    ";
+      WriteAsOperand(OS, UI->getOperandValToReplace(), false);
+      OS << " = ";
+      OS << *getReplacementExpr(*UI);
+      if (UI->isUseOfPostIncrementedValue())
+        OS << " (post-inc)";
+      OS << " in ";
+      UI->getUser()->print(OS);
+    }
+  }
+}
+
+void IVUsers::print(std::ostream &o, const Module *M) const {
+  raw_os_ostream OS(o);
+  print(OS, M);
+}
+
+void IVUsers::dump() const {
+  print(errs());
+}
+
+void IVUsers::releaseMemory() {
+  IVUsesByStride.clear();
+  StrideOrder.clear();
+  Processed.clear();
+}
+
+void IVStrideUse::deleted() {
+  // Remove this user from the list.
+  Parent->Users.erase(this);
+  // this now dangles!
+}
diff --git a/lib/Target/README.txt b/lib/Target/README.txt
index 538d137..f68cf0e 100644
--- a/lib/Target/README.txt
+++ b/lib/Target/README.txt
@@ -749,16 +749,6 @@ be done safely if "b" isn't modified between the strlen and memcpy of course.
 
 //===---------------------------------------------------------------------===//
 
-We should be able to evaluate this loop:
-
-int test(int x_offs) {
-  while (x_offs > 4)
-     x_offs -= 4;
-  return x_offs;
-}
-
-//===---------------------------------------------------------------------===//
-
 Reassociate should turn things like:
 
 int factorial(int X) {
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 3d9017d..80d34f6 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -43,6 +43,8 @@
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/Type.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -51,11 +53,12 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GetElementPtrTypeIterator.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
 using namespace llvm;
 
 STATISTIC(NumRemoved , "Number of aux indvars removed");
@@ -65,6 +68,7 @@ STATISTIC(NumLFTR    , "Number of loop exit tests replaced");
 
 namespace {
   class VISIBILITY_HIDDEN IndVarSimplify : public LoopPass {
+    IVUsers         *IU;
     LoopInfo        *LI;
     ScalarEvolution *SE;
     bool Changed;
@@ -76,12 +80,15 @@ namespace {
    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
 
    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+     AU.addRequired<DominatorTree>();
      AU.addRequired<ScalarEvolution>();
      AU.addRequiredID(LCSSAID);
      AU.addRequiredID(LoopSimplifyID);
      AU.addRequired<LoopInfo>();
+     AU.addRequired<IVUsers>();
      AU.addPreserved<ScalarEvolution>();
      AU.addPreservedID(LoopSimplifyID);
+     AU.addPreserved<IVUsers>();
      AU.addPreservedID(LCSSAID);
      AU.setPreservesCFG();
    }
@@ -90,17 +97,21 @@ namespace {
 
     void RewriteNonIntegerIVs(Loop *L);
 
-    void LinearFunctionTestReplace(Loop *L, SCEVHandle BackedgeTakenCount,
+    ICmpInst *LinearFunctionTestReplace(Loop *L, SCEVHandle BackedgeTakenCount,
                                    Value *IndVar,
                                    BasicBlock *ExitingBlock,
                                    BranchInst *BI,
                                    SCEVExpander &Rewriter);
     void RewriteLoopExitValues(Loop *L, const SCEV *BackedgeTakenCount);
 
-    void DeleteTriviallyDeadInstructions(SmallPtrSet<Instruction*, 16> &Insts);
+    void RewriteIVExpressions(Loop *L, const Type *LargestType,
+                              SCEVExpander &Rewriter);
 
-    void HandleFloatingPointIV(Loop *L, PHINode *PH,
-                               SmallPtrSet<Instruction*, 16> &DeadInsts);
+    void SinkUnusedInvariants(Loop *L, SCEVExpander &Rewriter);
+
+    void FixUsesBeforeDefs(Loop *L, SCEVExpander &Rewriter);
+
+    void HandleFloatingPointIV(Loop *L, PHINode *PH);
   };
 }
 
@@ -112,31 +123,12 @@ Pass *llvm::createIndVarSimplifyPass() {
   return new IndVarSimplify();
 }
 
-/// DeleteTriviallyDeadInstructions - If any of the instructions is the
-/// specified set are trivially dead, delete them and see if this makes any of
-/// their operands subsequently dead.
-void IndVarSimplify::
-DeleteTriviallyDeadInstructions(SmallPtrSet<Instruction*, 16> &Insts) {
-  while (!Insts.empty()) {
-    Instruction *I = *Insts.begin();
-    Insts.erase(I);
-    if (isInstructionTriviallyDead(I)) {
-      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
-        if (Instruction *U = dyn_cast<Instruction>(I->getOperand(i)))
-          Insts.insert(U);
-      DOUT << "INDVARS: Deleting: " << *I;
-      I->eraseFromParent();
-      Changed = true;
-    }
-  }
-}
-
 /// LinearFunctionTestReplace - This method rewrites the exit condition of the
 /// loop to be a canonical != comparison against the incremented loop induction
 /// variable.  This pass is able to rewrite the exit tests of any loop where the
 /// SCEV analysis can determine a loop-invariant trip count of the loop, which
 /// is actually a much broader range than just linear tests.
-void IndVarSimplify::LinearFunctionTestReplace(Loop *L,
+ICmpInst *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
                                    SCEVHandle BackedgeTakenCount,
                                    Value *IndVar,
                                    BasicBlock *ExitingBlock,
@@ -196,10 +188,15 @@ void IndVarSimplify::LinearFunctionTestReplace(Loop *L,
        << (Opcode == ICmpInst::ICMP_NE ? "!=" : "==") << "\n"
        << "      RHS:\t" << *RHS << "\n";
 
-  Value *Cond = new ICmpInst(Opcode, CmpIndVar, ExitCnt, "exitcond", BI);
-  BI->setCondition(Cond);
+  ICmpInst *Cond = new ICmpInst(Opcode, CmpIndVar, ExitCnt, "exitcond", BI);
+
+  Instruction *OrigCond = cast<Instruction>(BI->getCondition());
+  OrigCond->replaceAllUsesWith(Cond);
+  RecursivelyDeleteTriviallyDeadInstructions(OrigCond);
+
   ++NumLFTR;
   Changed = true;
+  return Cond;
 }
 
 /// RewriteLoopExitValues - Check to see if this loop has a computable
@@ -207,8 +204,16 @@ void IndVarSimplify::LinearFunctionTestReplace(Loop *L,
 /// final value of any expressions that are recurrent in the loop, and
 /// substitute the exit values from the loop into any instructions outside of
 /// the loop that use the final values of the current expressions.
+///
+/// This is mostly redundant with the regular IndVarSimplify activities that
+/// happen later, except that it's more powerful in some cases, because it's
+/// able to brute-force evaluate arbitrary instructions as long as they have
+/// constant operands at the beginning of the loop.
 void IndVarSimplify::RewriteLoopExitValues(Loop *L,
                                            const SCEV *BackedgeTakenCount) {
+  // Verify the input to the pass in already in LCSSA form.
+  assert(L->isLCSSAForm());
+
   BasicBlock *Preheader = L->getLoopPreheader();
 
   // Scan all of the instructions in the loop, looking at those that have
@@ -226,9 +231,6 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L,
     BlockToInsertInto = Preheader;
   BasicBlock::iterator InsertPt = BlockToInsertInto->getFirstNonPHI();
 
-  bool HasConstantItCount = isa<SCEVConstant>(BackedgeTakenCount);
-
-  SmallPtrSet<Instruction*, 16> InstructionsToDelete;
   std::map<Instruction*, Value*> ExitValues;
 
   // Find all values that are computed inside the loop, but used outside of it.
@@ -268,18 +270,11 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L,
         if (!L->contains(Inst->getParent()))
           continue;
 
-        // We require that this value either have a computable evolution or that
-        // the loop have a constant iteration count.  In the case where the loop
-        // has a constant iteration count, we can sometimes force evaluation of
-        // the exit value through brute force.
-        SCEVHandle SH = SE->getSCEV(Inst);
-        if (!SH->hasComputableLoopEvolution(L) && !HasConstantItCount)
-          continue;          // Cannot get exit evolution for the loop value.
-
         // Okay, this instruction has a user outside of the current loop
         // and varies predictably *inside* the loop.  Evaluate the value it
         // contains when the loop exits, if possible.
-        SCEVHandle ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        SCEVHandle SH = SE->getSCEV(Inst);
+        SCEVHandle ExitValue = SE->getSCEVAtScope(SH, L->getParentLoop());
         if (isa<SCEVCouldNotCompute>(ExitValue) ||
             !ExitValue->isLoopInvariant(L))
           continue;
@@ -298,9 +293,8 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L,
 
         PN->setIncomingValue(i, ExitVal);
 
-        // If this instruction is dead now, schedule it to be removed.
-        if (Inst->use_empty())
-          InstructionsToDelete.insert(Inst);
+        // If this instruction is dead now, delete it.
+        RecursivelyDeleteTriviallyDeadInstructions(Inst);
 
         // See if this is a single-entry LCSSA PHI node.  If so, we can (and
         // have to) remove
@@ -308,14 +302,12 @@ void IndVarSimplify::RewriteLoopExitValues(Loop *L,
         // in the loop, so we don't need an LCSSA phi node anymore.
         if (NumPreds == 1) {
           PN->replaceAllUsesWith(ExitVal);
-          PN->eraseFromParent();
+          RecursivelyDeleteTriviallyDeadInstructions(PN);
           break;
         }
       }
     }
   }
-
-  DeleteTriviallyDeadInstructions(InstructionsToDelete);
 }
 
 void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
@@ -325,266 +317,24 @@ void IndVarSimplify::RewriteNonIntegerIVs(Loop *L) {
   //
   BasicBlock *Header    = L->getHeader();
 
-  SmallPtrSet<Instruction*, 16> DeadInsts;
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    HandleFloatingPointIV(L, PN, DeadInsts);
-  }
+  SmallVector<WeakVH, 8> PHIs;
+  for (BasicBlock::iterator I = Header->begin();
+       PHINode *PN = dyn_cast<PHINode>(I); ++I)
+    PHIs.push_back(PN);
+
+  for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i]))
+      HandleFloatingPointIV(L, PN);
 
   // If the loop previously had floating-point IV, ScalarEvolution
   // may not have been able to compute a trip count. Now that we've done some
   // re-writing, the trip count may be computable.
   if (Changed)
     SE->forgetLoopBackedgeTakenCount(L);
-
-  if (!DeadInsts.empty())
-    DeleteTriviallyDeadInstructions(DeadInsts);
-}
-
-/// getEffectiveIndvarType - Determine the widest type that the
-/// induction-variable PHINode Phi is cast to.
-///
-static const Type *getEffectiveIndvarType(const PHINode *Phi,
-                                          const ScalarEvolution *SE) {
-  const Type *Ty = Phi->getType();
-
-  for (Value::use_const_iterator UI = Phi->use_begin(), UE = Phi->use_end();
-       UI != UE; ++UI) {
-    const Type *CandidateType = NULL;
-    if (const ZExtInst *ZI = dyn_cast<ZExtInst>(UI))
-      CandidateType = ZI->getDestTy();
-    else if (const SExtInst *SI = dyn_cast<SExtInst>(UI))
-      CandidateType = SI->getDestTy();
-    else if (const IntToPtrInst *IP = dyn_cast<IntToPtrInst>(UI))
-      CandidateType = IP->getDestTy();
-    else if (const PtrToIntInst *PI = dyn_cast<PtrToIntInst>(UI))
-      CandidateType = PI->getDestTy();
-    if (CandidateType &&
-        SE->isSCEVable(CandidateType) &&
-        SE->getTypeSizeInBits(CandidateType) > SE->getTypeSizeInBits(Ty))
-      Ty = CandidateType;
-  }
-
-  return Ty;
-}
-
-/// TestOrigIVForWrap - Analyze the original induction variable that
-/// controls the loop's iteration to determine whether it would ever
-/// undergo signed or unsigned overflow.
-///
-/// In addition to setting the NoSignedWrap and NoUnsignedWrap
-/// variables to true when appropriate (they are not set to false here),
-/// return the PHI for this induction variable.  Also record the initial
-/// and final values and the increment; these are not meaningful unless
-/// either NoSignedWrap or NoUnsignedWrap is true, and are always meaningful
-/// in that case, although the final value may be 0 indicating a nonconstant.
-///
-/// TODO: This duplicates a fair amount of ScalarEvolution logic.
-/// Perhaps this can be merged with
-/// ScalarEvolution::getBackedgeTakenCount
-/// and/or ScalarEvolution::get{Sign,Zero}ExtendExpr.
-///
-static const PHINode *TestOrigIVForWrap(const Loop *L,
-                                        const BranchInst *BI,
-                                        const Instruction *OrigCond,
-                                        const ScalarEvolution &SE,
-                                        bool &NoSignedWrap,
-                                        bool &NoUnsignedWrap,
-                                        const ConstantInt* &InitialVal,
-                                        const ConstantInt* &IncrVal,
-                                        const ConstantInt* &LimitVal) {
-  // Verify that the loop is sane and find the exit condition.
-  const ICmpInst *Cmp = dyn_cast<ICmpInst>(OrigCond);
-  if (!Cmp) return 0;
-
-  const Value *CmpLHS = Cmp->getOperand(0);
-  const Value *CmpRHS = Cmp->getOperand(1);
-  const BasicBlock *TrueBB = BI->getSuccessor(0);
-  const BasicBlock *FalseBB = BI->getSuccessor(1);
-  ICmpInst::Predicate Pred = Cmp->getPredicate();
-
-  // Canonicalize a constant to the RHS.
-  if (isa<ConstantInt>(CmpLHS)) {
-    Pred = ICmpInst::getSwappedPredicate(Pred);
-    std::swap(CmpLHS, CmpRHS);
-  }
-  // Canonicalize SLE to SLT.
-  if (Pred == ICmpInst::ICMP_SLE)
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS))
-      if (!CI->getValue().isMaxSignedValue()) {
-        CmpRHS = ConstantInt::get(CI->getValue() + 1);
-        Pred = ICmpInst::ICMP_SLT;
-      }
-  // Canonicalize SGT to SGE.
-  if (Pred == ICmpInst::ICMP_SGT)
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS))
-      if (!CI->getValue().isMaxSignedValue()) {
-        CmpRHS = ConstantInt::get(CI->getValue() + 1);
-        Pred = ICmpInst::ICMP_SGE;
-      }
-  // Canonicalize SGE to SLT.
-  if (Pred == ICmpInst::ICMP_SGE) {
-    std::swap(TrueBB, FalseBB);
-    Pred = ICmpInst::ICMP_SLT;
-  }
-  // Canonicalize ULE to ULT.
-  if (Pred == ICmpInst::ICMP_ULE)
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS))
-      if (!CI->getValue().isMaxValue()) {
-        CmpRHS = ConstantInt::get(CI->getValue() + 1);
-        Pred = ICmpInst::ICMP_ULT;
-      }
-  // Canonicalize UGT to UGE.
-  if (Pred == ICmpInst::ICMP_UGT)
-    if (const ConstantInt *CI = dyn_cast<ConstantInt>(CmpRHS))
-      if (!CI->getValue().isMaxValue()) {
-        CmpRHS = ConstantInt::get(CI->getValue() + 1);
-        Pred = ICmpInst::ICMP_UGE;
-      }
-  // Canonicalize UGE to ULT.
-  if (Pred == ICmpInst::ICMP_UGE) {
-    std::swap(TrueBB, FalseBB);
-    Pred = ICmpInst::ICMP_ULT;
-  }
-  // For now, analyze only LT loops for signed overflow.
-  if (Pred != ICmpInst::ICMP_SLT && Pred != ICmpInst::ICMP_ULT)
-    return 0;
-
-  bool isSigned = Pred == ICmpInst::ICMP_SLT;
-
-  // Get the increment instruction. Look past casts if we will
-  // be able to prove that the original induction variable doesn't
-  // undergo signed or unsigned overflow, respectively.
-  const Value *IncrInst = CmpLHS;
-  if (isSigned) {
-    if (const SExtInst *SI = dyn_cast<SExtInst>(CmpLHS)) {
-      if (!isa<ConstantInt>(CmpRHS) ||
-          !cast<ConstantInt>(CmpRHS)->getValue()
-            .isSignedIntN(SE.getTypeSizeInBits(IncrInst->getType())))
-        return 0;
-      IncrInst = SI->getOperand(0);
-    }
-  } else {
-    if (const ZExtInst *ZI = dyn_cast<ZExtInst>(CmpLHS)) {
-      if (!isa<ConstantInt>(CmpRHS) ||
-          !cast<ConstantInt>(CmpRHS)->getValue()
-            .isIntN(SE.getTypeSizeInBits(IncrInst->getType())))
-        return 0;
-      IncrInst = ZI->getOperand(0);
-    }
-  }
-
-  // For now, only analyze induction variables that have simple increments.
-  const BinaryOperator *IncrOp = dyn_cast<BinaryOperator>(IncrInst);
-  if (!IncrOp || IncrOp->getOpcode() != Instruction::Add)
-    return 0;
-  IncrVal = dyn_cast<ConstantInt>(IncrOp->getOperand(1));
-  if (!IncrVal)
-    return 0;
-
-  // Make sure the PHI looks like a normal IV.
-  const PHINode *PN = dyn_cast<PHINode>(IncrOp->getOperand(0));
-  if (!PN || PN->getNumIncomingValues() != 2)
-    return 0;
-  unsigned IncomingEdge = L->contains(PN->getIncomingBlock(0));
-  unsigned BackEdge = !IncomingEdge;
-  if (!L->contains(PN->getIncomingBlock(BackEdge)) ||
-      PN->getIncomingValue(BackEdge) != IncrOp)
-    return 0;
-  if (!L->contains(TrueBB))
-    return 0;
-
-  // For now, only analyze loops with a constant start value, so that
-  // we can easily determine if the start value is not a maximum value
-  // which would wrap on the first iteration.
-  InitialVal = dyn_cast<ConstantInt>(PN->getIncomingValue(IncomingEdge));
-  if (!InitialVal)
-    return 0;
-
-  // The upper limit need not be a constant; we'll check later.
-  LimitVal = dyn_cast<ConstantInt>(CmpRHS);
-
-  // We detect the impossibility of wrapping in two cases, both of
-  // which require starting with a non-max value:
-  // - The IV counts up by one, and the loop iterates only while it remains
-  // less than a limiting value (any) in the same type.
-  // - The IV counts up by a positive increment other than 1, and the
-  // constant limiting value + the increment is less than the max value
-  // (computed as max-increment to avoid overflow)
-  if (isSigned && !InitialVal->getValue().isMaxSignedValue()) {
-    if (IncrVal->equalsInt(1))
-      NoSignedWrap = true;    // LimitVal need not be constant
-    else if (LimitVal) {
-      uint64_t numBits = LimitVal->getValue().getBitWidth();
-      if (IncrVal->getValue().sgt(APInt::getNullValue(numBits)) &&
-          (APInt::getSignedMaxValue(numBits) - IncrVal->getValue())
-            .sgt(LimitVal->getValue()))
-        NoSignedWrap = true;
-    }
-  } else if (!isSigned && !InitialVal->getValue().isMaxValue()) {
-    if (IncrVal->equalsInt(1))
-      NoUnsignedWrap = true;  // LimitVal need not be constant
-    else if (LimitVal) {
-      uint64_t numBits = LimitVal->getValue().getBitWidth();
-      if (IncrVal->getValue().ugt(APInt::getNullValue(numBits)) &&
-          (APInt::getMaxValue(numBits) - IncrVal->getValue())
-            .ugt(LimitVal->getValue()))
-        NoUnsignedWrap = true;
-    }
-  }
-  return PN;
-}
-
-static Value *getSignExtendedTruncVar(const SCEVAddRecExpr *AR,
-                                      ScalarEvolution *SE,
-                                      const Type *LargestType, Loop *L, 
-                                      const Type *myType,
-                                      SCEVExpander &Rewriter) {
-  SCEVHandle ExtendedStart =
-    SE->getSignExtendExpr(AR->getStart(), LargestType);
-  SCEVHandle ExtendedStep =
-    SE->getSignExtendExpr(AR->getStepRecurrence(*SE), LargestType);
-  SCEVHandle ExtendedAddRec =
-    SE->getAddRecExpr(ExtendedStart, ExtendedStep, L);
-  if (LargestType != myType)
-    ExtendedAddRec = SE->getTruncateExpr(ExtendedAddRec, myType);
-  return Rewriter.expandCodeFor(ExtendedAddRec, myType);
-}
-
-static Value *getZeroExtendedTruncVar(const SCEVAddRecExpr *AR,
-                                      ScalarEvolution *SE,
-                                      const Type *LargestType, Loop *L, 
-                                      const Type *myType,
-                                      SCEVExpander &Rewriter) {
-  SCEVHandle ExtendedStart =
-    SE->getZeroExtendExpr(AR->getStart(), LargestType);
-  SCEVHandle ExtendedStep =
-    SE->getZeroExtendExpr(AR->getStepRecurrence(*SE), LargestType);
-  SCEVHandle ExtendedAddRec =
-    SE->getAddRecExpr(ExtendedStart, ExtendedStep, L);
-  if (LargestType != myType)
-    ExtendedAddRec = SE->getTruncateExpr(ExtendedAddRec, myType);
-  return Rewriter.expandCodeFor(ExtendedAddRec, myType);
-}
-
-/// allUsesAreSameTyped - See whether all Uses of I are instructions
-/// with the same Opcode and the same type.
-static bool allUsesAreSameTyped(unsigned int Opcode, Instruction *I) {
-  const Type* firstType = NULL;
-  for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
-       UI != UE; ++UI) {
-    Instruction *II = dyn_cast<Instruction>(*UI);
-    if (!II || II->getOpcode() != Opcode)
-      return false;
-    if (!firstType)
-      firstType = II->getType();
-    else if (firstType != II->getType())
-      return false;
-  }
-  return true;
 }
 
 bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+  IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   SE = &getAnalysis<ScalarEvolution>();
   Changed = false;
@@ -594,11 +344,8 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   RewriteNonIntegerIVs(L);
 
   BasicBlock *Header       = L->getHeader();
-  BasicBlock *ExitingBlock = L->getExitingBlock();
-  SmallPtrSet<Instruction*, 16> DeadInsts;
-
-  // Verify the input to the pass in already in LCSSA form.
-  assert(L->isLCSSAForm());
+  BasicBlock *ExitingBlock = L->getExitingBlock(); // may be null
+  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
 
   // Check to see if this loop has a computable loop-invariant execution count.
   // If so, this means that we can compute the final value of any expressions
@@ -606,59 +353,45 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
   // loop into any instructions outside of the loop that use the final values of
   // the current expressions.
   //
-  SCEVHandle BackedgeTakenCount = SE->getBackedgeTakenCount(L);
   if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     RewriteLoopExitValues(L, BackedgeTakenCount);
 
-  // Next, analyze all of the induction variables in the loop, canonicalizing
-  // auxillary induction variables.
-  std::vector<std::pair<PHINode*, SCEVHandle> > IndVars;
-
-  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
-    PHINode *PN = cast<PHINode>(I);
-    if (SE->isSCEVable(PN->getType())) {
-      SCEVHandle SCEV = SE->getSCEV(PN);
-      // FIXME: It is an extremely bad idea to indvar substitute anything more
-      // complex than affine induction variables.  Doing so will put expensive
-      // polynomial evaluations inside of the loop, and the str reduction pass
-      // currently can only reduce affine polynomials.  For now just disable
-      // indvar subst on anything more complex than an affine addrec.
-      if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SCEV))
-        if (AR->getLoop() == L && AR->isAffine())
-          IndVars.push_back(std::make_pair(PN, SCEV));
-    }
-  }
-
-  // Compute the type of the largest recurrence expression, and collect
-  // the set of the types of the other recurrence expressions.
+  // Compute the type of the largest recurrence expression, and decide whether
+  // a canonical induction variable should be inserted.
   const Type *LargestType = 0;
-  SmallSetVector<const Type *, 4> SizesToInsert;
+  bool NeedCannIV = false;
   if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
     LargestType = BackedgeTakenCount->getType();
     LargestType = SE->getEffectiveSCEVType(LargestType);
-    SizesToInsert.insert(LargestType);
+    // If we have a known trip count and a single exit block, we'll be
+    // rewriting the loop exit test condition below, which requires a
+    // canonical induction variable.
+    if (ExitingBlock)
+      NeedCannIV = true;
   }
-  for (unsigned i = 0, e = IndVars.size(); i != e; ++i) {
-    const PHINode *PN = IndVars[i].first;
-    const Type *PNTy = PN->getType();
-    PNTy = SE->getEffectiveSCEVType(PNTy);
-    SizesToInsert.insert(PNTy);
-    const Type *EffTy = getEffectiveIndvarType(PN, SE);
-    EffTy = SE->getEffectiveSCEVType(EffTy);
-    SizesToInsert.insert(EffTy);
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    SCEVHandle Stride = IU->StrideOrder[i];
+    const Type *Ty = SE->getEffectiveSCEVType(Stride->getType());
     if (!LargestType ||
-        SE->getTypeSizeInBits(EffTy) >
+        SE->getTypeSizeInBits(Ty) >
           SE->getTypeSizeInBits(LargestType))
-      LargestType = EffTy;
+      LargestType = Ty;
+
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[i]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+
+    if (!SI->second->Users.empty())
+      NeedCannIV = true;
   }
 
   // Create a rewriter object which we'll use to transform the code with.
   SCEVExpander Rewriter(*SE, *LI);
 
-  // Now that we know the largest of of the induction variables in this loop,
-  // insert a canonical induction variable of the largest size.
+  // Now that we know the largest of of the induction variable expressions
+  // in this loop, insert a canonical induction variable of the largest size.
   Value *IndVar = 0;
-  if (!SizesToInsert.empty()) {
+  if (NeedCannIV) {
     IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L,LargestType);
     ++NumInserted;
     Changed = true;
@@ -667,229 +400,291 @@ bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
 
   // If we have a trip count expression, rewrite the loop's exit condition
   // using it.  We can currently only handle loops with a single exit.
-  bool NoSignedWrap = false;
-  bool NoUnsignedWrap = false;
-  const ConstantInt* InitialVal, * IncrVal, * LimitVal;
-  const PHINode *OrigControllingPHI = 0;
-  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && ExitingBlock)
+  ICmpInst *NewICmp = 0;
+  if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) && ExitingBlock) {
+    assert(NeedCannIV &&
+           "LinearFunctionTestReplace requires a canonical induction variable");
     // Can't rewrite non-branch yet.
-    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator())) {
-      if (Instruction *OrigCond = dyn_cast<Instruction>(BI->getCondition())) {
-        // Determine if the OrigIV will ever undergo overflow.
-        OrigControllingPHI =
-          TestOrigIVForWrap(L, BI, OrigCond, *SE,
-                            NoSignedWrap, NoUnsignedWrap,
-                            InitialVal, IncrVal, LimitVal);
-
-        // We'll be replacing the original condition, so it'll be dead.
-        DeadInsts.insert(OrigCond);
-      }
+    if (BranchInst *BI = dyn_cast<BranchInst>(ExitingBlock->getTerminator()))
+      NewICmp = LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
+                                          ExitingBlock, BI, Rewriter);
+  }
 
-      LinearFunctionTestReplace(L, BackedgeTakenCount, IndVar,
-                                ExitingBlock, BI, Rewriter);
-    }
+  Rewriter.setInsertionPoint(Header->getFirstNonPHI());
 
-  // Now that we have a canonical induction variable, we can rewrite any
-  // recurrences in terms of the induction variable.  Start with the auxillary
-  // induction variables, and recursively rewrite any of their uses.
-  BasicBlock::iterator InsertPt = Header->getFirstNonPHI();
-  Rewriter.setInsertionPoint(InsertPt);
-
-  // If there were induction variables of other sizes, cast the primary
-  // induction variable to the right size for them, avoiding the need for the
-  // code evaluation methods to insert induction variables of different sizes.
-  for (unsigned i = 0, e = SizesToInsert.size(); i != e; ++i) {
-    const Type *Ty = SizesToInsert[i];
-    if (Ty != LargestType) {
-      Instruction *New = new TruncInst(IndVar, Ty, "indvar", InsertPt);
-      Rewriter.addInsertedValue(New, SE->getSCEV(New));
-      DOUT << "INDVARS: Made trunc IV for type " << *Ty << ": "
-           << *New << "\n";
-    }
-  }
+  // Rewrite IV-derived expressions.
+  RewriteIVExpressions(L, LargestType, Rewriter);
 
-  // Rewrite all induction variables in terms of the canonical induction
-  // variable.
-  while (!IndVars.empty()) {
-    PHINode *PN = IndVars.back().first;
-    const SCEVAddRecExpr *AR = cast<SCEVAddRecExpr>(IndVars.back().second);
-    Value *NewVal = Rewriter.expandCodeFor(AR, PN->getType());
-    DOUT << "INDVARS: Rewrote IV '" << *AR << "' " << *PN
-         << "   into = " << *NewVal << "\n";
-    NewVal->takeName(PN);
-
-    /// If the new canonical induction variable is wider than the original,
-    /// and the original has uses that are casts to wider types, see if the
-    /// truncate and extend can be omitted.
-    if (PN == OrigControllingPHI && PN->getType() != LargestType)
-      for (Value::use_iterator UI = PN->use_begin(), UE = PN->use_end();
-           UI != UE; ++UI) {
-        Instruction *UInst = dyn_cast<Instruction>(*UI);
-        if (UInst && isa<SExtInst>(UInst) && NoSignedWrap) {
-          Value *TruncIndVar = getSignExtendedTruncVar(AR, SE, LargestType, L, 
-                                         UInst->getType(), Rewriter);
-          UInst->replaceAllUsesWith(TruncIndVar);
-          DeadInsts.insert(UInst);
-        }
-        // See if we can figure out sext(i+constant) doesn't wrap, so we can
-        // use a larger add.  This is common in subscripting.
-        if (UInst && UInst->getOpcode()==Instruction::Add &&
-            !UInst->use_empty() &&
-            allUsesAreSameTyped(Instruction::SExt, UInst) &&
-            isa<ConstantInt>(UInst->getOperand(1)) &&
-            NoSignedWrap && LimitVal) {
-          uint64_t oldBitSize = LimitVal->getValue().getBitWidth();
-          uint64_t newBitSize = LargestType->getPrimitiveSizeInBits();
-          ConstantInt* AddRHS = dyn_cast<ConstantInt>(UInst->getOperand(1));
-          if (((APInt::getSignedMaxValue(oldBitSize) - IncrVal->getValue()) -
-                AddRHS->getValue()).sgt(LimitVal->getValue())) {
-            // We've determined this is (i+constant) and it won't overflow.
-            if (isa<SExtInst>(UInst->use_begin())) {
-              SExtInst* oldSext = dyn_cast<SExtInst>(UInst->use_begin());
-              uint64_t truncSize = oldSext->getType()->getPrimitiveSizeInBits();
-              Value *TruncIndVar = getSignExtendedTruncVar(AR, SE, LargestType,
-                                                L, oldSext->getType(), Rewriter);
-              APInt APnewAddRHS = APInt(AddRHS->getValue()).sext(newBitSize);
-              if (newBitSize > truncSize)
-                APnewAddRHS = APnewAddRHS.trunc(truncSize);
-              ConstantInt* newAddRHS =ConstantInt::get(APnewAddRHS);
-              Value *NewAdd = 
-                    BinaryOperator::CreateAdd(TruncIndVar, newAddRHS,
-                                              UInst->getName()+".nosex", UInst);
-              for (Value::use_iterator UI2 = UInst->use_begin(), 
-                    UE2 = UInst->use_end(); UI2 != UE2; ++UI2) {
-                Instruction *II = dyn_cast<Instruction>(UI2);
-                II->replaceAllUsesWith(NewAdd);
-                DeadInsts.insert(II);
-              }
-              DeadInsts.insert(UInst);
-            }
-          }
-        }
-        // Try for sext(i | constant).  This is safe as long as the
-        // high bit of the constant is not set.
-        if (UInst && UInst->getOpcode()==Instruction::Or &&
-            !UInst->use_empty() &&
-            allUsesAreSameTyped(Instruction::SExt, UInst) && NoSignedWrap &&
-            isa<ConstantInt>(UInst->getOperand(1))) {
-          ConstantInt* RHS = dyn_cast<ConstantInt>(UInst->getOperand(1));
-          if (!RHS->getValue().isNegative()) {
-            uint64_t newBitSize = LargestType->getPrimitiveSizeInBits();
-            SExtInst* oldSext = dyn_cast<SExtInst>(UInst->use_begin());
-            uint64_t truncSize = oldSext->getType()->getPrimitiveSizeInBits();
-            Value *TruncIndVar = getSignExtendedTruncVar(AR, SE, LargestType,
-                                              L, oldSext->getType(), Rewriter);
-            APInt APnewOrRHS = APInt(RHS->getValue()).sext(newBitSize);
-            if (newBitSize > truncSize)
-              APnewOrRHS = APnewOrRHS.trunc(truncSize);
-            ConstantInt* newOrRHS =ConstantInt::get(APnewOrRHS);
-            Value *NewOr = 
-                  BinaryOperator::CreateOr(TruncIndVar, newOrRHS,
-                                            UInst->getName()+".nosex", UInst);
-            for (Value::use_iterator UI2 = UInst->use_begin(), 
-                  UE2 = UInst->use_end(); UI2 != UE2; ++UI2) {
-              Instruction *II = dyn_cast<Instruction>(UI2);
-              II->replaceAllUsesWith(NewOr);
-              DeadInsts.insert(II);
-            }
-            DeadInsts.insert(UInst);
-          }
-        }
-        // A zext of a signed variable known not to overflow is still safe.
-        if (UInst && isa<ZExtInst>(UInst) && (NoUnsignedWrap || NoSignedWrap)) {
-          Value *TruncIndVar = getZeroExtendedTruncVar(AR, SE, LargestType, L, 
-                                         UInst->getType(), Rewriter);
-          UInst->replaceAllUsesWith(TruncIndVar);
-          DeadInsts.insert(UInst);
-        }
-        // If we have zext(i&constant), it's always safe to use the larger
-        // variable.  This is not common but is a bottleneck in Openssl.
-        // (RHS doesn't have to be constant.  There should be a better approach
-        // than bottom-up pattern matching for this...)
-        if (UInst && UInst->getOpcode()==Instruction::And &&
-            !UInst->use_empty() &&
-            allUsesAreSameTyped(Instruction::ZExt, UInst) &&
-            isa<ConstantInt>(UInst->getOperand(1))) {
-          uint64_t newBitSize = LargestType->getPrimitiveSizeInBits();
-          ConstantInt* AndRHS = dyn_cast<ConstantInt>(UInst->getOperand(1));
-          ZExtInst* oldZext = dyn_cast<ZExtInst>(UInst->use_begin());
-          uint64_t truncSize = oldZext->getType()->getPrimitiveSizeInBits();
-          Value *TruncIndVar = getSignExtendedTruncVar(AR, SE, LargestType,
-                                  L, oldZext->getType(), Rewriter);
-          APInt APnewAndRHS = APInt(AndRHS->getValue()).zext(newBitSize);
-          if (newBitSize > truncSize)
-            APnewAndRHS = APnewAndRHS.trunc(truncSize);
-          ConstantInt* newAndRHS = ConstantInt::get(APnewAndRHS);
-          Value *NewAnd = 
-                BinaryOperator::CreateAnd(TruncIndVar, newAndRHS,
-                                          UInst->getName()+".nozex", UInst);
-          for (Value::use_iterator UI2 = UInst->use_begin(), 
-                UE2 = UInst->use_end(); UI2 != UE2; ++UI2) {
-            Instruction *II = dyn_cast<Instruction>(UI2);
-            II->replaceAllUsesWith(NewAnd);
-            DeadInsts.insert(II);
+  // Loop-invariant instructions in the preheader that aren't used in the
+  // loop may be sunk below the loop to reduce register pressure.
+  SinkUnusedInvariants(L, Rewriter);
+
+  // Reorder instructions to avoid use-before-def conditions.
+  FixUsesBeforeDefs(L, Rewriter);
+
+  // For completeness, inform IVUsers of the IV use in the newly-created
+  // loop exit test instruction.
+  if (NewICmp)
+    IU->AddUsersIfInteresting(cast<Instruction>(NewICmp->getOperand(0)));
+
+  // Clean up dead instructions.
+  DeleteDeadPHIs(L->getHeader());
+  // Check a post-condition.
+  assert(L->isLCSSAForm() && "Indvars did not leave the loop in lcssa form!");
+  return Changed;
+}
+
+void IndVarSimplify::RewriteIVExpressions(Loop *L, const Type *LargestType,
+                                          SCEVExpander &Rewriter) {
+  SmallVector<WeakVH, 16> DeadInsts;
+
+  // Rewrite all induction variable expressions in terms of the canonical
+  // induction variable.
+  //
+  // If there were induction variables of other sizes or offsets, manually
+  // add the offsets to the primary induction variable and cast, avoiding
+  // the need for the code evaluation methods to insert induction variables
+  // of different sizes.
+  for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+    SCEVHandle Stride = IU->StrideOrder[i];
+
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[i]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+    ilist<IVStrideUse> &List = SI->second->Users;
+    for (ilist<IVStrideUse>::iterator UI = List.begin(),
+         E = List.end(); UI != E; ++UI) {
+      SCEVHandle Offset = UI->getOffset();
+      Value *Op = UI->getOperandValToReplace();
+      Instruction *User = UI->getUser();
+      bool isSigned = UI->isSigned();
+
+      // Compute the final addrec to expand into code.
+      SCEVHandle AR = IU->getReplacementExpr(*UI);
+
+      // FIXME: It is an extremely bad idea to indvar substitute anything more
+      // complex than affine induction variables.  Doing so will put expensive
+      // polynomial evaluations inside of the loop, and the str reduction pass
+      // currently can only reduce affine polynomials.  For now just disable
+      // indvar subst on anything more complex than an affine addrec, unless
+      // it can be expanded to a trivial value.
+      if (!Stride->isLoopInvariant(L) &&
+          !isa<SCEVConstant>(AR) &&
+          L->contains(User->getParent()))
+        continue;
+
+      Value *NewVal = 0;
+      if (AR->isLoopInvariant(L)) {
+        BasicBlock::iterator I = Rewriter.getInsertionPoint();
+        // Expand loop-invariant values in the loop preheader. They will
+        // be sunk to the exit block later, if possible.
+          NewVal =
+          Rewriter.expandCodeFor(AR, LargestType,
+                                 L->getLoopPreheader()->getTerminator());
+        Rewriter.setInsertionPoint(I);
+        ++NumReplaced;
+      } else {
+        const Type *IVTy = Offset->getType();
+        const Type *UseTy = Op->getType();
+
+        // Promote the Offset and Stride up to the canonical induction
+        // variable's bit width.
+        SCEVHandle PromotedOffset = Offset;
+        SCEVHandle PromotedStride = Stride;
+        if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType)) {
+          // It doesn't matter for correctness whether zero or sign extension
+          // is used here, since the value is truncated away below, but if the
+          // value is signed, sign extension is more likely to be folded.
+          if (isSigned) {
+            PromotedOffset = SE->getSignExtendExpr(PromotedOffset, LargestType);
+            PromotedStride = SE->getSignExtendExpr(PromotedStride, LargestType);
+          } else {
+            PromotedOffset = SE->getZeroExtendExpr(PromotedOffset, LargestType);
+            // If the stride is obviously negative, use sign extension to
+            // produce things like x-1 instead of x+255.
+            if (isa<SCEVConstant>(PromotedStride) &&
+                cast<SCEVConstant>(PromotedStride)
+                  ->getValue()->getValue().isNegative())
+              PromotedStride = SE->getSignExtendExpr(PromotedStride,
+                                                     LargestType);
+            else
+              PromotedStride = SE->getZeroExtendExpr(PromotedStride,
+                                                     LargestType);
           }
-          DeadInsts.insert(UInst);
         }
-        // If we have zext((i+constant)&constant), we can use the larger
-        // variable even if the add does overflow.  This works whenever the
-        // constant being ANDed is the same size as i, which it presumably is.
-        // We don't need to restrict the expression being and'ed to i+const,
-        // but we have to promote everything in it, so it's convenient.
-        // zext((i | constant)&constant) is also valid and accepted here.
-        if (UInst && (UInst->getOpcode()==Instruction::Add ||
-                      UInst->getOpcode()==Instruction::Or) &&
-            UInst->hasOneUse() &&
-            isa<ConstantInt>(UInst->getOperand(1))) {
-          uint64_t newBitSize = LargestType->getPrimitiveSizeInBits();
-          ConstantInt* AddRHS = dyn_cast<ConstantInt>(UInst->getOperand(1));
-          Instruction *UInst2 = dyn_cast<Instruction>(UInst->use_begin());
-          if (UInst2 && UInst2->getOpcode() == Instruction::And &&
-              !UInst2->use_empty() &&
-              allUsesAreSameTyped(Instruction::ZExt, UInst2) &&
-              isa<ConstantInt>(UInst2->getOperand(1))) {
-            ZExtInst* oldZext = dyn_cast<ZExtInst>(UInst2->use_begin());
-            uint64_t truncSize = oldZext->getType()->getPrimitiveSizeInBits();
-            Value *TruncIndVar = getSignExtendedTruncVar(AR, SE, LargestType,
-                                    L, oldZext->getType(), Rewriter);
-            ConstantInt* AndRHS = dyn_cast<ConstantInt>(UInst2->getOperand(1));
-            APInt APnewAddRHS = APInt(AddRHS->getValue()).zext(newBitSize);
-            if (newBitSize > truncSize)
-              APnewAddRHS = APnewAddRHS.trunc(truncSize);
-            ConstantInt* newAddRHS = ConstantInt::get(APnewAddRHS);
-            Value *NewAdd = ((UInst->getOpcode()==Instruction::Add) ?
-                  BinaryOperator::CreateAdd(TruncIndVar, newAddRHS,
-                                            UInst->getName()+".nozex", UInst2) :
-                  BinaryOperator::CreateOr(TruncIndVar, newAddRHS,
-                                            UInst->getName()+".nozex", UInst2));
-            APInt APcopy2 = APInt(AndRHS->getValue());
-            ConstantInt* newAndRHS = ConstantInt::get(APcopy2.zext(newBitSize));
-            Value *NewAnd = 
-                  BinaryOperator::CreateAnd(NewAdd, newAndRHS,
-                                            UInst->getName()+".nozex", UInst2);
-            for (Value::use_iterator UI2 = UInst2->use_begin(), 
-                  UE2 = UInst2->use_end(); UI2 != UE2; ++UI2) {
-              Instruction *II = dyn_cast<Instruction>(UI2);
-              II->replaceAllUsesWith(NewAnd);
-              DeadInsts.insert(II);
-            }
-            DeadInsts.insert(UInst);
-            DeadInsts.insert(UInst2);
+
+        // Create the SCEV representing the offset from the canonical
+        // induction variable, still in the canonical induction variable's
+        // type, so that all expanded arithmetic is done in the same type.
+        SCEVHandle NewAR = SE->getAddRecExpr(SE->getIntegerSCEV(0, LargestType),
+                                           PromotedStride, L);
+        // Add the PromotedOffset as a separate step, because it may not be
+        // loop-invariant.
+        NewAR = SE->getAddExpr(NewAR, PromotedOffset);
+
+        // Expand the addrec into instructions.
+        Value *V = Rewriter.expandCodeFor(NewAR, LargestType);
+
+        // Insert an explicit cast if necessary to truncate the value
+        // down to the original stride type. This is done outside of
+        // SCEVExpander because in SCEV expressions, a truncate of an
+        // addrec is always folded.
+        if (LargestType != IVTy) {
+          if (SE->getTypeSizeInBits(IVTy) != SE->getTypeSizeInBits(LargestType))
+            NewAR = SE->getTruncateExpr(NewAR, IVTy);
+          if (Rewriter.isInsertedExpression(NewAR))
+            V = Rewriter.expandCodeFor(NewAR, IVTy);
+          else {
+            V = Rewriter.InsertCastOfTo(CastInst::getCastOpcode(V, false,
+                                                                IVTy, false),
+                                        V, IVTy);
+            assert(!isa<SExtInst>(V) && !isa<ZExtInst>(V) &&
+                   "LargestType wasn't actually the largest type!");
+            // Force the rewriter to use this trunc whenever this addrec
+            // appears so that it doesn't insert new phi nodes or
+            // arithmetic in a different type.
+            Rewriter.addInsertedValue(V, NewAR);
           }
         }
+
+        DOUT << "INDVARS: Made offset-and-trunc IV for offset "
+             << *IVTy << " " << *Offset << ": ";
+        DEBUG(WriteAsOperand(*DOUT, V, false));
+        DOUT << "\n";
+
+        // Now expand it into actual Instructions and patch it into place.
+        NewVal = Rewriter.expandCodeFor(AR, UseTy);
       }
 
-    // Replace the old PHI Node with the inserted computation.
-    PN->replaceAllUsesWith(NewVal);
-    DeadInsts.insert(PN);
-    IndVars.pop_back();
-    ++NumRemoved;
-    Changed = true;
+      // Patch the new value into place.
+      if (Op->hasName())
+        NewVal->takeName(Op);
+      User->replaceUsesOfWith(Op, NewVal);
+      UI->setOperandValToReplace(NewVal);
+      DOUT << "INDVARS: Rewrote IV '" << *AR << "' " << *Op
+           << "   into = " << *NewVal << "\n";
+      ++NumRemoved;
+      Changed = true;
+
+      // The old value may be dead now.
+      DeadInsts.push_back(Op);
+    }
   }
 
-  DeleteTriviallyDeadInstructions(DeadInsts);
-  assert(L->isLCSSAForm());
-  return Changed;
+  // Now that we're done iterating through lists, clean up any instructions
+  // which are now dead.
+  while (!DeadInsts.empty()) {
+    Instruction *Inst = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
+    if (Inst)
+      RecursivelyDeleteTriviallyDeadInstructions(Inst);
+  }
+}
+
+/// If there's a single exit block, sink any loop-invariant values that
+/// were defined in the preheader but not used inside the loop into the
+/// exit block to reduce register pressure in the loop.
+void IndVarSimplify::SinkUnusedInvariants(Loop *L, SCEVExpander &Rewriter) {
+  BasicBlock *ExitBlock = L->getExitBlock();
+  if (!ExitBlock) return;
+
+  Instruction *NonPHI = ExitBlock->getFirstNonPHI();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  BasicBlock::iterator I = Preheader->getTerminator();
+  while (I != Preheader->begin()) {
+    --I;
+    // New instructions were inserted at the end of the preheader. Only
+    // consider those new instructions.
+    if (!Rewriter.isInsertedInstruction(I))
+      break;
+    // Determine if there is a use in or before the loop (direct or
+    // otherwise).
+    bool UsedInLoop = false;
+    for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+         UI != UE; ++UI) {
+      BasicBlock *UseBB = cast<Instruction>(UI)->getParent();
+      if (PHINode *P = dyn_cast<PHINode>(UI)) {
+        unsigned i =
+          PHINode::getIncomingValueNumForOperand(UI.getOperandNo());
+        UseBB = P->getIncomingBlock(i);
+      }
+      if (UseBB == Preheader || L->contains(UseBB)) {
+        UsedInLoop = true;
+        break;
+      }
+    }
+    // If there is, the def must remain in the preheader.
+    if (UsedInLoop)
+      continue;
+    // Otherwise, sink it to the exit block.
+    Instruction *ToMove = I;
+    bool Done = false;
+    if (I != Preheader->begin())
+      --I;
+    else
+      Done = true;
+    ToMove->moveBefore(NonPHI);
+    if (Done)
+      break;
+  }
+}
+
+/// Re-schedule the inserted instructions to put defs before uses. This
+/// fixes problems that arrise when SCEV expressions contain loop-variant
+/// values unrelated to the induction variable which are defined inside the
+/// loop. FIXME: It would be better to insert instructions in the right
+/// place so that this step isn't needed.
+void IndVarSimplify::FixUsesBeforeDefs(Loop *L, SCEVExpander &Rewriter) {
+  // Visit all the blocks in the loop in pre-order dom-tree dfs order.
+  DominatorTree *DT = &getAnalysis<DominatorTree>();
+  std::map<Instruction *, unsigned> NumPredsLeft;
+  SmallVector<DomTreeNode *, 16> Worklist;
+  Worklist.push_back(DT->getNode(L->getHeader()));
+  do {
+    DomTreeNode *Node = Worklist.pop_back_val();
+    for (DomTreeNode::iterator I = Node->begin(), E = Node->end(); I != E; ++I)
+      if (L->contains((*I)->getBlock()))
+        Worklist.push_back(*I);
+    BasicBlock *BB = Node->getBlock();
+    // Visit all the instructions in the block top down.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Count the number of operands that aren't properly dominating.
+      unsigned NumPreds = 0;
+      if (Rewriter.isInsertedInstruction(I) && !isa<PHINode>(I))
+        for (User::op_iterator OI = I->op_begin(), OE = I->op_end();
+             OI != OE; ++OI)
+          if (Instruction *Inst = dyn_cast<Instruction>(OI))
+            if (L->contains(Inst->getParent()) && !NumPredsLeft.count(Inst))
+              ++NumPreds;
+      NumPredsLeft[I] = NumPreds;
+      // Notify uses of the position of this instruction, and move the
+      // users (and their dependents, recursively) into place after this
+      // instruction if it is their last outstanding operand.
+      for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI) {
+        Instruction *Inst = cast<Instruction>(UI);
+        std::map<Instruction *, unsigned>::iterator Z = NumPredsLeft.find(Inst);
+        if (Z != NumPredsLeft.end() && Z->second != 0 && --Z->second == 0) {
+          SmallVector<Instruction *, 4> UseWorkList;
+          UseWorkList.push_back(Inst);
+          BasicBlock::iterator InsertPt = next(I);
+          while (isa<PHINode>(InsertPt)) ++InsertPt;
+          do {
+            Instruction *Use = UseWorkList.pop_back_val();
+            Use->moveBefore(InsertPt);
+            NumPredsLeft.erase(Use);
+            for (Value::use_iterator IUI = Use->use_begin(),
+                 IUE = Use->use_end(); IUI != IUE; ++IUI) {
+              Instruction *IUIInst = cast<Instruction>(IUI);
+              if (L->contains(IUIInst->getParent()) &&
+                  Rewriter.isInsertedInstruction(IUIInst) &&
+                  !isa<PHINode>(IUIInst))
+                UseWorkList.push_back(IUIInst);
+            }
+          } while (!UseWorkList.empty());
+        }
+      }
+    }
+  } while (!Worklist.empty());
 }
 
 /// Return true if it is OK to use SIToFPInst for an inducation variable
@@ -933,8 +728,7 @@ static bool convertToInt(const APFloat &APF, uint64_t *intVal) {
 /// for(int i = 0; i < 10000; ++i)
 ///   bar((double)i);
 ///
-void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PH,
-                                   SmallPtrSet<Instruction*, 16> &DeadInsts) {
+void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PH) {
 
   unsigned IncomingEdge = L->contains(PH->getIncomingBlock(0));
   unsigned BackEdge     = IncomingEdge^1;
@@ -1041,25 +835,34 @@ void IndVarSimplify::HandleFloatingPointIV(Loop *L, PHINode *PH,
   ICmpInst *NewEC = new ICmpInst(NewPred, LHS, RHS, EC->getNameStart(),
                                  EC->getParent()->getTerminator());
 
+  // In the following deltions, PH may become dead and may be deleted.
+  // Use a WeakVH to observe whether this happens.
+  WeakVH WeakPH = PH;
+
   // Delete old, floating point, exit comparision instruction.
   EC->replaceAllUsesWith(NewEC);
-  DeadInsts.insert(EC);
+  RecursivelyDeleteTriviallyDeadInstructions(EC);
 
   // Delete old, floating point, increment instruction.
   Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
-  DeadInsts.insert(Incr);
-
-  // Replace floating induction variable. Give SIToFPInst preference over
-  // UIToFPInst because it is faster on platforms that are widely used.
-  if (useSIToFPInst(*InitValue, *EV, newInitValue, intEV)) {
-    SIToFPInst *Conv = new SIToFPInst(NewPHI, PH->getType(), "indvar.conv",
-                                      PH->getParent()->getFirstNonPHI());
-    PH->replaceAllUsesWith(Conv);
-  } else {
-    UIToFPInst *Conv = new UIToFPInst(NewPHI, PH->getType(), "indvar.conv",
-                                      PH->getParent()->getFirstNonPHI());
-    PH->replaceAllUsesWith(Conv);
+  RecursivelyDeleteTriviallyDeadInstructions(Incr);
+
+  // Replace floating induction variable, if it isn't already deleted.
+  // Give SIToFPInst preference over UIToFPInst because it is faster on
+  // platforms that are widely used.
+  if (WeakPH && !PH->use_empty()) {
+    if (useSIToFPInst(*InitValue, *EV, newInitValue, intEV)) {
+      SIToFPInst *Conv = new SIToFPInst(NewPHI, PH->getType(), "indvar.conv",
+                                        PH->getParent()->getFirstNonPHI());
+      PH->replaceAllUsesWith(Conv);
+    } else {
+      UIToFPInst *Conv = new UIToFPInst(NewPHI, PH->getType(), "indvar.conv",
+                                        PH->getParent()->getFirstNonPHI());
+      PH->replaceAllUsesWith(Conv);
+    }
+    RecursivelyDeleteTriviallyDeadInstructions(PH);
   }
-  DeadInsts.insert(PH);
-}
 
+  // Add a new IVUsers entry for the newly-created integer PHI.
+  IU->AddUsersIfInteresting(NewPHI);
+}
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 127ef56..4f6d531 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Type.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
@@ -53,40 +54,6 @@ namespace {
 
   struct BasedUser;
 
-  /// IVStrideUse - Keep track of one use of a strided induction variable, where
-  /// the stride is stored externally.  The Offset member keeps track of the 
-  /// offset from the IV, User is the actual user of the operand, and
-  /// 'OperandValToReplace' is the operand of the User that is the use.
-  struct VISIBILITY_HIDDEN IVStrideUse {
-    SCEVHandle Offset;
-    Instruction *User;
-    Value *OperandValToReplace;
-
-    // isUseOfPostIncrementedValue - True if this should use the
-    // post-incremented version of this IV, not the preincremented version.
-    // This can only be set in special cases, such as the terminating setcc
-    // instruction for a loop or uses dominated by the loop.
-    bool isUseOfPostIncrementedValue;
-    
-    IVStrideUse(const SCEVHandle &Offs, Instruction *U, Value *O)
-      : Offset(Offs), User(U), OperandValToReplace(O),
-        isUseOfPostIncrementedValue(false) {}
-  };
-  
-  /// IVUsersOfOneStride - This structure keeps track of all instructions that
-  /// have an operand that is based on the trip count multiplied by some stride.
-  /// The stride for all of these users is common and kept external to this
-  /// structure.
-  struct VISIBILITY_HIDDEN IVUsersOfOneStride {
-    /// Users - Keep track of all of the users of this stride as well as the
-    /// initial value and the operand that uses the IV.
-    std::vector<IVStrideUse> Users;
-    
-    void addUser(const SCEVHandle &Offset,Instruction *User, Value *Operand) {
-      Users.push_back(IVStrideUse(Offset, User, Operand));
-    }
-  };
-
   /// IVInfo - This structure keeps track of one IV expression inserted during
   /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
   /// well as the PHI node and increment value created for rewrite.
@@ -110,15 +77,12 @@ namespace {
   };
 
   class VISIBILITY_HIDDEN LoopStrengthReduce : public LoopPass {
+    IVUsers *IU;
     LoopInfo *LI;
     DominatorTree *DT;
     ScalarEvolution *SE;
     bool Changed;
 
-    /// IVUsesByStride - Keep track of all uses of induction variables that we
-    /// are interested in.  The key of the map is the stride of the access.
-    std::map<SCEVHandle, IVUsersOfOneStride> IVUsesByStride;
-
     /// IVsByStride - Keep track of all IVs that have been inserted for a
     /// particular stride.
     std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
@@ -127,14 +91,9 @@ namespace {
     /// reused (nor should they be rewritten to reuse other strides).
     SmallSet<SCEVHandle, 4> StrideNoReuse;
 
-    /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
-    /// We use this to iterate over the IVUsesByStride collection without being
-    /// dependent on random ordering of pointers in the process.
-    SmallVector<SCEVHandle, 16> StrideOrder;
-
     /// DeadInsts - Keep track of instructions we may have made dead, so that
     /// we can remove them after we are done working.
-    SmallVector<Instruction*, 16> DeadInsts;
+    SmallVector<WeakVH, 16> DeadInsts;
 
     /// TLI - Keep a pointer of a TargetLowering to consult for determining
     /// transformation profitability.
@@ -161,11 +120,11 @@ namespace {
       AU.addRequired<DominatorTree>();
       AU.addRequired<ScalarEvolution>();
       AU.addPreserved<ScalarEvolution>();
+      AU.addRequired<IVUsers>();
+      AU.addPreserved<IVUsers>();
     }
 
   private:
-    bool AddUsersIfInteresting(Instruction *I, Loop *L,
-                               SmallPtrSet<Instruction*,16> &Processed);
     ICmpInst *ChangeCompareStride(Loop *L, ICmpInst *Cond,
                                   IVStrideUse* &CondUse,
                                   const SCEVHandle* &CondStride);
@@ -191,6 +150,8 @@ namespace {
                              const std::vector<BasedUser>& UsersToProcess);
     bool ValidScale(bool, int64_t,
                     const std::vector<BasedUser>& UsersToProcess);
+    bool ValidOffset(bool, int64_t, int64_t,
+                     const std::vector<BasedUser>& UsersToProcess);
     SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
                               IVUsersOfOneStride &Uses,
                               Loop *L,
@@ -242,21 +203,8 @@ Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
 void LoopStrengthReduce::DeleteTriviallyDeadInstructions() {
   if (DeadInsts.empty()) return;
   
-  // Sort the deadinsts list so that we can trivially eliminate duplicates as we
-  // go.  The code below never adds a non-dead instruction to the worklist, but
-  // callers may not be so careful.
-  array_pod_sort(DeadInsts.begin(), DeadInsts.end());
-
-  // Drop duplicate instructions and those with uses.
-  for (unsigned i = 0, e = DeadInsts.size()-1; i < e; ++i) {
-    Instruction *I = DeadInsts[i];
-    if (!I->use_empty()) DeadInsts[i] = 0;
-    while (i != e && DeadInsts[i+1] == I)
-      DeadInsts[++i] = 0;
-  }
-  
   while (!DeadInsts.empty()) {
-    Instruction *I = DeadInsts.back();
+    Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.back());
     DeadInsts.pop_back();
     
     if (I == 0 || !isInstructionTriviallyDead(I))
@@ -313,111 +261,6 @@ static bool containsAddRecFromDifferentLoop(SCEVHandle S, Loop *L) {
   return false;
 }
 
-/// getSCEVStartAndStride - Compute the start and stride of this expression,
-/// returning false if the expression is not a start/stride pair, or true if it
-/// is.  The stride must be a loop invariant expression, but the start may be
-/// a mix of loop invariant and loop variant expressions.  The start cannot,
-/// however, contain an AddRec from a different loop, unless that loop is an
-/// outer loop of the current loop.
-static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L,
-                                  SCEVHandle &Start, SCEVHandle &Stride,
-                                  ScalarEvolution *SE, DominatorTree *DT) {
-  SCEVHandle TheAddRec = Start;   // Initialize to zero.
-
-  // If the outer level is an AddExpr, the operands are all start values except
-  // for a nested AddRecExpr.
-  if (const SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(SH)) {
-    for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i)
-      if (const SCEVAddRecExpr *AddRec =
-             dyn_cast<SCEVAddRecExpr>(AE->getOperand(i))) {
-        if (AddRec->getLoop() == L)
-          TheAddRec = SE->getAddExpr(AddRec, TheAddRec);
-        else
-          return false;  // Nested IV of some sort?
-      } else {
-        Start = SE->getAddExpr(Start, AE->getOperand(i));
-      }
-        
-  } else if (isa<SCEVAddRecExpr>(SH)) {
-    TheAddRec = SH;
-  } else {
-    return false;  // not analyzable.
-  }
-  
-  const SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(TheAddRec);
-  if (!AddRec || AddRec->getLoop() != L) return false;
-  
-  // FIXME: Generalize to non-affine IV's.
-  if (!AddRec->isAffine()) return false;
-
-  // If Start contains an SCEVAddRecExpr from a different loop, other than an
-  // outer loop of the current loop, reject it.  SCEV has no concept of 
-  // operating on more than one loop at a time so don't confuse it with such
-  // expressions.
-  if (containsAddRecFromDifferentLoop(AddRec->getOperand(0), L))
-    return false;
-
-  Start = SE->getAddExpr(Start, AddRec->getOperand(0));
-  
-  if (!isa<SCEVConstant>(AddRec->getOperand(1))) {
-    // If stride is an instruction, make sure it dominates the loop preheader.
-    // Otherwise we could end up with a use before def situation.
-    BasicBlock *Preheader = L->getLoopPreheader();
-    if (!AddRec->getOperand(1)->dominates(Preheader, DT))
-      return false;
-
-    DOUT << "[" << L->getHeader()->getName()
-         << "] Variable stride: " << *AddRec << "\n";
-  }
-
-  Stride = AddRec->getOperand(1);
-  return true;
-}
-
-/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
-/// and now we need to decide whether the user should use the preinc or post-inc
-/// value.  If this user should use the post-inc version of the IV, return true.
-///
-/// Choosing wrong here can break dominance properties (if we choose to use the
-/// post-inc value when we cannot) or it can end up adding extra live-ranges to
-/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
-/// should use the post-inc value).
-static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
-                                       Loop *L, DominatorTree *DT, Pass *P,
-                                      SmallVectorImpl<Instruction*> &DeadInsts){
-  // If the user is in the loop, use the preinc value.
-  if (L->contains(User->getParent())) return false;
-  
-  BasicBlock *LatchBlock = L->getLoopLatch();
-  
-  // Ok, the user is outside of the loop.  If it is dominated by the latch
-  // block, use the post-inc value.
-  if (DT->dominates(LatchBlock, User->getParent()))
-    return true;
-
-  // There is one case we have to be careful of: PHI nodes.  These little guys
-  // can live in blocks that do not dominate the latch block, but (since their
-  // uses occur in the predecessor block, not the block the PHI lives in) should
-  // still use the post-inc value.  Check for this case now.
-  PHINode *PN = dyn_cast<PHINode>(User);
-  if (!PN) return false;  // not a phi, not dominated by latch block.
-  
-  // Look at all of the uses of IV by the PHI node.  If any use corresponds to
-  // a block that is not dominated by the latch block, give up and use the
-  // preincremented value.
-  unsigned NumUses = 0;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
-    if (PN->getIncomingValue(i) == IV) {
-      ++NumUses;
-      if (!DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
-        return false;
-    }
-
-  // Okay, all uses of IV by PN are in predecessor blocks that really are
-  // dominated by the latch block.  Use the post-incremented value.
-  return true;
-}
-
 /// isAddressUse - Returns true if the specified instruction is using the
 /// specified value as an address.
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
@@ -467,90 +310,6 @@ static const Type *getAccessType(const Instruction *Inst) {
   return UseTy;
 }
 
-/// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
-/// reducible SCEV, recursively add its users to the IVUsesByStride set and
-/// return true.  Otherwise, return false.
-bool LoopStrengthReduce::AddUsersIfInteresting(Instruction *I, Loop *L,
-                                      SmallPtrSet<Instruction*,16> &Processed) {
-  if (!SE->isSCEVable(I->getType()))
-    return false;   // Void and FP expressions cannot be reduced.
-
-  // LSR is not APInt clean, do not touch integers bigger than 64-bits.
-  if (SE->getTypeSizeInBits(I->getType()) > 64)
-    return false;
-  
-  if (!Processed.insert(I))
-    return true;    // Instruction already handled.
-  
-  // Get the symbolic expression for this instruction.
-  SCEVHandle ISE = SE->getSCEV(I);
-  if (isa<SCEVCouldNotCompute>(ISE)) return false;
-  
-  // Get the start and stride for this expression.
-  SCEVHandle Start = SE->getIntegerSCEV(0, ISE->getType());
-  SCEVHandle Stride = Start;
-  if (!getSCEVStartAndStride(ISE, L, Start, Stride, SE, DT))
-    return false;  // Non-reducible symbolic expression, bail out.
-
-  std::vector<Instruction *> IUsers;
-  // Collect all I uses now because IVUseShouldUsePostIncValue may 
-  // invalidate use_iterator.
-  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; ++UI)
-    IUsers.push_back(cast<Instruction>(*UI));
-
-  for (unsigned iused_index = 0, iused_size = IUsers.size(); 
-       iused_index != iused_size; ++iused_index) {
-
-    Instruction *User = IUsers[iused_index];
-
-    // Do not infinitely recurse on PHI nodes.
-    if (isa<PHINode>(User) && Processed.count(User))
-      continue;
-
-    // Descend recursively, but not into PHI nodes outside the current loop.
-    // It's important to see the entire expression outside the loop to get
-    // choices that depend on addressing mode use right, although we won't
-    // consider references ouside the loop in all cases.
-    // If User is already in Processed, we don't want to recurse into it again,
-    // but do want to record a second reference in the same instruction.
-    bool AddUserToIVUsers = false;
-    if (LI->getLoopFor(User->getParent()) != L) {
-      if (isa<PHINode>(User) || Processed.count(User) ||
-          !AddUsersIfInteresting(User, L, Processed)) {
-        DOUT << "FOUND USER in other loop: " << *User
-             << "   OF SCEV: " << *ISE << "\n";
-        AddUserToIVUsers = true;
-      }
-    } else if (Processed.count(User) || 
-               !AddUsersIfInteresting(User, L, Processed)) {
-      DOUT << "FOUND USER: " << *User
-           << "   OF SCEV: " << *ISE << "\n";
-      AddUserToIVUsers = true;
-    }
-
-    if (AddUserToIVUsers) {
-      IVUsersOfOneStride &StrideUses = IVUsesByStride[Stride];
-      if (StrideUses.Users.empty())     // First occurrence of this stride?
-        StrideOrder.push_back(Stride);
-      
-      // Okay, we found a user that we cannot reduce.  Analyze the instruction
-      // and decide what to do with it.  If we are a use inside of the loop, use
-      // the value before incrementation, otherwise use it after incrementation.
-      if (IVUseShouldUsePostIncValue(User, I, L, DT, this, DeadInsts)) {
-        // The value used will be incremented by the stride more than we are
-        // expecting, so subtract this off.
-        SCEVHandle NewStart = SE->getMinusSCEV(Start, Stride);
-        StrideUses.addUser(NewStart, User, I);
-        StrideUses.Users.back().isUseOfPostIncrementedValue = true;
-        DOUT << "   USING POSTINC SCEV, START=" << *NewStart<< "\n";
-      } else {        
-        StrideUses.addUser(Start, User, I);
-      }
-    }
-  }
-  return true;
-}
-
 namespace {
   /// BasedUser - For a particular base value, keep information about how we've
   /// partitioned the expression so far.
@@ -571,6 +330,13 @@ namespace {
     /// EmittedBase.
     Value *OperandValToReplace;
 
+    /// isSigned - The stride (and thus also the Base) of this use may be in
+    /// a narrower type than the use itself (OperandValToReplace->getType()).
+    /// When this is the case, the isSigned field indicates whether the
+    /// IV expression should be signed-extended instead of zero-extended to
+    /// fit the type of the use.
+    bool isSigned;
+
     /// Imm - The immediate value that should be added to the base immediately
     /// before Inst, because it will be folded into the imm field of the
     /// instruction.  This is also sometimes used for loop-variant values that
@@ -589,10 +355,11 @@ namespace {
     bool isUseOfPostIncrementedValue;
     
     BasedUser(IVStrideUse &IVSU, ScalarEvolution *se)
-      : SE(se), Base(IVSU.Offset), Inst(IVSU.User), 
-        OperandValToReplace(IVSU.OperandValToReplace), 
+      : SE(se), Base(IVSU.getOffset()), Inst(IVSU.getUser()),
+        OperandValToReplace(IVSU.getOperandValToReplace()),
+        isSigned(IVSU.isSigned()),
         Imm(SE->getIntegerSCEV(0, Base->getType())), 
-        isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue) {}
+        isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue()) {}
 
     // Once we rewrite the code to insert the new IVs we want, update the
     // operands of Inst to use the new expression 'NewBase', with 'Imm' added
@@ -600,7 +367,7 @@ namespace {
     void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
                                         Instruction *InsertPt,
                                        SCEVExpander &Rewriter, Loop *L, Pass *P,
-                                      SmallVectorImpl<Instruction*> &DeadInsts);
+                                        SmallVectorImpl<WeakVH> &DeadInsts);
     
     Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
                                        const Type *Ty,
@@ -638,19 +405,27 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
       InsertLoop = InsertLoop->getParentLoop();
     }
   
-  Value *Base = Rewriter.expandCodeFor(NewBase, Ty, BaseInsertPt);
+  Value *Base = Rewriter.expandCodeFor(NewBase, NewBase->getType(),
+                                       BaseInsertPt);
+
+  SCEVHandle NewValSCEV = SE->getUnknown(Base);
 
   // If there is no immediate value, skip the next part.
-  if (Imm->isZero())
-    return Base;
+  if (!Imm->isZero()) {
+    // If we are inserting the base and imm values in the same block, make sure
+    // to adjust the IP position if insertion reused a result.
+    if (IP == BaseInsertPt)
+      IP = Rewriter.getInsertionPoint();
+
+    // Always emit the immediate (if non-zero) into the same block as the user.
+    NewValSCEV = SE->getAddExpr(NewValSCEV, Imm);
+  }
+
+  if (isSigned)
+    NewValSCEV = SE->getTruncateOrSignExtend(NewValSCEV, Ty);
+  else
+    NewValSCEV = SE->getTruncateOrZeroExtend(NewValSCEV, Ty);
 
-  // If we are inserting the base and imm values in the same block, make sure to
-  // adjust the IP position if insertion reused a result.
-  if (IP == BaseInsertPt)
-    IP = Rewriter.getInsertionPoint();
-  
-  // Always emit the immediate (if non-zero) into the same block as the user.
-  SCEVHandle NewValSCEV = SE->getAddExpr(SE->getUnknown(Base), Imm);
   return Rewriter.expandCodeFor(NewValSCEV, Ty, IP);
 }
 
@@ -664,7 +439,7 @@ Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase,
 void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
                                                Instruction *NewBasePt,
                                       SCEVExpander &Rewriter, Loop *L, Pass *P,
-                                      SmallVectorImpl<Instruction*> &DeadInsts){
+                                      SmallVectorImpl<WeakVH> &DeadInsts) {
   if (!isa<PHINode>(Inst)) {
     // By default, insert code at the user instruction.
     BasicBlock::iterator InsertPt = Inst;
@@ -1158,6 +933,39 @@ bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
   return true;
 }
 
+/// ValidOffset - Check whether the given Offset is valid for all loads and
+/// stores in UsersToProcess.
+///
+bool LoopStrengthReduce::ValidOffset(bool HasBaseReg,
+                               int64_t Offset,
+                               int64_t Scale,
+                               const std::vector<BasedUser>& UsersToProcess) {
+  if (!TLI)
+    return true;
+
+  for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
+    // If this is a load or other access, pass the type of the access in.
+    const Type *AccessTy = Type::VoidTy;
+    if (isAddressUse(UsersToProcess[i].Inst,
+                     UsersToProcess[i].OperandValToReplace))
+      AccessTy = getAccessType(UsersToProcess[i].Inst);
+    else if (isa<PHINode>(UsersToProcess[i].Inst))
+      continue;
+
+    TargetLowering::AddrMode AM;
+    if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
+      AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.BaseOffs = (uint64_t)AM.BaseOffs + (uint64_t)Offset;
+    AM.HasBaseReg = HasBaseReg || !UsersToProcess[i].Base->isZero();
+    AM.Scale = Scale;
+
+    // If load[imm+r*scale] is illegal, bail out.
+    if (!TLI->isLegalAddressingMode(AM, AccessTy))
+      return false;
+  }
+  return true;
+}
+
 /// RequiresTypeConversion - Returns true if converting Ty1 to Ty2 is not
 /// a nop.
 bool LoopStrengthReduce::RequiresTypeConversion(const Type *Ty1,
@@ -1196,10 +1004,10 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
 
   if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
     int64_t SInt = SC->getValue()->getSExtValue();
-    for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e;
-         ++NewStride) {
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
       std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
-                IVsByStride.find(StrideOrder[NewStride]);
+                IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
           StrideNoReuse.count(SI->first))
         continue;
@@ -1215,24 +1023,44 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
       // multiplications.
       if (Scale == 1 ||
           (AllUsesAreAddresses &&
-           ValidScale(HasBaseReg, Scale, UsersToProcess)))
+           ValidScale(HasBaseReg, Scale, UsersToProcess))) {
+        // Prefer to reuse an IV with a base of zero.
         for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
                IE = SI->second.IVs.end(); II != IE; ++II)
-          // FIXME: Only handle base == 0 for now.
-          // Only reuse previous IV if it would not require a type conversion.
+          // Only reuse previous IV if it would not require a type conversion
+          // and if the base difference can be folded.
           if (II->Base->isZero() &&
               !RequiresTypeConversion(II->Base->getType(), Ty)) {
             IV = *II;
             return SE->getIntegerSCEV(Scale, Stride->getType());
           }
+        // Otherwise, settle for an IV with a foldable base.
+        if (AllUsesAreAddresses)
+          for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+                 IE = SI->second.IVs.end(); II != IE; ++II)
+            // Only reuse previous IV if it would not require a type conversion
+            // and if the base difference can be folded.
+            if (SE->getEffectiveSCEVType(II->Base->getType()) ==
+                SE->getEffectiveSCEVType(Ty) &&
+                isa<SCEVConstant>(II->Base)) {
+              int64_t Base =
+                cast<SCEVConstant>(II->Base)->getValue()->getSExtValue();
+              if (Base > INT32_MIN && Base <= INT32_MAX &&
+                  ValidOffset(HasBaseReg, -Base * Scale,
+                              Scale, UsersToProcess)) {
+                IV = *II;
+                return SE->getIntegerSCEV(Scale, Stride->getType());
+              }
+            }
+      }
     }
   } else if (AllUsesAreOutsideLoop) {
     // Accept nonconstant strides here; it is really really right to substitute
     // an existing IV if we can.
-    for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e;
-         ++NewStride) {
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
       std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
-                IVsByStride.find(StrideOrder[NewStride]);
+                IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
         continue;
       int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
@@ -1249,10 +1077,10 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
     }
     // Special case, old IV is -1*x and this one is x.  Can treat this one as
     // -1*old.
-    for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e;
-         ++NewStride) {
+    for (unsigned NewStride = 0, e = IU->StrideOrder.size();
+         NewStride != e; ++NewStride) {
       std::map<SCEVHandle, IVsOfOneStride>::iterator SI = 
-                IVsByStride.find(StrideOrder[NewStride]);
+                IVsByStride.find(IU->StrideOrder[NewStride]);
       if (SI == IVsByStride.end()) 
         continue;
       if (const SCEVMulExpr *ME = dyn_cast<SCEVMulExpr>(SI->first))
@@ -1303,10 +1131,15 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
                                               bool &AllUsesAreAddresses,
                                               bool &AllUsesAreOutsideLoop,
                                        std::vector<BasedUser> &UsersToProcess) {
+  // FIXME: Generalize to non-affine IV's.
+  if (!Stride->isLoopInvariant(L))
+    return SE->getIntegerSCEV(0, Stride->getType());
+
   UsersToProcess.reserve(Uses.Users.size());
-  for (unsigned i = 0, e = Uses.Users.size(); i != e; ++i) {
-    UsersToProcess.push_back(BasedUser(Uses.Users[i], SE));
-    
+  for (ilist<IVStrideUse>::iterator I = Uses.Users.begin(),
+       E = Uses.Users.end(); I != E; ++I) {
+    UsersToProcess.push_back(BasedUser(*I, SE));
+
     // Move any loop variant operands from the offset field to the immediate
     // field of the use, so that we don't try to use something before it is
     // computed.
@@ -1404,7 +1237,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
   // TODO: For now, don't do full strength reduction if there could
   // potentially be greater-stride multiples of the current stride
   // which could reuse the current stride IV.
-  if (StrideOrder.back() != Stride)
+  if (IU->StrideOrder.back() != Stride)
     return false;
 
   // Iterate through the uses to find conditions that automatically rule out
@@ -1853,8 +1686,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
 
       SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
 
-      if (SE->getTypeSizeInBits(RewriteOp->getType()) !=
-          SE->getTypeSizeInBits(ReplacedTy)) {
+      if (SE->getEffectiveSCEVType(RewriteOp->getType()) !=
+          SE->getEffectiveSCEVType(ReplacedTy)) {
         assert(SE->getTypeSizeInBits(RewriteOp->getType()) >
                SE->getTypeSizeInBits(ReplacedTy) &&
                "Unexpected widening cast!");
@@ -1884,8 +1717,8 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
         // it here.
         if (!ReuseIV.Base->isZero()) {
           SCEVHandle typedBase = ReuseIV.Base;
-          if (SE->getTypeSizeInBits(RewriteExpr->getType()) !=
-              SE->getTypeSizeInBits(ReuseIV.Base->getType())) {
+          if (SE->getEffectiveSCEVType(RewriteExpr->getType()) !=
+              SE->getEffectiveSCEVType(ReuseIV.Base->getType())) {
             // It's possible the original IV is a larger type than the new IV,
             // in which case we have to truncate the Base.  We checked in
             // RequiresTypeConversion that this is valid.
@@ -1929,7 +1762,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
 
       // Mark old value we replaced as possibly dead, so that it is eliminated
       // if we just replaced the last use of that value.
-      DeadInsts.push_back(cast<Instruction>(User.OperandValToReplace));
+      DeadInsts.push_back(User.OperandValToReplace);
 
       UsersToProcess.pop_back();
       ++NumReduced;
@@ -1949,19 +1782,19 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
 /// false.
 bool LoopStrengthReduce::FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
                                        const SCEVHandle *&CondStride) {
-  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e && !CondUse;
-       ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
-    IVUsesByStride.find(StrideOrder[Stride]);
-    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
-    
-    for (std::vector<IVStrideUse>::iterator UI = SI->second.Users.begin(),
-         E = SI->second.Users.end(); UI != E; ++UI)
-      if (UI->User == Cond) {
+  for (unsigned Stride = 0, e = IU->StrideOrder.size();
+       Stride != e && !CondUse; ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+
+    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+         E = SI->second->Users.end(); UI != E; ++UI)
+      if (UI->getUser() == Cond) {
         // NOTE: we could handle setcc instructions with multiple uses here, but
         // InstCombine does it as well for simple uses, it's not clear that it
         // occurs enough in real life to handle.
-        CondUse = &*UI;
+        CondUse = UI;
         CondStride = &SI->first;
         return true;
       }
@@ -2022,9 +1855,18 @@ namespace {
 ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
                                                 IVStrideUse* &CondUse,
                                                 const SCEVHandle* &CondStride) {
-  if (StrideOrder.size() < 2 ||
-      IVUsesByStride[*CondStride].Users.size() != 1)
+  // If there's only one stride in the loop, there's nothing to do here.
+  if (IU->StrideOrder.size() < 2)
+    return Cond;
+  // If there are other users of the condition's stride, don't bother
+  // trying to change the condition because the stride will still
+  // remain.
+  std::map<SCEVHandle, IVUsersOfOneStride *>::iterator I =
+    IU->IVUsesByStride.find(*CondStride);
+  if (I == IU->IVUsesByStride.end() ||
+      I->second->Users.size() != 1)
     return Cond;
+  // Only handle constant strides for now.
   const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride);
   if (!SC) return Cond;
 
@@ -2051,9 +1893,9 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
       return Cond;
 
     // Look for a suitable stride / iv as replacement.
-    for (unsigned i = 0, e = StrideOrder.size(); i != e; ++i) {
-      std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
-        IVUsesByStride.find(StrideOrder[i]);
+    for (unsigned i = 0, e = IU->StrideOrder.size(); i != e; ++i) {
+      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        IU->IVUsesByStride.find(IU->StrideOrder[i]);
       if (!isa<SCEVConstant>(SI->first))
         continue;
       int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
@@ -2069,6 +1911,9 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
       // Check for overflow.
       if (!Mul.isSignedIntN(BitWidth))
         continue;
+      // Check for overflow in the stride's type too.
+      if (!Mul.isSignedIntN(SE->getTypeSizeInBits(SI->first->getType())))
+        continue;
 
       // Watch out for overflow.
       if (ICmpInst::isSignedPredicate(Predicate) &&
@@ -2079,9 +1924,27 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
         continue;
       // Pick the best iv to use trying to avoid a cast.
       NewCmpLHS = NULL;
-      for (std::vector<IVStrideUse>::iterator UI = SI->second.Users.begin(),
-             E = SI->second.Users.end(); UI != E; ++UI) {
-        NewCmpLHS = UI->OperandValToReplace;
+      for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+             E = SI->second->Users.end(); UI != E; ++UI) {
+        Value *Op = UI->getOperandValToReplace();
+
+        // If the IVStrideUse implies a cast, check for an actual cast which
+        // can be used to find the original IV expression.
+        if (SE->getEffectiveSCEVType(Op->getType()) !=
+            SE->getEffectiveSCEVType(SI->first->getType())) {
+          CastInst *CI = dyn_cast<CastInst>(Op);
+          // If it's not a simple cast, it's complicated.
+          if (!CI)
+            continue;
+          // If it's a cast from a type other than the stride type,
+          // it's complicated.
+          if (CI->getOperand(0)->getType() != SI->first->getType())
+            continue;
+          // Ok, we found the IV expression in the stride's type.
+          Op = CI->getOperand(0);
+        }
+
+        NewCmpLHS = Op;
         if (NewCmpLHS->getType() == CmpTy)
           break;
       }
@@ -2105,13 +1968,13 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
       // Don't rewrite if use offset is non-constant and the new type is
       // of a different type.
       // FIXME: too conservative?
-      if (NewTyBits != TyBits && !isa<SCEVConstant>(CondUse->Offset))
+      if (NewTyBits != TyBits && !isa<SCEVConstant>(CondUse->getOffset()))
         continue;
 
       bool AllUsesAreAddresses = true;
       bool AllUsesAreOutsideLoop = true;
       std::vector<BasedUser> UsersToProcess;
-      SCEVHandle CommonExprs = CollectIVUsers(SI->first, SI->second, L,
+      SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
                                               AllUsesAreAddresses,
                                               AllUsesAreOutsideLoop,
                                               UsersToProcess);
@@ -2127,7 +1990,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
       if (Scale < 0 && !Cond->isEquality())
         Predicate = ICmpInst::getSwappedPredicate(Predicate);
 
-      NewStride = &StrideOrder[i];
+      NewStride = &IU->StrideOrder[i];
       if (!isa<PointerType>(NewCmpTy))
         NewCmpRHS = ConstantInt::get(NewCmpTy, NewCmpVal);
       else {
@@ -2135,10 +1998,11 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
         NewCmpRHS = ConstantExpr::getIntToPtr(CI, NewCmpTy);
       }
       NewOffset = TyBits == NewTyBits
-        ? SE->getMulExpr(CondUse->Offset,
+        ? SE->getMulExpr(CondUse->getOffset(),
                          SE->getConstant(ConstantInt::get(CmpTy, Scale)))
         : SE->getConstant(ConstantInt::get(NewCmpIntTy,
-          cast<SCEVConstant>(CondUse->Offset)->getValue()->getSExtValue()*Scale));
+          cast<SCEVConstant>(CondUse->getOffset())->getValue()
+            ->getSExtValue()*Scale));
       break;
     }
   }
@@ -2165,13 +2029,12 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
                         OldCond);
 
     // Remove the old compare instruction. The old indvar is probably dead too.
-    DeadInsts.push_back(cast<Instruction>(CondUse->OperandValToReplace));
+    DeadInsts.push_back(CondUse->getOperandValToReplace());
     OldCond->replaceAllUsesWith(Cond);
     OldCond->eraseFromParent();
 
-    IVUsesByStride[*CondStride].Users.pop_back();
-    IVUsesByStride[*NewStride].addUser(NewOffset, Cond, NewCmpLHS);
-    CondUse = &IVUsesByStride[*NewStride].Users.back();
+    IU->IVUsesByStride[*NewStride]->addUser(NewOffset, Cond, NewCmpLHS, false);
+    CondUse = &IU->IVUsesByStride[*NewStride]->Users.back();
     CondStride = NewStride;
     ++NumEliminated;
     Changed = true;
@@ -2287,12 +2150,12 @@ ICmpInst *LoopStrengthReduce::OptimizeSMax(Loop *L, ICmpInst *Cond,
 
   // Delete the max calculation instructions.
   Cond->replaceAllUsesWith(NewCond);
-  Cond->eraseFromParent();
+  CondUse->setUser(NewCond);
   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
+  Cond->eraseFromParent();
   Sel->eraseFromParent();
   if (Cmp->use_empty())
     Cmp->eraseFromParent();
-  CondUse->User = NewCond;
   return NewCond;
 }
 
@@ -2304,19 +2167,19 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     return;
 
-  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e;
+  for (unsigned Stride = 0, e = IU->StrideOrder.size(); Stride != e;
        ++Stride) {
-    std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
-      IVUsesByStride.find(StrideOrder[Stride]);
-    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
+    std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+      IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+    assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
     if (!isa<SCEVConstant>(SI->first))
       continue;
 
-    for (std::vector<IVStrideUse>::iterator UI = SI->second.Users.begin(),
-           E = SI->second.Users.end(); UI != E; /* empty */) {
-      std::vector<IVStrideUse>::iterator CandidateUI = UI;
+    for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
+           E = SI->second->Users.end(); UI != E; /* empty */) {
+      ilist<IVStrideUse>::iterator CandidateUI = UI;
       ++UI;
-      Instruction *ShadowUse = CandidateUI->User;
+      Instruction *ShadowUse = CandidateUI->getUser();
       const Type *DestTy = NULL;
 
       /* If shadow use is a int->float cast then insert a second IV
@@ -2331,9 +2194,9 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
            for (unsigned i = 0; i < n; ++i, ++d) 
              foo(d);
       */
-      if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->User))
+      if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
         DestTy = UCast->getDestTy();
-      else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->User))
+      else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
         DestTy = SCast->getDestTy();
       if (!DestTy) continue;
 
@@ -2400,7 +2263,6 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
       /* Remove cast operation */
       ShadowUse->replaceAllUsesWith(NewPH);
       ShadowUse->eraseFromParent();
-      SI->second.Users.erase(CandidateUI);
       NumShadow++;
       break;
     }
@@ -2450,11 +2312,12 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
     // transform the icmp to use post-inc iv. Otherwise do so only if it would
     // not reuse another iv and its iv would be reused by other uses. We are
     // optimizing for the case where the icmp is the only use of the iv.
-    IVUsersOfOneStride &StrideUses = IVUsesByStride[*CondStride];
-    for (unsigned i = 0, e = StrideUses.Users.size(); i != e; ++i) {
-      if (StrideUses.Users[i].User == Cond)
+    IVUsersOfOneStride &StrideUses = *IU->IVUsesByStride[*CondStride];
+    for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
+         E = StrideUses.Users.end(); I != E; ++I) {
+      if (I->getUser() == Cond)
         continue;
-      if (!StrideUses.Users[i].isUseOfPostIncrementedValue)
+      if (!I->isUseOfPostIncrementedValue())
         return;
     }
 
@@ -2463,10 +2326,10 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
     // StrengthReduceStridedIVUsers?
     if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride)) {
       int64_t SInt = SC->getValue()->getSExtValue();
-      for (unsigned NewStride = 0, ee = StrideOrder.size(); NewStride != ee;
+      for (unsigned NewStride = 0, ee = IU->StrideOrder.size(); NewStride != ee;
            ++NewStride) {
-        std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
-          IVUsesByStride.find(StrideOrder[NewStride]);
+        std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+          IU->IVUsesByStride.find(IU->StrideOrder[NewStride]);
         if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
           continue;
         int64_t SSInt =
@@ -2479,7 +2342,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
         bool AllUsesAreAddresses = true;
         bool AllUsesAreOutsideLoop = true;
         std::vector<BasedUser> UsersToProcess;
-        SCEVHandle CommonExprs = CollectIVUsers(SI->first, SI->second, L,
+        SCEVHandle CommonExprs = CollectIVUsers(SI->first, *SI->second, L,
                                                 AllUsesAreAddresses,
                                                 AllUsesAreOutsideLoop,
                                                 UsersToProcess);
@@ -2518,17 +2381,18 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
       LatchBlock->getInstList().insert(TermBr, Cond);
       
       // Clone the IVUse, as the old use still exists!
-      IVUsesByStride[*CondStride].addUser(CondUse->Offset, Cond,
-                                          CondUse->OperandValToReplace);
-      CondUse = &IVUsesByStride[*CondStride].Users.back();
+      IU->IVUsesByStride[*CondStride]->addUser(CondUse->getOffset(), Cond,
+                                              CondUse->getOperandValToReplace(),
+                                               false);
+      CondUse = &IU->IVUsesByStride[*CondStride]->Users.back();
     }
   }
 
   // If we get to here, we know that we can transform the setcc instruction to
   // use the post-incremented version of the IV, allowing us to coalesce the
   // live ranges for the IV correctly.
-  CondUse->Offset = SE->getMinusSCEV(CondUse->Offset, *CondStride);
-  CondUse->isUseOfPostIncrementedValue = true;
+  CondUse->setOffset(SE->getMinusSCEV(CondUse->getOffset(), *CondStride));
+  CondUse->setIsUseOfPostIncrementedValue(true);
   Changed = true;
 
   ++NumLoopCond;
@@ -2644,19 +2508,13 @@ void LoopStrengthReduce::OptimizeLoopCountIV(Loop *L) {
 
 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
 
+  IU = &getAnalysis<IVUsers>();
   LI = &getAnalysis<LoopInfo>();
   DT = &getAnalysis<DominatorTree>();
   SE = &getAnalysis<ScalarEvolution>();
   Changed = false;
 
-  // Find all uses of induction variables in this loop, and categorize
-  // them by stride.  Start by finding all of the PHI nodes in the header for
-  // this loop.  If they are induction variables, inspect their uses.
-  SmallPtrSet<Instruction*,16> Processed;   // Don't reprocess instructions.
-  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
-    AddUsersIfInteresting(I, L, Processed);
-
-  if (!IVUsesByStride.empty()) {
+  if (!IU->IVUsesByStride.empty()) {
 #ifndef NDEBUG
     DOUT << "\nLSR on \"" << L->getHeader()->getParent()->getNameStart()
          << "\" ";
@@ -2664,7 +2522,8 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
 #endif
 
     // Sort the StrideOrder so we process larger strides first.
-    std::stable_sort(StrideOrder.begin(), StrideOrder.end(), StrideCompare(SE));
+    std::stable_sort(IU->StrideOrder.begin(), IU->StrideOrder.end(),
+                     StrideCompare(SE));
 
     // Optimize induction variables.  Some indvar uses can be transformed to use
     // strides that will be needed for other purposes.  A common example of this
@@ -2695,11 +2554,15 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
     // Also, note that we iterate over IVUsesByStride indirectly by using
     // StrideOrder. This extra layer of indirection makes the ordering of
     // strides deterministic - not dependent on map order.
-    for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
-      std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
-        IVUsesByStride.find(StrideOrder[Stride]);
-      assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
-      StrengthReduceStridedIVUsers(SI->first, SI->second, L);
+    for (unsigned Stride = 0, e = IU->StrideOrder.size();
+         Stride != e; ++Stride) {
+      std::map<SCEVHandle, IVUsersOfOneStride *>::iterator SI =
+        IU->IVUsesByStride.find(IU->StrideOrder[Stride]);
+      assert(SI != IU->IVUsesByStride.end() && "Stride doesn't exist!");
+      // FIXME: Generalize to non-affine IV's.
+      if (!SI->first->isLoopInvariant(L))
+        continue;
+      StrengthReduceStridedIVUsers(SI->first, *SI->second, L);
     }
   }
 
@@ -2708,9 +2571,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
   OptimizeLoopCountIV(L);
 
   // We're done analyzing this loop; release all the state we built up for it.
-  IVUsesByStride.clear();
   IVsByStride.clear();
-  StrideOrder.clear();
   StrideNoReuse.clear();
 
   // Clean up after ourselves
diff --git a/test/CodeGen/X86/iv-users-in-other-loops.ll b/test/CodeGen/X86/iv-users-in-other-loops.ll
index d11b025..275feba 100644
--- a/test/CodeGen/X86/iv-users-in-other-loops.ll
+++ b/test/CodeGen/X86/iv-users-in-other-loops.ll
@@ -1,10 +1,11 @@
 ; RUN: llvm-as < %s | llc -march=x86-64 -f -o %t
 ; RUN: grep inc %t | count 1
 ; RUN: grep dec %t | count 2
-; RUN: grep addq %t | count 13
-; RUN: grep leaq %t | count 8
-; RUN: grep leal %t | count 4
-; RUN: grep movq %t | count 5
+; RUN: grep addq %t | count 8
+; RUN: grep addb %t | count 2
+; RUN: grep leaq %t | count 12
+; RUN: grep leal %t | count 2
+; RUN: grep movq %t | count 4
 
 ; IV users in each of the loops from other loops shouldn't cause LSR
 ; to insert new induction variables. Previously it would create a
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index e9b80a4..e102535 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -3,14 +3,13 @@
 ; RUN: not grep movz %t
 ; RUN: not grep sar %t
 ; RUN: not grep shl %t
-; RUN: grep add %t | count 6
-; RUN: grep inc %t | count 2
-; RUN: grep dec %t | count 4
+; RUN: grep add %t | count 2
+; RUN: grep inc %t | count 4
+; RUN: grep dec %t | count 2
 ; RUN: grep lea %t | count 2
 
 ; Optimize away zext-inreg and sext-inreg on the loop induction
 ; variable using trip-count information.
-; Also, the loop-reversal algorithm kicks in twice.
 
 define void @count_up(double* %d, i64 %n) nounwind {
 entry:
diff --git a/test/CodeGen/X86/subreg-to-reg-5.ll b/test/CodeGen/X86/subreg-to-reg-5.ll
index eee751a..81b262a 100644
--- a/test/CodeGen/X86/subreg-to-reg-5.ll
+++ b/test/CodeGen/X86/subreg-to-reg-5.ll
@@ -8,7 +8,8 @@ entry:
 
 bb2:		; preds = %bb3, %entry
 	%B_addr.0.rec = phi i64 [ %indvar.next154, %bb3 ], [ 0, %entry ]		; <i64> [#uses=2]
-	br i1 false, label %bb3, label %bb4
+        %z = icmp slt i64 %B_addr.0.rec, 20000
+	br i1 %z, label %bb3, label %bb4
 
 bb3:		; preds = %bb2
 	%indvar.next154 = add i64 %B_addr.0.rec, 1		; <i64> [#uses=1]
@@ -17,7 +18,7 @@ bb3:		; preds = %bb2
 bb4:		; preds = %bb2
 	%B_addr.0 = getelementptr float* %B, i64 %B_addr.0.rec		; <float*> [#uses=1]
 	%t1 = ptrtoint float* %B_addr.0 to i64		; <i64> [#uses=1]
-	%t2 = and i64 %t1, 15		; <i64> [#uses=1]
+	%t2 = and i64 %t1, 4294967295		; <i64> [#uses=1]
 	%t3 = icmp eq i64 %t2, 0		; <i1> [#uses=1]
 	br i1 %t3, label %bb5, label %bb10.preheader
 
@@ -25,7 +26,7 @@ bb10.preheader:		; preds = %bb4
 	br label %bb9
 
 bb5:		; preds = %bb4
-	unreachable
+	ret float 7.0
 
 bb9:		; preds = %bb10.preheader
 	%t5 = getelementptr float* %B, i64 0		; <float*> [#uses=1]
diff --git a/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll b/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll
index 5cc595e..4d26803 100644
--- a/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll
+++ b/test/Transforms/IndVarSimplify/2009-04-15-shorten-iv-vars-2.ll
@@ -1,5 +1,4 @@
-; RUN: llvm-as < %s | opt -indvars | llvm-dis | not grep {sext}
-; RUN: llvm-as < %s | opt -indvars | llvm-dis | not grep {zext}
+; RUN: llvm-as < %s | opt -indvars -instcombine | llvm-dis | not grep {\[sz\]ext}
 ; ModuleID = '<stdin>'
 ;extern int *a, *b, *c, *d, *e, *f;  /* 64 bit */
 ;extern int K[256];
diff --git a/test/Transforms/IndVarSimplify/ada-loops.ll b/test/Transforms/IndVarSimplify/ada-loops.ll
new file mode 100644
index 0000000..56325b3
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/ada-loops.ll
@@ -0,0 +1,90 @@
+; RUN: llvm-as < %s | opt -indvars | llvm-dis > %t
+; RUN: grep phi %t | count 4
+; RUN: grep {= phi i32} %t | count 4
+; RUN: not grep {sext i} %t
+; RUN: not grep {zext i} %t
+; RUN: not grep {trunc i} %t
+; RUN: not grep {add i8} %t
+; PR1301
+
+; Do a bunch of analysis and prove that the loops can use an i32 trip
+; count without casting.
+
+; ModuleID = 'ada.bc'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
+target triple = "i686-pc-linux-gnu"
+
+define void @kinds__sbytezero([256 x i32]* nocapture %a) nounwind {
+bb.thread:
+	%tmp46 = getelementptr [256 x i32]* %a, i32 0, i32 0		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp46
+	br label %bb
+
+bb:		; preds = %bb, %bb.thread
+	%i.0.reg2mem.0 = phi i8 [ -128, %bb.thread ], [ %tmp8, %bb ]		; <i8> [#uses=1]
+	%tmp8 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=3]
+	%tmp1 = sext i8 %tmp8 to i32		; <i32> [#uses=1]
+	%tmp3 = add i32 %tmp1, 128		; <i32> [#uses=1]
+	%tmp4 = getelementptr [256 x i32]* %a, i32 0, i32 %tmp3		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp4
+	%0 = icmp eq i8 %tmp8, 127		; <i1> [#uses=1]
+	br i1 %0, label %return, label %bb
+
+return:		; preds = %bb
+	ret void
+}
+
+define void @kinds__ubytezero([256 x i32]* nocapture %a) nounwind {
+bb.thread:
+	%tmp35 = getelementptr [256 x i32]* %a, i32 0, i32 0		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp35
+	br label %bb
+
+bb:		; preds = %bb, %bb.thread
+	%i.0.reg2mem.0 = phi i8 [ 0, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=1]
+	%tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=3]
+	%tmp1 = zext i8 %tmp7 to i32		; <i32> [#uses=1]
+	%tmp3 = getelementptr [256 x i32]* %a, i32 0, i32 %tmp1		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp3
+	%0 = icmp eq i8 %tmp7, -1		; <i1> [#uses=1]
+	br i1 %0, label %return, label %bb
+
+return:		; preds = %bb
+	ret void
+}
+
+define void @kinds__srangezero([21 x i32]* nocapture %a) nounwind {
+bb.thread:
+	br label %bb
+
+bb:		; preds = %bb, %bb.thread
+	%i.0.reg2mem.0 = phi i8 [ -10, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=2]
+	%tmp12 = sext i8 %i.0.reg2mem.0 to i32		; <i32> [#uses=1]
+	%tmp4 = add i32 %tmp12, 10		; <i32> [#uses=1]
+	%tmp5 = getelementptr [21 x i32]* %a, i32 0, i32 %tmp4		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp5
+	%tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=2]
+	%0 = icmp sgt i8 %tmp7, 10		; <i1> [#uses=1]
+	br i1 %0, label %return, label %bb
+
+return:		; preds = %bb
+	ret void
+}
+
+define void @kinds__urangezero([21 x i32]* nocapture %a) nounwind {
+bb.thread:
+	br label %bb
+
+bb:		; preds = %bb, %bb.thread
+	%i.0.reg2mem.0 = phi i8 [ 10, %bb.thread ], [ %tmp7, %bb ]		; <i8> [#uses=2]
+	%tmp12 = sext i8 %i.0.reg2mem.0 to i32		; <i32> [#uses=1]
+	%tmp4 = add i32 %tmp12, -10		; <i32> [#uses=1]
+	%tmp5 = getelementptr [21 x i32]* %a, i32 0, i32 %tmp4		; <i32*> [#uses=1]
+	store i32 0, i32* %tmp5
+	%tmp7 = add i8 %i.0.reg2mem.0, 1		; <i8> [#uses=2]
+	%0 = icmp sgt i8 %tmp7, 30		; <i1> [#uses=1]
+	br i1 %0, label %return, label %bb
+
+return:		; preds = %bb
+	ret void
+}
diff --git a/test/Transforms/IndVarSimplify/iv-zext.ll b/test/Transforms/IndVarSimplify/iv-zext.ll
new file mode 100644
index 0000000..76d48de
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/iv-zext.ll
@@ -0,0 +1,33 @@
+; RUN: llvm-as < %s | opt -indvars | llvm-dis > %t
+; RUN: not grep and %t
+; RUN: not grep zext %t
+
+target datalayout = "-p:64:64:64"
+
+define void @foo(double* %d, i64 %n) nounwind {
+entry:
+	br label %loop
+
+loop:
+	%indvar = phi i64 [ 0, %entry ], [ %indvar.next, %loop ]
+	%indvar.i8 = and i64 %indvar, 255
+	%t0 = getelementptr double* %d, i64 %indvar.i8
+	%t1 = load double* %t0
+	%t2 = mul double %t1, 0.1
+	store double %t2, double* %t0
+	%indvar.i24 = and i64 %indvar, 16777215
+	%t3 = getelementptr double* %d, i64 %indvar.i24
+	%t4 = load double* %t3
+	%t5 = mul double %t4, 2.3
+	store double %t5, double* %t3
+	%t6 = getelementptr double* %d, i64 %indvar
+	%t7 = load double* %t6
+	%t8 = mul double %t7, 4.5
+	store double %t8, double* %t6
+	%indvar.next = add i64 %indvar, 1
+	%exitcond = icmp eq i64 %indvar.next, 10
+	br i1 %exitcond, label %return, label %loop
+
+return:
+	ret void
+}
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate_6.ll b/test/Transforms/IndVarSimplify/loop_evaluate_6.ll
new file mode 100644
index 0000000..35fbf52
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/loop_evaluate_6.ll
@@ -0,0 +1,29 @@
+; RUN: llvm-as < %s | opt -indvars -loop-deletion | llvm-dis | grep phi | count 1
+
+; Indvars should be able to evaluate this loop, allowing loop deletion
+; to delete it.
+
+define i32 @test(i32 %x_offs) nounwind readnone {
+entry:
+	%0 = icmp sgt i32 %x_offs, 4		; <i1> [#uses=1]
+	br i1 %0, label %bb.nph, label %bb2
+
+bb.nph:		; preds = %entry
+	br label %bb
+
+bb:		; preds = %bb1, %bb.nph
+	%x_offs_addr.01 = phi i32 [ %1, %bb1 ], [ %x_offs, %bb.nph ]		; <i32> [#uses=1]
+	%1 = add i32 %x_offs_addr.01, -4		; <i32> [#uses=3]
+	br label %bb1
+
+bb1:		; preds = %bb
+	%2 = icmp sgt i32 %1, 4		; <i1> [#uses=1]
+	br i1 %2, label %bb, label %bb1.bb2_crit_edge
+
+bb1.bb2_crit_edge:		; preds = %bb1
+	br label %bb2
+
+bb2:		; preds = %bb1.bb2_crit_edge, %entry
+	%x_offs_addr.0.lcssa = phi i32 [ %1, %bb1.bb2_crit_edge ], [ %x_offs, %entry ]		; <i32> [#uses=1]
+	ret i32 %x_offs_addr.0.lcssa
+}
diff --git a/test/Transforms/LoopStrengthReduce/2009-04-28-no-reduce-mul.ll b/test/Transforms/LoopStrengthReduce/2009-04-28-no-reduce-mul.ll
index 153a181..f873b3d 100644
--- a/test/Transforms/LoopStrengthReduce/2009-04-28-no-reduce-mul.ll
+++ b/test/Transforms/LoopStrengthReduce/2009-04-28-no-reduce-mul.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | opt -loop-reduce | llvm-dis | grep mul | count 3
+; RUN: llvm-as < %s | opt -loop-reduce | llvm-dis | grep {mul.*%lsr.iv} | count 2
 ; The multiply in bb2 must not be reduced to an add, as the sext causes the
 ; %1 argument to become negative after a while.
 ; ModuleID = '<stdin>'