aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--lib/Transforms/Scalar/LoopStrengthReduce.cpp182
-rw-r--r--test/CodeGen/X86/lsr-loop-exit-cond.ll134
-rw-r--r--test/CodeGen/X86/lsr-negative-stride.ll2
-rw-r--r--test/CodeGen/X86/remat-mov-1.ll (renamed from test/CodeGen/X86/remat-mov0.ll)2
4 files changed, 275 insertions, 45 deletions
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 9568449..127ef56 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -43,6 +43,7 @@ STATISTIC(NumVariable, "Number of PHIs with variable strides");
STATISTIC(NumEliminated, "Number of strides eliminated");
STATISTIC(NumShadow, "Number of Shadow IVs optimized");
STATISTIC(NumImmSunk, "Number of common expr immediates sunk into uses");
+STATISTIC(NumLoopCond, "Number of loop terminating conds optimized");
static cl::opt<bool> EnableFullLSRMode("enable-full-lsr",
cl::init(false),
@@ -122,6 +123,10 @@ namespace {
/// particular stride.
std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
+ /// StrideNoReuse - Keep track of all the strides whose ivs cannot be
+ /// reused (nor should they be rewritten to reuse other strides).
+ SmallSet<SCEVHandle, 4> StrideNoReuse;
+
/// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
/// We use this to iterate over the IVUsesByStride collection without being
/// dependent on random ordering of pointers in the process.
@@ -184,8 +189,8 @@ namespace {
SCEVHandle CheckForIVReuse(bool, bool, bool, const SCEVHandle&,
IVExpr&, const Type*,
const std::vector<BasedUser>& UsersToProcess);
- bool ValidStride(bool, int64_t,
- const std::vector<BasedUser>& UsersToProcess);
+ bool ValidScale(bool, int64_t,
+ const std::vector<BasedUser>& UsersToProcess);
SCEVHandle CollectIVUsers(const SCEVHandle &Stride,
IVUsersOfOneStride &Uses,
Loop *L,
@@ -213,6 +218,7 @@ namespace {
SCEVHandle Stride,
SCEVHandle CommonExprs,
Value *CommonBaseV,
+ Instruction *IVIncInsertPt,
const Loop *L,
SCEVExpander &PreheaderRewriter);
void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
@@ -799,7 +805,7 @@ static bool fitsInAddressMode(const SCEVHandle &V, const Type *UseTy,
/// MoveLoopVariantsToImmediateField - Move any subexpressions from Val that are
/// loop varying to the Imm operand.
static void MoveLoopVariantsToImmediateField(SCEVHandle &Val, SCEVHandle &Imm,
- Loop *L, ScalarEvolution *SE) {
+ Loop *L, ScalarEvolution *SE) {
if (Val->isLoopInvariant(L)) return; // Nothing to do.
if (const SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
@@ -1122,16 +1128,15 @@ RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses,
return Result;
}
-/// ValidStride - Check whether the given Scale is valid for all loads and
+/// ValidScale - Check whether the given Scale is valid for all loads and
/// stores in UsersToProcess.
///
-bool LoopStrengthReduce::ValidStride(bool HasBaseReg,
- int64_t Scale,
+bool LoopStrengthReduce::ValidScale(bool HasBaseReg, int64_t Scale,
const std::vector<BasedUser>& UsersToProcess) {
if (!TLI)
return true;
- for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
+ for (unsigned i = 0, e = UsersToProcess.size(); i!=e; ++i) {
// If this is a load or other access, pass the type of the access in.
const Type *AccessTy = Type::VoidTy;
if (isAddressUse(UsersToProcess[i].Inst,
@@ -1186,13 +1191,17 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
const SCEVHandle &Stride,
IVExpr &IV, const Type *Ty,
const std::vector<BasedUser>& UsersToProcess) {
+ if (StrideNoReuse.count(Stride))
+ return SE->getIntegerSCEV(0, Stride->getType());
+
if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
int64_t SInt = SC->getValue()->getSExtValue();
for (unsigned NewStride = 0, e = StrideOrder.size(); NewStride != e;
++NewStride) {
std::map<SCEVHandle, IVsOfOneStride>::iterator SI =
IVsByStride.find(StrideOrder[NewStride]);
- if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first))
+ if (SI == IVsByStride.end() || !isa<SCEVConstant>(SI->first) ||
+ StrideNoReuse.count(SI->first))
continue;
int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
if (SI->first != Stride &&
@@ -1206,7 +1215,7 @@ SCEVHandle LoopStrengthReduce::CheckForIVReuse(bool HasBaseReg,
// multiplications.
if (Scale == 1 ||
(AllUsesAreAddresses &&
- ValidStride(HasBaseReg, Scale, UsersToProcess)))
+ ValidScale(HasBaseReg, Scale, UsersToProcess)))
for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
IE = SI->second.IVs.end(); II != IE; ++II)
// FIXME: Only handle base == 0 for now.
@@ -1302,7 +1311,7 @@ SCEVHandle LoopStrengthReduce::CollectIVUsers(const SCEVHandle &Stride,
// field of the use, so that we don't try to use something before it is
// computed.
MoveLoopVariantsToImmediateField(UsersToProcess.back().Base,
- UsersToProcess.back().Imm, L, SE);
+ UsersToProcess.back().Imm, L, SE);
assert(UsersToProcess.back().Base->isLoopInvariant(L) &&
"Base value is not loop invariant!");
}
@@ -1452,6 +1461,7 @@ bool LoopStrengthReduce::ShouldUseFullStrengthReductionMode(
/// Return the created phi node.
///
static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
+ Instruction *IVIncInsertPt,
const Loop *L,
SCEVExpander &Rewriter) {
assert(Start->isLoopInvariant(L) && "New PHI start is not loop invariant!");
@@ -1475,16 +1485,17 @@ static PHINode *InsertAffinePhi(SCEVHandle Start, SCEVHandle Step,
IncAmount = Rewriter.SE.getNegativeSCEV(Step);
// Insert an add instruction right before the terminator corresponding
- // to the back-edge.
+ // to the back-edge or just before the only use. The location is determined
+ // by the caller and passed in as IVIncInsertPt.
Value *StepV = Rewriter.expandCodeFor(IncAmount, Ty,
Preheader->getTerminator());
Instruction *IncV;
if (isNegative) {
IncV = BinaryOperator::CreateSub(PN, StepV, "lsr.iv.next",
- LatchBlock->getTerminator());
+ IVIncInsertPt);
} else {
IncV = BinaryOperator::CreateAdd(PN, StepV, "lsr.iv.next",
- LatchBlock->getTerminator());
+ IVIncInsertPt);
}
if (!isa<ConstantInt>(StepV)) ++NumVariable;
@@ -1541,6 +1552,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
// Rewrite the UsersToProcess records, creating a separate PHI for each
// unique Base value.
+ Instruction *IVIncInsertPt = L->getLoopLatch()->getTerminator();
for (unsigned i = 0, e = UsersToProcess.size(); i != e; ) {
// TODO: The uses are grouped by base, but not sorted. We arbitrarily
// pick the first Imm value here to start with, and adjust it for the
@@ -1548,7 +1560,7 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
SCEVHandle Imm = UsersToProcess[i].Imm;
SCEVHandle Base = UsersToProcess[i].Base;
SCEVHandle Start = SE->getAddExpr(CommonExprs, Base, Imm);
- PHINode *Phi = InsertAffinePhi(Start, Stride, L,
+ PHINode *Phi = InsertAffinePhi(Start, Stride, IVIncInsertPt, L,
PreheaderRewriter);
// Loop over all the users with the same base.
do {
@@ -1561,6 +1573,18 @@ LoopStrengthReduce::PrepareToStrengthReduceFully(
}
}
+/// FindIVIncInsertPt - Return the location to insert the increment instruction.
+/// If the only use if a use of postinc value, (must be the loop termination
+/// condition), then insert it just before the use.
+static Instruction *FindIVIncInsertPt(std::vector<BasedUser> &UsersToProcess,
+ const Loop *L) {
+ if (UsersToProcess.size() == 1 &&
+ UsersToProcess[0].isUseOfPostIncrementedValue &&
+ L->contains(UsersToProcess[0].Inst->getParent()))
+ return UsersToProcess[0].Inst;
+ return L->getLoopLatch()->getTerminator();
+}
+
/// PrepareToStrengthReduceWithNewPhi - Insert a new induction variable for the
/// given users to share.
///
@@ -1570,12 +1594,13 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
SCEVHandle Stride,
SCEVHandle CommonExprs,
Value *CommonBaseV,
+ Instruction *IVIncInsertPt,
const Loop *L,
SCEVExpander &PreheaderRewriter) {
DOUT << " Inserting new PHI:\n";
PHINode *Phi = InsertAffinePhi(SE->getUnknown(CommonBaseV),
- Stride, L,
+ Stride, IVIncInsertPt, L,
PreheaderRewriter);
// Remember this in case a later stride is multiple of this.
@@ -1590,8 +1615,8 @@ LoopStrengthReduce::PrepareToStrengthReduceWithNewPhi(
DOUT << "\n";
}
-/// PrepareToStrengthReduceWithNewPhi - Prepare for the given users to reuse
-/// an induction variable with a stride that is a factor of the current
+/// PrepareToStrengthReduceFromSmallerStride - Prepare for the given users to
+/// reuse an induction variable with a stride that is a factor of the current
/// induction variable.
///
void
@@ -1727,6 +1752,7 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
BasicBlock *Preheader = L->getLoopPreheader();
Instruction *PreInsertPt = Preheader->getTerminator();
BasicBlock *LatchBlock = L->getLoopLatch();
+ Instruction *IVIncInsertPt = LatchBlock->getTerminator();
Value *CommonBaseV = Constant::getNullValue(ReplacedTy);
@@ -1755,13 +1781,15 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
AllUsesAreOutsideLoop,
Stride, ReuseIV, ReplacedTy,
UsersToProcess);
- if (isa<SCEVConstant>(RewriteFactor) &&
- cast<SCEVConstant>(RewriteFactor)->isZero())
- PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
- CommonBaseV, L, PreheaderRewriter);
- else
+ if (!RewriteFactor->isZero())
PrepareToStrengthReduceFromSmallerStride(UsersToProcess, CommonBaseV,
ReuseIV, PreInsertPt);
+ else {
+ IVIncInsertPt = FindIVIncInsertPt(UsersToProcess, L);
+ PrepareToStrengthReduceWithNewPhi(UsersToProcess, Stride, CommonExprs,
+ CommonBaseV, IVIncInsertPt,
+ L, PreheaderRewriter);
+ }
}
// Process all the users now, replacing their strided uses with
@@ -1800,7 +1828,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
// FIXME: Use emitted users to emit other users.
BasedUser &User = UsersToProcess.back();
- DOUT << " Examining use ";
+ DOUT << " Examining ";
+ if (User.isUseOfPostIncrementedValue)
+ DOUT << "postinc";
+ else
+ DOUT << "preinc";
+ DOUT << " use ";
DEBUG(WriteAsOperand(*DOUT, UsersToProcess.back().OperandValToReplace,
/*PrintType=*/false));
DOUT << " in Inst: " << *(User.Inst);
@@ -1810,11 +1843,12 @@ void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
Value *RewriteOp = User.Phi;
if (User.isUseOfPostIncrementedValue) {
RewriteOp = User.Phi->getIncomingValueForBlock(LatchBlock);
-
// If this user is in the loop, make sure it is the last thing in the
- // loop to ensure it is dominated by the increment.
- if (L->contains(User.Inst->getParent()))
- User.Inst->moveBefore(LatchBlock->getTerminator());
+ // loop to ensure it is dominated by the increment. In case it's the
+ // only use of the iv, the increment instruction is already before the
+ // use.
+ if (L->contains(User.Inst->getParent()) && User.Inst != IVIncInsertPt)
+ User.Inst->moveBefore(IVIncInsertPt);
}
SCEVHandle RewriteExpr = SE->getUnknown(RewriteOp);
@@ -2085,7 +2119,7 @@ ICmpInst *LoopStrengthReduce::ChangeCompareStride(Loop *L, ICmpInst *Cond,
// if it's likely the new stride uses will be rewritten using the
// stride of the compare instruction.
if (AllUsesAreAddresses &&
- ValidStride(!CommonExprs->isZero(), Scale, UsersToProcess))
+ ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
continue;
// If scale is negative, use swapped predicate unless it's testing
@@ -2304,8 +2338,8 @@ void LoopStrengthReduce::OptimizeShadowIV(Loop *L) {
if (!DestTy) continue;
if (TLI) {
- /* If target does not support DestTy natively then do not apply
- this transformation. */
+ // If target does not support DestTy natively then do not apply
+ // this transformation.
MVT DVT = TLI->getValueType(DestTy);
if (!TLI->isTypeLegal(DVT)) continue;
}
@@ -2380,8 +2414,6 @@ void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
// TODO: implement optzns here.
OptimizeShadowIV(L);
-
- OptimizeLoopTermCond(L);
}
/// OptimizeLoopTermCond - Change loop terminating condition to use the
@@ -2391,23 +2423,78 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
// can, we want to change it to use a post-incremented version of its
// induction variable, to allow coalescing the live ranges for the IV into
// one register value.
- PHINode *SomePHI = cast<PHINode>(L->getHeader()->begin());
- BasicBlock *Preheader = L->getLoopPreheader();
- BasicBlock *LatchBlock =
- SomePHI->getIncomingBlock(SomePHI->getIncomingBlock(0) == Preheader);
- BranchInst *TermBr = dyn_cast<BranchInst>(LatchBlock->getTerminator());
- if (!TermBr || TermBr->isUnconditional() ||
- !isa<ICmpInst>(TermBr->getCondition()))
+ BasicBlock *LatchBlock = L->getLoopLatch();
+ BasicBlock *ExitBlock = L->getExitingBlock();
+ if (!ExitBlock)
+ // Multiple exits, just look at the exit in the latch block if there is one.
+ ExitBlock = LatchBlock;
+ BranchInst *TermBr = dyn_cast<BranchInst>(ExitBlock->getTerminator());
+ if (!TermBr)
+ return;
+ if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
return;
- ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
// Search IVUsesByStride to find Cond's IVUse if there is one.
IVStrideUse *CondUse = 0;
const SCEVHandle *CondStride = 0;
-
+ ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
if (!FindIVUserForCond(Cond, CondUse, CondStride))
return; // setcc doesn't use the IV.
+ if (ExitBlock != LatchBlock) {
+ if (!Cond->hasOneUse())
+ // See below, we don't want the condition to be cloned.
+ return;
+
+ // If exiting block is the latch block, we know it's safe and profitable to
+ // transform the icmp to use post-inc iv. Otherwise do so only if it would
+ // not reuse another iv and its iv would be reused by other uses. We are
+ // optimizing for the case where the icmp is the only use of the iv.
+ IVUsersOfOneStride &StrideUses = IVUsesByStride[*CondStride];
+ for (unsigned i = 0, e = StrideUses.Users.size(); i != e; ++i) {
+ if (StrideUses.Users[i].User == Cond)
+ continue;
+ if (!StrideUses.Users[i].isUseOfPostIncrementedValue)
+ return;
+ }
+
+ // FIXME: This is expensive, and worse still ChangeCompareStride does a
+ // similar check. Can we perform all the icmp related transformations after
+ // StrengthReduceStridedIVUsers?
+ if (const SCEVConstant *SC = dyn_cast<SCEVConstant>(*CondStride)) {
+ int64_t SInt = SC->getValue()->getSExtValue();
+ for (unsigned NewStride = 0, ee = StrideOrder.size(); NewStride != ee;
+ ++NewStride) {
+ std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI =
+ IVUsesByStride.find(StrideOrder[NewStride]);
+ if (!isa<SCEVConstant>(SI->first) || SI->first == *CondStride)
+ continue;
+ int64_t SSInt =
+ cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+ if (SSInt == SInt)
+ return; // This can definitely be reused.
+ if (unsigned(abs(SSInt)) < SInt || (SSInt % SInt) != 0)
+ continue;
+ int64_t Scale = SSInt / SInt;
+ bool AllUsesAreAddresses = true;
+ bool AllUsesAreOutsideLoop = true;
+ std::vector<BasedUser> UsersToProcess;
+ SCEVHandle CommonExprs = CollectIVUsers(SI->first, SI->second, L,
+ AllUsesAreAddresses,
+ AllUsesAreOutsideLoop,
+ UsersToProcess);
+ // Avoid rewriting the compare instruction with an iv of new stride
+ // if it's likely the new stride uses will be rewritten using the
+ // stride of the compare instruction.
+ if (AllUsesAreAddresses &&
+ ValidScale(!CommonExprs->isZero(), Scale, UsersToProcess))
+ return;
+ }
+ }
+
+ StrideNoReuse.insert(*CondStride);
+ }
+
// If the trip count is computed in terms of an smax (due to ScalarEvolution
// being unable to find a sufficient guard, for example), change the loop
// comparison to use SLT instead of NE.
@@ -2415,7 +2502,8 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
// If possible, change stride and operands of the compare instruction to
// eliminate one stride.
- Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
+ if (ExitBlock == LatchBlock)
+ Cond = ChangeCompareStride(L, Cond, CondUse, CondStride);
// It's possible for the setcc instruction to be anywhere in the loop, and
// possible for it to have multiple users. If it is not immediately before
@@ -2431,7 +2519,7 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
// Clone the IVUse, as the old use still exists!
IVUsesByStride[*CondStride].addUser(CondUse->Offset, Cond,
- CondUse->OperandValToReplace);
+ CondUse->OperandValToReplace);
CondUse = &IVUsesByStride[*CondStride].Users.back();
}
}
@@ -2442,6 +2530,8 @@ void LoopStrengthReduce::OptimizeLoopTermCond(Loop *L) {
CondUse->Offset = SE->getMinusSCEV(CondUse->Offset, *CondStride);
CondUse->isUseOfPostIncrementedValue = true;
Changed = true;
+
+ ++NumLoopCond;
}
// OptimizeLoopCountIV - If, after all sharing of IVs, the IV used for deciding
@@ -2582,6 +2672,11 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
// computation of some other indvar to decide when to terminate the loop.
OptimizeIndvars(L);
+ // Change loop terminating condition to use the postinc iv when possible
+ // and optimize loop terminating compare. FIXME: Move this after
+ // StrengthReduceStridedIVUsers?
+ OptimizeLoopTermCond(L);
+
// FIXME: We can shrink overlarge IV's here. e.g. if the code has
// computation in i64 values and the target doesn't support i64, demote
// the computation to 32-bit if safe.
@@ -2616,6 +2711,7 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
IVUsesByStride.clear();
IVsByStride.clear();
StrideOrder.clear();
+ StrideNoReuse.clear();
// Clean up after ourselves
if (!DeadInsts.empty())
diff --git a/test/CodeGen/X86/lsr-loop-exit-cond.ll b/test/CodeGen/X86/lsr-loop-exit-cond.ll
new file mode 100644
index 0000000..c998268
--- /dev/null
+++ b/test/CodeGen/X86/lsr-loop-exit-cond.ll
@@ -0,0 +1,134 @@
+; RUN: llvm-as < %s | llc -march=x86-64 | %prcontext decq 1 | grep jne
+
+@Te0 = external global [256 x i32] ; <[256 x i32]*> [#uses=5]
+@Te1 = external global [256 x i32] ; <[256 x i32]*> [#uses=4]
+@Te3 = external global [256 x i32] ; <[256 x i32]*> [#uses=2]
+
+define void @t(i8* nocapture %in, i8* nocapture %out, i32* nocapture %rk, i32 %r) nounwind ssp {
+entry:
+ %0 = load i32* %rk, align 4 ; <i32> [#uses=1]
+ %1 = getelementptr i32* %rk, i64 1 ; <i32*> [#uses=1]
+ %2 = load i32* %1, align 4 ; <i32> [#uses=1]
+ %tmp15 = add i32 %r, -1 ; <i32> [#uses=1]
+ %tmp.16 = zext i32 %tmp15 to i64 ; <i64> [#uses=2]
+ br label %bb
+
+bb: ; preds = %bb1, %entry
+ %indvar = phi i64 [ 0, %entry ], [ %indvar.next, %bb1 ] ; <i64> [#uses=3]
+ %s1.0 = phi i32 [ %2, %entry ], [ %56, %bb1 ] ; <i32> [#uses=2]
+ %s0.0 = phi i32 [ %0, %entry ], [ %43, %bb1 ] ; <i32> [#uses=2]
+ %tmp18 = shl i64 %indvar, 4 ; <i64> [#uses=4]
+ %rk26 = bitcast i32* %rk to i8* ; <i8*> [#uses=6]
+ %3 = lshr i32 %s0.0, 24 ; <i32> [#uses=1]
+ %4 = zext i32 %3 to i64 ; <i64> [#uses=1]
+ %5 = getelementptr [256 x i32]* @Te0, i64 0, i64 %4 ; <i32*> [#uses=1]
+ %6 = load i32* %5, align 4 ; <i32> [#uses=1]
+ %7 = lshr i32 %s1.0, 16 ; <i32> [#uses=1]
+ %8 = and i32 %7, 255 ; <i32> [#uses=1]
+ %9 = zext i32 %8 to i64 ; <i64> [#uses=1]
+ %10 = getelementptr [256 x i32]* @Te1, i64 0, i64 %9 ; <i32*> [#uses=1]
+ %11 = load i32* %10, align 4 ; <i32> [#uses=1]
+ %ctg2.sum2728 = or i64 %tmp18, 8 ; <i64> [#uses=1]
+ %12 = getelementptr i8* %rk26, i64 %ctg2.sum2728 ; <i8*> [#uses=1]
+ %13 = bitcast i8* %12 to i32* ; <i32*> [#uses=1]
+ %14 = load i32* %13, align 4 ; <i32> [#uses=1]
+ %15 = xor i32 %11, %6 ; <i32> [#uses=1]
+ %16 = xor i32 %15, %14 ; <i32> [#uses=3]
+ %17 = lshr i32 %s1.0, 24 ; <i32> [#uses=1]
+ %18 = zext i32 %17 to i64 ; <i64> [#uses=1]
+ %19 = getelementptr [256 x i32]* @Te0, i64 0, i64 %18 ; <i32*> [#uses=1]
+ %20 = load i32* %19, align 4 ; <i32> [#uses=1]
+ %21 = and i32 %s0.0, 255 ; <i32> [#uses=1]
+ %22 = zext i32 %21 to i64 ; <i64> [#uses=1]
+ %23 = getelementptr [256 x i32]* @Te3, i64 0, i64 %22 ; <i32*> [#uses=1]
+ %24 = load i32* %23, align 4 ; <i32> [#uses=1]
+ %ctg2.sum2930 = or i64 %tmp18, 12 ; <i64> [#uses=1]
+ %25 = getelementptr i8* %rk26, i64 %ctg2.sum2930 ; <i8*> [#uses=1]
+ %26 = bitcast i8* %25 to i32* ; <i32*> [#uses=1]
+ %27 = load i32* %26, align 4 ; <i32> [#uses=1]
+ %28 = xor i32 %24, %20 ; <i32> [#uses=1]
+ %29 = xor i32 %28, %27 ; <i32> [#uses=4]
+ %30 = lshr i32 %16, 24 ; <i32> [#uses=1]
+ %31 = zext i32 %30 to i64 ; <i64> [#uses=1]
+ %32 = getelementptr [256 x i32]* @Te0, i64 0, i64 %31 ; <i32*> [#uses=1]
+ %33 = load i32* %32, align 4 ; <i32> [#uses=2]
+ %exitcond = icmp eq i64 %indvar, %tmp.16 ; <i1> [#uses=1]
+ br i1 %exitcond, label %bb2, label %bb1
+
+bb1: ; preds = %bb
+ %ctg2.sum31 = add i64 %tmp18, 16 ; <i64> [#uses=1]
+ %34 = getelementptr i8* %rk26, i64 %ctg2.sum31 ; <i8*> [#uses=1]
+ %35 = bitcast i8* %34 to i32* ; <i32*> [#uses=1]
+ %36 = lshr i32 %29, 16 ; <i32> [#uses=1]
+ %37 = and i32 %36, 255 ; <i32> [#uses=1]
+ %38 = zext i32 %37 to i64 ; <i64> [#uses=1]
+ %39 = getelementptr [256 x i32]* @Te1, i64 0, i64 %38 ; <i32*> [#uses=1]
+ %40 = load i32* %39, align 4 ; <i32> [#uses=1]
+ %41 = load i32* %35, align 4 ; <i32> [#uses=1]
+ %42 = xor i32 %40, %33 ; <i32> [#uses=1]
+ %43 = xor i32 %42, %41 ; <i32> [#uses=1]
+ %44 = lshr i32 %29, 24 ; <i32> [#uses=1]
+ %45 = zext i32 %44 to i64 ; <i64> [#uses=1]
+ %46 = getelementptr [256 x i32]* @Te0, i64 0, i64 %45 ; <i32*> [#uses=1]
+ %47 = load i32* %46, align 4 ; <i32> [#uses=1]
+ %48 = and i32 %16, 255 ; <i32> [#uses=1]
+ %49 = zext i32 %48 to i64 ; <i64> [#uses=1]
+ %50 = getelementptr [256 x i32]* @Te3, i64 0, i64 %49 ; <i32*> [#uses=1]
+ %51 = load i32* %50, align 4 ; <i32> [#uses=1]
+ %ctg2.sum32 = add i64 %tmp18, 20 ; <i64> [#uses=1]
+ %52 = getelementptr i8* %rk26, i64 %ctg2.sum32 ; <i8*> [#uses=1]
+ %53 = bitcast i8* %52 to i32* ; <i32*> [#uses=1]
+ %54 = load i32* %53, align 4 ; <i32> [#uses=1]
+ %55 = xor i32 %51, %47 ; <i32> [#uses=1]
+ %56 = xor i32 %55, %54 ; <i32> [#uses=1]
+ %indvar.next = add i64 %indvar, 1 ; <i64> [#uses=1]
+ br label %bb
+
+bb2: ; preds = %bb
+ %tmp10 = shl i64 %tmp.16, 4 ; <i64> [#uses=2]
+ %ctg2.sum = add i64 %tmp10, 16 ; <i64> [#uses=1]
+ %tmp1213 = getelementptr i8* %rk26, i64 %ctg2.sum ; <i8*> [#uses=1]
+ %57 = bitcast i8* %tmp1213 to i32* ; <i32*> [#uses=1]
+ %58 = and i32 %33, -16777216 ; <i32> [#uses=1]
+ %59 = lshr i32 %29, 16 ; <i32> [#uses=1]
+ %60 = and i32 %59, 255 ; <i32> [#uses=1]
+ %61 = zext i32 %60 to i64 ; <i64> [#uses=1]
+ %62 = getelementptr [256 x i32]* @Te1, i64 0, i64 %61 ; <i32*> [#uses=1]
+ %63 = load i32* %62, align 4 ; <i32> [#uses=1]
+ %64 = and i32 %63, 16711680 ; <i32> [#uses=1]
+ %65 = or i32 %64, %58 ; <i32> [#uses=1]
+ %66 = load i32* %57, align 4 ; <i32> [#uses=1]
+ %67 = xor i32 %65, %66 ; <i32> [#uses=2]
+ %68 = lshr i32 %29, 8 ; <i32> [#uses=1]
+ %69 = zext i32 %68 to i64 ; <i64> [#uses=1]
+ %70 = getelementptr [256 x i32]* @Te0, i64 0, i64 %69 ; <i32*> [#uses=1]
+ %71 = load i32* %70, align 4 ; <i32> [#uses=1]
+ %72 = and i32 %71, -16777216 ; <i32> [#uses=1]
+ %73 = and i32 %16, 255 ; <i32> [#uses=1]
+ %74 = zext i32 %73 to i64 ; <i64> [#uses=1]
+ %75 = getelementptr [256 x i32]* @Te1, i64 0, i64 %74 ; <i32*> [#uses=1]
+ %76 = load i32* %75, align 4 ; <i32> [#uses=1]
+ %77 = and i32 %76, 16711680 ; <i32> [#uses=1]
+ %78 = or i32 %77, %72 ; <i32> [#uses=1]
+ %ctg2.sum25 = add i64 %tmp10, 20 ; <i64> [#uses=1]
+ %79 = getelementptr i8* %rk26, i64 %ctg2.sum25 ; <i8*> [#uses=1]
+ %80 = bitcast i8* %79 to i32* ; <i32*> [#uses=1]
+ %81 = load i32* %80, align 4 ; <i32> [#uses=1]
+ %82 = xor i32 %78, %81 ; <i32> [#uses=2]
+ %83 = lshr i32 %67, 24 ; <i32> [#uses=1]
+ %84 = trunc i32 %83 to i8 ; <i8> [#uses=1]
+ store i8 %84, i8* %out, align 1
+ %85 = lshr i32 %67, 16 ; <i32> [#uses=1]
+ %86 = trunc i32 %85 to i8 ; <i8> [#uses=1]
+ %87 = getelementptr i8* %out, i64 1 ; <i8*> [#uses=1]
+ store i8 %86, i8* %87, align 1
+ %88 = getelementptr i8* %out, i64 4 ; <i8*> [#uses=1]
+ %89 = lshr i32 %82, 24 ; <i32> [#uses=1]
+ %90 = trunc i32 %89 to i8 ; <i8> [#uses=1]
+ store i8 %90, i8* %88, align 1
+ %91 = lshr i32 %82, 16 ; <i32> [#uses=1]
+ %92 = trunc i32 %91 to i8 ; <i8> [#uses=1]
+ %93 = getelementptr i8* %out, i64 5 ; <i8*> [#uses=1]
+ store i8 %92, i8* %93, align 1
+ ret void
+}
diff --git a/test/CodeGen/X86/lsr-negative-stride.ll b/test/CodeGen/X86/lsr-negative-stride.ll
index 43b507b..28d041f 100644
--- a/test/CodeGen/X86/lsr-negative-stride.ll
+++ b/test/CodeGen/X86/lsr-negative-stride.ll
@@ -16,7 +16,7 @@
;}
-define i32 @t(i32 %a, i32 %b) {
+define i32 @t(i32 %a, i32 %b) nounwind {
entry:
%tmp1434 = icmp eq i32 %a, %b ; <i1> [#uses=1]
br i1 %tmp1434, label %bb17, label %bb.outer
diff --git a/test/CodeGen/X86/remat-mov0.ll b/test/CodeGen/X86/remat-mov-1.ll
index 360628c..98b7bb4 100644
--- a/test/CodeGen/X86/remat-mov0.ll
+++ b/test/CodeGen/X86/remat-mov-1.ll
@@ -1,4 +1,4 @@
-; RUN: llvm-as < %s | llc -march=x86 | grep xor | count 2
+; RUN: llvm-as < %s | llc -march=x86 | grep 4294967295 | grep mov | count 2
%struct.FILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
%struct.ImgT = type { i8, i8*, i8*, %struct.FILE*, i32, i32, i32, i32, i8*, double*, float*, float*, float*, i32*, double, double, i32*, double*, i32*, i32* }