diff options
author | Dan Gohman <gohman@apple.com> | 2011-05-03 00:46:49 +0000 |
---|---|---|
committer | Dan Gohman <gohman@apple.com> | 2011-05-03 00:46:49 +0000 |
commit | cca82149adef8306a295abdc963213ae3b11bbb6 (patch) | |
tree | 2d38fc521b67e22982d8a8535c98bafe4cdbd709 | |
parent | f7710af4ba78aa7a0cc9c226f334d8f2b6ab31bf (diff) | |
download | external_llvm-cca82149adef8306a295abdc963213ae3b11bbb6.zip external_llvm-cca82149adef8306a295abdc963213ae3b11bbb6.tar.gz external_llvm-cca82149adef8306a295abdc963213ae3b11bbb6.tar.bz2 |
Add an unfolded offset field to LSR's Formula record. This is used to
model constants which can be added to base registers via add-immediate
instructions which don't require an additional register to materialize
the immediate.
git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@130743 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r-- | include/llvm/Target/TargetLowering.h | 8 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.cpp | 8 | ||||
-rw-r--r-- | lib/Target/ARM/ARMISelLowering.h | 6 | ||||
-rw-r--r-- | lib/Transforms/Scalar/LoopStrengthReduce.cpp | 71 | ||||
-rw-r--r-- | test/CodeGen/ARM/lsr-unfolded-offset.ll | 80 |
5 files changed, 164 insertions, 9 deletions
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index 17d761c..15db9c5 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1583,6 +1583,14 @@ public: return true; } + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can add + /// a register with the immediate without having to materialize the + /// immediate into a register. + virtual bool isLegalAddImmediate(int64_t Imm) const { + return true; + } + //===--------------------------------------------------------------------===// // Div utility functions // diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index 0a31b87..8533b6d 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -6984,6 +6984,14 @@ bool ARMTargetLowering::isLegalICmpImmediate(int64_t Imm) const { return Imm >= 0 && Imm <= 255; } +/// isLegalAddImmediate - Return true if the specified immediate is legal +/// add immediate, that is the target has add instructions which can add +/// a register with the immediate without having to materialize the +/// immediate into a register. +bool ARMTargetLowering::isLegalAddImmediate(int64_t Imm) const { + return ARM_AM::getSOImmVal(Imm) != -1; +} + static bool getARMIndexedAddressParts(SDNode *Ptr, EVT VT, bool isSEXTLoad, SDValue &Base, SDValue &Offset, bool &isInc, diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index a2e6260..82d167e 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -264,6 +264,12 @@ namespace llvm { /// the immediate into a register. virtual bool isLegalICmpImmediate(int64_t Imm) const; + /// isLegalAddImmediate - Return true if the specified immediate is legal + /// add immediate, that is the target has add instructions which can + /// add a register and the immediate without having to materialize + /// the immediate into a register. + virtual bool isLegalAddImmediate(int64_t Imm) const; + /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address /// can be legally represented as pre-indexed load / store address. diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp index 5abc790..0d617bf 100644 --- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp +++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp @@ -209,7 +209,12 @@ struct Formula { /// when AM.Scale is not zero. const SCEV *ScaledReg; - Formula() : ScaledReg(0) {} + /// UnfoldedOffset - An additional constant offset which added near the + /// use. This requires a temporary register, but the offset itself can + /// live in an add immediate field rather than a register. + int64_t UnfoldedOffset; + + Formula() : ScaledReg(0), UnfoldedOffset(0) {} void InitialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE); @@ -379,6 +384,10 @@ void Formula::print(raw_ostream &OS) const { OS << "<unknown>"; OS << ')'; } + if (UnfoldedOffset != 0) { + if (!First) OS << " + "; else First = false; + OS << "imm(" << UnfoldedOffset << ')'; + } } void Formula::dump() const { @@ -771,8 +780,10 @@ void Cost::RateFormula(const Formula &F, RatePrimaryRegister(BaseReg, Regs, L, SE, DT); } - if (F.BaseRegs.size() > 1) - NumBaseAdds += F.BaseRegs.size() - 1; + // Determine how many (unfolded) adds we'll need inside the loop. + size_t NumBaseParts = F.BaseRegs.size() + (F.UnfoldedOffset != 0); + if (NumBaseParts > 1) + NumBaseAdds += NumBaseParts - 1; // Tally up the non-zero immediates. for (SmallVectorImpl<int64_t>::const_iterator I = Offsets.begin(), @@ -1945,7 +1956,8 @@ LSRInstance::FindUseWithSimilarFormula(const Formula &OrigF, if (F.BaseRegs == OrigF.BaseRegs && F.ScaledReg == OrigF.ScaledReg && F.AM.BaseGV == OrigF.AM.BaseGV && - F.AM.Scale == OrigF.AM.Scale) { + F.AM.Scale == OrigF.AM.Scale && + F.UnfoldedOffset == OrigF.UnfoldedOffset) { if (F.AM.BaseOffs == 0) return &LU; // This is the formula where all the registers and symbols matched; @@ -2313,8 +2325,29 @@ void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx, if (InnerSum->isZero()) continue; Formula F = Base; - F.BaseRegs[i] = InnerSum; - F.BaseRegs.push_back(*J); + + // Add the remaining pieces of the add back into the new formula. + const SCEVConstant *InnerSumSC = dyn_cast<SCEVConstant>(InnerSum); + if (TLI && InnerSumSC && + SE.getTypeSizeInBits(InnerSumSC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue())) { + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + InnerSumSC->getValue()->getZExtValue(); + F.BaseRegs.erase(F.BaseRegs.begin() + i); + } else + F.BaseRegs[i] = InnerSum; + + // Add J as its own register, or an unfolded immediate. + const SCEVConstant *SC = dyn_cast<SCEVConstant>(*J); + if (TLI && SC && SE.getTypeSizeInBits(SC->getType()) <= 64 && + TLI->isLegalAddImmediate((uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue())) + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset + + SC->getValue()->getZExtValue(); + else + F.BaseRegs.push_back(*J); + if (InsertFormula(LU, LUIdx, F)) // If that formula hadn't been seen before, recurse to find more like // it. @@ -2482,6 +2515,13 @@ void LSRInstance::GenerateICmpZeroScales(LSRUse &LU, unsigned LUIdx, continue; } + // Check that multiplying with the unfolded offset doesn't overflow. + if (F.UnfoldedOffset != 0) { + F.UnfoldedOffset = (uint64_t)F.UnfoldedOffset * Factor; + if (F.UnfoldedOffset / Factor != Base.UnfoldedOffset) + continue; + } + // If we make it here and it's legal, add it. (void)InsertFormula(LU, LUIdx, F); next:; @@ -2664,7 +2704,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { // other orig regs. ImmMapTy::const_iterator OtherImms[] = { Imms.begin(), prior(Imms.end()), - Imms.upper_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) + Imms.lower_bound((Imms.begin()->first + prior(Imms.end())->first) / 2) }; for (size_t i = 0, e = array_lengthof(OtherImms); i != e; ++i) { ImmMapTy::const_iterator M = OtherImms[i]; @@ -2738,8 +2778,13 @@ void LSRInstance::GenerateCrossUseConstantOffsets() { Formula NewF = F; NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm; if (!isLegalUse(NewF.AM, LU.MinOffset, LU.MaxOffset, - LU.Kind, LU.AccessTy, TLI)) - continue; + LU.Kind, LU.AccessTy, TLI)) { + if (!TLI || + !TLI->isLegalAddImmediate((uint64_t)NewF.UnfoldedOffset + Imm)) + continue; + NewF = F; + NewF.UnfoldedOffset = (uint64_t)NewF.UnfoldedOffset + Imm; + } NewF.BaseRegs[N] = SE.getAddExpr(NegImmS, BaseReg); // If the new formula has a constant in a register, and adding the @@ -3488,6 +3533,14 @@ Value *LSRInstance::Expand(const LSRFixup &LF, } } + // Expand the unfolded offset portion. + int64_t UnfoldedOffset = F.UnfoldedOffset; + if (UnfoldedOffset != 0) { + // Just add the immediate values. + Ops.push_back(SE.getUnknown(ConstantInt::getSigned(IntTy, + UnfoldedOffset))); + } + // Emit instructions summing all the operands. const SCEV *FullS = Ops.empty() ? SE.getConstant(IntTy, 0) : diff --git a/test/CodeGen/ARM/lsr-unfolded-offset.ll b/test/CodeGen/ARM/lsr-unfolded-offset.ll new file mode 100644 index 0000000..a3342c6 --- /dev/null +++ b/test/CodeGen/ARM/lsr-unfolded-offset.ll @@ -0,0 +1,80 @@ +; RUN: llc < %s | FileCheck %s + +; LSR shouldn't introduce more induction variables than needed, increasing +; register pressure and therefore spilling. There is more room for improvement +; here. + +; CHECK: sub sp, #32 + +; CHECK: ldr r{{.*}}, [sp, #4] +; CHECK-NEXT: ldr r{{.*}}, [sp, #16] +; CHECK-NEXT: ldr r{{.*}}, [sp, #12] +; CHECK-NEXT: adds + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" +target triple = "thumbv7-apple-macosx10.7.0" + +%struct.partition_entry = type { i32, i32, i64, i64 } + +define i32 @partition_overlap_check(%struct.partition_entry* nocapture %part, i32 %num_entries) nounwind readonly optsize ssp { +entry: + %cmp79 = icmp sgt i32 %num_entries, 0 + br i1 %cmp79, label %outer.loop, label %for.end72 + +outer.loop: ; preds = %for.inc69, %entry + %overlap.081 = phi i32 [ %overlap.4, %for.inc69 ], [ 0, %entry ] + %0 = phi i32 [ %inc71, %for.inc69 ], [ 0, %entry ] + %offset = getelementptr %struct.partition_entry* %part, i32 %0, i32 2 + %len = getelementptr %struct.partition_entry* %part, i32 %0, i32 3 + %tmp5 = load i64* %offset, align 4, !tbaa !0 + %tmp15 = load i64* %len, align 4, !tbaa !0 + %add = add nsw i64 %tmp15, %tmp5 + br label %inner.loop + +inner.loop: ; preds = %for.inc, %outer.loop + %overlap.178 = phi i32 [ %overlap.081, %outer.loop ], [ %overlap.4, %for.inc ] + %1 = phi i32 [ 0, %outer.loop ], [ %inc, %for.inc ] + %cmp23 = icmp eq i32 %0, %1 + br i1 %cmp23, label %for.inc, label %if.end + +if.end: ; preds = %inner.loop + %len39 = getelementptr %struct.partition_entry* %part, i32 %1, i32 3 + %offset28 = getelementptr %struct.partition_entry* %part, i32 %1, i32 2 + %tmp29 = load i64* %offset28, align 4, !tbaa !0 + %tmp40 = load i64* %len39, align 4, !tbaa !0 + %add41 = add nsw i64 %tmp40, %tmp29 + %cmp44 = icmp sge i64 %tmp29, %tmp5 + %cmp47 = icmp slt i64 %tmp29, %add + %or.cond = and i1 %cmp44, %cmp47 + %overlap.2 = select i1 %or.cond, i32 1, i32 %overlap.178 + %cmp52 = icmp sle i64 %add41, %add + %cmp56 = icmp sgt i64 %add41, %tmp5 + %or.cond74 = and i1 %cmp52, %cmp56 + %overlap.3 = select i1 %or.cond74, i32 1, i32 %overlap.2 + %cmp61 = icmp sgt i64 %tmp29, %tmp5 + %cmp65 = icmp slt i64 %add41, %add + %or.cond75 = or i1 %cmp61, %cmp65 + br i1 %or.cond75, label %for.inc, label %if.then66 + +if.then66: ; preds = %if.end + br label %for.inc + +for.inc: ; preds = %if.end, %if.then66, %inner.loop + %overlap.4 = phi i32 [ %overlap.178, %inner.loop ], [ 1, %if.then66 ], [ %overlap.3, %if.end ] + %inc = add nsw i32 %1, 1 + %exitcond = icmp eq i32 %inc, %num_entries + br i1 %exitcond, label %for.inc69, label %inner.loop + +for.inc69: ; preds = %for.inc + %inc71 = add nsw i32 %0, 1 + %exitcond83 = icmp eq i32 %inc71, %num_entries + br i1 %exitcond83, label %for.end72, label %outer.loop + +for.end72: ; preds = %for.inc69, %entry + %overlap.0.lcssa = phi i32 [ 0, %entry ], [ %overlap.4, %for.inc69 ] + ret i32 %overlap.0.lcssa +} + +!0 = metadata !{metadata !"long long", metadata !1} +!1 = metadata !{metadata !"omnipotent char", metadata !2} +!2 = metadata !{metadata !"Simple C/C++ TBAA", null} |