aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorRichard Sandiford <rsandifo@linux.vnet.ibm.com>2013-08-05 11:23:46 +0000
committerRichard Sandiford <rsandifo@linux.vnet.ibm.com>2013-08-05 11:23:46 +0000
commit93795574785de252703591e7fcc8f052c762f25e (patch)
treede693d743c5334444b688797de354cdc279bbdbe
parentf8e16c6f5a3a0d2cc6f7ae6dae0a8f55a89cfb2f (diff)
downloadexternal_llvm-93795574785de252703591e7fcc8f052c762f25e.zip
external_llvm-93795574785de252703591e7fcc8f052c762f25e.tar.gz
external_llvm-93795574785de252703591e7fcc8f052c762f25e.tar.bz2
[SystemZ] Use BRCT and BRCTG to eliminate add-&-compare sequences
This patch just uses a peephole test for "add; compare; branch" sequences within a single block. The IR optimizers already convert loops to decrement-and-branch-on-nonzero form in some cases, so even this simplistic test triggers many times during a clang bootstrap and projects/test-suite run. It looks like there are still cases where we need to more strongly prefer branches on nonzero though. E.g. I saw a case where a loop that started out with a check for 0 ended up with a check for -1. I'll try to look at that sometime. I ended up adding the Reference class because MachineInstr::readsRegister() doesn't check for subregisters (by design, as far as I could tell). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187723 91177308-0d34-0410-b5e6-96231b3b80d8
-rw-r--r--lib/Target/SystemZ/SystemZElimCompare.cpp147
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.cpp8
-rw-r--r--lib/Target/SystemZ/SystemZInstrInfo.h10
-rw-r--r--lib/Target/SystemZ/SystemZLongBranch.cpp31
-rw-r--r--lib/Target/SystemZ/SystemZTargetMachine.cpp3
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-07.py68
-rw-r--r--test/CodeGen/SystemZ/Large/branch-range-08.py69
-rw-r--r--test/CodeGen/SystemZ/loop-01.ll101
8 files changed, 419 insertions, 18 deletions
diff --git a/lib/Target/SystemZ/SystemZElimCompare.cpp b/lib/Target/SystemZ/SystemZElimCompare.cpp
index bcdc5b7..07afc86 100644
--- a/lib/Target/SystemZ/SystemZElimCompare.cpp
+++ b/lib/Target/SystemZ/SystemZElimCompare.cpp
@@ -28,10 +28,38 @@
using namespace llvm;
+STATISTIC(BranchOnCounts, "Number of branch-on-count instructions");
STATISTIC(EliminatedComparisons, "Number of eliminated comparisons");
STATISTIC(FusedComparisons, "Number of fused compare-and-branch instructions");
namespace {
+ // Represents the references to a particular register in one or more
+ // instructions.
+ struct Reference {
+ Reference()
+ : Def(false), Use(false), IndirectDef(false), IndirectUse(false) {}
+
+ Reference &operator|=(const Reference &Other) {
+ Def |= Other.Def;
+ IndirectDef |= Other.IndirectDef;
+ Use |= Other.Use;
+ IndirectUse |= Other.IndirectUse;
+ return *this;
+ }
+
+ operator bool() const { return Def || Use; }
+
+ // True if the register is defined or used in some form, either directly or
+ // via a sub- or super-register.
+ bool Def;
+ bool Use;
+
+ // True if the register is defined or used indirectly, by a sub- or
+ // super-register.
+ bool IndirectDef;
+ bool IndirectUse;
+ };
+
class SystemZElimCompare : public MachineFunctionPass {
public:
static char ID;
@@ -46,6 +74,9 @@ namespace {
bool runOnMachineFunction(MachineFunction &F);
private:
+ Reference getRegReferences(MachineInstr *MI, unsigned Reg);
+ bool convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers);
bool convertToLoadAndTest(MachineInstr *MI);
bool adjustCCMasksForInstr(MachineInstr *MI, MachineInstr *Compare,
SmallVectorImpl<MachineInstr *> &CCUsers);
@@ -99,6 +130,80 @@ static bool resultTests(MachineInstr *MI, unsigned Reg, unsigned SubReg) {
return false;
}
+// Describe the references to Reg in MI, including sub- and super-registers.
+Reference SystemZElimCompare::getRegReferences(MachineInstr *MI, unsigned Reg) {
+ Reference Ref;
+ for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
+ const MachineOperand &MO = MI->getOperand(I);
+ if (MO.isReg()) {
+ if (unsigned MOReg = MO.getReg()) {
+ if (MOReg == Reg || TRI->regsOverlap(MOReg, Reg)) {
+ if (MO.isUse()) {
+ Ref.Use = true;
+ Ref.IndirectUse |= (MOReg != Reg);
+ }
+ if (MO.isDef()) {
+ Ref.Def = true;
+ Ref.IndirectDef |= (MOReg != Reg);
+ }
+ }
+ }
+ }
+ }
+ return Ref;
+}
+
+// Compare compares the result of MI against zero. If MI is an addition
+// of -1 and if CCUsers is a single branch on nonzero, eliminate the addition
+// and convert the branch to a BRCT(G). Return true on success.
+bool
+SystemZElimCompare::convertToBRCT(MachineInstr *MI, MachineInstr *Compare,
+ SmallVectorImpl<MachineInstr *> &CCUsers) {
+ // Check whether we have an addition of -1.
+ unsigned Opcode = MI->getOpcode();
+ unsigned BRCT;
+ if (Opcode == SystemZ::AHI)
+ BRCT = SystemZ::BRCT;
+ else if (Opcode == SystemZ::AGHI)
+ BRCT = SystemZ::BRCTG;
+ else
+ return false;
+ if (MI->getOperand(2).getImm() != -1)
+ return false;
+
+ // Check whether we have a single JLH.
+ if (CCUsers.size() != 1)
+ return false;
+ MachineInstr *Branch = CCUsers[0];
+ if (Branch->getOpcode() != SystemZ::BRC ||
+ Branch->getOperand(0).getImm() != SystemZ::CCMASK_ICMP ||
+ Branch->getOperand(1).getImm() != SystemZ::CCMASK_CMP_NE)
+ return false;
+
+ // We already know that there are no references to the register between
+ // MI and Compare. Make sure that there are also no references between
+ // Compare and Branch.
+ unsigned SrcReg = Compare->getOperand(0).getReg();
+ MachineBasicBlock::iterator MBBI = Compare, MBBE = Branch;
+ for (++MBBI; MBBI != MBBE; ++MBBI)
+ if (getRegReferences(MBBI, SrcReg))
+ return false;
+
+ // The transformation is OK. Rebuild Branch as a BRCT(G).
+ MachineOperand Target(Branch->getOperand(2));
+ Branch->RemoveOperand(2);
+ Branch->RemoveOperand(1);
+ Branch->RemoveOperand(0);
+ Branch->setDesc(TII->get(BRCT));
+ MachineInstrBuilder(*Branch->getParent()->getParent(), Branch)
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1))
+ .addOperand(Target)
+ .addReg(SystemZ::CC, RegState::ImplicitDefine);
+ MI->removeFromParent();
+ return true;
+}
+
// If MI is a load instruction, try to convert it into a LOAD AND TEST.
// Return true on success.
bool SystemZElimCompare::convertToLoadAndTest(MachineInstr *MI) {
@@ -210,21 +315,32 @@ optimizeCompareZero(MachineInstr *Compare,
unsigned SrcSubReg = Compare->getOperand(0).getSubReg();
MachineBasicBlock *MBB = Compare->getParent();
MachineBasicBlock::iterator MBBI = Compare, MBBE = MBB->begin();
- bool SeenUseOfCC = false;
+ Reference CCRefs;
+ Reference SrcRefs;
while (MBBI != MBBE) {
--MBBI;
MachineInstr *MI = MBBI;
- if (resultTests(MI, SrcReg, SrcSubReg) &&
- ((!SeenUseOfCC && convertToLoadAndTest(MI)) ||
- adjustCCMasksForInstr(MI, Compare, CCUsers))) {
- EliminatedComparisons += 1;
- return true;
+ if (resultTests(MI, SrcReg, SrcSubReg)) {
+ // Try to remove both MI and Compare by converting a branch to BRCT(G).
+ // We don't care in this case whether CC is modified between MI and
+ // Compare.
+ if (!CCRefs.Use && !SrcRefs && convertToBRCT(MI, Compare, CCUsers)) {
+ BranchOnCounts += 1;
+ return true;
+ }
+ // Try to eliminate Compare by reusing a CC result from MI.
+ if ((!CCRefs && convertToLoadAndTest(MI)) ||
+ (!CCRefs.Def && adjustCCMasksForInstr(MI, Compare, CCUsers))) {
+ EliminatedComparisons += 1;
+ return true;
+ }
}
- if (MI->modifiesRegister(SrcReg, TRI) ||
- MI->modifiesRegister(SystemZ::CC, TRI))
+ SrcRefs |= getRegReferences(MI, SrcReg);
+ if (SrcRefs.Def)
+ return false;
+ CCRefs |= getRegReferences(MI, SystemZ::CC);
+ if (CCRefs.Use && CCRefs.Def)
return false;
- if (MI->readsRegister(SystemZ::CC, TRI))
- SeenUseOfCC = true;
}
return false;
}
@@ -316,13 +432,12 @@ bool SystemZElimCompare::processBlock(MachineBasicBlock *MBB) {
continue;
}
- if (MI->definesRegister(SystemZ::CC, TRI)) {
+ Reference CCRefs(getRegReferences(MI, SystemZ::CC));
+ if (CCRefs.Def) {
CCUsers.clear();
- CompleteCCUsers = true;
- } else if (MI->modifiesRegister(SystemZ::CC, TRI))
- CompleteCCUsers = false;
-
- if (CompleteCCUsers && MI->readsRegister(SystemZ::CC, TRI))
+ CompleteCCUsers = !CCRefs.IndirectDef;
+ }
+ if (CompleteCCUsers && CCRefs.Use)
CCUsers.push_back(MI);
}
return Changed;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 5dd8d98..c2a6a7f 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -684,6 +684,14 @@ SystemZInstrInfo::getBranchInfo(const MachineInstr *MI) const {
MI->getOperand(0).getImm(),
MI->getOperand(1).getImm(), &MI->getOperand(2));
+ case SystemZ::BRCT:
+ return SystemZII::Branch(SystemZII::BranchCT, SystemZ::CCMASK_ICMP,
+ SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+
+ case SystemZ::BRCTG:
+ return SystemZII::Branch(SystemZII::BranchCTG, SystemZ::CCMASK_ICMP,
+ SystemZ::CCMASK_CMP_NE, &MI->getOperand(2));
+
case SystemZ::CIJ:
case SystemZ::CRJ:
return SystemZII::Branch(SystemZII::BranchC, SystemZ::CCMASK_ICMP,
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index 1392745..b12b471 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -69,7 +69,15 @@ namespace SystemZII {
// An instruction that peforms a 64-bit signed comparison and branches
// on the result.
- BranchCG
+ BranchCG,
+
+ // An instruction that decrements a 32-bit register and branches if
+ // the result is nonzero.
+ BranchCT,
+
+ // An instruction that decrements a 64-bit register and branches if
+ // the result is nonzero.
+ BranchCTG
};
// Information about a branch instruction.
struct Branch {
diff --git a/lib/Target/SystemZ/SystemZLongBranch.cpp b/lib/Target/SystemZ/SystemZLongBranch.cpp
index c5c4cab..114f74e 100644
--- a/lib/Target/SystemZ/SystemZLongBranch.cpp
+++ b/lib/Target/SystemZ/SystemZLongBranch.cpp
@@ -148,6 +148,7 @@ namespace {
bool mustRelaxBranch(const TerminatorInfo &Terminator, uint64_t Address);
bool mustRelaxABranch();
void setWorstCaseAddresses();
+ void splitBranchOnCount(MachineInstr *MI, unsigned AddOpcode);
void splitCompareBranch(MachineInstr *MI, unsigned CompareOpcode);
void relaxBranch(TerminatorInfo &Terminator);
void relaxBranches();
@@ -218,6 +219,11 @@ TerminatorInfo SystemZLongBranch::describeTerminator(MachineInstr *MI) {
// Relaxes to BRCL, which is 2 bytes longer.
Terminator.ExtraRelaxSize = 2;
break;
+ case SystemZ::BRCT:
+ case SystemZ::BRCTG:
+ // Relaxes to A(G)HI and BRCL, which is 6 bytes longer.
+ Terminator.ExtraRelaxSize = 6;
+ break;
case SystemZ::CRJ:
// Relaxes to a CR/BRCL sequence, which is 2 bytes longer.
Terminator.ExtraRelaxSize = 2;
@@ -330,6 +336,25 @@ void SystemZLongBranch::setWorstCaseAddresses() {
}
}
+// Split BRANCH ON COUNT MI into the addition given by AddOpcode followed
+// by a BRCL on the result.
+void SystemZLongBranch::splitBranchOnCount(MachineInstr *MI,
+ unsigned AddOpcode) {
+ MachineBasicBlock *MBB = MI->getParent();
+ DebugLoc DL = MI->getDebugLoc();
+ BuildMI(*MBB, MI, DL, TII->get(AddOpcode))
+ .addOperand(MI->getOperand(0))
+ .addOperand(MI->getOperand(1))
+ .addImm(-1);
+ MachineInstr *BRCL = BuildMI(*MBB, MI, DL, TII->get(SystemZ::BRCL))
+ .addImm(SystemZ::CCMASK_ICMP)
+ .addImm(SystemZ::CCMASK_CMP_NE)
+ .addOperand(MI->getOperand(2));
+ // The implicit use of CC is a killing use.
+ BRCL->addRegisterKilled(SystemZ::CC, &TII->getRegisterInfo());
+ MI->eraseFromParent();
+}
+
// Split MI into the comparison given by CompareOpcode followed
// a BRCL on the result.
void SystemZLongBranch::splitCompareBranch(MachineInstr *MI,
@@ -358,6 +383,12 @@ void SystemZLongBranch::relaxBranch(TerminatorInfo &Terminator) {
case SystemZ::BRC:
Branch->setDesc(TII->get(SystemZ::BRCL));
break;
+ case SystemZ::BRCT:
+ splitBranchOnCount(Branch, SystemZ::AHI);
+ break;
+ case SystemZ::BRCTG:
+ splitBranchOnCount(Branch, SystemZ::AGHI);
+ break;
case SystemZ::CRJ:
splitCompareBranch(Branch, SystemZ::CR);
break;
diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 2bacc2b..856183c 100644
--- a/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -82,6 +82,9 @@ bool SystemZPassConfig::addPreEmitPass() {
// CC values (while still being worthwhile) and others that happen to make
// the CC result more useful than it was originally.
//
+ // Another reason is that we only want to use BRANCH ON COUNT in cases
+ // where we know that the count register is not going to be spilled.
+ //
// Doing it so late makes it more likely that a register will be reused
// between the comparison and the branch, but it isn't clear whether
// preventing that would be a win or not.
diff --git a/test/CodeGen/SystemZ/Large/branch-range-07.py b/test/CodeGen/SystemZ/Large/branch-range-07.py
new file mode 100644
index 0000000..90c4420
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-07.py
@@ -0,0 +1,68 @@
+# Test 32-bit BRANCH RELATIVE ON COUNT in cases where some branches are out
+# of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# loopN:
+# load of countN
+# ...
+# loop0:
+# 0xffd8 bytes, from MVIY instructions
+# conditional branch to main
+# after0:
+# ...
+# decrement of countN
+# conditional branch to loopN
+# afterN:
+#
+# Each load occupies 4 bytes. Each decrement and branch occupies 4
+# bytes if BRCT can be used, otherwise it occupies 10 bytes (AHI + BRCL).
+# This means that loop 6 contains 5 * 4 + 0xffd8 + 5 * 4 == 0x10000 bytes
+# and is therefore (just) in range. Loop 7 is out of range.
+#
+# CHECK: brct {{%r[0-9]+}}
+# CHECK: brct {{%r[0-9]+}}
+# CHECK: brct {{%r[0-9]+}}
+# CHECK: brct {{%r[0-9]+}}
+# CHECK: brct {{%r[0-9]+}}
+# CHECK: brct {{%r[0-9]+}}
+# CHECK: ahi {{%r[0-9]+}}, -1
+# CHECK: jglh
+# CHECK: ahi {{%r[0-9]+}}, -1
+# CHECK: jglh
+
+branch_blocks = 8
+main_size = 0xffd8
+
+print 'define void @f1(i8 *%base, i32 *%counts) {'
+print 'entry:'
+
+for i in xrange(branch_blocks - 1, -1, -1):
+ print ' %%countptr%d = getelementptr i32 *%%counts, i64 %d' % (i, i)
+ print ' %%initcount%d = load i32 *%%countptr%d' % (i, i)
+ print ' br label %%loop%d' % i
+
+ print 'loop%d:' % i
+ block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1)
+ block2 = 'loop0' if i == 0 else 'after%d' % (i - 1)
+ print (' %%count%d = phi i32 [ %%initcount%d, %%%s ],'
+ ' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2))
+
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+ a, b = b, a + b
+ offset = 4096 + b % 500000
+ value = a % 256
+ print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+ print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+ print ' %%nextcount%d = add i32 %%count%d, -1' % (i, i)
+ print ' %%test%d = icmp ne i32 %%nextcount%d, 0' % (i, i)
+ print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i)
+ print ''
+ print 'after%d:' % i
+
+print ' ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/Large/branch-range-08.py b/test/CodeGen/SystemZ/Large/branch-range-08.py
new file mode 100644
index 0000000..ac1b137
--- /dev/null
+++ b/test/CodeGen/SystemZ/Large/branch-range-08.py
@@ -0,0 +1,69 @@
+# Test 64-bit BRANCH RELATIVE ON COUNT in cases where some branches are out
+# of range.
+# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s
+
+# Construct:
+#
+# loopN:
+# load of countN
+# ...
+# loop0:
+# 0xffd8 bytes, from MVIY instructions
+# conditional branch to main
+# after0:
+# ...
+# decrement of countN
+# conditional branch to loopN
+# afterN:
+#
+# Each load occupies 6 bytes. Each decrement and branch occupies 4
+# bytes if BRCTG can be used, otherwise it occupies 10 bytes (AGHI + BRCL).
+# This means that loop 5 contains 4 * 6 + 0xffd8 + 4 * 4 == 0x10000 bytes
+# and is therefore (just) in range. Loop 6 is out of range.
+#
+# CHECK: brctg {{%r[0-9]+}}
+# CHECK: brctg {{%r[0-9]+}}
+# CHECK: brctg {{%r[0-9]+}}
+# CHECK: brctg {{%r[0-9]+}}
+# CHECK: brctg {{%r[0-9]+}}
+# CHECK: aghi {{%r[0-9]+}}, -1
+# CHECK: jglh
+# CHECK: aghi {{%r[0-9]+}}, -1
+# CHECK: jglh
+# CHECK: aghi {{%r[0-9]+}}, -1
+# CHECK: jglh
+
+branch_blocks = 8
+main_size = 0xffd8
+
+print 'define void @f1(i8 *%base, i64 *%counts) {'
+print 'entry:'
+
+for i in xrange(branch_blocks - 1, -1, -1):
+ print ' %%countptr%d = getelementptr i64 *%%counts, i64 %d' % (i, i)
+ print ' %%initcount%d = load i64 *%%countptr%d' % (i, i)
+ print ' br label %%loop%d' % i
+
+ print 'loop%d:' % i
+ block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1)
+ block2 = 'loop0' if i == 0 else 'after%d' % (i - 1)
+ print (' %%count%d = phi i64 [ %%initcount%d, %%%s ],'
+ ' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2))
+
+a, b = 1, 1
+for i in xrange(0, main_size, 6):
+ a, b = b, a + b
+ offset = 4096 + b % 500000
+ value = a % 256
+ print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset)
+ print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i)
+
+for i in xrange(branch_blocks):
+ print ' %%nextcount%d = add i64 %%count%d, -1' % (i, i)
+ print ' %%test%d = icmp ne i64 %%nextcount%d, 0' % (i, i)
+ print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i)
+ print ''
+ print 'after%d:' % i
+
+print ' ret void'
+print '}'
diff --git a/test/CodeGen/SystemZ/loop-01.ll b/test/CodeGen/SystemZ/loop-01.ll
index 025a34e..5800801 100644
--- a/test/CodeGen/SystemZ/loop-01.ll
+++ b/test/CodeGen/SystemZ/loop-01.ll
@@ -5,7 +5,7 @@
; Test that strength reduction is applied to addresses with a scale factor,
; but that indexed addressing can still be used.
define void @f1(i32 *%dest, i32 %a) {
-; CHECK-LABEL: f1
+; CHECK-LABEL: f1:
; CHECK-NOT: sllg
; CHECK: st %r3, 0({{%r[1-5],%r[1-5]}})
; CHECK: br %r14
@@ -23,3 +23,102 @@ loop:
exit:
ret void
}
+
+; Test a loop that should be converted into dbr form and then use BRCT.
+define void @f2(i32 *%src, i32 *%dest) {
+; CHECK-LABEL: f2:
+; CHECK: lhi [[REG:%r[0-5]]], 100
+; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
+; CHECK: brct [[REG]], [[LABEL]]
+; CHECK: br %r14
+entry:
+ br label %loop
+
+loop:
+ %count = phi i32 [ 0, %entry ], [ %next, %loop.next ]
+ %next = add i32 %count, 1
+ %val = load volatile i32 *%src
+ %cmp = icmp eq i32 %val, 0
+ br i1 %cmp, label %loop.next, label %loop.store
+
+loop.store:
+ %add = add i32 %val, 1
+ store volatile i32 %add, i32 *%dest
+ br label %loop.next
+
+loop.next:
+ %cont = icmp ne i32 %next, 100
+ br i1 %cont, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+; Like f2, but for BRCTG.
+define void @f3(i64 *%src, i64 *%dest) {
+; CHECK-LABEL: f3:
+; CHECK: lghi [[REG:%r[0-5]]], 100
+; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop
+; CHECK: brctg [[REG]], [[LABEL]]
+; CHECK: br %r14
+entry:
+ br label %loop
+
+loop:
+ %count = phi i64 [ 0, %entry ], [ %next, %loop.next ]
+ %next = add i64 %count, 1
+ %val = load volatile i64 *%src
+ %cmp = icmp eq i64 %val, 0
+ br i1 %cmp, label %loop.next, label %loop.store
+
+loop.store:
+ %add = add i64 %val, 1
+ store volatile i64 %add, i64 *%dest
+ br label %loop.next
+
+loop.next:
+ %cont = icmp ne i64 %next, 100
+ br i1 %cont, label %loop, label %exit
+
+exit:
+ ret void
+}
+
+; Test a loop with a 64-bit decremented counter in which the 32-bit
+; low part of the counter is used after the decrement. This is an example
+; of a subregister use being the only thing that blocks a conversion to BRCTG.
+define void @f4(i32 *%src, i32 *%dest, i64 *%dest2, i64 %count) {
+; CHECK-LABEL: f4:
+; CHECK: aghi [[REG:%r[0-5]]], -1
+; CHECK: lr [[REG2:%r[0-5]]], [[REG]]
+; CHECK: stg [[REG2]],
+; CHECK: jne {{\..*}}
+; CHECK: br %r14
+entry:
+ br label %loop
+
+loop:
+ %left = phi i64 [ %count, %entry ], [ %next, %loop.next ]
+ store volatile i64 %left, i64 *%dest2
+ %val = load volatile i32 *%src
+ %cmp = icmp eq i32 %val, 0
+ br i1 %cmp, label %loop.next, label %loop.store
+
+loop.store:
+ %add = add i32 %val, 1
+ store volatile i32 %add, i32 *%dest
+ br label %loop.next
+
+loop.next:
+ %next = add i64 %left, -1
+ %ext = zext i32 %val to i64
+ %shl = shl i64 %ext, 32
+ %and = and i64 %next, 4294967295
+ %or = or i64 %shl, %and
+ store volatile i64 %or, i64 *%dest2
+ %cont = icmp ne i64 %next, 0
+ br i1 %cont, label %loop, label %exit
+
+exit:
+ ret void
+}