From 93795574785de252703591e7fcc8f052c762f25e Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Mon, 5 Aug 2013 11:23:46 +0000 Subject: [SystemZ] Use BRCT and BRCTG to eliminate add-&-compare sequences This patch just uses a peephole test for "add; compare; branch" sequences within a single block. The IR optimizers already convert loops to decrement-and-branch-on-nonzero form in some cases, so even this simplistic test triggers many times during a clang bootstrap and projects/test-suite run. It looks like there are still cases where we need to more strongly prefer branches on nonzero though. E.g. I saw a case where a loop that started out with a check for 0 ended up with a check for -1. I'll try to look at that sometime. I ended up adding the Reference class because MachineInstr::readsRegister() doesn't check for subregisters (by design, as far as I could tell). git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@187723 91177308-0d34-0410-b5e6-96231b3b80d8 --- test/CodeGen/SystemZ/Large/branch-range-07.py | 68 +++++++++++++++++ test/CodeGen/SystemZ/Large/branch-range-08.py | 69 ++++++++++++++++++ test/CodeGen/SystemZ/loop-01.ll | 101 +++++++++++++++++++++++++- 3 files changed, 237 insertions(+), 1 deletion(-) create mode 100644 test/CodeGen/SystemZ/Large/branch-range-07.py create mode 100644 test/CodeGen/SystemZ/Large/branch-range-08.py (limited to 'test') diff --git a/test/CodeGen/SystemZ/Large/branch-range-07.py b/test/CodeGen/SystemZ/Large/branch-range-07.py new file mode 100644 index 0000000..90c4420 --- /dev/null +++ b/test/CodeGen/SystemZ/Large/branch-range-07.py @@ -0,0 +1,68 @@ +# Test 32-bit BRANCH RELATIVE ON COUNT in cases where some branches are out +# of range. +# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s + +# Construct: +# +# loopN: +# load of countN +# ... +# loop0: +# 0xffd8 bytes, from MVIY instructions +# conditional branch to main +# after0: +# ... +# decrement of countN +# conditional branch to loopN +# afterN: +# +# Each load occupies 4 bytes. Each decrement and branch occupies 4 +# bytes if BRCT can be used, otherwise it occupies 10 bytes (AHI + BRCL). +# This means that loop 6 contains 5 * 4 + 0xffd8 + 5 * 4 == 0x10000 bytes +# and is therefore (just) in range. Loop 7 is out of range. +# +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: brct {{%r[0-9]+}} +# CHECK: ahi {{%r[0-9]+}}, -1 +# CHECK: jglh +# CHECK: ahi {{%r[0-9]+}}, -1 +# CHECK: jglh + +branch_blocks = 8 +main_size = 0xffd8 + +print 'define void @f1(i8 *%base, i32 *%counts) {' +print 'entry:' + +for i in xrange(branch_blocks - 1, -1, -1): + print ' %%countptr%d = getelementptr i32 *%%counts, i64 %d' % (i, i) + print ' %%initcount%d = load i32 *%%countptr%d' % (i, i) + print ' br label %%loop%d' % i + + print 'loop%d:' % i + block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1) + block2 = 'loop0' if i == 0 else 'after%d' % (i - 1) + print (' %%count%d = phi i32 [ %%initcount%d, %%%s ],' + ' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2)) + +a, b = 1, 1 +for i in xrange(0, main_size, 6): + a, b = b, a + b + offset = 4096 + b % 500000 + value = a % 256 + print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset) + print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i) + +for i in xrange(branch_blocks): + print ' %%nextcount%d = add i32 %%count%d, -1' % (i, i) + print ' %%test%d = icmp ne i32 %%nextcount%d, 0' % (i, i) + print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i) + print '' + print 'after%d:' % i + +print ' ret void' +print '}' diff --git a/test/CodeGen/SystemZ/Large/branch-range-08.py b/test/CodeGen/SystemZ/Large/branch-range-08.py new file mode 100644 index 0000000..ac1b137 --- /dev/null +++ b/test/CodeGen/SystemZ/Large/branch-range-08.py @@ -0,0 +1,69 @@ +# Test 64-bit BRANCH RELATIVE ON COUNT in cases where some branches are out +# of range. +# RUN: python %s | llc -mtriple=s390x-linux-gnu | FileCheck %s + +# Construct: +# +# loopN: +# load of countN +# ... +# loop0: +# 0xffd8 bytes, from MVIY instructions +# conditional branch to main +# after0: +# ... +# decrement of countN +# conditional branch to loopN +# afterN: +# +# Each load occupies 6 bytes. Each decrement and branch occupies 4 +# bytes if BRCTG can be used, otherwise it occupies 10 bytes (AGHI + BRCL). +# This means that loop 5 contains 4 * 6 + 0xffd8 + 4 * 4 == 0x10000 bytes +# and is therefore (just) in range. Loop 6 is out of range. +# +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: brctg {{%r[0-9]+}} +# CHECK: aghi {{%r[0-9]+}}, -1 +# CHECK: jglh +# CHECK: aghi {{%r[0-9]+}}, -1 +# CHECK: jglh +# CHECK: aghi {{%r[0-9]+}}, -1 +# CHECK: jglh + +branch_blocks = 8 +main_size = 0xffd8 + +print 'define void @f1(i8 *%base, i64 *%counts) {' +print 'entry:' + +for i in xrange(branch_blocks - 1, -1, -1): + print ' %%countptr%d = getelementptr i64 *%%counts, i64 %d' % (i, i) + print ' %%initcount%d = load i64 *%%countptr%d' % (i, i) + print ' br label %%loop%d' % i + + print 'loop%d:' % i + block1 = 'entry' if i == branch_blocks - 1 else 'loop%d' % (i + 1) + block2 = 'loop0' if i == 0 else 'after%d' % (i - 1) + print (' %%count%d = phi i64 [ %%initcount%d, %%%s ],' + ' [ %%nextcount%d, %%%s ]' % (i, i, block1, i, block2)) + +a, b = 1, 1 +for i in xrange(0, main_size, 6): + a, b = b, a + b + offset = 4096 + b % 500000 + value = a % 256 + print ' %%ptr%d = getelementptr i8 *%%base, i64 %d' % (i, offset) + print ' store volatile i8 %d, i8 *%%ptr%d' % (value, i) + +for i in xrange(branch_blocks): + print ' %%nextcount%d = add i64 %%count%d, -1' % (i, i) + print ' %%test%d = icmp ne i64 %%nextcount%d, 0' % (i, i) + print ' br i1 %%test%d, label %%loop%d, label %%after%d' % (i, i, i) + print '' + print 'after%d:' % i + +print ' ret void' +print '}' diff --git a/test/CodeGen/SystemZ/loop-01.ll b/test/CodeGen/SystemZ/loop-01.ll index 025a34e..5800801 100644 --- a/test/CodeGen/SystemZ/loop-01.ll +++ b/test/CodeGen/SystemZ/loop-01.ll @@ -5,7 +5,7 @@ ; Test that strength reduction is applied to addresses with a scale factor, ; but that indexed addressing can still be used. define void @f1(i32 *%dest, i32 %a) { -; CHECK-LABEL: f1 +; CHECK-LABEL: f1: ; CHECK-NOT: sllg ; CHECK: st %r3, 0({{%r[1-5],%r[1-5]}}) ; CHECK: br %r14 @@ -23,3 +23,102 @@ loop: exit: ret void } + +; Test a loop that should be converted into dbr form and then use BRCT. +define void @f2(i32 *%src, i32 *%dest) { +; CHECK-LABEL: f2: +; CHECK: lhi [[REG:%r[0-5]]], 100 +; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop +; CHECK: brct [[REG]], [[LABEL]] +; CHECK: br %r14 +entry: + br label %loop + +loop: + %count = phi i32 [ 0, %entry ], [ %next, %loop.next ] + %next = add i32 %count, 1 + %val = load volatile i32 *%src + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %loop.next, label %loop.store + +loop.store: + %add = add i32 %val, 1 + store volatile i32 %add, i32 *%dest + br label %loop.next + +loop.next: + %cont = icmp ne i32 %next, 100 + br i1 %cont, label %loop, label %exit + +exit: + ret void +} + +; Like f2, but for BRCTG. +define void @f3(i64 *%src, i64 *%dest) { +; CHECK-LABEL: f3: +; CHECK: lghi [[REG:%r[0-5]]], 100 +; CHECK: [[LABEL:\.[^:]*]]:{{.*}} %loop +; CHECK: brctg [[REG]], [[LABEL]] +; CHECK: br %r14 +entry: + br label %loop + +loop: + %count = phi i64 [ 0, %entry ], [ %next, %loop.next ] + %next = add i64 %count, 1 + %val = load volatile i64 *%src + %cmp = icmp eq i64 %val, 0 + br i1 %cmp, label %loop.next, label %loop.store + +loop.store: + %add = add i64 %val, 1 + store volatile i64 %add, i64 *%dest + br label %loop.next + +loop.next: + %cont = icmp ne i64 %next, 100 + br i1 %cont, label %loop, label %exit + +exit: + ret void +} + +; Test a loop with a 64-bit decremented counter in which the 32-bit +; low part of the counter is used after the decrement. This is an example +; of a subregister use being the only thing that blocks a conversion to BRCTG. +define void @f4(i32 *%src, i32 *%dest, i64 *%dest2, i64 %count) { +; CHECK-LABEL: f4: +; CHECK: aghi [[REG:%r[0-5]]], -1 +; CHECK: lr [[REG2:%r[0-5]]], [[REG]] +; CHECK: stg [[REG2]], +; CHECK: jne {{\..*}} +; CHECK: br %r14 +entry: + br label %loop + +loop: + %left = phi i64 [ %count, %entry ], [ %next, %loop.next ] + store volatile i64 %left, i64 *%dest2 + %val = load volatile i32 *%src + %cmp = icmp eq i32 %val, 0 + br i1 %cmp, label %loop.next, label %loop.store + +loop.store: + %add = add i32 %val, 1 + store volatile i32 %add, i32 *%dest + br label %loop.next + +loop.next: + %next = add i64 %left, -1 + %ext = zext i32 %val to i64 + %shl = shl i64 %ext, 32 + %and = and i64 %next, 4294967295 + %or = or i64 %shl, %and + store volatile i64 %or, i64 *%dest2 + %cont = icmp ne i64 %next, 0 + br i1 %cont, label %loop, label %exit + +exit: + ret void +} -- cgit v1.1