summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2014-12-08 16:52:53 -0800
committerEric Anholt <eric@anholt.net>2014-12-09 01:04:46 -0800
commit8420a956924c720b3c4932a577623f836758c21c (patch)
tree09097890b2dabbb4bff4bf81e4b5d55ddcf3bc6a
parentab1b1fa6fbd72b05c48f83c9df5036c2bfe893a3 (diff)
downloadexternal_mesa3d-8420a956924c720b3c4932a577623f836758c21c.zip
external_mesa3d-8420a956924c720b3c4932a577623f836758c21c.tar.gz
external_mesa3d-8420a956924c720b3c4932a577623f836758c21c.tar.bz2
vc4: Reserve rb31 instead of r3 for raddr conflict spills.
This increases the cost of a raddr b conflict spill (save r3 to rb31, move src1 to r3, move rb31 back to r3 when done, instead of just move src1 to r3), but on average thanks to instruction pairing it's more worthwhile to have another accumulator. total instructions in shared programs: 46428 -> 46171 (-0.55%) instructions in affected programs: 38030 -> 37773 (-0.68%)
-rw-r--r--src/gallium/drivers/vc4/vc4_qpu_emit.c50
-rw-r--r--src/gallium/drivers/vc4/vc4_register_allocate.c6
2 files changed, 45 insertions, 11 deletions
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index 856f844..f2620c0 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -93,21 +93,41 @@ swap_file(struct qpu_reg *src)
* In that case, we need to move one to a temporary that can be used in the
* instruction, instead.
*/
-static void
+static bool
fixup_raddr_conflict(struct vc4_compile *c,
- struct qpu_reg *src0, struct qpu_reg *src1)
+ struct qpu_reg dst,
+ struct qpu_reg *src0, struct qpu_reg *src1,
+ bool r3_live)
{
if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) ||
src0->mux != src1->mux ||
src0->addr == src1->addr) {
- return;
+ return false;
}
if (swap_file(src0) || swap_file(src1))
- return;
+ return false;
+
+ if (src0->mux == QPU_MUX_A) {
+ /* If we're conflicting over the A regfile, then we can just
+ * use the reserved rb31.
+ */
+ queue(c, qpu_a_MOV(qpu_rb(31), *src1));
+ *src1 = qpu_rb(31);
+ return false;
+ } else {
+ /* Otherwise, we need a non-B regfile. So, we spill r3 out to
+ * rb31, then store our desired value in r3, and tell the
+ * caller to put rb31 back into r3 when we're done.
+ */
+ if (r3_live)
+ queue(c, qpu_a_MOV(qpu_rb(31), qpu_r3()));
+ queue(c, qpu_a_MOV(qpu_r3(), *src1));
+
+ *src1 = qpu_r3();
- queue(c, qpu_a_MOV(qpu_r3(), *src1));
- *src1 = qpu_r3();
+ return r3_live && dst.mux != QPU_MUX_R3;
+ }
}
void
@@ -118,6 +138,8 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
uint32_t inputs_remaining = c->num_inputs;
uint32_t vpm_read_fifo_count = 0;
uint32_t vpm_read_offset = 0;
+ bool written_r3 = false;
+ bool needs_restore;
make_empty_list(&c->qpu_inst_list);
@@ -416,8 +438,12 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
break;
case QOP_TEX_DIRECT:
- fixup_raddr_conflict(c, &src[0], &src[1]);
+ needs_restore = fixup_raddr_conflict(c, dst,
+ &src[0], &src[1],
+ written_r3);
queue(c, qpu_a_ADD(qpu_rb(QPU_W_TMU0_S), src[0], src[1]));
+ if (needs_restore)
+ queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
break;
case QOP_TEX_RESULT:
@@ -477,7 +503,9 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
if (qir_get_op_nsrc(qinst->op) == 1)
src[1] = src[0];
- fixup_raddr_conflict(c, &src[0], &src[1]);
+ needs_restore = fixup_raddr_conflict(c, dst,
+ &src[0], &src[1],
+ written_r3);
if (translate[qinst->op].is_mul) {
queue(c, qpu_m_alu2(translate[qinst->op].op,
@@ -488,8 +516,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
dst,
src[0], src[1]));
}
+ if (needs_restore)
+ queue(c, qpu_a_MOV(qpu_r3(), qpu_rb(31)));
+
break;
}
+
+ if (dst.mux == QPU_MUX_R3)
+ written_r3 = true;
}
qpu_schedule_instructions(c);
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index 3001900..85f29e5 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -117,10 +117,10 @@ vc4_alloc_reg_set(struct vc4_context *vc4)
vc4->reg_class_any = ra_alloc_reg_class(vc4->regs);
for (uint32_t i = 0; i < ARRAY_SIZE(vc4_regs); i++) {
- /* Reserve r3 for now, since we're using it for spilling-like
- * operations in vc4_qpu_emit.c
+ /* Reserve rb31 for spilling fixup_raddr_conflict() in
+ * vc4_qpu_emit.c
*/
- if (vc4_regs[i].mux == QPU_MUX_R3)
+ if (vc4_regs[i].mux == QPU_MUX_B && vc4_regs[i].addr == 31)
continue;
/* R4 can't be written as a general purpose register. (it's