summaryrefslogtreecommitdiffstats
path: root/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
diff options
context:
space:
mode:
authorJason Ekstrand <jason.ekstrand@intel.com>2014-08-18 14:27:55 -0700
committerJason Ekstrand <jason.ekstrand@intel.com>2014-09-30 10:29:14 -0700
commit7210583eb84a5d49803dbe37b0960373b4224d10 (patch)
treed355daca1d46348861122d6daf505eb6a75fd1ec /src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
parent4232a776a699d80601496802ab2d817374a31f56 (diff)
downloadexternal_mesa3d-7210583eb84a5d49803dbe37b0960373b4224d10.zip
external_mesa3d-7210583eb84a5d49803dbe37b0960373b4224d10.tar.gz
external_mesa3d-7210583eb84a5d49803dbe37b0960373b4224d10.tar.bz2
i965/fs_reg: Allocate double the number of vgrfs in SIMD16 mode
This is actually the squash of a bunch of different changes. Individual commit titles follow: i965/fs: Always 2-align registers SIMD16 for gen <= 5 i965/fs: Use the register width when applying offsets This reworks both byte_offset() and offset() to be more intelligent. The byte_offset() function now supports offsets bigger than 32. The offset() function uses the byte_offset() function together with the register width and the type size to offset the register by the correct amount. i965/fs: Change regs_read to be in hardware registers i965/fs: Change regs_written to be actual hardware registers i965/fs: Properly handle register widths in LOAD_PAYLOAD The LOAD_PAYLOAD instruction is a bit special because it collects a bunch of registers (with possibly different widths) into a single payload block. Once the payload is constructed, it's treated as a single block of data and most of the information such as register widths doesn't matter anymore. In particular, the offset of any particular source register is the accumulation of the sizes of the previous source registers. i965/fs: Properly set writemasks in LOAD_PAYLOAD i965/fs: Handle register widths in demote_pull_constants i965/fs: Get rid of implicit register doubling in the allocator i965/fs: Reserve enough registers for PLN instructions i965/fs: Make sources and destinations interfere in 16-wide i965/fs: Properly handle register widths in CSE i965/fs: Properly handle register widths in register_coalesce i965/fs: Properly handle widths in copy propagation i965/fs: Properly handle register widths in VARYING_PULL_CONSTANT_LOAD i965/fs: Properly handle register widths and odd register sizes in spilling i965/fs: Don't waste a register on texture lookups for gen >= 7 Previously, we were waisting a register in SIMD16 mode because we could only allocate registers in pairs. Now that we can allocate and address odd-sized registers, let's get rid of this special-case. Signed-off-by: Jason Ekstrand <jason.ekstrand@intel.com> Reviewed-by: Matt Turner <mattst88@gmail.com>
Diffstat (limited to 'src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp')
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp47
1 files changed, 31 insertions, 16 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index bd502c4..b4f4431 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -42,6 +42,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */
struct acp_entry : public exec_node {
fs_reg dst;
fs_reg src;
+ uint8_t regs_written;
enum opcode opcode;
bool saturate;
};
@@ -295,11 +296,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
/* Bail if inst is reading a range that isn't contained in the range
* that entry is writing.
*/
- int reg_size = dispatch_width * sizeof(float);
if (inst->src[arg].reg_offset < entry->dst.reg_offset ||
- (inst->src[arg].reg_offset * reg_size + inst->src[arg].subreg_offset +
- inst->regs_read(this, arg) * inst->src[arg].stride * reg_size) >
- (entry->dst.reg_offset + 1) * reg_size)
+ (inst->src[arg].reg_offset * 32 + inst->src[arg].subreg_offset +
+ inst->regs_read(this, arg) * inst->src[arg].stride * 32) >
+ (entry->dst.reg_offset + entry->regs_written) * 32)
return false;
/* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */
@@ -371,16 +371,25 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
inst->saturate = inst->saturate || entry->saturate;
switch (entry->src.file) {
+ case UNIFORM:
+ assert(entry->src.width == 1);
case BAD_FILE:
case HW_REG:
- case UNIFORM:
+ inst->src[arg].width = entry->src.width;
inst->src[arg].reg_offset = entry->src.reg_offset;
inst->src[arg].subreg_offset = entry->src.subreg_offset;
break;
case GRF:
{
- /* In this case, we have to deal with mapping parts of vgrfs to
- * other parts of vgrfs so we have to do some reg_offset magic.
+ assert(entry->src.width % inst->src[arg].width == 0);
+ /* In this case, we'll just leave the width alone. The source
+ * register could have different widths depending on how it is
+ * being used. For instance, if only half of the register was
+ * used then we want to preserve that and continue to only use
+ * half.
+ *
+ * Also, we have to deal with mapping parts of vgrfs to other
+ * parts of vgrfs so we have to do some reg_offset magic.
*/
/* Compute the offset of inst->src[arg] relative to inst->dst */
@@ -389,10 +398,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
int rel_suboffset = inst->src[arg].subreg_offset;
/* Compute the final register offset (in bytes) */
- int offset = entry->src.reg_offset * reg_size + entry->src.subreg_offset;
- offset += rel_offset * reg_size + rel_suboffset;
- inst->src[arg].reg_offset = offset / reg_size;
- inst->src[arg].subreg_offset = offset % reg_size;
+ int offset = entry->src.reg_offset * 32 + entry->src.subreg_offset;
+ offset += rel_offset * 32 + rel_suboffset;
+ inst->src[arg].reg_offset = offset / 32;
+ inst->src[arg].subreg_offset = offset % 32;
}
break;
default:
@@ -429,11 +438,10 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
/* Bail if inst is reading a range that isn't contained in the range
* that entry is writing.
*/
- int reg_size = dispatch_width * sizeof(float);
if (inst->src[i].reg_offset < entry->dst.reg_offset ||
- (inst->src[i].reg_offset * reg_size + inst->src[i].subreg_offset +
- inst->regs_read(this, i) * inst->src[i].stride * reg_size) >
- (entry->dst.reg_offset + 1) * reg_size)
+ (inst->src[i].reg_offset * 32 + inst->src[i].subreg_offset +
+ inst->regs_read(this, i) * inst->src[i].stride * 32) >
+ (entry->dst.reg_offset + entry->regs_written) * 32)
continue;
/* Don't bother with cases that should have been taken care of by the
@@ -623,17 +631,23 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
entry->dst = inst->dst;
entry->src = inst->src[0];
+ entry->regs_written = inst->regs_written;
entry->opcode = inst->opcode;
entry->saturate = inst->saturate;
acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
} else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
inst->dst.file == GRF) {
+ int offset = 0;
for (int i = 0; i < inst->sources; i++) {
+ int regs_written = ((inst->src[i].effective_width(this) *
+ type_sz(inst->src[i].type)) + 31) / 32;
if (inst->src[i].file == GRF) {
acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
entry->dst = inst->dst;
- entry->dst.reg_offset = i;
+ entry->dst.reg_offset = offset;
+ entry->dst.width = inst->src[i].effective_width(this);
entry->src = inst->src[i];
+ entry->regs_written = regs_written;
entry->opcode = inst->opcode;
if (!entry->dst.equals(inst->src[i])) {
acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
@@ -641,6 +655,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
ralloc_free(entry);
}
}
+ offset += regs_written;
}
}
}