summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.cpp153
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs.h36
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp47
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_cse.cpp22
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_generator.cpp11
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp193
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp19
-rw-r--r--src/mesa/drivers/dri/i965/brw_fs_visitor.cpp36
-rw-r--r--src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp11
9 files changed, 371 insertions, 157 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5d7e867..9a9dbda 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -65,7 +65,21 @@ fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
this->conditional_mod = BRW_CONDITIONAL_NONE;
/* This will be the case for almost all instructions. */
- this->regs_written = 1;
+ switch (dst.file) {
+ case GRF:
+ case HW_REG:
+ case MRF:
+ this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
+ break;
+ case BAD_FILE:
+ this->regs_written = 0;
+ break;
+ case IMM:
+ case UNIFORM:
+ unreachable("Invalid destination register file");
+ default:
+ unreachable("Invalid register file");
+ }
this->writes_accumulator = false;
}
@@ -252,7 +266,16 @@ fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
{
fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
sources);
- inst->regs_written = sources;
+ inst->regs_written = 0;
+ for (int i = 0; i < sources; ++i) {
+ /* The LOAD_PAYLOAD instruction only really makes sense if we are
+ * dealing with whole registers. If this ever changes, we can deal
+ * with it later.
+ */
+ int size = src[i].effective_width(this) * type_sz(src[i].type);
+ assert(size % 32 == 0);
+ inst->regs_written += (size + 31) / 32;
+ }
return inst;
}
@@ -282,7 +305,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
varying_offset, fs_reg(const_offset & ~3)));
int scale = 1;
- if (brw->gen == 4 && dispatch_width == 8) {
+ if (brw->gen == 4 && dst.width == 8) {
/* Pre-gen5, we can either use a SIMD8 message that requires (header,
* u, v, r) as parameters, or we can just use the SIMD16 message
* consisting of (header, u). We choose the second, at the cost of a
@@ -296,9 +319,13 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
else
op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
- fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
+
+ assert(dst.width % 8 == 0);
+ int regs_written = 4 * (dst.width / 8) * scale;
+ fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
+ dst.type, dst.width);
inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
- inst->regs_written = 4 * scale;
+ inst->regs_written = regs_written;
instructions.push_tail(inst);
if (brw->gen < 7) {
@@ -802,12 +829,27 @@ int
fs_inst::regs_read(fs_visitor *v, int arg) const
{
if (is_tex() && arg == 0 && src[0].file == GRF) {
- if (v->dispatch_width == 16)
- return (mlen + 1) / 2;
- else
- return mlen;
+ return mlen;
+ }
+
+ switch (src[arg].file) {
+ case BAD_FILE:
+ case UNIFORM:
+ case IMM:
+ return 1;
+ case GRF:
+ case HW_REG:
+ if (src[arg].stride == 0) {
+ return 1;
+ } else {
+ int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
+ return (size + 31) / 32;
+ }
+ case MRF:
+ unreachable("MRF registers are not allowed as sources");
+ default:
+ unreachable("Invalid register file");
}
- return 1;
}
bool
@@ -948,9 +990,10 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
{
init();
+ int reg_width = v->dispatch_width / 8;
this->file = GRF;
- this->reg = v->virtual_grf_alloc(v->type_size(type));
+ this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
this->reg_offset = 0;
this->type = brw_type_for_base_type(type);
this->width = v->dispatch_width;
@@ -2096,6 +2139,7 @@ fs_visitor::demote_pull_constants()
inst->src[i].file = GRF;
inst->src[i].reg = dst.reg;
inst->src[i].reg_offset = 0;
+ inst->src[i].width = dispatch_width;
}
}
invalidate_live_intervals();
@@ -2241,12 +2285,12 @@ fs_visitor::opt_register_renaming()
if (depth == 0 &&
inst->dst.file == GRF &&
- virtual_grf_sizes[inst->dst.reg] == 1 &&
+ virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
!inst->is_partial_write()) {
if (remap[dst] == -1) {
remap[dst] = dst;
} else {
- remap[dst] = virtual_grf_alloc(1);
+ remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
inst->dst.reg = remap[dst];
progress = true;
}
@@ -2338,7 +2382,7 @@ fs_visitor::compute_to_mrf()
/* Things returning more than one register would need us to
* understand coalescing out more than one MOV at a time.
*/
- if (scan_inst->regs_written > 1)
+ if (scan_inst->regs_written > scan_inst->dst.width / 8)
break;
/* SEND instructions can't have MRF as a destination. */
@@ -2599,8 +2643,7 @@ void
fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
fs_inst *inst)
{
- int reg_size = dispatch_width / 8;
- int write_len = inst->regs_written * reg_size;
+ int write_len = inst->regs_written;
int first_write_grf = inst->dst.reg;
bool needs_dep[BRW_MAX_MRF];
assert(write_len < (int)sizeof(needs_dep) - 1);
@@ -2639,7 +2682,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
*/
if (scan_inst->dst.file == GRF) {
for (int i = 0; i < scan_inst->regs_written; i++) {
- int reg = scan_inst->dst.reg + i * reg_size;
+ int reg = scan_inst->dst.reg + i;
if (reg >= first_write_grf &&
reg < first_write_grf + write_len &&
@@ -2677,7 +2720,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
void
fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
{
- int write_len = inst->regs_written * dispatch_width / 8;
+ int write_len = inst->regs_written;
int first_write_grf = inst->dst.reg;
bool needs_dep[BRW_MAX_MRF];
assert(write_len < (int)sizeof(needs_dep) - 1);
@@ -2829,19 +2872,77 @@ fs_visitor::lower_load_payload()
{
bool progress = false;
+ int vgrf_to_reg[virtual_grf_count];
+ int reg_count = 16; /* Leave room for MRF */
+ for (int i = 0; i < virtual_grf_count; ++i) {
+ vgrf_to_reg[i] = reg_count;
+ reg_count += virtual_grf_sizes[i];
+ }
+
+ struct {
+ bool written:1; /* Whether this register has ever been written */
+ bool force_writemask_all:1;
+ bool force_sechalf:1;
+ } metadata[reg_count];
+ memset(metadata, 0, sizeof(metadata));
+
foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+ int dst_reg;
+ if (inst->dst.file == MRF) {
+ dst_reg = inst->dst.reg;
+ } else if (inst->dst.file == GRF) {
+ dst_reg = vgrf_to_reg[inst->dst.reg];
+ }
+
+ if (inst->dst.file == MRF || inst->dst.file == GRF) {
+ bool force_sechalf = inst->force_sechalf;
+ bool toggle_sechalf = inst->dst.width == 16 &&
+ type_sz(inst->dst.type) == 4;
+ for (int i = 0; i < inst->regs_written; ++i) {
+ metadata[dst_reg + i].written = true;
+ metadata[dst_reg + i].force_sechalf = force_sechalf;
+ metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
+ force_sechalf = (toggle_sechalf != force_sechalf);
+ }
+ }
+
if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+ assert(inst->dst.file == MRF || inst->dst.file == GRF);
fs_reg dst = inst->dst;
- /* src[0] represents the (optional) message header. */
- if (inst->src[0].file != BAD_FILE) {
- inst->insert_before(block, MOV(dst, inst->src[0]));
- }
- dst.reg_offset++;
+ for (int i = 0; i < inst->sources; i++) {
+ dst.width = inst->src[i].effective_width(this);
+ dst.type = inst->src[i].type;
+
+ if (inst->src[i].file == BAD_FILE) {
+ /* Do nothing but otherwise increment as normal */
+ } else {
+ fs_inst *mov = MOV(dst, inst->src[i]);
+ if (inst->src[i].file == GRF) {
+ int src_reg = vgrf_to_reg[inst->src[i].reg] +
+ inst->src[i].reg_offset;
+ mov->force_sechalf = metadata[src_reg].force_sechalf;
+ mov->force_writemask_all = metadata[src_reg].force_writemask_all;
+ metadata[dst_reg] = metadata[src_reg];
+ if (dst.width * type_sz(dst.type) > 32) {
+ assert((!metadata[src_reg].written ||
+ !metadata[src_reg].force_sechalf) &&
+ (!metadata[src_reg + 1].written ||
+ metadata[src_reg + 1].force_sechalf));
+ metadata[dst_reg + 1] = metadata[src_reg + 1];
+ }
+ } else {
+ metadata[dst_reg].force_writemask_all = false;
+ metadata[dst_reg].force_sechalf = false;
+ if (dst.width == 16) {
+ metadata[dst_reg + 1].force_writemask_all = false;
+ metadata[dst_reg + 1].force_sechalf = true;
+ }
+ }
+ inst->insert_before(block, mov);
+ }
- for (int i = 1; i < inst->sources; i++) {
- inst->insert_before(block, MOV(dst, inst->src[i]));
- dst.reg_offset++;
+ dst = offset(dst, 1);
}
inst->remove(block);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 0d3931e..05fb71d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -124,18 +124,40 @@ retype(fs_reg reg, enum brw_reg_type type)
}
static inline fs_reg
-offset(fs_reg reg, unsigned delta)
+byte_offset(fs_reg reg, unsigned delta)
{
- assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
- reg.reg_offset += delta;
+ switch (reg.file) {
+ case BAD_FILE:
+ break;
+ case GRF:
+ reg.reg_offset += delta / 32;
+ break;
+ case MRF:
+ reg.reg += delta / 32;
+ break;
+ default:
+ assert(delta == 0);
+ }
+ reg.subreg_offset += delta % 32;
return reg;
}
static inline fs_reg
-byte_offset(fs_reg reg, unsigned delta)
+offset(fs_reg reg, unsigned delta)
{
- assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
- reg.subreg_offset += delta;
+ assert(reg.stride > 0);
+ switch (reg.file) {
+ case BAD_FILE:
+ break;
+ case GRF:
+ case MRF:
+ return byte_offset(reg, delta * reg.width * reg.stride * type_sz(reg.type));
+ case UNIFORM:
+ reg.reg_offset += delta;
+ break;
+ default:
+ assert(delta == 0);
+ }
return reg;
}
@@ -426,6 +448,8 @@ public:
void emit_if_gen6(ir_if *ir);
void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
uint32_t spill_offset, int count);
+ void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
+ uint32_t spill_offset, int count);
void emit_fragment_program_code();
void setup_fp_regs();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index bd502c4..b4f4431 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -42,6 +42,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */
struct acp_entry : public exec_node {
fs_reg dst;
fs_reg src;
+ uint8_t regs_written;
enum opcode opcode;
bool saturate;
};
@@ -295,11 +296,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
/* Bail if inst is reading a range that isn't contained in the range
* that entry is writing.
*/
- int reg_size = dispatch_width * sizeof(float);
if (inst->src[arg].reg_offset < entry->dst.reg_offset ||
- (inst->src[arg].reg_offset * reg_size + inst->src[arg].subreg_offset +
- inst->regs_read(this, arg) * inst->src[arg].stride * reg_size) >
- (entry->dst.reg_offset + 1) * reg_size)
+ (inst->src[arg].reg_offset * 32 + inst->src[arg].subreg_offset +
+ inst->regs_read(this, arg) * inst->src[arg].stride * 32) >
+ (entry->dst.reg_offset + entry->regs_written) * 32)
return false;
/* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */
@@ -371,16 +371,25 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
inst->saturate = inst->saturate || entry->saturate;
switch (entry->src.file) {
+ case UNIFORM:
+ assert(entry->src.width == 1);
case BAD_FILE:
case HW_REG:
- case UNIFORM:
+ inst->src[arg].width = entry->src.width;
inst->src[arg].reg_offset = entry->src.reg_offset;
inst->src[arg].subreg_offset = entry->src.subreg_offset;
break;
case GRF:
{
- /* In this case, we have to deal with mapping parts of vgrfs to
- * other parts of vgrfs so we have to do some reg_offset magic.
+ assert(entry->src.width % inst->src[arg].width == 0);
+ /* In this case, we'll just leave the width alone. The source
+ * register could have different widths depending on how it is
+ * being used. For instance, if only half of the register was
+ * used then we want to preserve that and continue to only use
+ * half.
+ *
+ * Also, we have to deal with mapping parts of vgrfs to other
+ * parts of vgrfs so we have to do some reg_offset magic.
*/
/* Compute the offset of inst->src[arg] relative to inst->dst */
@@ -389,10 +398,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
int rel_suboffset = inst->src[arg].subreg_offset;
/* Compute the final register offset (in bytes) */
- int offset = entry->src.reg_offset * reg_size + entry->src.subreg_offset;
- offset += rel_offset * reg_size + rel_suboffset;
- inst->src[arg].reg_offset = offset / reg_size;
- inst->src[arg].subreg_offset = offset % reg_size;
+ int offset = entry->src.reg_offset * 32 + entry->src.subreg_offset;
+ offset += rel_offset * 32 + rel_suboffset;
+ inst->src[arg].reg_offset = offset / 32;
+ inst->src[arg].subreg_offset = offset % 32;
}
break;
default:
@@ -429,11 +438,10 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
/* Bail if inst is reading a range that isn't contained in the range
* that entry is writing.
*/
- int reg_size = dispatch_width * sizeof(float);
if (inst->src[i].reg_offset < entry->dst.reg_offset ||
- (inst->src[i].reg_offset * reg_size + inst->src[i].subreg_offset +
- inst->regs_read(this, i) * inst->src[i].stride * reg_size) >
- (entry->dst.reg_offset + 1) * reg_size)
+ (inst->src[i].reg_offset * 32 + inst->src[i].subreg_offset +
+ inst->regs_read(this, i) * inst->src[i].stride * 32) >
+ (entry->dst.reg_offset + entry->regs_written) * 32)
continue;
/* Don't bother with cases that should have been taken care of by the
@@ -623,17 +631,23 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
entry->dst = inst->dst;
entry->src = inst->src[0];
+ entry->regs_written = inst->regs_written;
entry->opcode = inst->opcode;
entry->saturate = inst->saturate;
acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
} else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
inst->dst.file == GRF) {
+ int offset = 0;
for (int i = 0; i < inst->sources; i++) {
+ int regs_written = ((inst->src[i].effective_width(this) *
+ type_sz(inst->src[i].type)) + 31) / 32;
if (inst->src[i].file == GRF) {
acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
entry->dst = inst->dst;
- entry->dst.reg_offset = i;
+ entry->dst.reg_offset = offset;
+ entry->dst.width = inst->src[i].effective_width(this);
entry->src = inst->src[i];
+ entry->regs_written = regs_written;
entry->opcode = inst->opcode;
if (!entry->dst.equals(inst->src[i])) {
acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
@@ -641,6 +655,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
ralloc_free(entry);
}
}
+ offset += regs_written;
}
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 7edbe19..817fc1f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -202,19 +202,21 @@ fs_visitor::opt_cse_local(bblock_t *block)
bool no_existing_temp = entry->tmp.file == BAD_FILE;
if (no_existing_temp && !entry->generator->dst.is_null()) {
int written = entry->generator->regs_written;
+ int dst_width = entry->generator->dst.width / 8;
+ assert(written % dst_width == 0);
fs_reg orig_dst = entry->generator->dst;
fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written),
- orig_dst.type);
+ orig_dst.type, orig_dst.width);
entry->tmp = tmp;
entry->generator->dst = tmp;
fs_inst *copy;
- if (written > 1) {
- fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written);
- for (int i = 0; i < written; i++)
+ if (written > dst_width) {
+ fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
+ for (int i = 0; i < written / dst_width; i++)
sources[i] = offset(tmp, i);
- copy = LOAD_PAYLOAD(orig_dst, sources, written);
+ copy = LOAD_PAYLOAD(orig_dst, sources, written / dst_width);
} else {
copy = MOV(orig_dst, tmp);
copy->force_writemask_all =
@@ -226,16 +228,18 @@ fs_visitor::opt_cse_local(bblock_t *block)
/* dest <- temp */
if (!inst->dst.is_null()) {
int written = inst->regs_written;
+ int dst_width = inst->dst.width / 8;
assert(written == entry->generator->regs_written);
+ assert(dst_width == entry->generator->dst.width / 8);
assert(inst->dst.type == entry->tmp.type);
fs_reg dst = inst->dst;
fs_reg tmp = entry->tmp;
fs_inst *copy;
- if (written > 1) {
- fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written);
- for (int i = 0; i < written; i++)
+ if (written > dst_width) {
+ fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
+ for (int i = 0; i < written / dst_width; i++)
sources[i] = offset(tmp, i);
- copy = LOAD_PAYLOAD(dst, sources, written);
+ copy = LOAD_PAYLOAD(dst, sources, written / dst_width);
} else {
copy = MOV(dst, tmp);
copy->force_writemask_all = inst->force_writemask_all;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 122a43f..5bfc559 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -542,15 +542,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
dst = vec16(dst);
}
- if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
- /* The send-from-GRF for SIMD16 texturing with a header has an extra
- * hardware register allocated to it, which we need to skip over (since
- * our coordinates in the payload are in the even-numbered registers,
- * and the header comes right before the first one).
- */
- assert(src.file == BRW_GENERAL_REGISTER_FILE);
- src.nr++;
- }
+ assert(brw->gen < 7 || !inst->header_present ||
+ src.file == BRW_GENERAL_REGISTER_FILE);
assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index a627b64..095b45c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -31,11 +31,11 @@
#include "glsl/ir_optimization.h"
static void
-assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width)
+assign_reg(int *reg_hw_locations, fs_reg *reg)
{
if (reg->file == GRF) {
assert(reg->reg_offset >= 0);
- reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
+ reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset;
reg->reg_offset = 0;
}
}
@@ -51,14 +51,14 @@ fs_visitor::assign_regs_trivial()
hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
for (i = 1; i <= this->virtual_grf_count; i++) {
hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
- this->virtual_grf_sizes[i - 1] * reg_width);
+ this->virtual_grf_sizes[i - 1]);
}
this->grf_used = hw_reg_mapping[this->virtual_grf_count];
foreach_block_and_inst(block, fs_inst, inst, cfg) {
- assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+ assign_reg(hw_reg_mapping, &inst->dst);
for (i = 0; i < inst->sources; i++) {
- assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
+ assign_reg(hw_reg_mapping, &inst->src[i]);
}
}
@@ -75,7 +75,7 @@ static void
brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
{
const struct brw_device_info *devinfo = screen->devinfo;
- int base_reg_count = BRW_MAX_GRF / reg_width;
+ int base_reg_count = BRW_MAX_GRF;
int index = reg_width - 1;
/* The registers used to make up almost all values handled in the compiler
@@ -105,8 +105,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
int class_sizes[BRW_MAX_MRF];
if (devinfo->gen >= 7) {
- for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE;
- class_count++)
+ for (class_count = 0; class_count < BRW_MAX_MRF; class_count++)
class_sizes[class_count] = class_count + 1;
} else {
for (class_count = 0; class_count < 4; class_count++)
@@ -117,7 +116,21 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
/* Compute the total number of registers across all classes. */
int ra_reg_count = 0;
for (int i = 0; i < class_count; i++) {
- ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+ if (devinfo->gen <= 5 && reg_width == 2) {
+ /* From the G45 PRM:
+ *
+ * In order to reduce the hardware complexity, the following
+ * rules and restrictions apply to the compressed instruction:
+ * ...
+ * * Operand Alignment Rule: With the exceptions listed below, a
+ * source/destination operand in general should be aligned to
+ * even 256-bit physical register with a region size equal to
+ * two 256-bit physical register
+ */
+ ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
+ } else {
+ ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+ }
}
uint8_t *ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count);
@@ -134,27 +147,48 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
int pairs_base_reg = 0;
int pairs_reg_count = 0;
for (int i = 0; i < class_count; i++) {
- int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+ int class_reg_count;
+ if (devinfo->gen <= 5 && reg_width == 2) {
+ class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
+ } else {
+ class_reg_count = base_reg_count - (class_sizes[i] - 1);
+ }
classes[i] = ra_alloc_reg_class(regs);
/* Save this off for the aligned pair class at the end. */
if (class_sizes[i] == 2) {
- pairs_base_reg = reg;
- pairs_reg_count = class_reg_count;
+ pairs_base_reg = reg;
+ pairs_reg_count = class_reg_count;
}
- for (int j = 0; j < class_reg_count; j++) {
- ra_class_add_reg(regs, classes[i], reg);
+ if (devinfo->gen <= 5 && reg_width == 2) {
+ for (int j = 0; j < class_reg_count; j++) {
+ ra_class_add_reg(regs, classes[i], reg);
- ra_reg_to_grf[reg] = j;
+ ra_reg_to_grf[reg] = j * 2;
- for (int base_reg = j;
- base_reg < j + class_sizes[i];
- base_reg++) {
- ra_add_transitive_reg_conflict(regs, base_reg, reg);
- }
+ for (int base_reg = j * 2;
+ base_reg < j * 2 + class_sizes[i];
+ base_reg++) {
+ ra_add_transitive_reg_conflict(regs, base_reg, reg);
+ }
+
+ reg++;
+ }
+ } else {
+ for (int j = 0; j < class_reg_count; j++) {
+ ra_class_add_reg(regs, classes[i], reg);
+
+ ra_reg_to_grf[reg] = j;
- reg++;
+ for (int base_reg = j;
+ base_reg < j + class_sizes[i];
+ base_reg++) {
+ ra_add_transitive_reg_conflict(regs, base_reg, reg);
+ }
+
+ reg++;
+ }
}
}
assert(reg == ra_reg_count);
@@ -162,7 +196,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
/* Add a special class for aligned pairs, which we'll put delta_x/y
* in on gen5 so that we can do PLN.
*/
- if (devinfo->has_pln && reg_width == 1 && devinfo->gen < 6) {
+ if (devinfo->has_pln && devinfo->gen < 6) {
aligned_pairs_class = ra_alloc_reg_class(regs);
for (int i = 0; i < pairs_reg_count; i++) {
@@ -236,7 +270,6 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
int payload_node_count,
int first_payload_node)
{
- int reg_width = dispatch_width / 8;
int loop_depth = 0;
int loop_end_ip = 0;
@@ -276,7 +309,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == HW_REG &&
inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
- int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width;
+ int node_nr = inst->src[i].fixed_hw_reg.nr;
if (node_nr >= payload_node_count)
continue;
@@ -292,25 +325,26 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
* sideband. It also really freaks out driver developers to see g0
* used in unusual places, so just always reserve it.
*/
- payload_last_use_ip[0 / reg_width] = use_ip;
- payload_last_use_ip[1 / reg_width] = use_ip;
+ payload_last_use_ip[0] = use_ip;
+ payload_last_use_ip[1] = use_ip;
break;
case FS_OPCODE_LINTERP:
- /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes)
- * used by PLN's sourcing of the deltas, while we list only the first
- * two in the arguments (1 node). Pre-gen6, the deltas are computed
- * in normal VGRFs.
+ /* On gen6+ in SIMD16, there are 4 adjacent registers used by
+ * PLN's sourcing of the deltas, while we list only the first one
+ * in the arguments. Pre-gen6, the deltas are computed in normal
+ * VGRFs.
*/
if (brw->gen >= 6) {
int delta_x_arg = 0;
if (inst->src[delta_x_arg].file == HW_REG &&
inst->src[delta_x_arg].fixed_hw_reg.file ==
BRW_GENERAL_REGISTER_FILE) {
- int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr /
- reg_width) + 1;
- assert(sechalf_node < payload_node_count);
- payload_last_use_ip[sechalf_node] = use_ip;
+ for (int i = 1; i < 4; ++i) {
+ int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i;
+ assert(node < payload_node_count);
+ payload_last_use_ip[node] = use_ip;
+ }
}
}
break;
@@ -391,8 +425,6 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
void
fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
{
- int reg_width = dispatch_width / 8;
-
bool mrf_used[BRW_MAX_MRF];
get_used_mrfs(mrf_used);
@@ -402,8 +434,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
* The alternative would be to have per-physical-register classes, which
* would just be silly.
*/
- ra_set_node_reg(g, first_mrf_node + i,
- (GEN7_MRF_HACK_START + i) / reg_width);
+ ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
/* Since we don't have any live/dead analysis on the MRFs, just mark all
* that are used as conflicting with all virtual GRFs.
@@ -428,8 +459,7 @@ fs_visitor::assign_regs(bool allow_spilling)
*/
int reg_width = dispatch_width / 8;
int hw_reg_mapping[this->virtual_grf_count];
- int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
- reg_width);
+ int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
int rsi = reg_width - 1; /* Which screen->wm_reg_sets[] to use */
calculate_live_intervals();
@@ -478,6 +508,30 @@ fs_visitor::assign_regs(bool allow_spilling)
if (brw->gen >= 7)
setup_mrf_hack_interference(g, first_mrf_hack_node);
+ if (dispatch_width > 8) {
+ /* In 16-wide dispatch we have an issue where a compressed
+ * instruction is actually two instructions executed simultaneiously.
+ * It's actually ok to have the source and destination registers be
+ * the same. In this case, each instruction over-writes its own
+ * source and there's no problem. The real problem here is if the
+ * source and destination registers are off by one. Then you can end
+ * up in a scenario where the first instruction over-writes the
+ * source of the second instruction. Since the compiler doesn't know
+ * about this level of granularity, we simply make the source and
+ * destination interfere.
+ */
+ foreach_block_and_inst(block, fs_inst, inst, cfg) {
+ if (inst->dst.file != GRF)
+ continue;
+
+ for (int i = 0; i < inst->sources; ++i) {
+ if (inst->src[i].file == GRF) {
+ ra_add_node_interference(g, inst->dst.reg, inst->src[i].reg);
+ }
+ }
+ }
+ }
+
/* Debug of register spilling: Go spill everything. */
if (0) {
int reg = choose_spill_reg(g);
@@ -511,20 +565,19 @@ fs_visitor::assign_regs(bool allow_spilling)
* regs in the register classes back down to real hardware reg
* numbers.
*/
- this->grf_used = payload_node_count * reg_width;
+ this->grf_used = payload_node_count;
for (int i = 0; i < this->virtual_grf_count; i++) {
int reg = ra_get_node_reg(g, i);
- hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg] * reg_width;
+ hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg];
this->grf_used = MAX2(this->grf_used,
- hw_reg_mapping[i] + this->virtual_grf_sizes[i] *
- reg_width);
+ hw_reg_mapping[i] + this->virtual_grf_sizes[i]);
}
foreach_block_and_inst(block, fs_inst, inst, cfg) {
- assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+ assign_reg(hw_reg_mapping, &inst->dst);
for (int i = 0; i < inst->sources; i++) {
- assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
+ assign_reg(hw_reg_mapping, &inst->src[i]);
}
}
@@ -539,7 +592,11 @@ void
fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
uint32_t spill_offset, int count)
{
- for (int i = 0; i < count; i++) {
+ int reg_size = 1;
+ if (count % 2 == 0)
+ reg_size = 2;
+
+ for (int i = 0; i < count / reg_size; i++) {
/* The gen7 descriptor-based offset is 12 bits of HWORD units. */
bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;
@@ -558,8 +615,32 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
}
inst->insert_before(block, unspill_inst);
- dst = offset(dst, 1);
- spill_offset += dispatch_width * sizeof(float);
+ dst.reg_offset += reg_size;
+ spill_offset += reg_size * 8 * sizeof(float);
+ }
+}
+
+void
+fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
+ uint32_t spill_offset, int count)
+{
+ int spill_base_mrf = dispatch_width > 8 ? 13 : 14;
+
+ int reg_size = 1;
+ if (count % 2 == 0)
+ reg_size = 2;
+
+ for (int i = 0; i < count / reg_size; i++) {
+ fs_inst *spill_inst =
+ new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+ reg_null_f, src);
+ src.reg_offset += reg_size;
+ spill_inst->offset = spill_offset + i * reg_size;
+ spill_inst->ir = inst->ir;
+ spill_inst->annotation = inst->annotation;
+ spill_inst->mlen = 1 + reg_size; /* header, value */
+ spill_inst->base_mrf = spill_base_mrf;
+ inst->insert_after(block, spill_inst);
}
}
@@ -712,18 +793,8 @@ fs_visitor::spill_reg(int spill_reg)
inst->regs_written);
}
- for (int chan = 0; chan < inst->regs_written; chan++) {
- fs_inst *spill_inst =
- new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
- reg_null_f, spill_src);
- spill_src = offset(spill_src, 1);
- spill_inst->offset = subset_spill_offset + chan * reg_size;
- spill_inst->ir = inst->ir;
- spill_inst->annotation = inst->annotation;
- spill_inst->mlen = 1 + dispatch_width / 8; /* header, value */
- spill_inst->base_mrf = spill_base_mrf;
- inst->insert_after(block, spill_inst);
- }
+ emit_spill(block, inst, spill_src, subset_spill_offset,
+ inst->regs_written);
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 73f18f9..9546dcd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -69,16 +69,12 @@ is_copy_payload(const fs_visitor *v, const fs_inst *inst)
if (v->virtual_grf_sizes[inst->src[0].reg] != inst->regs_written)
return false;
- const int reg = inst->src[0].reg;
- if (inst->src[0].reg_offset != 0)
- return false;
+ fs_reg reg = inst->src[0];
- for (int i = 1; i < inst->sources; i++) {
- if (inst->src[i].reg != reg ||
- inst->src[i].reg_offset != i) {
+ for (int i = 0; i < inst->sources; i++)
+ if (!inst->src[i].equals(offset(reg, i)))
return false;
- }
- }
+
return true;
}
@@ -186,6 +182,7 @@ fs_visitor::register_coalesce()
src_size = virtual_grf_sizes[inst->src[0].reg];
assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
+ assert(inst->src[0].width % 8 == 0);
channels_remaining = src_size;
memset(mov, 0, sizeof(mov));
@@ -200,12 +197,14 @@ fs_visitor::register_coalesce()
reg_to_offset[i] = i;
}
mov[0] = inst;
- channels_remaining -= inst->sources;
+ channels_remaining -= inst->regs_written;
} else {
const int offset = inst->src[0].reg_offset;
reg_to_offset[offset] = inst->dst.reg_offset;
+ if (inst->src[0].width == 16)
+ reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
mov[offset] = inst;
- channels_remaining--;
+ channels_remaining -= inst->regs_written;
}
if (channels_remaining)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 72ffe1f..3b31e34 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -287,7 +287,8 @@ fs_visitor::try_emit_saturate(ir_expression *ir)
* src, just set the saturate flag instead of emmitting a separate mov.
*/
fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
- if (modify && modify->regs_written == 1 && modify->can_do_saturate()) {
+ if (modify && modify->regs_written == modify->dst.width / 8 &&
+ modify->can_do_saturate()) {
modify->saturate = true;
this->result = src;
return true;
@@ -1434,7 +1435,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
inst->base_mrf = base_mrf;
inst->mlen = mlen;
inst->header_present = header_present;
- inst->regs_written = 4;
+ inst->regs_written = 4 * reg_width;
if (mlen > MAX_SAMPLER_MESSAGE_SIZE) {
fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
@@ -1480,7 +1481,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
* need to offset the Sampler State Pointer in the header.
*/
header_present = true;
- sources[length] = reg_undef;
+ sources[0] = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
length++;
}
@@ -1618,7 +1619,13 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
}
}
- fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(length),
+ int mlen;
+ if (reg_width == 2)
+ mlen = length * reg_width - header_present;
+ else
+ mlen = length * reg_width;
+
+ fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(mlen),
BRW_REGISTER_TYPE_F);
emit(LOAD_PAYLOAD(src_payload, sources, length));
@@ -1645,12 +1652,9 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
}
fs_inst *inst = emit(opcode, dst, src_payload, sampler);
inst->base_mrf = -1;
- if (reg_width == 2)
- inst->mlen = length * reg_width - header_present;
- else
- inst->mlen = length * reg_width;
+ inst->mlen = mlen;
inst->header_present = header_present;
- inst->regs_written = 4;
+ inst->regs_written = 4 * reg_width;
if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
@@ -1784,7 +1788,7 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
{
int reg_width = dispatch_width / 8;
int length = ir->coordinate->type->vector_elements;
- fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length),
+ fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length * reg_width),
BRW_REGISTER_TYPE_F);
fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
fs_reg *sources = ralloc_array(mem_ctx, fs_reg, length);
@@ -1802,9 +1806,10 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
inst->base_mrf = -1;
inst->mlen = length * reg_width;
inst->header_present = false;
- inst->regs_written = 4; /* we only care about one reg of response,
- * but the sampler always writes 4/8
- */
+ inst->regs_written = 4 * reg_width; /* we only care about one reg of
+ * response, but the sampler always
+ * writes 4/8
+ */
return dest;
}
@@ -1979,14 +1984,15 @@ fs_visitor::visit(ir_texture *ir)
emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
- for (int i = 0; i < inst->regs_written; i++) {
+ int components = inst->regs_written / (dst.width / 8);
+ for (int i = 0; i < components; i++) {
if (i == 2) {
fixed_payload[i] = fixed_depth;
} else {
fixed_payload[i] = offset(dst, i);
}
}
- emit(LOAD_PAYLOAD(dst, fixed_payload, inst->regs_written));
+ emit(LOAD_PAYLOAD(dst, fixed_payload, components));
}
}
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index b963bda..5e8c98a 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -795,7 +795,7 @@ fs_instruction_scheduler::calculate_deps()
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == GRF) {
if (post_reg_alloc) {
- for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
+ for (int r = 0; r < inst->regs_read(v, i); r++)
add_dep(last_grf_write[inst->src[i].reg + r], n);
} else {
for (int r = 0; r < inst->regs_read(v, i); r++) {
@@ -847,7 +847,7 @@ fs_instruction_scheduler::calculate_deps()
/* write-after-write deps. */
if (inst->dst.file == GRF) {
if (post_reg_alloc) {
- for (int r = 0; r < inst->regs_written * reg_width; r++) {
+ for (int r = 0; r < inst->regs_written; r++) {
add_dep(last_grf_write[inst->dst.reg + r], n);
last_grf_write[inst->dst.reg + r] = n;
}
@@ -923,7 +923,7 @@ fs_instruction_scheduler::calculate_deps()
for (int i = 0; i < inst->sources; i++) {
if (inst->src[i].file == GRF) {
if (post_reg_alloc) {
- for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
+ for (int r = 0; r < inst->regs_read(v, i); r++)
add_dep(n, last_grf_write[inst->src[i].reg + r]);
} else {
for (int r = 0; r < inst->regs_read(v, i); r++) {
@@ -977,7 +977,7 @@ fs_instruction_scheduler::calculate_deps()
*/
if (inst->dst.file == GRF) {
if (post_reg_alloc) {
- for (int r = 0; r < inst->regs_written * reg_width; r++)
+ for (int r = 0; r < inst->regs_written; r++)
last_grf_write[inst->dst.reg + r] = n;
} else {
for (int r = 0; r < inst->regs_written; r++) {
@@ -1300,7 +1300,8 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
* single-result send is probably actually reducing register
* pressure.
*/
- if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+ if (inst->regs_written <= inst->dst.width / 8 &&
+ chosen_inst->regs_written > chosen_inst->dst.width / 8) {
chosen = n;
continue;
} else if (inst->regs_written > chosen_inst->regs_written) {