diff options
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.cpp | 153 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs.h | 36 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp | 47 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_cse.cpp | 22 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 11 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 193 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp | 19 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 36 | ||||
-rw-r--r-- | src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp | 11 |
9 files changed, 371 insertions, 157 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp index 5d7e867..9a9dbda 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs.cpp @@ -65,7 +65,21 @@ fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources) this->conditional_mod = BRW_CONDITIONAL_NONE; /* This will be the case for almost all instructions. */ - this->regs_written = 1; + switch (dst.file) { + case GRF: + case HW_REG: + case MRF: + this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32; + break; + case BAD_FILE: + this->regs_written = 0; + break; + case IMM: + case UNIFORM: + unreachable("Invalid destination register file"); + default: + unreachable("Invalid register file"); + } this->writes_accumulator = false; } @@ -252,7 +266,16 @@ fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources) { fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources); - inst->regs_written = sources; + inst->regs_written = 0; + for (int i = 0; i < sources; ++i) { + /* The LOAD_PAYLOAD instruction only really makes sense if we are + * dealing with whole registers. If this ever changes, we can deal + * with it later. + */ + int size = src[i].effective_width(this) * type_sz(src[i].type); + assert(size % 32 == 0); + inst->regs_written += (size + 31) / 32; + } return inst; } @@ -282,7 +305,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst, varying_offset, fs_reg(const_offset & ~3))); int scale = 1; - if (brw->gen == 4 && dispatch_width == 8) { + if (brw->gen == 4 && dst.width == 8) { /* Pre-gen5, we can either use a SIMD8 message that requires (header, * u, v, r) as parameters, or we can just use the SIMD16 message * consisting of (header, u). We choose the second, at the cost of a @@ -296,9 +319,13 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst, op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; else op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; - fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type); + + assert(dst.width % 8 == 0); + int regs_written = 4 * (dst.width / 8) * scale; + fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written), + dst.type, dst.width); inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset); - inst->regs_written = 4 * scale; + inst->regs_written = regs_written; instructions.push_tail(inst); if (brw->gen < 7) { @@ -802,12 +829,27 @@ int fs_inst::regs_read(fs_visitor *v, int arg) const { if (is_tex() && arg == 0 && src[0].file == GRF) { - if (v->dispatch_width == 16) - return (mlen + 1) / 2; - else - return mlen; + return mlen; + } + + switch (src[arg].file) { + case BAD_FILE: + case UNIFORM: + case IMM: + return 1; + case GRF: + case HW_REG: + if (src[arg].stride == 0) { + return 1; + } else { + int size = src[arg].width * src[arg].stride * type_sz(src[arg].type); + return (size + 31) / 32; + } + case MRF: + unreachable("MRF registers are not allowed as sources"); + default: + unreachable("Invalid register file"); } - return 1; } bool @@ -948,9 +990,10 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type, fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type) { init(); + int reg_width = v->dispatch_width / 8; this->file = GRF; - this->reg = v->virtual_grf_alloc(v->type_size(type)); + this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width); this->reg_offset = 0; this->type = brw_type_for_base_type(type); this->width = v->dispatch_width; @@ -2096,6 +2139,7 @@ fs_visitor::demote_pull_constants() inst->src[i].file = GRF; inst->src[i].reg = dst.reg; inst->src[i].reg_offset = 0; + inst->src[i].width = dispatch_width; } } invalidate_live_intervals(); @@ -2241,12 +2285,12 @@ fs_visitor::opt_register_renaming() if (depth == 0 && inst->dst.file == GRF && - virtual_grf_sizes[inst->dst.reg] == 1 && + virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 && !inst->is_partial_write()) { if (remap[dst] == -1) { remap[dst] = dst; } else { - remap[dst] = virtual_grf_alloc(1); + remap[dst] = virtual_grf_alloc(inst->dst.width / 8); inst->dst.reg = remap[dst]; progress = true; } @@ -2338,7 +2382,7 @@ fs_visitor::compute_to_mrf() /* Things returning more than one register would need us to * understand coalescing out more than one MOV at a time. */ - if (scan_inst->regs_written > 1) + if (scan_inst->regs_written > scan_inst->dst.width / 8) break; /* SEND instructions can't have MRF as a destination. */ @@ -2599,8 +2643,7 @@ void fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, fs_inst *inst) { - int reg_size = dispatch_width / 8; - int write_len = inst->regs_written * reg_size; + int write_len = inst->regs_written; int first_write_grf = inst->dst.reg; bool needs_dep[BRW_MAX_MRF]; assert(write_len < (int)sizeof(needs_dep) - 1); @@ -2639,7 +2682,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, */ if (scan_inst->dst.file == GRF) { for (int i = 0; i < scan_inst->regs_written; i++) { - int reg = scan_inst->dst.reg + i * reg_size; + int reg = scan_inst->dst.reg + i; if (reg >= first_write_grf && reg < first_write_grf + write_len && @@ -2677,7 +2720,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, void fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) { - int write_len = inst->regs_written * dispatch_width / 8; + int write_len = inst->regs_written; int first_write_grf = inst->dst.reg; bool needs_dep[BRW_MAX_MRF]; assert(write_len < (int)sizeof(needs_dep) - 1); @@ -2829,19 +2872,77 @@ fs_visitor::lower_load_payload() { bool progress = false; + int vgrf_to_reg[virtual_grf_count]; + int reg_count = 16; /* Leave room for MRF */ + for (int i = 0; i < virtual_grf_count; ++i) { + vgrf_to_reg[i] = reg_count; + reg_count += virtual_grf_sizes[i]; + } + + struct { + bool written:1; /* Whether this register has ever been written */ + bool force_writemask_all:1; + bool force_sechalf:1; + } metadata[reg_count]; + memset(metadata, 0, sizeof(metadata)); + foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { + int dst_reg; + if (inst->dst.file == MRF) { + dst_reg = inst->dst.reg; + } else if (inst->dst.file == GRF) { + dst_reg = vgrf_to_reg[inst->dst.reg]; + } + + if (inst->dst.file == MRF || inst->dst.file == GRF) { + bool force_sechalf = inst->force_sechalf; + bool toggle_sechalf = inst->dst.width == 16 && + type_sz(inst->dst.type) == 4; + for (int i = 0; i < inst->regs_written; ++i) { + metadata[dst_reg + i].written = true; + metadata[dst_reg + i].force_sechalf = force_sechalf; + metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all; + force_sechalf = (toggle_sechalf != force_sechalf); + } + } + if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) { + assert(inst->dst.file == MRF || inst->dst.file == GRF); fs_reg dst = inst->dst; - /* src[0] represents the (optional) message header. */ - if (inst->src[0].file != BAD_FILE) { - inst->insert_before(block, MOV(dst, inst->src[0])); - } - dst.reg_offset++; + for (int i = 0; i < inst->sources; i++) { + dst.width = inst->src[i].effective_width(this); + dst.type = inst->src[i].type; + + if (inst->src[i].file == BAD_FILE) { + /* Do nothing but otherwise increment as normal */ + } else { + fs_inst *mov = MOV(dst, inst->src[i]); + if (inst->src[i].file == GRF) { + int src_reg = vgrf_to_reg[inst->src[i].reg] + + inst->src[i].reg_offset; + mov->force_sechalf = metadata[src_reg].force_sechalf; + mov->force_writemask_all = metadata[src_reg].force_writemask_all; + metadata[dst_reg] = metadata[src_reg]; + if (dst.width * type_sz(dst.type) > 32) { + assert((!metadata[src_reg].written || + !metadata[src_reg].force_sechalf) && + (!metadata[src_reg + 1].written || + metadata[src_reg + 1].force_sechalf)); + metadata[dst_reg + 1] = metadata[src_reg + 1]; + } + } else { + metadata[dst_reg].force_writemask_all = false; + metadata[dst_reg].force_sechalf = false; + if (dst.width == 16) { + metadata[dst_reg + 1].force_writemask_all = false; + metadata[dst_reg + 1].force_sechalf = true; + } + } + inst->insert_before(block, mov); + } - for (int i = 1; i < inst->sources; i++) { - inst->insert_before(block, MOV(dst, inst->src[i])); - dst.reg_offset++; + dst = offset(dst, 1); } inst->remove(block); diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h index 0d3931e..05fb71d 100644 --- a/src/mesa/drivers/dri/i965/brw_fs.h +++ b/src/mesa/drivers/dri/i965/brw_fs.h @@ -124,18 +124,40 @@ retype(fs_reg reg, enum brw_reg_type type) } static inline fs_reg -offset(fs_reg reg, unsigned delta) +byte_offset(fs_reg reg, unsigned delta) { - assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM)); - reg.reg_offset += delta; + switch (reg.file) { + case BAD_FILE: + break; + case GRF: + reg.reg_offset += delta / 32; + break; + case MRF: + reg.reg += delta / 32; + break; + default: + assert(delta == 0); + } + reg.subreg_offset += delta % 32; return reg; } static inline fs_reg -byte_offset(fs_reg reg, unsigned delta) +offset(fs_reg reg, unsigned delta) { - assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM)); - reg.subreg_offset += delta; + assert(reg.stride > 0); + switch (reg.file) { + case BAD_FILE: + break; + case GRF: + case MRF: + return byte_offset(reg, delta * reg.width * reg.stride * type_sz(reg.type)); + case UNIFORM: + reg.reg_offset += delta; + break; + default: + assert(delta == 0); + } return reg; } @@ -426,6 +448,8 @@ public: void emit_if_gen6(ir_if *ir); void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg, uint32_t spill_offset, int count); + void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg, + uint32_t spill_offset, int count); void emit_fragment_program_code(); void setup_fp_regs(); diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp index bd502c4..b4f4431 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp @@ -42,6 +42,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */ struct acp_entry : public exec_node { fs_reg dst; fs_reg src; + uint8_t regs_written; enum opcode opcode; bool saturate; }; @@ -295,11 +296,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) /* Bail if inst is reading a range that isn't contained in the range * that entry is writing. */ - int reg_size = dispatch_width * sizeof(float); if (inst->src[arg].reg_offset < entry->dst.reg_offset || - (inst->src[arg].reg_offset * reg_size + inst->src[arg].subreg_offset + - inst->regs_read(this, arg) * inst->src[arg].stride * reg_size) > - (entry->dst.reg_offset + 1) * reg_size) + (inst->src[arg].reg_offset * 32 + inst->src[arg].subreg_offset + + inst->regs_read(this, arg) * inst->src[arg].stride * 32) > + (entry->dst.reg_offset + entry->regs_written) * 32) return false; /* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */ @@ -371,16 +371,25 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) inst->saturate = inst->saturate || entry->saturate; switch (entry->src.file) { + case UNIFORM: + assert(entry->src.width == 1); case BAD_FILE: case HW_REG: - case UNIFORM: + inst->src[arg].width = entry->src.width; inst->src[arg].reg_offset = entry->src.reg_offset; inst->src[arg].subreg_offset = entry->src.subreg_offset; break; case GRF: { - /* In this case, we have to deal with mapping parts of vgrfs to - * other parts of vgrfs so we have to do some reg_offset magic. + assert(entry->src.width % inst->src[arg].width == 0); + /* In this case, we'll just leave the width alone. The source + * register could have different widths depending on how it is + * being used. For instance, if only half of the register was + * used then we want to preserve that and continue to only use + * half. + * + * Also, we have to deal with mapping parts of vgrfs to other + * parts of vgrfs so we have to do some reg_offset magic. */ /* Compute the offset of inst->src[arg] relative to inst->dst */ @@ -389,10 +398,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry) int rel_suboffset = inst->src[arg].subreg_offset; /* Compute the final register offset (in bytes) */ - int offset = entry->src.reg_offset * reg_size + entry->src.subreg_offset; - offset += rel_offset * reg_size + rel_suboffset; - inst->src[arg].reg_offset = offset / reg_size; - inst->src[arg].subreg_offset = offset % reg_size; + int offset = entry->src.reg_offset * 32 + entry->src.subreg_offset; + offset += rel_offset * 32 + rel_suboffset; + inst->src[arg].reg_offset = offset / 32; + inst->src[arg].subreg_offset = offset % 32; } break; default: @@ -429,11 +438,10 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry) /* Bail if inst is reading a range that isn't contained in the range * that entry is writing. */ - int reg_size = dispatch_width * sizeof(float); if (inst->src[i].reg_offset < entry->dst.reg_offset || - (inst->src[i].reg_offset * reg_size + inst->src[i].subreg_offset + - inst->regs_read(this, i) * inst->src[i].stride * reg_size) > - (entry->dst.reg_offset + 1) * reg_size) + (inst->src[i].reg_offset * 32 + inst->src[i].subreg_offset + + inst->regs_read(this, i) * inst->src[i].stride * 32) > + (entry->dst.reg_offset + entry->regs_written) * 32) continue; /* Don't bother with cases that should have been taken care of by the @@ -623,17 +631,23 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); entry->dst = inst->dst; entry->src = inst->src[0]; + entry->regs_written = inst->regs_written; entry->opcode = inst->opcode; entry->saturate = inst->saturate; acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD && inst->dst.file == GRF) { + int offset = 0; for (int i = 0; i < inst->sources; i++) { + int regs_written = ((inst->src[i].effective_width(this) * + type_sz(inst->src[i].type)) + 31) / 32; if (inst->src[i].file == GRF) { acp_entry *entry = ralloc(copy_prop_ctx, acp_entry); entry->dst = inst->dst; - entry->dst.reg_offset = i; + entry->dst.reg_offset = offset; + entry->dst.width = inst->src[i].effective_width(this); entry->src = inst->src[i]; + entry->regs_written = regs_written; entry->opcode = inst->opcode; if (!entry->dst.equals(inst->src[i])) { acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry); @@ -641,6 +655,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block, ralloc_free(entry); } } + offset += regs_written; } } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp index 7edbe19..817fc1f 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp @@ -202,19 +202,21 @@ fs_visitor::opt_cse_local(bblock_t *block) bool no_existing_temp = entry->tmp.file == BAD_FILE; if (no_existing_temp && !entry->generator->dst.is_null()) { int written = entry->generator->regs_written; + int dst_width = entry->generator->dst.width / 8; + assert(written % dst_width == 0); fs_reg orig_dst = entry->generator->dst; fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written), - orig_dst.type); + orig_dst.type, orig_dst.width); entry->tmp = tmp; entry->generator->dst = tmp; fs_inst *copy; - if (written > 1) { - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written); - for (int i = 0; i < written; i++) + if (written > dst_width) { + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width); + for (int i = 0; i < written / dst_width; i++) sources[i] = offset(tmp, i); - copy = LOAD_PAYLOAD(orig_dst, sources, written); + copy = LOAD_PAYLOAD(orig_dst, sources, written / dst_width); } else { copy = MOV(orig_dst, tmp); copy->force_writemask_all = @@ -226,16 +228,18 @@ fs_visitor::opt_cse_local(bblock_t *block) /* dest <- temp */ if (!inst->dst.is_null()) { int written = inst->regs_written; + int dst_width = inst->dst.width / 8; assert(written == entry->generator->regs_written); + assert(dst_width == entry->generator->dst.width / 8); assert(inst->dst.type == entry->tmp.type); fs_reg dst = inst->dst; fs_reg tmp = entry->tmp; fs_inst *copy; - if (written > 1) { - fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written); - for (int i = 0; i < written; i++) + if (written > dst_width) { + fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width); + for (int i = 0; i < written / dst_width; i++) sources[i] = offset(tmp, i); - copy = LOAD_PAYLOAD(dst, sources, written); + copy = LOAD_PAYLOAD(dst, sources, written / dst_width); } else { copy = MOV(dst, tmp); copy->force_writemask_all = inst->force_writemask_all; diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp index 122a43f..5bfc559 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp @@ -542,15 +542,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src dst = vec16(dst); } - if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) { - /* The send-from-GRF for SIMD16 texturing with a header has an extra - * hardware register allocated to it, which we need to skip over (since - * our coordinates in the payload are in the even-numbered registers, - * and the header comes right before the first one). - */ - assert(src.file == BRW_GENERAL_REGISTER_FILE); - src.nr++; - } + assert(brw->gen < 7 || !inst->header_present || + src.file == BRW_GENERAL_REGISTER_FILE); assert(sampler_index.type == BRW_REGISTER_TYPE_UD); diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp index a627b64..095b45c 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp @@ -31,11 +31,11 @@ #include "glsl/ir_optimization.h" static void -assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width) +assign_reg(int *reg_hw_locations, fs_reg *reg) { if (reg->file == GRF) { assert(reg->reg_offset >= 0); - reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width; + reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset; reg->reg_offset = 0; } } @@ -51,14 +51,14 @@ fs_visitor::assign_regs_trivial() hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width); for (i = 1; i <= this->virtual_grf_count; i++) { hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + - this->virtual_grf_sizes[i - 1] * reg_width); + this->virtual_grf_sizes[i - 1]); } this->grf_used = hw_reg_mapping[this->virtual_grf_count]; foreach_block_and_inst(block, fs_inst, inst, cfg) { - assign_reg(hw_reg_mapping, &inst->dst, reg_width); + assign_reg(hw_reg_mapping, &inst->dst); for (i = 0; i < inst->sources; i++) { - assign_reg(hw_reg_mapping, &inst->src[i], reg_width); + assign_reg(hw_reg_mapping, &inst->src[i]); } } @@ -75,7 +75,7 @@ static void brw_alloc_reg_set(struct intel_screen *screen, int reg_width) { const struct brw_device_info *devinfo = screen->devinfo; - int base_reg_count = BRW_MAX_GRF / reg_width; + int base_reg_count = BRW_MAX_GRF; int index = reg_width - 1; /* The registers used to make up almost all values handled in the compiler @@ -105,8 +105,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) int class_sizes[BRW_MAX_MRF]; if (devinfo->gen >= 7) { - for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE; - class_count++) + for (class_count = 0; class_count < BRW_MAX_MRF; class_count++) class_sizes[class_count] = class_count + 1; } else { for (class_count = 0; class_count < 4; class_count++) @@ -117,7 +116,21 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) /* Compute the total number of registers across all classes. */ int ra_reg_count = 0; for (int i = 0; i < class_count; i++) { - ra_reg_count += base_reg_count - (class_sizes[i] - 1); + if (devinfo->gen <= 5 && reg_width == 2) { + /* From the G45 PRM: + * + * In order to reduce the hardware complexity, the following + * rules and restrictions apply to the compressed instruction: + * ... + * * Operand Alignment Rule: With the exceptions listed below, a + * source/destination operand in general should be aligned to + * even 256-bit physical register with a region size equal to + * two 256-bit physical register + */ + ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2; + } else { + ra_reg_count += base_reg_count - (class_sizes[i] - 1); + } } uint8_t *ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count); @@ -134,27 +147,48 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) int pairs_base_reg = 0; int pairs_reg_count = 0; for (int i = 0; i < class_count; i++) { - int class_reg_count = base_reg_count - (class_sizes[i] - 1); + int class_reg_count; + if (devinfo->gen <= 5 && reg_width == 2) { + class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2; + } else { + class_reg_count = base_reg_count - (class_sizes[i] - 1); + } classes[i] = ra_alloc_reg_class(regs); /* Save this off for the aligned pair class at the end. */ if (class_sizes[i] == 2) { - pairs_base_reg = reg; - pairs_reg_count = class_reg_count; + pairs_base_reg = reg; + pairs_reg_count = class_reg_count; } - for (int j = 0; j < class_reg_count; j++) { - ra_class_add_reg(regs, classes[i], reg); + if (devinfo->gen <= 5 && reg_width == 2) { + for (int j = 0; j < class_reg_count; j++) { + ra_class_add_reg(regs, classes[i], reg); - ra_reg_to_grf[reg] = j; + ra_reg_to_grf[reg] = j * 2; - for (int base_reg = j; - base_reg < j + class_sizes[i]; - base_reg++) { - ra_add_transitive_reg_conflict(regs, base_reg, reg); - } + for (int base_reg = j * 2; + base_reg < j * 2 + class_sizes[i]; + base_reg++) { + ra_add_transitive_reg_conflict(regs, base_reg, reg); + } + + reg++; + } + } else { + for (int j = 0; j < class_reg_count; j++) { + ra_class_add_reg(regs, classes[i], reg); + + ra_reg_to_grf[reg] = j; - reg++; + for (int base_reg = j; + base_reg < j + class_sizes[i]; + base_reg++) { + ra_add_transitive_reg_conflict(regs, base_reg, reg); + } + + reg++; + } } } assert(reg == ra_reg_count); @@ -162,7 +196,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width) /* Add a special class for aligned pairs, which we'll put delta_x/y * in on gen5 so that we can do PLN. */ - if (devinfo->has_pln && reg_width == 1 && devinfo->gen < 6) { + if (devinfo->has_pln && devinfo->gen < 6) { aligned_pairs_class = ra_alloc_reg_class(regs); for (int i = 0; i < pairs_reg_count; i++) { @@ -236,7 +270,6 @@ fs_visitor::setup_payload_interference(struct ra_graph *g, int payload_node_count, int first_payload_node) { - int reg_width = dispatch_width / 8; int loop_depth = 0; int loop_end_ip = 0; @@ -276,7 +309,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g, for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == HW_REG && inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { - int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width; + int node_nr = inst->src[i].fixed_hw_reg.nr; if (node_nr >= payload_node_count) continue; @@ -292,25 +325,26 @@ fs_visitor::setup_payload_interference(struct ra_graph *g, * sideband. It also really freaks out driver developers to see g0 * used in unusual places, so just always reserve it. */ - payload_last_use_ip[0 / reg_width] = use_ip; - payload_last_use_ip[1 / reg_width] = use_ip; + payload_last_use_ip[0] = use_ip; + payload_last_use_ip[1] = use_ip; break; case FS_OPCODE_LINTERP: - /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes) - * used by PLN's sourcing of the deltas, while we list only the first - * two in the arguments (1 node). Pre-gen6, the deltas are computed - * in normal VGRFs. + /* On gen6+ in SIMD16, there are 4 adjacent registers used by + * PLN's sourcing of the deltas, while we list only the first one + * in the arguments. Pre-gen6, the deltas are computed in normal + * VGRFs. */ if (brw->gen >= 6) { int delta_x_arg = 0; if (inst->src[delta_x_arg].file == HW_REG && inst->src[delta_x_arg].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { - int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr / - reg_width) + 1; - assert(sechalf_node < payload_node_count); - payload_last_use_ip[sechalf_node] = use_ip; + for (int i = 1; i < 4; ++i) { + int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i; + assert(node < payload_node_count); + payload_last_use_ip[node] = use_ip; + } } } break; @@ -391,8 +425,6 @@ fs_visitor::get_used_mrfs(bool *mrf_used) void fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node) { - int reg_width = dispatch_width / 8; - bool mrf_used[BRW_MAX_MRF]; get_used_mrfs(mrf_used); @@ -402,8 +434,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node) * The alternative would be to have per-physical-register classes, which * would just be silly. */ - ra_set_node_reg(g, first_mrf_node + i, - (GEN7_MRF_HACK_START + i) / reg_width); + ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i); /* Since we don't have any live/dead analysis on the MRFs, just mark all * that are used as conflicting with all virtual GRFs. @@ -428,8 +459,7 @@ fs_visitor::assign_regs(bool allow_spilling) */ int reg_width = dispatch_width / 8; int hw_reg_mapping[this->virtual_grf_count]; - int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) / - reg_width); + int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width); int rsi = reg_width - 1; /* Which screen->wm_reg_sets[] to use */ calculate_live_intervals(); @@ -478,6 +508,30 @@ fs_visitor::assign_regs(bool allow_spilling) if (brw->gen >= 7) setup_mrf_hack_interference(g, first_mrf_hack_node); + if (dispatch_width > 8) { + /* In 16-wide dispatch we have an issue where a compressed + * instruction is actually two instructions executed simultaneiously. + * It's actually ok to have the source and destination registers be + * the same. In this case, each instruction over-writes its own + * source and there's no problem. The real problem here is if the + * source and destination registers are off by one. Then you can end + * up in a scenario where the first instruction over-writes the + * source of the second instruction. Since the compiler doesn't know + * about this level of granularity, we simply make the source and + * destination interfere. + */ + foreach_block_and_inst(block, fs_inst, inst, cfg) { + if (inst->dst.file != GRF) + continue; + + for (int i = 0; i < inst->sources; ++i) { + if (inst->src[i].file == GRF) { + ra_add_node_interference(g, inst->dst.reg, inst->src[i].reg); + } + } + } + } + /* Debug of register spilling: Go spill everything. */ if (0) { int reg = choose_spill_reg(g); @@ -511,20 +565,19 @@ fs_visitor::assign_regs(bool allow_spilling) * regs in the register classes back down to real hardware reg * numbers. */ - this->grf_used = payload_node_count * reg_width; + this->grf_used = payload_node_count; for (int i = 0; i < this->virtual_grf_count; i++) { int reg = ra_get_node_reg(g, i); - hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg] * reg_width; + hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg]; this->grf_used = MAX2(this->grf_used, - hw_reg_mapping[i] + this->virtual_grf_sizes[i] * - reg_width); + hw_reg_mapping[i] + this->virtual_grf_sizes[i]); } foreach_block_and_inst(block, fs_inst, inst, cfg) { - assign_reg(hw_reg_mapping, &inst->dst, reg_width); + assign_reg(hw_reg_mapping, &inst->dst); for (int i = 0; i < inst->sources; i++) { - assign_reg(hw_reg_mapping, &inst->src[i], reg_width); + assign_reg(hw_reg_mapping, &inst->src[i]); } } @@ -539,7 +592,11 @@ void fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst, uint32_t spill_offset, int count) { - for (int i = 0; i < count; i++) { + int reg_size = 1; + if (count % 2 == 0) + reg_size = 2; + + for (int i = 0; i < count / reg_size; i++) { /* The gen7 descriptor-based offset is 12 bits of HWORD units. */ bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE; @@ -558,8 +615,32 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst, } inst->insert_before(block, unspill_inst); - dst = offset(dst, 1); - spill_offset += dispatch_width * sizeof(float); + dst.reg_offset += reg_size; + spill_offset += reg_size * 8 * sizeof(float); + } +} + +void +fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src, + uint32_t spill_offset, int count) +{ + int spill_base_mrf = dispatch_width > 8 ? 13 : 14; + + int reg_size = 1; + if (count % 2 == 0) + reg_size = 2; + + for (int i = 0; i < count / reg_size; i++) { + fs_inst *spill_inst = + new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE, + reg_null_f, src); + src.reg_offset += reg_size; + spill_inst->offset = spill_offset + i * reg_size; + spill_inst->ir = inst->ir; + spill_inst->annotation = inst->annotation; + spill_inst->mlen = 1 + reg_size; /* header, value */ + spill_inst->base_mrf = spill_base_mrf; + inst->insert_after(block, spill_inst); } } @@ -712,18 +793,8 @@ fs_visitor::spill_reg(int spill_reg) inst->regs_written); } - for (int chan = 0; chan < inst->regs_written; chan++) { - fs_inst *spill_inst = - new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE, - reg_null_f, spill_src); - spill_src = offset(spill_src, 1); - spill_inst->offset = subset_spill_offset + chan * reg_size; - spill_inst->ir = inst->ir; - spill_inst->annotation = inst->annotation; - spill_inst->mlen = 1 + dispatch_width / 8; /* header, value */ - spill_inst->base_mrf = spill_base_mrf; - inst->insert_after(block, spill_inst); - } + emit_spill(block, inst, spill_src, subset_spill_offset, + inst->regs_written); } } diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp index 73f18f9..9546dcd 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp @@ -69,16 +69,12 @@ is_copy_payload(const fs_visitor *v, const fs_inst *inst) if (v->virtual_grf_sizes[inst->src[0].reg] != inst->regs_written) return false; - const int reg = inst->src[0].reg; - if (inst->src[0].reg_offset != 0) - return false; + fs_reg reg = inst->src[0]; - for (int i = 1; i < inst->sources; i++) { - if (inst->src[i].reg != reg || - inst->src[i].reg_offset != i) { + for (int i = 0; i < inst->sources; i++) + if (!inst->src[i].equals(offset(reg, i))) return false; - } - } + return true; } @@ -186,6 +182,7 @@ fs_visitor::register_coalesce() src_size = virtual_grf_sizes[inst->src[0].reg]; assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE); + assert(inst->src[0].width % 8 == 0); channels_remaining = src_size; memset(mov, 0, sizeof(mov)); @@ -200,12 +197,14 @@ fs_visitor::register_coalesce() reg_to_offset[i] = i; } mov[0] = inst; - channels_remaining -= inst->sources; + channels_remaining -= inst->regs_written; } else { const int offset = inst->src[0].reg_offset; reg_to_offset[offset] = inst->dst.reg_offset; + if (inst->src[0].width == 16) + reg_to_offset[offset + 1] = inst->dst.reg_offset + 1; mov[offset] = inst; - channels_remaining--; + channels_remaining -= inst->regs_written; } if (channels_remaining) diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp index 72ffe1f..3b31e34 100644 --- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp +++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp @@ -287,7 +287,8 @@ fs_visitor::try_emit_saturate(ir_expression *ir) * src, just set the saturate flag instead of emmitting a separate mov. */ fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src); - if (modify && modify->regs_written == 1 && modify->can_do_saturate()) { + if (modify && modify->regs_written == modify->dst.width / 8 && + modify->can_do_saturate()) { modify->saturate = true; this->result = src; return true; @@ -1434,7 +1435,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate, inst->base_mrf = base_mrf; inst->mlen = mlen; inst->header_present = header_present; - inst->regs_written = 4; + inst->regs_written = 4 * reg_width; if (mlen > MAX_SAMPLER_MESSAGE_SIZE) { fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE) @@ -1480,7 +1481,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, * need to offset the Sampler State Pointer in the header. */ header_present = true; - sources[length] = reg_undef; + sources[0] = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD); length++; } @@ -1618,7 +1619,13 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, } } - fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(length), + int mlen; + if (reg_width == 2) + mlen = length * reg_width - header_present; + else + mlen = length * reg_width; + + fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(mlen), BRW_REGISTER_TYPE_F); emit(LOAD_PAYLOAD(src_payload, sources, length)); @@ -1645,12 +1652,9 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate, } fs_inst *inst = emit(opcode, dst, src_payload, sampler); inst->base_mrf = -1; - if (reg_width == 2) - inst->mlen = length * reg_width - header_present; - else - inst->mlen = length * reg_width; + inst->mlen = mlen; inst->header_present = header_present; - inst->regs_written = 4; + inst->regs_written = 4 * reg_width; if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) { fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE) @@ -1784,7 +1788,7 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler) { int reg_width = dispatch_width / 8; int length = ir->coordinate->type->vector_elements; - fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length), + fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length * reg_width), BRW_REGISTER_TYPE_F); fs_reg dest = fs_reg(this, glsl_type::uvec4_type); fs_reg *sources = ralloc_array(mem_ctx, fs_reg, length); @@ -1802,9 +1806,10 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler) inst->base_mrf = -1; inst->mlen = length * reg_width; inst->header_present = false; - inst->regs_written = 4; /* we only care about one reg of response, - * but the sampler always writes 4/8 - */ + inst->regs_written = 4 * reg_width; /* we only care about one reg of + * response, but the sampler always + * writes 4/8 + */ return dest; } @@ -1979,14 +1984,15 @@ fs_visitor::visit(ir_texture *ir) emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6)); fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written); - for (int i = 0; i < inst->regs_written; i++) { + int components = inst->regs_written / (dst.width / 8); + for (int i = 0; i < components; i++) { if (i == 2) { fixed_payload[i] = fixed_depth; } else { fixed_payload[i] = offset(dst, i); } } - emit(LOAD_PAYLOAD(dst, fixed_payload, inst->regs_written)); + emit(LOAD_PAYLOAD(dst, fixed_payload, components)); } } diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp index b963bda..5e8c98a 100644 --- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp +++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp @@ -795,7 +795,7 @@ fs_instruction_scheduler::calculate_deps() for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == GRF) { if (post_reg_alloc) { - for (int r = 0; r < reg_width * inst->regs_read(v, i); r++) + for (int r = 0; r < inst->regs_read(v, i); r++) add_dep(last_grf_write[inst->src[i].reg + r], n); } else { for (int r = 0; r < inst->regs_read(v, i); r++) { @@ -847,7 +847,7 @@ fs_instruction_scheduler::calculate_deps() /* write-after-write deps. */ if (inst->dst.file == GRF) { if (post_reg_alloc) { - for (int r = 0; r < inst->regs_written * reg_width; r++) { + for (int r = 0; r < inst->regs_written; r++) { add_dep(last_grf_write[inst->dst.reg + r], n); last_grf_write[inst->dst.reg + r] = n; } @@ -923,7 +923,7 @@ fs_instruction_scheduler::calculate_deps() for (int i = 0; i < inst->sources; i++) { if (inst->src[i].file == GRF) { if (post_reg_alloc) { - for (int r = 0; r < reg_width * inst->regs_read(v, i); r++) + for (int r = 0; r < inst->regs_read(v, i); r++) add_dep(n, last_grf_write[inst->src[i].reg + r]); } else { for (int r = 0; r < inst->regs_read(v, i); r++) { @@ -977,7 +977,7 @@ fs_instruction_scheduler::calculate_deps() */ if (inst->dst.file == GRF) { if (post_reg_alloc) { - for (int r = 0; r < inst->regs_written * reg_width; r++) + for (int r = 0; r < inst->regs_written; r++) last_grf_write[inst->dst.reg + r] = n; } else { for (int r = 0; r < inst->regs_written; r++) { @@ -1300,7 +1300,8 @@ fs_instruction_scheduler::choose_instruction_to_schedule() * single-result send is probably actually reducing register * pressure. */ - if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) { + if (inst->regs_written <= inst->dst.width / 8 && + chosen_inst->regs_written > chosen_inst->dst.width / 8) { chosen = n; continue; } else if (inst->regs_written > chosen_inst->regs_written) { |