summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/vc4
diff options
context:
space:
mode:
authorEric Anholt <eric@anholt.net>2016-04-27 16:01:24 -0700
committerEric Anholt <eric@anholt.net>2016-07-12 17:42:41 -0700
commitf505f66cd5a266dc70ad12e2b015e6c631651aec (patch)
tree2795dd95fcfc44ecda08712b36ede9cf26bef562 /src/gallium/drivers/vc4
parentab1d40b84a9812d73411f3499274a6cbf3f25373 (diff)
downloadexternal_mesa3d-f505f66cd5a266dc70ad12e2b015e6c631651aec.zip
external_mesa3d-f505f66cd5a266dc70ad12e2b015e6c631651aec.tar.gz
external_mesa3d-f505f66cd5a266dc70ad12e2b015e6c631651aec.tar.bz2
vc4: Add support for storing to NIR registers in a non-SSA fashion.
Previously, there were occasionally NIR registers in our programs, but they were always actually used SSA-only. Now that we're trying to support control flow, we need to actually conditionally move to registers based on whether channels are active or not.
Diffstat (limited to 'src/gallium/drivers/vc4')
-rw-r--r--src/gallium/drivers/vc4/vc4_program.c217
-rw-r--r--src/gallium/drivers/vc4/vc4_qir.h12
2 files changed, 144 insertions, 85 deletions
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index dec1445..f87a9b2 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -151,6 +151,43 @@ ntq_init_ssa_def(struct vc4_compile *c, nir_ssa_def *def)
return qregs;
}
+static void
+ntq_store_dest(struct vc4_compile *c, nir_dest *dest, int chan,
+ struct qreg result)
+{
+ if (dest->is_ssa) {
+ assert(chan < dest->ssa.num_components);
+
+ struct qreg *qregs;
+ struct hash_entry *entry =
+ _mesa_hash_table_search(c->def_ht, &dest->ssa);
+
+ if (entry)
+ qregs = entry->data;
+ else
+ qregs = ntq_init_ssa_def(c, &dest->ssa);
+
+ qregs[chan] = result;
+ } else {
+ nir_register *reg = dest->reg.reg;
+ assert(dest->reg.base_offset == 0);
+ assert(reg->num_array_elems == 0);
+ struct hash_entry *entry =
+ _mesa_hash_table_search(c->def_ht, reg);
+ struct qreg *qregs = entry->data;
+
+ /* Conditionally move the result to the destination if the
+ * channel is active.
+ */
+ if (c->execute.file != QFILE_NULL) {
+ qir_SF(c, c->execute);
+ qir_MOV_cond(c, QPU_COND_ZS, qregs[chan], result);
+ } else {
+ qir_MOV_dest(c, qregs[chan], result);
+ }
+ }
+}
+
static struct qreg *
ntq_get_dest(struct vc4_compile *c, nir_dest *dest)
{
@@ -300,7 +337,7 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
struct qreg tex = qir_TEX_RESULT(c);
c->num_texture_samples++;
- struct qreg *dest = ntq_get_dest(c, &instr->dest);
+ struct qreg dest[4];
enum pipe_format format = c->key->tex[unit].format;
if (util_format_is_depth_or_stencil(format)) {
struct qreg scaled = ntq_scale_depth_texture(c, tex);
@@ -310,6 +347,9 @@ ntq_emit_txf(struct vc4_compile *c, nir_tex_instr *instr)
for (int i = 0; i < 4; i++)
dest[i] = qir_UNPACK_8_F(c, tex, i);
}
+
+ for (int i = 0; i < 4; i++)
+ ntq_store_dest(c, &instr->dest, i, dest[i]);
}
static void
@@ -731,10 +771,10 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
if (instr->src[0].swizzle[0] == instr->src[0].swizzle[1] &&
instr->src[0].swizzle[0] == instr->src[0].swizzle[2] &&
instr->src[0].swizzle[0] == instr->src[0].swizzle[3]) {
- struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
- *dest = qir_PACK_8888_F(c,
- ntq_get_src(c, instr->src[0].src,
- instr->src[0].swizzle[0]));
+ struct qreg rep = ntq_get_src(c,
+ instr->src[0].src,
+ instr->src[0].swizzle[0]);
+ ntq_store_dest(c, &instr->dest.dest, 0, qir_PACK_8888_F(c, rep));
return;
}
@@ -764,8 +804,7 @@ ntq_emit_pack_unorm_4x8(struct vc4_compile *c, nir_alu_instr *instr)
qir_PACK_8_F(c, result, src, i);
}
- struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
- *dest = result;
+ ntq_store_dest(c, &instr->dest.dest, 0, result);
}
/** Handles sign-extended bitfield extracts for 16 bits. */
@@ -901,6 +940,9 @@ out:
static void
ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
{
+ /* This should always be lowered to ALU operations for VC4. */
+ assert(!instr->dest.saturate);
+
/* Vectors are special in that they have non-scalarized writemasks,
* and just take the first swizzle channel for each argument in order
* into each writemask channel.
@@ -912,9 +954,8 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
srcs[i] = ntq_get_src(c, instr->src[i].src,
instr->src[i].swizzle[0]);
- struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++)
- dest[i] = srcs[i];
+ ntq_store_dest(c, &instr->dest.dest, i, srcs[i]);
return;
}
@@ -926,10 +967,10 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
if (instr->op == nir_op_unpack_unorm_4x8) {
struct qreg src = ntq_get_src(c, instr->src[0].src,
instr->src[0].swizzle[0]);
- struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
for (int i = 0; i < 4; i++) {
if (instr->dest.write_mask & (1 << i))
- dest[i] = qir_UNPACK_8_F(c, src, i);
+ ntq_store_dest(c, &instr->dest.dest, i,
+ qir_UNPACK_8_F(c, src, i));
}
return;
}
@@ -940,91 +981,87 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
src[i] = ntq_get_alu_src(c, instr, i);
}
- /* Pick the channel to store the output in. */
- assert(!instr->dest.saturate);
- struct qreg *dest = ntq_get_dest(c, &instr->dest.dest);
- assert(util_is_power_of_two(instr->dest.write_mask));
- dest += ffs(instr->dest.write_mask) - 1;
+ struct qreg result;
switch (instr->op) {
case nir_op_fmov:
case nir_op_imov:
- *dest = qir_MOV(c, src[0]);
+ result = qir_MOV(c, src[0]);
break;
case nir_op_fmul:
- *dest = qir_FMUL(c, src[0], src[1]);
+ result = qir_FMUL(c, src[0], src[1]);
break;
case nir_op_fadd:
- *dest = qir_FADD(c, src[0], src[1]);
+ result = qir_FADD(c, src[0], src[1]);
break;
case nir_op_fsub:
- *dest = qir_FSUB(c, src[0], src[1]);
+ result = qir_FSUB(c, src[0], src[1]);
break;
case nir_op_fmin:
- *dest = qir_FMIN(c, src[0], src[1]);
+ result = qir_FMIN(c, src[0], src[1]);
break;
case nir_op_fmax:
- *dest = qir_FMAX(c, src[0], src[1]);
+ result = qir_FMAX(c, src[0], src[1]);
break;
case nir_op_f2i:
case nir_op_f2u:
- *dest = qir_FTOI(c, src[0]);
+ result = qir_FTOI(c, src[0]);
break;
case nir_op_i2f:
case nir_op_u2f:
- *dest = qir_ITOF(c, src[0]);
+ result = qir_ITOF(c, src[0]);
break;
case nir_op_b2f:
- *dest = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
+ result = qir_AND(c, src[0], qir_uniform_f(c, 1.0));
break;
case nir_op_b2i:
- *dest = qir_AND(c, src[0], qir_uniform_ui(c, 1));
+ result = qir_AND(c, src[0], qir_uniform_ui(c, 1));
break;
case nir_op_i2b:
case nir_op_f2b:
qir_SF(c, src[0]);
- *dest = qir_SEL(c, QPU_COND_ZC,
- qir_uniform_ui(c, ~0),
- qir_uniform_ui(c, 0));
+ result = qir_SEL(c, QPU_COND_ZC,
+ qir_uniform_ui(c, ~0),
+ qir_uniform_ui(c, 0));
break;
case nir_op_iadd:
- *dest = qir_ADD(c, src[0], src[1]);
+ result = qir_ADD(c, src[0], src[1]);
break;
case nir_op_ushr:
- *dest = qir_SHR(c, src[0], src[1]);
+ result = qir_SHR(c, src[0], src[1]);
break;
case nir_op_isub:
- *dest = qir_SUB(c, src[0], src[1]);
+ result = qir_SUB(c, src[0], src[1]);
break;
case nir_op_ishr:
- *dest = qir_ASR(c, src[0], src[1]);
+ result = qir_ASR(c, src[0], src[1]);
break;
case nir_op_ishl:
- *dest = qir_SHL(c, src[0], src[1]);
+ result = qir_SHL(c, src[0], src[1]);
break;
case nir_op_imin:
- *dest = qir_MIN(c, src[0], src[1]);
+ result = qir_MIN(c, src[0], src[1]);
break;
case nir_op_imax:
- *dest = qir_MAX(c, src[0], src[1]);
+ result = qir_MAX(c, src[0], src[1]);
break;
case nir_op_iand:
- *dest = qir_AND(c, src[0], src[1]);
+ result = qir_AND(c, src[0], src[1]);
break;
case nir_op_ior:
- *dest = qir_OR(c, src[0], src[1]);
+ result = qir_OR(c, src[0], src[1]);
break;
case nir_op_ixor:
- *dest = qir_XOR(c, src[0], src[1]);
+ result = qir_XOR(c, src[0], src[1]);
break;
case nir_op_inot:
- *dest = qir_NOT(c, src[0]);
+ result = qir_NOT(c, src[0]);
break;
case nir_op_imul:
- *dest = ntq_umul(c, src[0], src[1]);
+ result = ntq_umul(c, src[0], src[1]);
break;
case nir_op_seq:
@@ -1040,90 +1077,90 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
case nir_op_ige:
case nir_op_uge:
case nir_op_ilt:
- if (!ntq_emit_comparison(c, dest, instr, instr)) {
+ if (!ntq_emit_comparison(c, &result, instr, instr)) {
fprintf(stderr, "Bad comparison instruction\n");
}
break;
case nir_op_bcsel:
- *dest = ntq_emit_bcsel(c, instr, src);
+ result = ntq_emit_bcsel(c, instr, src);
break;
case nir_op_fcsel:
qir_SF(c, src[0]);
- *dest = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
+ result = qir_SEL(c, QPU_COND_ZC, src[1], src[2]);
break;
case nir_op_frcp:
- *dest = ntq_rcp(c, src[0]);
+ result = ntq_rcp(c, src[0]);
break;
case nir_op_frsq:
- *dest = ntq_rsq(c, src[0]);
+ result = ntq_rsq(c, src[0]);
break;
case nir_op_fexp2:
- *dest = qir_EXP2(c, src[0]);
+ result = qir_EXP2(c, src[0]);
break;
case nir_op_flog2:
- *dest = qir_LOG2(c, src[0]);
+ result = qir_LOG2(c, src[0]);
break;
case nir_op_ftrunc:
- *dest = qir_ITOF(c, qir_FTOI(c, src[0]));
+ result = qir_ITOF(c, qir_FTOI(c, src[0]));
break;
case nir_op_fceil:
- *dest = ntq_fceil(c, src[0]);
+ result = ntq_fceil(c, src[0]);
break;
case nir_op_ffract:
- *dest = ntq_ffract(c, src[0]);
+ result = ntq_ffract(c, src[0]);
break;
case nir_op_ffloor:
- *dest = ntq_ffloor(c, src[0]);
+ result = ntq_ffloor(c, src[0]);
break;
case nir_op_fsin:
- *dest = ntq_fsin(c, src[0]);
+ result = ntq_fsin(c, src[0]);
break;
case nir_op_fcos:
- *dest = ntq_fcos(c, src[0]);
+ result = ntq_fcos(c, src[0]);
break;
case nir_op_fsign:
- *dest = ntq_fsign(c, src[0]);
+ result = ntq_fsign(c, src[0]);
break;
case nir_op_fabs:
- *dest = qir_FMAXABS(c, src[0], src[0]);
+ result = qir_FMAXABS(c, src[0], src[0]);
break;
case nir_op_iabs:
- *dest = qir_MAX(c, src[0],
+ result = qir_MAX(c, src[0],
qir_SUB(c, qir_uniform_ui(c, 0), src[0]));
break;
case nir_op_ibitfield_extract:
- *dest = ntq_emit_ibfe(c, src[0], src[1], src[2]);
+ result = ntq_emit_ibfe(c, src[0], src[1], src[2]);
break;
case nir_op_ubitfield_extract:
- *dest = ntq_emit_ubfe(c, src[0], src[1], src[2]);
+ result = ntq_emit_ubfe(c, src[0], src[1], src[2]);
break;
case nir_op_usadd_4x8:
- *dest = qir_V8ADDS(c, src[0], src[1]);
+ result = qir_V8ADDS(c, src[0], src[1]);
break;
case nir_op_ussub_4x8:
- *dest = qir_V8SUBS(c, src[0], src[1]);
+ result = qir_V8SUBS(c, src[0], src[1]);
break;
case nir_op_umin_4x8:
- *dest = qir_V8MIN(c, src[0], src[1]);
+ result = qir_V8MIN(c, src[0], src[1]);
break;
case nir_op_umax_4x8:
- *dest = qir_V8MAX(c, src[0], src[1]);
+ result = qir_V8MAX(c, src[0], src[1]);
break;
case nir_op_umul_unorm_4x8:
- *dest = qir_V8MULD(c, src[0], src[1]);
+ result = qir_V8MULD(c, src[0], src[1]);
break;
default:
@@ -1132,6 +1169,13 @@ ntq_emit_alu(struct vc4_compile *c, nir_alu_instr *instr)
fprintf(stderr, "\n");
abort();
}
+
+ /* We have a scalar result, so the instruction should only have a
+ * single channel written to.
+ */
+ assert(util_is_power_of_two(instr->dest.write_mask));
+ ntq_store_dest(c, &instr->dest.dest,
+ ffs(instr->dest.write_mask) - 1, result);
}
static void
@@ -1473,7 +1517,7 @@ ntq_setup_registers(struct vc4_compile *c, struct exec_list *list)
_mesa_hash_table_insert(c->def_ht, nir_reg, qregs);
for (int i = 0; i < array_len * nir_reg->num_components; i++)
- qregs[i] = qir_uniform_ui(c, 0);
+ qregs[i] = qir_get_temp(c);
}
}
@@ -1502,14 +1546,8 @@ ntq_emit_ssa_undef(struct vc4_compile *c, nir_ssa_undef_instr *instr)
static void
ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
{
- const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
nir_const_value *const_offset;
unsigned offset;
- struct qreg *dest = NULL;
-
- if (info->has_dest) {
- dest = ntq_get_dest(c, &instr->dest);
- }
switch (instr->intrinsic) {
case nir_intrinsic_load_uniform:
@@ -1521,36 +1559,43 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
/* We need dwords */
offset = offset / 4;
if (offset < VC4_NIR_STATE_UNIFORM_OFFSET) {
- *dest = qir_uniform(c, QUNIFORM_UNIFORM,
- offset);
+ ntq_store_dest(c, &instr->dest, 0,
+ qir_uniform(c, QUNIFORM_UNIFORM,
+ offset));
} else {
- *dest = qir_uniform(c, offset -
- VC4_NIR_STATE_UNIFORM_OFFSET,
- 0);
+ ntq_store_dest(c, &instr->dest, 0,
+ qir_uniform(c, offset -
+ VC4_NIR_STATE_UNIFORM_OFFSET,
+ 0));
}
} else {
- *dest = indirect_uniform_load(c, instr);
+ ntq_store_dest(c, &instr->dest, 0,
+ indirect_uniform_load(c, instr));
}
break;
case nir_intrinsic_load_user_clip_plane:
for (int i = 0; i < instr->num_components; i++) {
- dest[i] = qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
- instr->const_index[0] * 4 + i);
+ ntq_store_dest(c, &instr->dest, i,
+ qir_uniform(c, QUNIFORM_USER_CLIP_PLANE,
+ instr->const_index[0] * 4 +
+ i));
}
break;
case nir_intrinsic_load_sample_mask_in:
- *dest = qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0);
+ ntq_store_dest(c, &instr->dest, 0,
+ qir_uniform(c, QUNIFORM_SAMPLE_MASK, 0));
break;
case nir_intrinsic_load_front_face:
/* The register contains 0 (front) or 1 (back), and we need to
* turn it into a NIR bool where true means front.
*/
- *dest = qir_ADD(c,
- qir_uniform_ui(c, -1),
- qir_reg(QFILE_FRAG_REV_FLAG, 0));
+ ntq_store_dest(c, &instr->dest, 0,
+ qir_ADD(c,
+ qir_uniform_ui(c, -1),
+ qir_reg(QFILE_FRAG_REV_FLAG, 0)));
break;
case nir_intrinsic_load_input:
@@ -1570,10 +1615,12 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
qir_TLB_COLOR_READ(c);
}
}
- *dest = c->color_reads[sample_index];
+ ntq_store_dest(c, &instr->dest, 0,
+ c->color_reads[sample_index]);
} else {
offset = instr->const_index[0] + const_offset->u32[0];
- *dest = c->inputs[offset];
+ ntq_store_dest(c, &instr->dest, 0,
+ c->inputs[offset]);
}
break;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index ad784bb..e284ed5 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -408,6 +408,11 @@ struct vc4_compile {
uint32_t num_ubo_ranges;
uint32_t next_ubo_dst_offset;
+ /* State for whether we're executing on each channel currently. 0 if
+ * yes, otherwise a block number + 1 that the channel jumped to.
+ */
+ struct qreg execute;
+
struct qreg line_x, point_x, point_y;
struct qreg discard;
struct qreg payload_FRAG_Z;
@@ -760,6 +765,13 @@ qir_LOAD_IMM(struct vc4_compile *c, uint32_t val)
qir_reg(QFILE_LOAD_IMM, val), c->undef));
}
+static inline void
+qir_MOV_cond(struct vc4_compile *c, uint8_t cond,
+ struct qreg dest, struct qreg src)
+{
+ qir_MOV_dest(c, dest, src)->cond = cond;
+}
+
static inline struct qinst *
qir_BRANCH(struct vc4_compile *c, uint8_t cond)
{