/* * Copyright © 2015 Broadcom * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice (including the next * paragraph) shall be included in all copies or substantial portions of the * Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS * IN THE SOFTWARE. */ #include "vc4_qir.h" #include "compiler/nir/nir_builder.h" #include "util/u_format.h" /** * Walks the NIR generated by TGSI-to-NIR to lower its io intrinsics into * something amenable to the VC4 architecture. * * Currently, it split inputs, outputs, and uniforms into scalars, drops any * non-position outputs in coordinate shaders, and fixes up the addressing on * indirect uniform loads. */ static void replace_intrinsic_with_vec4(nir_builder *b, nir_intrinsic_instr *intr, nir_ssa_def **comps) { /* Batch things back together into a vec4. This will get split by the * later ALU scalarization pass. */ nir_ssa_def *vec = nir_vec4(b, comps[0], comps[1], comps[2], comps[3]); /* Replace the old intrinsic with a reference to our reconstructed * vec4. */ nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(vec)); nir_instr_remove(&intr->instr); } static nir_ssa_def * vc4_nir_unpack_8i(nir_builder *b, nir_ssa_def *src, unsigned chan) { return nir_ubitfield_extract(b, src, nir_imm_int(b, 8 * chan), nir_imm_int(b, 8)); } /** Returns the 16 bit field as a sign-extended 32-bit value. */ static nir_ssa_def * vc4_nir_unpack_16i(nir_builder *b, nir_ssa_def *src, unsigned chan) { return nir_ibitfield_extract(b, src, nir_imm_int(b, 16 * chan), nir_imm_int(b, 16)); } /** Returns the 16 bit field as an unsigned 32 bit value. */ static nir_ssa_def * vc4_nir_unpack_16u(nir_builder *b, nir_ssa_def *src, unsigned chan) { if (chan == 0) { return nir_iand(b, src, nir_imm_int(b, 0xffff)); } else { return nir_ushr(b, src, nir_imm_int(b, 16)); } } static nir_ssa_def * vc4_nir_unpack_8f(nir_builder *b, nir_ssa_def *src, unsigned chan) { return nir_channel(b, nir_unpack_unorm_4x8(b, src), chan); } static nir_ssa_def * vc4_nir_get_vattr_channel_vpm(struct vc4_compile *c, nir_builder *b, nir_ssa_def **vpm_reads, uint8_t swiz, const struct util_format_description *desc) { const struct util_format_channel_description *chan = &desc->channel[swiz]; nir_ssa_def *temp; if (swiz > UTIL_FORMAT_SWIZZLE_W) { return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz); } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_FLOAT) { return vc4_nir_get_swizzled_channel(b, vpm_reads, swiz); } else if (chan->size == 32 && chan->type == UTIL_FORMAT_TYPE_SIGNED) { if (chan->normalized) { return nir_fmul(b, nir_i2f(b, vpm_reads[swiz]), nir_imm_float(b, 1.0 / 0x7fffffff)); } else { return nir_i2f(b, vpm_reads[swiz]); } } else if (chan->size == 8 && (chan->type == UTIL_FORMAT_TYPE_UNSIGNED || chan->type == UTIL_FORMAT_TYPE_SIGNED)) { nir_ssa_def *vpm = vpm_reads[0]; if (chan->type == UTIL_FORMAT_TYPE_SIGNED) { temp = nir_ixor(b, vpm, nir_imm_int(b, 0x80808080)); if (chan->normalized) { return nir_fsub(b, nir_fmul(b, vc4_nir_unpack_8f(b, temp, swiz), nir_imm_float(b, 2.0)), nir_imm_float(b, 1.0)); } else { return nir_fadd(b, nir_i2f(b, vc4_nir_unpack_8i(b, temp, swiz)), nir_imm_float(b, -128.0)); } } else { if (chan->normalized) { return vc4_nir_unpack_8f(b, vpm, swiz); } else { return nir_i2f(b, vc4_nir_unpack_8i(b, vpm, swiz)); } } } else if (chan->size == 16 && (chan->type == UTIL_FORMAT_TYPE_UNSIGNED || chan->type == UTIL_FORMAT_TYPE_SIGNED)) { nir_ssa_def *vpm = vpm_reads[swiz / 2]; /* Note that UNPACK_16F eats a half float, not ints, so we use * UNPACK_16_I for all of these. */ if (chan->type == UTIL_FORMAT_TYPE_SIGNED) { temp = nir_i2f(b, vc4_nir_unpack_16i(b, vpm, swiz & 1)); if (chan->normalized) { return nir_fmul(b, temp, nir_imm_float(b, 1/32768.0f)); } else { return temp; } } else { temp = nir_i2f(b, vc4_nir_unpack_16u(b, vpm, swiz & 1)); if (chan->normalized) { return nir_fmul(b, temp, nir_imm_float(b, 1 / 65535.0)); } else { return temp; } } } else { return NULL; } } static void vc4_nir_lower_vertex_attr(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr) { b->cursor = nir_before_instr(&intr->instr); int attr = intr->const_index[0]; enum pipe_format format = c->vs_key->attr_formats[attr]; uint32_t attr_size = util_format_get_blocksize(format); /* All TGSI-to-NIR inputs are vec4. */ assert(intr->num_components == 4); /* We only accept direct outputs and TGSI only ever gives them to us * with an offset value of 0. */ assert(nir_src_as_const_value(intr->src[0]) && nir_src_as_const_value(intr->src[0])->u32[0] == 0); /* Generate dword loads for the VPM values (Since these intrinsics may * be reordered, the actual reads will be generated at the top of the * shader by ntq_setup_inputs(). */ nir_ssa_def *vpm_reads[4]; for (int i = 0; i < align(attr_size, 4) / 4; i++) { nir_intrinsic_instr *intr_comp = nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input); intr_comp->num_components = 1; intr_comp->const_index[0] = intr->const_index[0] * 4 + i; intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); nir_builder_instr_insert(b, &intr_comp->instr); vpm_reads[i] = &intr_comp->dest.ssa; } bool format_warned = false; const struct util_format_description *desc = util_format_description(format); nir_ssa_def *dests[4]; for (int i = 0; i < 4; i++) { uint8_t swiz = desc->swizzle[i]; dests[i] = vc4_nir_get_vattr_channel_vpm(c, b, vpm_reads, swiz, desc); if (!dests[i]) { if (!format_warned) { fprintf(stderr, "vtx element %d unsupported type: %s\n", attr, util_format_name(format)); format_warned = true; } dests[i] = nir_imm_float(b, 0.0); } } replace_intrinsic_with_vec4(b, intr, dests); } static void vc4_nir_lower_fs_input(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr) { b->cursor = nir_before_instr(&intr->instr); if (intr->const_index[0] >= VC4_NIR_TLB_COLOR_READ_INPUT && intr->const_index[0] < (VC4_NIR_TLB_COLOR_READ_INPUT + VC4_MAX_SAMPLES)) { /* This doesn't need any lowering. */ return; } nir_variable *input_var = NULL; nir_foreach_variable(var, &c->s->inputs) { if (var->data.driver_location == intr->const_index[0]) { input_var = var; break; } } assert(input_var); /* All TGSI-to-NIR inputs are vec4. */ assert(intr->num_components == 4); /* We only accept direct inputs and TGSI only ever gives them to us * with an offset value of 0. */ assert(nir_src_as_const_value(intr->src[0]) && nir_src_as_const_value(intr->src[0])->u32[0] == 0); /* Generate scalar loads equivalent to the original VEC4. */ nir_ssa_def *dests[4]; for (unsigned i = 0; i < intr->num_components; i++) { nir_intrinsic_instr *intr_comp = nir_intrinsic_instr_create(c->s, nir_intrinsic_load_input); intr_comp->num_components = 1; intr_comp->const_index[0] = intr->const_index[0] * 4 + i; intr_comp->src[0] = nir_src_for_ssa(nir_imm_int(b, 0)); nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); nir_builder_instr_insert(b, &intr_comp->instr); dests[i] = &intr_comp->dest.ssa; } if (input_var->data.location == VARYING_SLOT_FACE) { dests[0] = nir_fsub(b, nir_imm_float(b, 1.0), nir_fmul(b, nir_i2f(b, dests[0]), nir_imm_float(b, 2.0))); dests[1] = nir_imm_float(b, 0.0); dests[2] = nir_imm_float(b, 0.0); dests[3] = nir_imm_float(b, 1.0); } else if (input_var->data.location >= VARYING_SLOT_VAR0) { if (c->fs_key->point_sprite_mask & (1 << (input_var->data.location - VARYING_SLOT_VAR0))) { if (!c->fs_key->is_points) { dests[0] = nir_imm_float(b, 0.0); dests[1] = nir_imm_float(b, 0.0); } if (c->fs_key->point_coord_upper_left) { dests[1] = nir_fsub(b, nir_imm_float(b, 1.0), dests[1]); } dests[2] = nir_imm_float(b, 0.0); dests[3] = nir_imm_float(b, 1.0); } } replace_intrinsic_with_vec4(b, intr, dests); } static void vc4_nir_lower_output(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr) { nir_variable *output_var = NULL; nir_foreach_variable(var, &c->s->outputs) { if (var->data.driver_location == intr->const_index[0]) { output_var = var; break; } } assert(output_var); if (c->stage == QSTAGE_COORD && output_var->data.location != VARYING_SLOT_POS && output_var->data.location != VARYING_SLOT_PSIZ) { nir_instr_remove(&intr->instr); return; } /* Color output is lowered by vc4_nir_lower_blend(). */ if (c->stage == QSTAGE_FRAG && (output_var->data.location == FRAG_RESULT_COLOR || output_var->data.location == FRAG_RESULT_DATA0 || output_var->data.location == FRAG_RESULT_SAMPLE_MASK)) { intr->const_index[0] *= 4; return; } /* All TGSI-to-NIR outputs are VEC4. */ assert(intr->num_components == 4); /* We only accept direct outputs and TGSI only ever gives them to us * with an offset value of 0. */ assert(nir_src_as_const_value(intr->src[1]) && nir_src_as_const_value(intr->src[1])->u32[0] == 0); b->cursor = nir_before_instr(&intr->instr); for (unsigned i = 0; i < intr->num_components; i++) { nir_intrinsic_instr *intr_comp = nir_intrinsic_instr_create(c->s, nir_intrinsic_store_output); intr_comp->num_components = 1; intr_comp->const_index[0] = intr->const_index[0] * 4 + i; assert(intr->src[0].is_ssa); intr_comp->src[0] = nir_src_for_ssa(nir_channel(b, intr->src[0].ssa, i)); intr_comp->src[1] = nir_src_for_ssa(nir_imm_int(b, 0)); nir_builder_instr_insert(b, &intr_comp->instr); } nir_instr_remove(&intr->instr); } static void vc4_nir_lower_uniform(struct vc4_compile *c, nir_builder *b, nir_intrinsic_instr *intr) { /* All TGSI-to-NIR uniform loads are vec4, but we need byte offsets * in the backend. */ if (intr->num_components == 1) return; assert(intr->num_components == 4); b->cursor = nir_before_instr(&intr->instr); /* Generate scalar loads equivalent to the original VEC4. */ nir_ssa_def *dests[4]; for (unsigned i = 0; i < intr->num_components; i++) { nir_intrinsic_instr *intr_comp = nir_intrinsic_instr_create(c->s, intr->intrinsic); intr_comp->num_components = 1; nir_ssa_dest_init(&intr_comp->instr, &intr_comp->dest, 1, 32, NULL); /* Convert the uniform offset to bytes. If it happens to be a * constant, constant-folding will clean up the shift for us. */ intr_comp->const_index[0] = (intr->const_index[0] * 16 + i * 4); intr_comp->src[0] = nir_src_for_ssa(nir_ishl(b, intr->src[0].ssa, nir_imm_int(b, 4))); dests[i] = &intr_comp->dest.ssa; nir_builder_instr_insert(b, &intr_comp->instr); } replace_intrinsic_with_vec4(b, intr, dests); } static void vc4_nir_lower_io_instr(struct vc4_compile *c, nir_builder *b, struct nir_instr *instr) { if (instr->type != nir_instr_type_intrinsic) return; nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); switch (intr->intrinsic) { case nir_intrinsic_load_input: if (c->stage == QSTAGE_FRAG) vc4_nir_lower_fs_input(c, b, intr); else vc4_nir_lower_vertex_attr(c, b, intr); break; case nir_intrinsic_store_output: vc4_nir_lower_output(c, b, intr); break; case nir_intrinsic_load_uniform: vc4_nir_lower_uniform(c, b, intr); break; case nir_intrinsic_load_user_clip_plane: default: break; } } static bool vc4_nir_lower_io_block(nir_block *block, void *arg) { struct vc4_compile *c = arg; nir_function_impl *impl = nir_cf_node_get_function(&block->cf_node); nir_builder b; nir_builder_init(&b, impl); nir_foreach_instr_safe(block, instr) vc4_nir_lower_io_instr(c, &b, instr); return true; } static bool vc4_nir_lower_io_impl(struct vc4_compile *c, nir_function_impl *impl) { nir_foreach_block(impl, vc4_nir_lower_io_block, c); nir_metadata_preserve(impl, nir_metadata_block_index | nir_metadata_dominance); return true; } void vc4_nir_lower_io(nir_shader *s, struct vc4_compile *c) { nir_foreach_function(s, function) { if (function->impl) vc4_nir_lower_io_impl(c, function->impl); } }