diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/drivers/vc4/Makefile.sources | 1 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c | 17 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_opt_algebraic.c | 26 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_opt_small_immediates.c | 105 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qir.c | 13 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qir.h | 7 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu.c | 104 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_defines.h | 7 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_disasm.c | 4 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_emit.c | 35 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_schedule.c | 8 | ||||
-rw-r--r-- | src/gallium/drivers/vc4/vc4_qpu_validate.c | 1 |
13 files changed, 283 insertions, 46 deletions
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources index 6bcb731..1f8e8c4 100644 --- a/src/gallium/drivers/vc4/Makefile.sources +++ b/src/gallium/drivers/vc4/Makefile.sources @@ -15,6 +15,7 @@ C_SOURCES := \ vc4_opt_copy_propagation.c \ vc4_opt_cse.c \ vc4_opt_dead_code.c \ + vc4_opt_small_immediates.c \ vc4_packet.h \ vc4_program.c \ vc4_qir.c \ diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c index f5e152b..48bc683 100644 --- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c +++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c @@ -133,12 +133,18 @@ check_tmu_write(uint64_t inst, int tmu = waddr > QPU_W_TMU0_B; bool submit = is_tmu_submit(waddr); bool is_direct = submit && validation_state->tmu_write_count[tmu] == 0; + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); if (is_direct) { uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A); uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); uint32_t clamp_offset = ~0; + if (sig == QPU_SIG_SMALL_IMM) { + DRM_ERROR("direct TMU read used small immediate\n"); + return false; + } + /* Make sure that this texture load is an add of the base * address of the UBO to a clamped offset within the UBO. */ @@ -180,7 +186,8 @@ check_tmu_write(uint64_t inst, validation_state->tmu_setup[tmu].is_direct = true; } else { - if (raddr_a == QPU_R_UNIF || raddr_b == QPU_R_UNIF) { + if (raddr_a == QPU_R_UNIF || (sig != QPU_SIG_SMALL_IMM && + raddr_b == QPU_R_UNIF)) { DRM_ERROR("uniform read in the same instruction as " "texture setup.\n"); return false; @@ -298,6 +305,7 @@ track_live_clamps(uint64_t inst, uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B); uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); bool is_b = inst & QPU_WS; uint32_t live_reg_index; @@ -305,7 +313,8 @@ track_live_clamps(uint64_t inst, return; if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) && - !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF)) { + !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF && + sig != QPU_SIG_SMALL_IMM)) { return; } @@ -344,9 +353,10 @@ check_instruction_reads(uint64_t inst, { uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); if (raddr_a == QPU_R_UNIF || - raddr_b == QPU_R_UNIF) { + (raddr_b == QPU_R_UNIF && sig != QPU_SIG_SMALL_IMM)) { /* This can't overflow the uint32_t, because we're reading 8 * bytes of instruction to increment by 4 here, so we'd * already be OOM. @@ -401,6 +411,7 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj, case QPU_SIG_LOAD_TMU0: case QPU_SIG_LOAD_TMU1: case QPU_SIG_PROG_END: + case QPU_SIG_SMALL_IMM: if (!check_instruction_writes(inst, validated_shader, &validation_state)) { DRM_ERROR("Bad write at ip %d\n", ip); diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c index 4376c7b..d36bb2d 100644 --- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c +++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c @@ -60,23 +60,33 @@ dump_to(struct vc4_compile *c, struct qinst *inst) } static bool +is_constant_value(struct vc4_compile *c, struct qinst **defs, struct qreg reg, + uint32_t val) +{ + if (reg.file == QFILE_UNIF && + c->uniform_contents[reg.index] == QUNIFORM_CONSTANT && + c->uniform_data[reg.index] == val) { + return true; + } + + if (reg.file == QFILE_SMALL_IMM && reg.index == val) + return true; + + return false; +} + +static bool is_zero(struct vc4_compile *c, struct qinst **defs, struct qreg reg) { reg = qir_follow_movs(defs, reg); - - return (reg.file == QFILE_UNIF && - c->uniform_contents[reg.index] == QUNIFORM_CONSTANT && - c->uniform_data[reg.index] == 0); + return is_constant_value(c, defs, reg, 0); } static bool is_1f(struct vc4_compile *c, struct qinst **defs, struct qreg reg) { reg = qir_follow_movs(defs, reg); - - return (reg.file == QFILE_UNIF && - c->uniform_contents[reg.index] == QUNIFORM_CONSTANT && - c->uniform_data[reg.index] == fui(1.0)); + return is_constant_value(c, defs, reg, fui(1.0)); } static void diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c new file mode 100644 index 0000000..8b98ce3 --- /dev/null +++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c @@ -0,0 +1,105 @@ +/* + * Copyright © 2014 Broadcom + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +/** + * @file vc4_opt_small_immediates.c + * + * Turns references to small constant uniform values into small immediates + * fields. + */ + +#include "vc4_qir.h" +#include "vc4_qpu.h" + +static bool debug; + +bool +qir_opt_small_immediates(struct vc4_compile *c) +{ + bool progress = false; + struct simple_node *node; + struct qinst *defs[c->num_temps]; + + foreach(node, &c->instructions) { + struct qinst *inst = (struct qinst *)node; + + if (inst->dst.file == QFILE_TEMP) + defs[inst->dst.index] = inst; + + /* The small immediate value sits in the raddr B field, so we + * can't have 2 small immediates in one instruction (unless + * they're the same value, but that should be optimized away + * elsewhere). + */ + bool uses_small_imm = false; + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + if (inst->src[i].file == QFILE_SMALL_IMM) + uses_small_imm = true; + } + if (uses_small_imm) + continue; + + for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) { + struct qreg src = qir_follow_movs(defs, inst->src[i]); + + if (src.file != QFILE_UNIF || + c->uniform_contents[src.index] != + QUNIFORM_CONSTANT) { + continue; + } + + if (i == 1 && + (inst->op == QOP_TEX_S || + inst->op == QOP_TEX_T || + inst->op == QOP_TEX_R || + inst->op == QOP_TEX_B)) { + /* No turning the implicit uniform read into + * an immediate. + */ + continue; + } + + uint32_t imm = c->uniform_data[src.index]; + uint32_t small_imm = qpu_encode_small_immediate(imm); + if (small_imm == ~0) + continue; + + if (debug) { + fprintf(stderr, "opt_small_immediate() from: "); + qir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + inst->src[i].file = QFILE_SMALL_IMM; + inst->src[i].index = imm; + if (debug) { + fprintf(stderr, "to: "); + qir_dump_inst(c, inst); + fprintf(stderr, "\n"); + } + progress = true; + break; + } + } + + return progress; +} diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c index d7251ab..8cb9826 100644 --- a/src/gallium/drivers/vc4/vc4_qir.c +++ b/src/gallium/drivers/vc4/vc4_qir.c @@ -204,16 +204,22 @@ qir_reads_r4(struct qinst *inst) static void qir_print_reg(struct vc4_compile *c, struct qreg reg) { - const char *files[] = { + static const char *files[] = { [QFILE_TEMP] = "t", [QFILE_VARY] = "v", [QFILE_UNIF] = "u", }; - if (reg.file == QFILE_NULL) + if (reg.file == QFILE_NULL) { fprintf(stderr, "null"); - else + } else if (reg.file == QFILE_SMALL_IMM) { + if ((int)reg.index >= -16 && (int)reg.index <= 15) + fprintf(stderr, "%d", reg.index); + else + fprintf(stderr, "%f", uif(reg.index)); + } else { fprintf(stderr, "%s%d", files[reg.file], reg.index); + } if (reg.file == QFILE_UNIF && c->uniform_contents[reg.index] == QUNIFORM_CONSTANT) { @@ -386,6 +392,7 @@ qir_optimize(struct vc4_compile *c) OPTPASS(qir_opt_cse); OPTPASS(qir_opt_copy_propagation); OPTPASS(qir_opt_dead_code); + OPTPASS(qir_opt_small_immediates); if (!progress) break; diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h index 40c0d3d..db0a436 100644 --- a/src/gallium/drivers/vc4/vc4_qir.h +++ b/src/gallium/drivers/vc4/vc4_qir.h @@ -38,6 +38,12 @@ enum qfile { QFILE_TEMP, QFILE_VARY, QFILE_UNIF, + + /** + * Stores an immediate value in the index field that can be turned + * into a small immediate field by qpu_encode_small_immediate(). + */ + QFILE_SMALL_IMM, }; struct qreg { @@ -382,6 +388,7 @@ bool qir_opt_algebraic(struct vc4_compile *c); bool qir_opt_copy_propagation(struct vc4_compile *c); bool qir_opt_cse(struct vc4_compile *c); bool qir_opt_dead_code(struct vc4_compile *c); +bool qir_opt_small_immediates(struct vc4_compile *c); void qpu_schedule_instructions(struct vc4_compile *c); diff --git a/src/gallium/drivers/vc4/vc4_qpu.c b/src/gallium/drivers/vc4/vc4_qpu.c index 52c06ae..7e38ede 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.c +++ b/src/gallium/drivers/vc4/vc4_qpu.c @@ -26,6 +26,9 @@ #include "vc4_qir.h" #include "vc4_qpu.h" +#define QPU_MUX(mux, muxfield) \ + QPU_SET_FIELD(mux != QPU_MUX_SMALL_IMM ? mux : QPU_MUX_B, muxfield) + static uint64_t set_src_raddr(uint64_t inst, struct qpu_reg src) { @@ -36,11 +39,23 @@ set_src_raddr(uint64_t inst, struct qpu_reg src) } if (src.mux == QPU_MUX_B) { - assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP || - QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr); + assert((QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP || + QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr) && + QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM); return QPU_UPDATE_FIELD(inst, src.addr, QPU_RADDR_B); } + if (src.mux == QPU_MUX_SMALL_IMM) { + if (QPU_GET_FIELD(inst, QPU_SIG) == QPU_SIG_SMALL_IMM) { + assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == src.addr); + } else { + inst = qpu_set_sig(inst, QPU_SIG_SMALL_IMM); + assert(QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_NOP); + } + return ((inst & ~QPU_RADDR_B_MASK) | + QPU_SET_FIELD(src.addr, QPU_RADDR_B)); + } + return inst; } @@ -101,15 +116,15 @@ qpu_a_MOV(struct qpu_reg dst, struct qpu_reg src) { uint64_t inst = 0; + inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(QPU_A_OR, QPU_OP_ADD); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B); inst |= qpu_a_dst(dst); inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_ADD); - inst |= QPU_SET_FIELD(src.mux, QPU_ADD_A); - inst |= QPU_SET_FIELD(src.mux, QPU_ADD_B); + inst |= QPU_MUX(src.mux, QPU_ADD_A); + inst |= QPU_MUX(src.mux, QPU_ADD_B); inst = set_src_raddr(inst, src); - inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL); return inst; @@ -120,15 +135,15 @@ qpu_m_MOV(struct qpu_reg dst, struct qpu_reg src) { uint64_t inst = 0; + inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(QPU_M_V8MIN, QPU_OP_MUL); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B); inst |= qpu_m_dst(dst); inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_MUL); - inst |= QPU_SET_FIELD(src.mux, QPU_MUL_A); - inst |= QPU_SET_FIELD(src.mux, QPU_MUL_B); + inst |= QPU_MUX(src.mux, QPU_MUL_A); + inst |= QPU_MUX(src.mux, QPU_MUL_B); inst = set_src_raddr(inst, src); - inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD); return inst; @@ -155,16 +170,16 @@ qpu_a_alu2(enum qpu_op_add op, { uint64_t inst = 0; + inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(op, QPU_OP_ADD); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B); inst |= qpu_a_dst(dst); inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_ADD); - inst |= QPU_SET_FIELD(src0.mux, QPU_ADD_A); + inst |= QPU_MUX(src0.mux, QPU_ADD_A); inst = set_src_raddr(inst, src0); - inst |= QPU_SET_FIELD(src1.mux, QPU_ADD_B); + inst |= QPU_MUX(src1.mux, QPU_ADD_B); inst = set_src_raddr(inst, src1); - inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_MUL); return inst; @@ -176,16 +191,16 @@ qpu_m_alu2(enum qpu_op_mul op, { uint64_t inst = 0; + inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(op, QPU_OP_MUL); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_A); inst |= QPU_SET_FIELD(QPU_R_NOP, QPU_RADDR_B); inst |= qpu_m_dst(dst); inst |= QPU_SET_FIELD(QPU_COND_ALWAYS, QPU_COND_MUL); - inst |= QPU_SET_FIELD(src0.mux, QPU_MUL_A); + inst |= QPU_MUX(src0.mux, QPU_MUL_A); inst = set_src_raddr(inst, src0); - inst |= QPU_SET_FIELD(src1.mux, QPU_MUL_B); + inst |= QPU_MUX(src1.mux, QPU_MUL_B); inst = set_src_raddr(inst, src1); - inst |= QPU_SET_FIELD(QPU_SIG_NONE, QPU_SIG); inst |= QPU_SET_FIELD(QPU_W_NOP, QPU_WADDR_ADD); return inst; @@ -243,7 +258,8 @@ qpu_num_sf_accesses(uint64_t inst) if (raddr_a == QPU_R_MUTEX_ACQUIRE) accesses++; - if (raddr_b == QPU_R_MUTEX_ACQUIRE) + if (raddr_b == QPU_R_MUTEX_ACQUIRE && + QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) accesses++; /* XXX: semaphore, combined color read/write? */ @@ -383,6 +399,8 @@ qpu_merge_inst(uint64_t a, uint64_t b) { uint64_t merge = a | b; bool ok = true; + uint32_t a_sig = QPU_GET_FIELD(a, QPU_SIG); + uint32_t b_sig = QPU_GET_FIELD(b, QPU_SIG); if (QPU_GET_FIELD(a, QPU_OP_ADD) != QPU_A_NOP && QPU_GET_FIELD(b, QPU_OP_ADD) != QPU_A_NOP) { @@ -402,8 +420,10 @@ qpu_merge_inst(uint64_t a, uint64_t b) if (qpu_num_sf_accesses(a) && qpu_num_sf_accesses(b)) return 0; - if (QPU_GET_FIELD(a, QPU_SIG) == QPU_SIG_LOAD_IMM || - QPU_GET_FIELD(b, QPU_SIG) == QPU_SIG_LOAD_IMM) { + if (a_sig == QPU_SIG_LOAD_IMM || + b_sig == QPU_SIG_LOAD_IMM || + a_sig == QPU_SIG_SMALL_IMM || + b_sig == QPU_SIG_SMALL_IMM) { return 0; } @@ -501,6 +521,56 @@ qpu_inst_is_tlb(uint64_t inst) sig == QPU_SIG_WAIT_FOR_SCOREBOARD); } +/** + * Returns the small immediate value to be encoded in to the raddr b field if + * the argument can be represented as one, or ~0 otherwise. + */ +uint32_t +qpu_encode_small_immediate(uint32_t i) +{ + if (i <= 15) + return i; + if ((int)i < 0 && (int)i >= -16) + return i + 32; + + switch (i) { + case 0x3f800000: + return 32; + case 0x40000000: + return 33; + case 0x40800000: + return 34; + case 0x41000000: + return 35; + case 0x41800000: + return 36; + case 0x42000000: + return 37; + case 0x42800000: + return 38; + case 0x43000000: + return 39; + case 0x3b800000: + return 40; + case 0x3c000000: + return 41; + case 0x3c800000: + return 42; + case 0x3d000000: + return 43; + case 0x3d800000: + return 44; + case 0x3e000000: + return 45; + case 0x3e800000: + return 46; + case 0x3f000000: + return 47; + } + + return ~0; +} + void qpu_serialize_one_inst(struct vc4_compile *c, uint64_t inst) { diff --git a/src/gallium/drivers/vc4/vc4_qpu.h b/src/gallium/drivers/vc4/vc4_qpu.h index e1307eb..c9ab634 100644 --- a/src/gallium/drivers/vc4/vc4_qpu.h +++ b/src/gallium/drivers/vc4/vc4_qpu.h @@ -134,6 +134,7 @@ uint64_t qpu_load_imm_ui(struct qpu_reg dst, uint32_t val); uint64_t qpu_set_sig(uint64_t inst, uint32_t sig); uint64_t qpu_set_cond_add(uint64_t inst, uint32_t cond); uint64_t qpu_set_cond_mul(uint64_t inst, uint32_t cond); +uint32_t qpu_encode_small_immediate(uint32_t i); bool qpu_waddr_is_tlb(uint32_t waddr); bool qpu_inst_is_tlb(uint64_t inst); diff --git a/src/gallium/drivers/vc4/vc4_qpu_defines.h b/src/gallium/drivers/vc4/vc4_qpu_defines.h index a965b96..eb3dfb3 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_defines.h +++ b/src/gallium/drivers/vc4/vc4_qpu_defines.h @@ -147,8 +147,11 @@ enum qpu_mux { QPU_MUX_A, QPU_MUX_B, - /* non-hardware mux values */ - QPU_MUX_IMM, + /** + * Non-hardware mux value, stores a small immediate field to be + * programmed into raddr_b in the qpu_reg.index. + */ + QPU_MUX_SMALL_IMM, }; enum qpu_cond { diff --git a/src/gallium/drivers/vc4/vc4_qpu_disasm.c b/src/gallium/drivers/vc4/vc4_qpu_disasm.c index b87205a..55e0e61 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_disasm.c +++ b/src/gallium/drivers/vc4/vc4_qpu_disasm.c @@ -291,9 +291,9 @@ print_alu_src(uint64_t inst, uint32_t mux) else if (si <= 39) fprintf(stderr, "%.1f", (float)(1 << (si - 32))); else if (si <= 47) - fprintf(stderr, "%f", 1.0f / (256 / (si - 39))); + fprintf(stderr, "%f", 1.0f / (1 << (48 - si))); else - fprintf(stderr, "???"); + fprintf(stderr, "<bad imm %d>", si); } else if (raddr <= 31) fprintf(stderr, "r%s%d", file, raddr); else { diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c index 530ec8b..35300ff 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_emit.c +++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c @@ -74,11 +74,15 @@ swap_file(struct qpu_reg *src) switch (src->addr) { case QPU_R_UNIF: case QPU_R_VARY: - if (src->mux == QPU_MUX_A) - src->mux = QPU_MUX_B; - else - src->mux = QPU_MUX_A; - return true; + if (src->mux == QPU_MUX_SMALL_IMM) { + return false; + } else { + if (src->mux == QPU_MUX_A) + src->mux = QPU_MUX_B; + else + src->mux = QPU_MUX_A; + return true; + } default: return false; @@ -100,16 +104,20 @@ fixup_raddr_conflict(struct vc4_compile *c, struct qpu_reg *src0, struct qpu_reg *src1, bool r3_live) { - if ((src0->mux != QPU_MUX_A && src0->mux != QPU_MUX_B) || - src0->mux != src1->mux || - src0->addr == src1->addr) { + uint32_t mux0 = src0->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src0->mux; + uint32_t mux1 = src1->mux == QPU_MUX_SMALL_IMM ? QPU_MUX_B : src1->mux; + + if (mux0 <= QPU_MUX_R5 || + mux0 != mux1 || + (src0->addr == src1->addr && + src0->mux == src1->mux)) { return false; } if (swap_file(src0) || swap_file(src1)) return false; - if (src0->mux == QPU_MUX_A) { + if (mux0 == QPU_MUX_A) { /* If we're conflicting over the A regfile, then we can just * use the reserved rb31. */ @@ -233,6 +241,14 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) case QFILE_VARY: src[i] = qpu_vary(); break; + case QFILE_SMALL_IMM: + src[i].mux = QPU_MUX_SMALL_IMM; + src[i].addr = qpu_encode_small_immediate(qinst->src[i].index); + /* This should only have returned a valid + * small immediate field, not ~0 for failure. + */ + assert(src[i].addr <= 47); + break; } } @@ -246,6 +262,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c) break; case QFILE_VARY: case QFILE_UNIF: + case QFILE_SMALL_IMM: assert(!"not reached"); break; } diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c index 0700b0d..f523b4c 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c +++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c @@ -224,7 +224,8 @@ reads_uniform(uint64_t inst) return false; return (QPU_GET_FIELD(inst, QPU_RADDR_A) == QPU_R_UNIF || - QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF || + (QPU_GET_FIELD(inst, QPU_RADDR_B) == QPU_R_UNIF && + QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM) || is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_ADD)) || is_tmu_write(QPU_GET_FIELD(inst, QPU_WADDR_MUL))); } @@ -343,7 +344,8 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n) if (sig != QPU_SIG_LOAD_IMM) { process_raddr_deps(state, n, raddr_a, true); - process_raddr_deps(state, n, raddr_b, false); + if (sig != QPU_SIG_SMALL_IMM) + process_raddr_deps(state, n, raddr_b, false); } if (add_op != QPU_A_NOP) { @@ -435,6 +437,7 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) { uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A); uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B); + uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG); uint32_t src_muxes[] = { QPU_GET_FIELD(inst, QPU_ADD_A), QPU_GET_FIELD(inst, QPU_ADD_B), @@ -446,6 +449,7 @@ reads_too_soon_after_write(struct choose_scoreboard *scoreboard, uint64_t inst) raddr_a < 32 && scoreboard->last_waddr_a == raddr_a) || (src_muxes[i] == QPU_MUX_B && + sig != QPU_SIG_SMALL_IMM && raddr_b < 32 && scoreboard->last_waddr_b == raddr_b)) { return true; diff --git a/src/gallium/drivers/vc4/vc4_qpu_validate.c b/src/gallium/drivers/vc4/vc4_qpu_validate.c index ffd1b47..8471edb 100644 --- a/src/gallium/drivers/vc4/vc4_qpu_validate.c +++ b/src/gallium/drivers/vc4/vc4_qpu_validate.c @@ -49,6 +49,7 @@ _reads_reg(uint64_t inst, uint32_t r, bool ignore_a, bool ignore_b) return true; if (!ignore_b && + QPU_GET_FIELD(inst, QPU_SIG) != QPU_SIG_SMALL_IMM && src_regs[i].mux == QPU_MUX_B && (QPU_GET_FIELD(inst, QPU_RADDR_B) == r)) return true; |