diff options
author | Rob Clark <robclark@freedesktop.org> | 2014-02-16 07:29:13 -0500 |
---|---|---|
committer | Rob Clark <robclark@freedesktop.org> | 2014-02-16 08:17:23 -0500 |
commit | e35747b8824b0f996c5ef5c7b58fcaa200cc231e (patch) | |
tree | 0f2af3fe817208036575bac52ab6e9fd8ee5992d /src | |
parent | 89dc2825819c0260511c6596497c8a350d9901a7 (diff) | |
download | external_mesa3d-e35747b8824b0f996c5ef5c7b58fcaa200cc231e.zip external_mesa3d-e35747b8824b0f996c5ef5c7b58fcaa200cc231e.tar.gz external_mesa3d-e35747b8824b0f996c5ef5c7b58fcaa200cc231e.tar.bz2 |
freedreno/a3xx/compiler: trans_cmp() sanity
Thanks to figuring out 32bit float render target, and adding regdump
test in fdre-a3xx, I can more easily play around with instructions to
figure out range of inputs/outputs/etc. And from this I can conclude
that cmps.f works more like expected and I can do something much more
simple in trans_cmp() (compared to before which was more closely
emulating the instruction sequence of the blob compiler).
And using sel.b32 (binary 0/1) often makes more sense than sel.f32
(+/- float) or sel.u32 (+/- uint) as it can use the output directly
from cmps.f without needing the 'add.s tmp0, tmp0, -1'.
Signed-off-by: Rob Clark <robclark@freedesktop.org>
Diffstat (limited to 'src')
-rw-r--r-- | src/gallium/drivers/freedreno/a3xx/fd3_compiler.c | 86 |
1 files changed, 35 insertions, 51 deletions
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c index da327c9..b1ed2e0 100644 --- a/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c +++ b/src/gallium/drivers/freedreno/a3xx/fd3_compiler.c @@ -1130,36 +1130,32 @@ trans_samp(const struct instr_translater *t, /* * SEQ(a,b) = (a == b) ? 1.0 : 0.0 - * cmps.f.eq tmp0, b, a + * cmps.f.eq tmp0, a, b * cov.u16f16 dst, tmp0 * * SNE(a,b) = (a != b) ? 1.0 : 0.0 - * cmps.f.eq tmp0, b, a - * add.s tmp0, tmp0, -1 - * sel.f16 dst, {0.0}, tmp0, {1.0} + * cmps.f.ne tmp0, a, b + * cov.u16f16 dst, tmp0 * * SGE(a,b) = (a >= b) ? 1.0 : 0.0 * cmps.f.ge tmp0, a, b * cov.u16f16 dst, tmp0 * * SLE(a,b) = (a <= b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, b, a + * cmps.f.le tmp0, a, b * cov.u16f16 dst, tmp0 * * SGT(a,b) = (a > b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, b, a - * add.s tmp0, tmp0, -1 - * sel.f16 dst, {0.0}, tmp0, {1.0} + * cmps.f.gt tmp0, a, b + * cov.u16f16 dst, tmp0 * * SLT(a,b) = (a < b) ? 1.0 : 0.0 - * cmps.f.ge tmp0, a, b - * add.s tmp0, tmp0, -1 - * sel.f16 dst, {0.0}, tmp0, {1.0} + * cmps.f.lt tmp0, a, b + * cov.u16f16 dst, tmp0 * * CMP(a,b,c) = (a < 0.0) ? b : c - * cmps.f.ge tmp0, a, {0.0} - * add.s tmp0, tmp0, -1 - * sel.f16 dst, c, tmp0, b + * cmps.f.lt tmp0, a, {0.0} + * sel.b16 dst, b, tmp0, c */ static void trans_cmp(const struct instr_translater *t, @@ -1169,38 +1165,41 @@ trans_cmp(const struct instr_translater *t, struct ir3_instruction *instr; struct tgsi_dst_register tmp_dst; struct tgsi_src_register *tmp_src; - struct tgsi_src_register constval0, constval1; + struct tgsi_src_register constval0; /* final instruction for CMP() uses orig src1 and src2: */ struct tgsi_dst_register *dst = get_dst(ctx, inst); - struct tgsi_src_register *a0, *a1; + struct tgsi_src_register *a0, *a1, *a2; unsigned condition; tmp_src = get_internal_temp(ctx, &tmp_dst); + a0 = &inst->Src[0].Register; /* a */ + a1 = &inst->Src[1].Register; /* b */ + switch (t->tgsi_opc) { case TGSI_OPCODE_SEQ: - case TGSI_OPCODE_SNE: - a0 = &inst->Src[1].Register; /* b */ - a1 = &inst->Src[0].Register; /* a */ condition = IR3_COND_EQ; break; + case TGSI_OPCODE_SNE: + condition = IR3_COND_NE; + break; case TGSI_OPCODE_SGE: - case TGSI_OPCODE_SLT: - a0 = &inst->Src[0].Register; /* a */ - a1 = &inst->Src[1].Register; /* b */ condition = IR3_COND_GE; break; + case TGSI_OPCODE_SLT: + condition = IR3_COND_LT; + break; case TGSI_OPCODE_SLE: + condition = IR3_COND_LE; + break; case TGSI_OPCODE_SGT: - a0 = &inst->Src[1].Register; /* b */ - a1 = &inst->Src[0].Register; /* a */ - condition = IR3_COND_GE; + condition = IR3_COND_GT; break; case TGSI_OPCODE_CMP: get_immediate(ctx, &constval0, fui(0.0)); a0 = &inst->Src[0].Register; /* a */ a1 = &constval0; /* {0.0} */ - condition = IR3_COND_GE; + condition = IR3_COND_LT; break; default: compile_assert(ctx, 0); @@ -1210,7 +1209,7 @@ trans_cmp(const struct instr_translater *t, if (is_const(a0) && is_const(a1)) a0 = get_unconst(ctx, a0); - /* cmps.f.ge tmp, a0, a1 */ + /* cmps.f.<cond> tmp, a0, a1 */ instr = instr_create(ctx, 2, OPC_CMPS_F); instr->cat2.condition = condition; vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0); @@ -1219,37 +1218,22 @@ trans_cmp(const struct instr_translater *t, case TGSI_OPCODE_SEQ: case TGSI_OPCODE_SGE: case TGSI_OPCODE_SLE: + case TGSI_OPCODE_SNE: + case TGSI_OPCODE_SGT: + case TGSI_OPCODE_SLT: /* cov.u16f16 dst, tmp0 */ instr = instr_create(ctx, 1, 0); instr->cat1.src_type = get_utype(ctx); instr->cat1.dst_type = get_ftype(ctx); vectorize(ctx, instr, dst, 1, tmp_src, 0); break; - case TGSI_OPCODE_SNE: - case TGSI_OPCODE_SGT: - case TGSI_OPCODE_SLT: case TGSI_OPCODE_CMP: - /* add.s tmp, tmp, -1 */ - instr = instr_create(ctx, 2, OPC_ADD_S); - vectorize(ctx, instr, &tmp_dst, 2, tmp_src, 0, -1, IR3_REG_IMMED); - - if (t->tgsi_opc == TGSI_OPCODE_CMP) { - /* sel.{f32,f16} dst, src2, tmp, src1 */ - instr = instr_create(ctx, 3, - ctx->so->half_precision ? OPC_SEL_F16 : OPC_SEL_F32); - vectorize(ctx, instr, dst, 3, - &inst->Src[2].Register, 0, - tmp_src, 0, - &inst->Src[1].Register, 0); - } else { - get_immediate(ctx, &constval0, fui(0.0)); - get_immediate(ctx, &constval1, fui(1.0)); - /* sel.{f32,f16} dst, {0.0}, tmp0, {1.0} */ - instr = instr_create(ctx, 3, - ctx->so->half_precision ? OPC_SEL_F16 : OPC_SEL_F32); - vectorize(ctx, instr, dst, 3, - &constval0, 0, tmp_src, 0, &constval1, 0); - } + a1 = &inst->Src[1].Register; + a2 = &inst->Src[2].Register; + /* sel.{b32,b16} dst, src2, tmp, src1 */ + instr = instr_create(ctx, 3, + ctx->so->half_precision ? OPC_SEL_B16 : OPC_SEL_B32); + vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0); break; } |