summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp')
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp1101
1 files changed, 1101 insertions, 0 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
new file mode 100644
index 0000000..56eaad3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -0,0 +1,1101 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+// nv50 doesn't support 32 bit integer multiplication
+//
+// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
+// -------------------
+// al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
+// ah*bh 00 00 ( carry1) << 16 + ( carry2)
+// al*bl
+// ah*bl 00
+//
+// fffe0001 + fffe0001
+static bool
+expandIntegerMUL(BuildUtil *bld, Instruction *mul)
+{
+ const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+
+ DataType fTy = mul->sType; // full type
+ DataType hTy;
+ switch (fTy) {
+ case TYPE_S32: hTy = TYPE_S16; break;
+ case TYPE_U32: hTy = TYPE_U16; break;
+ case TYPE_U64: hTy = TYPE_U32; break;
+ case TYPE_S64: hTy = TYPE_S32; break;
+ default:
+ return false;
+ }
+ unsigned int fullSize = typeSizeof(fTy);
+ unsigned int halfSize = typeSizeof(hTy);
+
+ Instruction *i[9];
+
+ bld->setPosition(mul, true);
+
+ Value *a[2], *b[2];
+ Value *c[2];
+ Value *t[4];
+ for (int j = 0; j < 4; ++j)
+ t[j] = bld->getSSA(fullSize);
+
+ // split sources into halves
+ i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
+ i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+
+ i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+ i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+ i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
+ i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+
+ if (highResult) {
+ Value *r[3];
+ Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
+ c[0] = bld->getSSA(1, FILE_FLAGS);
+ c[1] = bld->getSSA(1, FILE_FLAGS);
+ for (int j = 0; j < 3; ++j)
+ r[j] = bld->getSSA(fullSize);
+
+ i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
+ i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
+ bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
+ i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+
+ // set carry defs / sources
+ i[3]->setFlagsDef(1, c[0]);
+ i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+ i[6]->setPredicate(CC_C, c[0]);
+ i[5]->setFlagsSrc(3, c[1]);
+ } else {
+ bld->mkMov(mul->getDef(0), t[3]);
+ }
+ delete_Instruction(bld->getProgram(), mul);
+
+ for (int j = 2; j <= (highResult ? 5 : 4); ++j)
+ if (i[j])
+ i[j]->sType = hTy;
+
+ return true;
+}
+
+#define QOP_ADD 0
+#define QOP_SUBR 1
+#define QOP_SUB 2
+#define QOP_MOV2 3
+
+// UL UR LL LR
+#define QUADOP(q, r, s, t) \
+ ((QOP_##q << 6) | (QOP_##r << 4) | \
+ (QOP_##s << 2) | (QOP_##t << 0))
+
+class NV50LegalizePostRA : public Pass
+{
+private:
+ virtual bool visit(Function *);
+ virtual bool visit(BasicBlock *);
+
+ void handlePRERET(FlowInstruction *);
+ void replaceZero(Instruction *);
+
+ LValue *r63;
+};
+
+bool
+NV50LegalizePostRA::visit(Function *fn)
+{
+ Program *prog = fn->getProgram();
+
+ r63 = new_LValue(fn, FILE_GPR);
+ r63->reg.data.id = 63;
+
+ // this is actually per-program, but we can do it all on visiting main()
+ std::list<Instruction *> *outWrites =
+ reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+
+ if (outWrites) {
+ for (std::list<Instruction *>::iterator it = outWrites->begin();
+ it != outWrites->end(); ++it)
+ (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
+ // instructions will be deleted on exit
+ outWrites->clear();
+ }
+
+ return true;
+}
+
+void
+NV50LegalizePostRA::replaceZero(Instruction *i)
+{
+ for (int s = 0; i->srcExists(s); ++s) {
+ ImmediateValue *imm = i->getSrc(s)->asImm();
+ if (imm && imm->reg.data.u64 == 0)
+ i->setSrc(s, r63);
+ }
+}
+
+// Emulate PRERET: jump to the target and call to the origin from there
+//
+// WARNING: atm only works if BBs are affected by at most a single PRERET
+//
+// BB:0
+// preret BB:3
+// (...)
+// BB:3
+// (...)
+// --->
+// BB:0
+// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
+// (...)
+// BB:3
+// bra BB:3 + n1 (skip the call)
+// call BB:0 + n2 (skip bra at beginning of BB:0)
+// (...)
+void
+NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
+{
+ BasicBlock *bbE = pre->bb;
+ BasicBlock *bbT = pre->target.bb;
+
+ pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
+ bbE->remove(pre);
+ bbE->insertHead(pre);
+
+ Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
+ Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
+
+ bbT->insertHead(call);
+ bbT->insertHead(skip);
+
+ // NOTE: maybe split blocks to prevent the instructions from moving ?
+
+ skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
+ call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
+}
+
+bool
+NV50LegalizePostRA::visit(BasicBlock *bb)
+{
+ Instruction *i, *next;
+
+ // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+ for (i = bb->getFirst(); i; i = next) {
+ next = i->next;
+ if (i->isNop()) {
+ bb->remove(i);
+ } else
+ if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
+ handlePRERET(i->asFlow());
+ } else {
+ // TODO: We will want to do this before register allocation,
+ // since have to use a $c register for the carry flag.
+ if (typeSizeof(i->dType) == 8) {
+ Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
+ if (hi)
+ next = hi;
+ }
+
+ if (i->op != OP_MOV && i->op != OP_PFETCH &&
+ i->op != OP_BAR &&
+ (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
+ replaceZero(i);
+ }
+ }
+ if (!bb->getEntry())
+ return true;
+
+ return true;
+}
+
+class NV50LegalizeSSA : public Pass
+{
+public:
+ NV50LegalizeSSA(Program *);
+
+ virtual bool visit(BasicBlock *bb);
+
+private:
+ void propagateWriteToOutput(Instruction *);
+ void handleDIV(Instruction *);
+ void handleMOD(Instruction *);
+ void handleMUL(Instruction *);
+ void handleAddrDef(Instruction *);
+
+ inline bool isARL(const Instruction *) const;
+
+ BuildUtil bld;
+
+ std::list<Instruction *> *outWrites;
+};
+
+NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
+{
+ bld.setProgram(prog);
+
+ if (prog->optLevel >= 2 &&
+ (prog->getType() == Program::TYPE_GEOMETRY ||
+ prog->getType() == Program::TYPE_VERTEX))
+ outWrites =
+ reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+ else
+ outWrites = NULL;
+}
+
+void
+NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
+{
+ if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
+ return;
+
+ // check def instruction can store
+ Instruction *di = st->getSrc(1)->defs.front()->getInsn();
+
+ // TODO: move exports (if beneficial) in common opt pass
+ if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
+ return;
+ for (int s = 0; di->srcExists(s); ++s)
+ if (di->src(s).getFile() == FILE_IMMEDIATE)
+ return;
+
+ // We cannot set defs to non-lvalues before register allocation, so
+ // save & remove (to save registers) the exports and replace later.
+ outWrites->push_back(st);
+ st->bb->remove(st);
+}
+
+bool
+NV50LegalizeSSA::isARL(const Instruction *i) const
+{
+ ImmediateValue imm;
+
+ if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
+ return false;
+ if (!i->src(1).getImmediate(imm))
+ return false;
+ return imm.isInteger(0);
+}
+
+void
+NV50LegalizeSSA::handleAddrDef(Instruction *i)
+{
+ Instruction *arl;
+
+ i->getDef(0)->reg.size = 2; // $aX are only 16 bit
+
+ // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
+ if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
+ if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
+ return;
+ if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
+ return;
+ }
+
+ // turn $a sources into $r sources (can't operate on $a)
+ for (int s = 0; i->srcExists(s); ++s) {
+ Value *a = i->getSrc(s);
+ Value *r;
+ if (a->reg.file == FILE_ADDRESS) {
+ if (a->getInsn() && isARL(a->getInsn())) {
+ i->setSrc(s, a->getInsn()->getSrc(0));
+ } else {
+ bld.setPosition(i, false);
+ r = bld.getSSA();
+ bld.mkMov(r, a);
+ i->setSrc(s, r);
+ }
+ }
+ }
+ if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
+ return;
+
+ // turn result back into $a
+ bld.setPosition(i, true);
+ arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
+ i->setDef(0, arl->getSrc(0));
+}
+
+void
+NV50LegalizeSSA::handleMUL(Instruction *mul)
+{
+ if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
+ return;
+ Value *def = mul->getDef(0);
+ Value *pred = mul->getPredicate();
+ CondCode cc = mul->cc;
+ if (pred)
+ mul->setPredicate(CC_ALWAYS, NULL);
+
+ if (mul->op == OP_MAD) {
+ Instruction *add = mul;
+ bld.setPosition(add, false);
+ Value *res = cloneShallow(func, mul->getDef(0));
+ mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
+ add->op = OP_ADD;
+ add->setSrc(0, mul->getDef(0));
+ add->setSrc(1, add->getSrc(2));
+ for (int s = 2; add->srcExists(s); ++s)
+ add->setSrc(s, NULL);
+ mul->subOp = add->subOp;
+ add->subOp = 0;
+ }
+ expandIntegerMUL(&bld, mul);
+ if (pred)
+ def->getInsn()->setPredicate(cc, pred);
+}
+
+// Use f32 division: first compute an approximate result, use it to reduce
+// the dividend, which should then be representable as f32, divide the reduced
+// dividend, and add the quotients.
+void
+NV50LegalizeSSA::handleDIV(Instruction *div)
+{
+ const DataType ty = div->sType;
+
+ if (ty != TYPE_U32 && ty != TYPE_S32)
+ return;
+
+ Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
+
+ bld.setPosition(div, false);
+
+ Value *a, *af = bld.getSSA();
+ Value *b, *bf = bld.getSSA();
+
+ bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
+ bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
+
+ if (isSignedType(ty)) {
+ af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+ bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+ a = bld.getSSA();
+ b = bld.getSSA();
+ bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
+ bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
+ } else {
+ a = div->getSrc(0);
+ b = div->getSrc(1);
+ }
+
+ bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
+ bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
+
+ bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
+ bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
+
+ // get error of 1st result
+ expandIntegerMUL(&bld,
+ bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
+ bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
+
+ bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
+
+ bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
+ bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
+ ->rnd = ROUND_Z;
+ bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
+
+ // correction: if modulus >= divisor, add 1
+ expandIntegerMUL(&bld,
+ bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
+ bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
+ bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
+ if (!isSignedType(ty)) {
+ div->op = OP_SUB;
+ div->setSrc(0, q);
+ div->setSrc(1, s);
+ } else {
+ t = q;
+ bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
+ s = bld.getSSA();
+ t = bld.getSSA();
+ // fix the sign
+ bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
+ ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
+ bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
+ bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
+
+ div->op = OP_UNION;
+ div->setSrc(0, s);
+ div->setSrc(1, t);
+ }
+}
+
+void
+NV50LegalizeSSA::handleMOD(Instruction *mod)
+{
+ if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
+ return;
+ bld.setPosition(mod, false);
+
+ Value *q = bld.getSSA();
+ Value *m = bld.getSSA();
+
+ bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
+ handleDIV(q->getInsn());
+
+ bld.setPosition(mod, false);
+ expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
+
+ mod->op = OP_SUB;
+ mod->setSrc(1, m);
+}
+
+bool
+NV50LegalizeSSA::visit(BasicBlock *bb)
+{
+ Instruction *insn, *next;
+ // skipping PHIs (don't pass them to handleAddrDef) !
+ for (insn = bb->getEntry(); insn; insn = next) {
+ next = insn->next;
+
+ switch (insn->op) {
+ case OP_EXPORT:
+ if (outWrites)
+ propagateWriteToOutput(insn);
+ break;
+ case OP_DIV:
+ handleDIV(insn);
+ break;
+ case OP_MOD:
+ handleMOD(insn);
+ break;
+ case OP_MAD:
+ case OP_MUL:
+ handleMUL(insn);
+ break;
+ default:
+ break;
+ }
+
+ if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
+ handleAddrDef(insn);
+ }
+ return true;
+}
+
+class NV50LoweringPreSSA : public Pass
+{
+public:
+ NV50LoweringPreSSA(Program *);
+
+private:
+ virtual bool visit(Instruction *);
+ virtual bool visit(Function *);
+
+ bool handleRDSV(Instruction *);
+ bool handleWRSV(Instruction *);
+
+ bool handleEXPORT(Instruction *);
+
+ bool handleDIV(Instruction *);
+ bool handleSQRT(Instruction *);
+ bool handlePOW(Instruction *);
+
+ bool handleSET(Instruction *);
+ bool handleSLCT(CmpInstruction *);
+ bool handleSELP(Instruction *);
+
+ bool handleTEX(TexInstruction *);
+ bool handleTXB(TexInstruction *); // I really
+ bool handleTXL(TexInstruction *); // hate
+ bool handleTXD(TexInstruction *); // these 3
+
+ bool handleCALL(Instruction *);
+ bool handlePRECONT(Instruction *);
+ bool handleCONT(Instruction *);
+
+ void checkPredicate(Instruction *);
+
+private:
+ const Target *const targ;
+
+ BuildUtil bld;
+
+ Value *tid;
+};
+
+NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
+ targ(prog->getTarget()), tid(NULL)
+{
+ bld.setProgram(prog);
+}
+
+bool
+NV50LoweringPreSSA::visit(Function *f)
+{
+ BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
+
+ if (prog->getType() == Program::TYPE_COMPUTE) {
+ // Add implicit "thread id" argument in $r0 to the function
+ Value *arg = new_LValue(func, FILE_GPR);
+ arg->reg.data.id = 0;
+ f->ins.push_back(arg);
+
+ bld.setPosition(root, false);
+ tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
+ }
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTEX(TexInstruction *i)
+{
+ const int arg = i->tex.target.getArgCount();
+ const int dref = arg;
+ const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
+
+ // dref comes before bias/lod
+ if (i->tex.target.isShadow())
+ if (i->op == OP_TXB || i->op == OP_TXL)
+ i->swapSources(dref, lod);
+
+ // array index must be converted to u32
+ if (i->tex.target.isArray()) {
+ Value *layer = i->getSrc(arg - 1);
+ LValue *src = new_LValue(func, FILE_GPR);
+ bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
+ bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
+ i->setSrc(arg - 1, src);
+
+ if (i->tex.target.isCube()) {
+ std::vector<Value *> acube, a2d;
+ int c;
+
+ acube.resize(4);
+ for (c = 0; c < 4; ++c)
+ acube[c] = i->getSrc(c);
+ a2d.resize(4);
+ for (c = 0; c < 3; ++c)
+ a2d[c] = new_LValue(func, FILE_GPR);
+ a2d[3] = NULL;
+
+ bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
+ a2d, acube)->asTex()->tex.mask = 0x7;
+
+ for (c = 0; c < 3; ++c)
+ i->setSrc(c, a2d[c]);
+ i->setSrc(c, NULL);
+ for (; i->srcExists(c + 1); ++c)
+ i->setSrc(c, i->getSrc(c + 1));
+
+ i->tex.target = i->tex.target.isShadow() ?
+ TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
+ }
+ }
+
+ // texel offsets are 3 immediate fields in the instruction,
+ // nv50 cannot do textureGatherOffsets
+ assert(i->tex.useOffsets <= 1);
+
+ return true;
+}
+
+// Bias must be equal for all threads of a quad or lod calculation will fail.
+//
+// The lanes of a quad are grouped by the bit in the condition register they
+// have set, which is selected by differing bias values.
+// Move the input values for TEX into a new register set for each group and
+// execute TEX only for a specific group.
+// We always need to use 4 new registers for the inputs/outputs because the
+// implicitly calculated derivatives must be correct.
+//
+// TODO: move to SSA phase so we can easily determine whether bias is constant
+bool
+NV50LoweringPreSSA::handleTXB(TexInstruction *i)
+{
+ const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
+ int l, d;
+
+ handleTEX(i);
+ Value *bias = i->getSrc(i->tex.target.getArgCount());
+ if (bias->isUniform())
+ return true;
+
+ Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
+ bld.loadImm(NULL, 1));
+ bld.setPosition(cond, false);
+
+ for (l = 1; l < 4; ++l) {
+ const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+ Value *bit = bld.getSSA();
+ Value *pred = bld.getScratch(1, FILE_FLAGS);
+ Value *imm = bld.loadImm(NULL, (1 << l));
+ bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
+ bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
+ cond->setSrc(l, bit);
+ }
+ Value *flags = bld.getScratch(1, FILE_FLAGS);
+ bld.setPosition(cond, true);
+ bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+
+ Instruction *tex[4];
+ for (l = 0; l < 4; ++l) {
+ (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
+ bld.insert(tex[l]);
+ }
+
+ Value *res[4][4];
+ for (d = 0; i->defExists(d); ++d)
+ res[0][d] = tex[0]->getDef(d);
+ for (l = 1; l < 4; ++l) {
+ for (d = 0; tex[l]->defExists(d); ++d) {
+ res[l][d] = cloneShallow(func, res[0][d]);
+ bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
+ }
+ }
+
+ for (d = 0; i->defExists(d); ++d) {
+ Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
+ for (l = 0; l < 4; ++l)
+ dst->setSrc(l, res[l][d]);
+ }
+ delete_Instruction(prog, i);
+ return true;
+}
+
+// LOD must be equal for all threads of a quad.
+// Unlike with TXB, here we can just diverge since there's no LOD calculation
+// that would require all 4 threads' sources to be set up properly.
+bool
+NV50LoweringPreSSA::handleTXL(TexInstruction *i)
+{
+ handleTEX(i);
+ Value *lod = i->getSrc(i->tex.target.getArgCount());
+ if (lod->isUniform())
+ return true;
+
+ BasicBlock *currBB = i->bb;
+ BasicBlock *texiBB = i->bb->splitBefore(i, false);
+ BasicBlock *joinBB = i->bb->splitAfter(i);
+
+ bld.setPosition(currBB, true);
+ currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+ for (int l = 0; l <= 3; ++l) {
+ const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+ Value *pred = bld.getScratch(1, FILE_FLAGS);
+ bld.setPosition(currBB, true);
+ bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
+ bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
+ currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
+ if (l <= 2) {
+ BasicBlock *laneBB = new BasicBlock(func);
+ currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
+ currBB = laneBB;
+ }
+ }
+ bld.setPosition(joinBB, false);
+ bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTXD(TexInstruction *i)
+{
+ static const uint8_t qOps[4][2] =
+ {
+ { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
+ { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
+ { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+ { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+ };
+ Value *def[4][4];
+ Value *crd[3];
+ Instruction *tex;
+ Value *zero = bld.loadImm(bld.getSSA(), 0);
+ int l, c;
+ const int dim = i->tex.target.getDim();
+
+ handleTEX(i);
+ i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+ for (c = 0; c < dim; ++c)
+ crd[c] = bld.getScratch();
+
+ bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+ for (l = 0; l < 4; ++l) {
+ // mov coordinates from lane l to all lanes
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+ // add dPdx from lane l to lanes dx
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+ // add dPdy from lane l to lanes dy
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+ // texture
+ bld.insert(tex = cloneForward(func, i));
+ for (c = 0; c < dim; ++c)
+ tex->setSrc(c, crd[c]);
+ // save results
+ for (c = 0; i->defExists(c); ++c) {
+ Instruction *mov;
+ def[c][l] = bld.getSSA();
+ mov = bld.mkMov(def[c][l], tex->getDef(c));
+ mov->fixed = 1;
+ mov->lanes = 1 << l;
+ }
+ }
+ bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+ for (c = 0; i->defExists(c); ++c) {
+ Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+ for (l = 0; l < 4; ++l)
+ u->setSrc(l, def[c][l]);
+ }
+
+ i->bb->remove(i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSET(Instruction *i)
+{
+ if (i->dType == TYPE_F32) {
+ bld.setPosition(i, true);
+ i->dType = TYPE_U32;
+ bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
+ bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
+ }
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
+{
+ Value *src0 = bld.getSSA();
+ Value *src1 = bld.getSSA();
+ Value *pred = bld.getScratch(1, FILE_FLAGS);
+
+ Value *v0 = i->getSrc(0);
+ Value *v1 = i->getSrc(1);
+ // XXX: these probably shouldn't be immediates in the first place ...
+ if (v0->asImm())
+ v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+ if (v1->asImm())
+ v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+ bld.setPosition(i, true);
+ bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
+ bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
+ bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+
+ bld.setPosition(i, false);
+ i->op = OP_SET;
+ i->setFlagsDef(0, pred);
+ i->dType = TYPE_U8;
+ i->setSrc(0, i->getSrc(2));
+ i->setSrc(2, NULL);
+ i->setSrc(1, bld.loadImm(NULL, 0));
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSELP(Instruction *i)
+{
+ Value *src0 = bld.getSSA();
+ Value *src1 = bld.getSSA();
+
+ Value *v0 = i->getSrc(0);
+ Value *v1 = i->getSrc(1);
+ if (v0->asImm())
+ v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+ if (v1->asImm())
+ v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+ bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
+ bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
+ bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+ delete_Instruction(prog, i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleWRSV(Instruction *i)
+{
+ Symbol *sym = i->getSrc(0)->asSym();
+
+ // these are all shader outputs, $sreg are not writeable
+ uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
+ if (addr >= 0x400)
+ return false;
+ sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+ bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
+
+ bld.getBB()->remove(i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCALL(Instruction *i)
+{
+ if (prog->getType() == Program::TYPE_COMPUTE) {
+ // Add implicit "thread id" argument in $r0 to the function
+ i->setSrc(i->srcCount(), tid);
+ }
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePRECONT(Instruction *i)
+{
+ delete_Instruction(prog, i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCONT(Instruction *i)
+{
+ i->op = OP_BRA;
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleRDSV(Instruction *i)
+{
+ Symbol *sym = i->getSrc(0)->asSym();
+ uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+ Value *def = i->getDef(0);
+ SVSemantic sv = sym->reg.data.sv.sv;
+ int idx = sym->reg.data.sv.index;
+
+ if (addr >= 0x400) // mov $sreg
+ return true;
+
+ switch (sv) {
+ case SV_POSITION:
+ assert(prog->getType() == Program::TYPE_FRAGMENT);
+ bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+ break;
+ case SV_FACE:
+ bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
+ if (i->dType == TYPE_F32) {
+ bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
+ bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
+ }
+ break;
+ case SV_NCTAID:
+ case SV_CTAID:
+ case SV_NTID:
+ if ((sv == SV_NCTAID && idx >= 2) ||
+ (sv == SV_NTID && idx >= 3)) {
+ bld.mkMov(def, bld.mkImm(1));
+ } else if (sv == SV_CTAID && idx >= 2) {
+ bld.mkMov(def, bld.mkImm(0));
+ } else {
+ Value *x = bld.getSSA(2);
+ bld.mkOp1(OP_LOAD, TYPE_U16, x,
+ bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+ bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+ }
+ break;
+ case SV_TID:
+ if (idx == 0) {
+ bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
+ } else if (idx == 1) {
+ bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
+ bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
+ } else if (idx == 2) {
+ bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
+ } else {
+ bld.mkMov(def, bld.mkImm(0));
+ }
+ break;
+ default:
+ bld.mkFetch(i->getDef(0), i->dType,
+ FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
+ break;
+ }
+ bld.getBB()->remove(i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleDIV(Instruction *i)
+{
+ if (!isFloatType(i->dType))
+ return true;
+ bld.setPosition(i, false);
+ Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+ i->op = OP_MUL;
+ i->setSrc(1, rcp->getDef(0));
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSQRT(Instruction *i)
+{
+ Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+ bld.getSSA(), i->getSrc(0));
+ i->op = OP_MUL;
+ i->setSrc(1, rsq->getDef(0));
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePOW(Instruction *i)
+{
+ LValue *val = bld.getScratch();
+
+ bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+ bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+ bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+ i->op = OP_EX2;
+ i->setSrc(0, val);
+ i->setSrc(1, NULL);
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleEXPORT(Instruction *i)
+{
+ if (prog->getType() == Program::TYPE_FRAGMENT) {
+ if (i->getIndirect(0, 0)) {
+ // TODO: redirect to l[] here, load to GPRs at exit
+ return false;
+ } else {
+ int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
+
+ i->op = OP_MOV;
+ i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+ i->src(0).set(i->src(1));
+ i->setSrc(1, NULL);
+ i->setDef(0, new_LValue(func, FILE_GPR));
+ i->getDef(0)->reg.data.id = id;
+
+ prog->maxGPR = MAX2(prog->maxGPR, id);
+ }
+ }
+ return true;
+}
+
+// Set flags according to predicate and make the instruction read $cX.
+void
+NV50LoweringPreSSA::checkPredicate(Instruction *insn)
+{
+ Value *pred = insn->getPredicate();
+ Value *cdst;
+
+ if (!pred || pred->reg.file == FILE_FLAGS)
+ return;
+ cdst = bld.getSSA(1, FILE_FLAGS);
+
+ bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
+
+ insn->setPredicate(insn->cc, cdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NV50LoweringPreSSA::visit(Instruction *i)
+{
+ bld.setPosition(i, false);
+
+ if (i->cc != CC_ALWAYS)
+ checkPredicate(i);
+
+ switch (i->op) {
+ case OP_TEX:
+ case OP_TXF:
+ case OP_TXG:
+ return handleTEX(i->asTex());
+ case OP_TXB:
+ return handleTXB(i->asTex());
+ case OP_TXL:
+ return handleTXL(i->asTex());
+ case OP_TXD:
+ return handleTXD(i->asTex());
+ case OP_EX2:
+ bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+ i->setSrc(0, i->getDef(0));
+ break;
+ case OP_SET:
+ return handleSET(i);
+ case OP_SLCT:
+ return handleSLCT(i->asCmp());
+ case OP_SELP:
+ return handleSELP(i);
+ case OP_POW:
+ return handlePOW(i);
+ case OP_DIV:
+ return handleDIV(i);
+ case OP_SQRT:
+ return handleSQRT(i);
+ case OP_EXPORT:
+ return handleEXPORT(i);
+ case OP_RDSV:
+ return handleRDSV(i);
+ case OP_WRSV:
+ return handleWRSV(i);
+ case OP_CALL:
+ return handleCALL(i);
+ case OP_PRECONT:
+ return handlePRECONT(i);
+ case OP_CONT:
+ return handleCONT(i);
+ default:
+ break;
+ }
+ return true;
+}
+
+bool
+TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
+{
+ bool ret = false;
+
+ if (stage == CG_STAGE_PRE_SSA) {
+ NV50LoweringPreSSA pass(prog);
+ ret = pass.run(prog, false, true);
+ } else
+ if (stage == CG_STAGE_SSA) {
+ if (!prog->targetPriv)
+ prog->targetPriv = new std::list<Instruction *>();
+ NV50LegalizeSSA pass(prog);
+ ret = pass.run(prog, false, true);
+ } else
+ if (stage == CG_STAGE_POST_RA) {
+ NV50LegalizePostRA pass;
+ ret = pass.run(prog, false, true);
+ if (prog->targetPriv)
+ delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+ }
+ return ret;
+}
+
+} // namespace nv50_ir