1 files changed, 1101 insertions, 0 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
new file mode 100644
index 0000000..56eaad3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -0,0 +1,1101 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+// nv50 doesn't support 32 bit integer multiplication
+//
+//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
+// -------------------
+//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
+// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
+//       al*bl
+//    ah*bl 00
+//
+// fffe0001 + fffe0001
+static bool
+expandIntegerMUL(BuildUtil *bld, Instruction *mul)
+{
+   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+
+   DataType fTy = mul->sType; // full type
+   DataType hTy;
+   switch (fTy) {
+   case TYPE_S32: hTy = TYPE_S16; break;
+   case TYPE_U32: hTy = TYPE_U16; break;
+   case TYPE_U64: hTy = TYPE_U32; break;
+   case TYPE_S64: hTy = TYPE_S32; break;
+   default:
+      return false;
+   }
+   unsigned int fullSize = typeSizeof(fTy);
+   unsigned int halfSize = typeSizeof(hTy);
+
+   Instruction *i[9];
+
+   bld->setPosition(mul, true);
+
+   Value *a[2], *b[2];
+   Value *c[2];
+   Value *t[4];
+   for (int j = 0; j < 4; ++j)
+      t[j] = bld->getSSA(fullSize);
+
+   // split sources into halves
+   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
+   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+
+   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
+   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+
+   if (highResult) {
+      Value *r[3];
+      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
+      c[0] = bld->getSSA(1, FILE_FLAGS);
+      c[1] = bld->getSSA(1, FILE_FLAGS);
+      for (int j = 0; j < 3; ++j)
+         r[j] = bld->getSSA(fullSize);
+
+      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
+      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
+      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
+      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+
+      // set carry defs / sources
+      i[3]->setFlagsDef(1, c[0]);
+      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+      i[6]->setPredicate(CC_C, c[0]);
+      i[5]->setFlagsSrc(3, c[1]);
+   } else {
+      bld->mkMov(mul->getDef(0), t[3]);
+   }
+   delete_Instruction(bld->getProgram(), mul);
+
+   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
+      if (i[j])
+         i[j]->sType = hTy;
+
+   return true;
+}
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+//             UL UR LL LR
+#define QUADOP(q, r, s, t)            \
+   ((QOP_##q << 6) | (QOP_##r << 4) | \
+    (QOP_##s << 2) | (QOP_##t << 0))
+
+class NV50LegalizePostRA : public Pass
+{
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+
+   void handlePRERET(FlowInstruction *);
+   void replaceZero(Instruction *);
+
+   LValue *r63;
+};
+
+bool
+NV50LegalizePostRA::visit(Function *fn)
+{
+   Program *prog = fn->getProgram();
+
+   r63 = new_LValue(fn, FILE_GPR);
+   r63->reg.data.id = 63;
+
+   // this is actually per-program, but we can do it all on visiting main()
+   std::list<Instruction *> *outWrites =
+      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+
+   if (outWrites) {
+      for (std::list<Instruction *>::iterator it = outWrites->begin();
+           it != outWrites->end(); ++it)
+         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
+      // instructions will be deleted on exit
+      outWrites->clear();
+   }
+
+   return true;
+}
+
+void
+NV50LegalizePostRA::replaceZero(Instruction *i)
+{
+   for (int s = 0; i->srcExists(s); ++s) {
+      ImmediateValue *imm = i->getSrc(s)->asImm();
+      if (imm && imm->reg.data.u64 == 0)
+         i->setSrc(s, r63);
+   }
+}
+
+// Emulate PRERET: jump to the target and call to the origin from there
+//
+// WARNING: atm only works if BBs are affected by at most a single PRERET
+//
+// BB:0
+// preret BB:3
+// (...)
+// BB:3
+// (...)
+//             --->
+// BB:0
+// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
+// (...)
+// BB:3
+// bra BB:3 + n1 (skip the call)
+// call BB:0 + n2 (skip bra at beginning of BB:0)
+// (...)
+void
+NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
+{
+   BasicBlock *bbE = pre->bb;
+   BasicBlock *bbT = pre->target.bb;
+
+   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
+   bbE->remove(pre);
+   bbE->insertHead(pre);
+
+   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
+   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
+
+   bbT->insertHead(call);
+   bbT->insertHead(skip);
+
+   // NOTE: maybe split blocks to prevent the instructions from moving ?
+
+   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
+   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
+}
+
+bool
+NV50LegalizePostRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+   for (i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->isNop()) {
+         bb->remove(i);
+      } else
+      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
+         handlePRERET(i->asFlow());
+      } else {
+         // TODO: We will want to do this before register allocation,
+         // since have to use a $c register for the carry flag.
+         if (typeSizeof(i->dType) == 8) {
+            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
+            if (hi)
+               next = hi;
+         }
+
+         if (i->op != OP_MOV && i->op != OP_PFETCH &&
+             i->op != OP_BAR &&
+             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
+            replaceZero(i);
+      }
+   }
+   if (!bb->getEntry())
+      return true;
+
+   return true;
+}
+
+class NV50LegalizeSSA : public Pass
+{
+public:
+   NV50LegalizeSSA(Program *);
+
+   virtual bool visit(BasicBlock *bb);
+
+private:
+   void propagateWriteToOutput(Instruction *);
+   void handleDIV(Instruction *);
+   void handleMOD(Instruction *);
+   void handleMUL(Instruction *);
+   void handleAddrDef(Instruction *);
+
+   inline bool isARL(const Instruction *) const;
+
+   BuildUtil bld;
+
+   std::list<Instruction *> *outWrites;
+};
+
+NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
+{
+   bld.setProgram(prog);
+
+   if (prog->optLevel >= 2 &&
+       (prog->getType() == Program::TYPE_GEOMETRY ||
+        prog->getType() == Program::TYPE_VERTEX))
+      outWrites =
+         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   else
+      outWrites = NULL;
+}
+
+void
+NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
+{
+   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
+      return;
+
+   // check def instruction can store
+   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
+
+   // TODO: move exports (if beneficial) in common opt pass
+   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
+      return;
+   for (int s = 0; di->srcExists(s); ++s)
+      if (di->src(s).getFile() == FILE_IMMEDIATE)
+         return;
+
+   // We cannot set defs to non-lvalues before register allocation, so
+   // save & remove (to save registers) the exports and replace later.
+   outWrites->push_back(st);
+   st->bb->remove(st);
+}
+
+bool
+NV50LegalizeSSA::isARL(const Instruction *i) const
+{
+   ImmediateValue imm;
+
+   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
+      return false;
+   if (!i->src(1).getImmediate(imm))
+      return false;
+   return imm.isInteger(0);
+}
+
+void
+NV50LegalizeSSA::handleAddrDef(Instruction *i)
+{
+   Instruction *arl;
+
+   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
+
+   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
+   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
+      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
+         return;
+      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
+         return;
+   }
+
+   // turn $a sources into $r sources (can't operate on $a)
+   for (int s = 0; i->srcExists(s); ++s) {
+      Value *a = i->getSrc(s);
+      Value *r;
+      if (a->reg.file == FILE_ADDRESS) {
+         if (a->getInsn() && isARL(a->getInsn())) {
+            i->setSrc(s, a->getInsn()->getSrc(0));
+         } else {
+            bld.setPosition(i, false);
+            r = bld.getSSA();
+            bld.mkMov(r, a);
+            i->setSrc(s, r);
+         }
+      }
+   }
+   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
+      return;
+
+   // turn result back into $a
+   bld.setPosition(i, true);
+   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
+   i->setDef(0, arl->getSrc(0));
+}
+
+void
+NV50LegalizeSSA::handleMUL(Instruction *mul)
+{
+   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
+      return;
+   Value *def = mul->getDef(0);
+   Value *pred = mul->getPredicate();
+   CondCode cc = mul->cc;
+   if (pred)
+      mul->setPredicate(CC_ALWAYS, NULL);
+
+   if (mul->op == OP_MAD) {
+      Instruction *add = mul;
+      bld.setPosition(add, false);
+      Value *res = cloneShallow(func, mul->getDef(0));
+      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
+      add->op = OP_ADD;
+      add->setSrc(0, mul->getDef(0));
+      add->setSrc(1, add->getSrc(2));
+      for (int s = 2; add->srcExists(s); ++s)
+         add->setSrc(s, NULL);
+      mul->subOp = add->subOp;
+      add->subOp = 0;
+   }
+   expandIntegerMUL(&bld, mul);
+   if (pred)
+      def->getInsn()->setPredicate(cc, pred);
+}
+
+// Use f32 division: first compute an approximate result, use it to reduce
+// the dividend, which should then be representable as f32, divide the reduced
+// dividend, and add the quotients.
+void
+NV50LegalizeSSA::handleDIV(Instruction *div)
+{
+   const DataType ty = div->sType;
+
+   if (ty != TYPE_U32 && ty != TYPE_S32)
+      return;
+
+   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
+
+   bld.setPosition(div, false);
+
+   Value *a, *af = bld.getSSA();
+   Value *b, *bf = bld.getSSA();
+
+   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
+   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
+
+   if (isSignedType(ty)) {
+      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      a = bld.getSSA();
+      b = bld.getSSA();
+      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
+      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
+   } else {
+      a = div->getSrc(0);
+      b = div->getSrc(1);
+   }
+
+   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
+   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
+
+   // get error of 1st result
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
+
+   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
+      ->rnd = ROUND_Z;
+   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
+
+   // correction: if modulus >= divisor, add 1
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
+   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
+   if (!isSignedType(ty)) {
+      div->op = OP_SUB;
+      div->setSrc(0, q);
+      div->setSrc(1, s);
+   } else {
+      t = q;
+      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
+      s = bld.getSSA();
+      t = bld.getSSA();
+      // fix the sign
+      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
+         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
+      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
+      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
+
+      div->op = OP_UNION;
+      div->setSrc(0, s);
+      div->setSrc(1, t);
+   }
+}
+
+void
+NV50LegalizeSSA::handleMOD(Instruction *mod)
+{
+   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
+      return;
+   bld.setPosition(mod, false);
+
+   Value *q = bld.getSSA();
+   Value *m = bld.getSSA();
+
+   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
+   handleDIV(q->getInsn());
+
+   bld.setPosition(mod, false);
+   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
+
+   mod->op = OP_SUB;
+   mod->setSrc(1, m);
+}
+
+bool
+NV50LegalizeSSA::visit(BasicBlock *bb)
+{
+   Instruction *insn, *next;
+   // skipping PHIs (don't pass them to handleAddrDef) !
+   for (insn = bb->getEntry(); insn; insn = next) {
+      next = insn->next;
+
+      switch (insn->op) {
+      case OP_EXPORT:
+         if (outWrites)
+            propagateWriteToOutput(insn);
+         break;
+      case OP_DIV:
+         handleDIV(insn);
+         break;
+      case OP_MOD:
+         handleMOD(insn);
+         break;
+      case OP_MAD:
+      case OP_MUL:
+         handleMUL(insn);
+         break;
+      default:
+         break;
+      }
+
+      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
+         handleAddrDef(insn);
+   }
+   return true;
+}
+
+class NV50LoweringPreSSA : public Pass
+{
+public:
+   NV50LoweringPreSSA(Program *);
+
+private:
+   virtual bool visit(Instruction *);
+   virtual bool visit(Function *);
+
+   bool handleRDSV(Instruction *);
+   bool handleWRSV(Instruction *);
+
+   bool handleEXPORT(Instruction *);
+
+   bool handleDIV(Instruction *);
+   bool handleSQRT(Instruction *);
+   bool handlePOW(Instruction *);
+
+   bool handleSET(Instruction *);
+   bool handleSLCT(CmpInstruction *);
+   bool handleSELP(Instruction *);
+
+   bool handleTEX(TexInstruction *);
+   bool handleTXB(TexInstruction *); // I really
+   bool handleTXL(TexInstruction *); // hate
+   bool handleTXD(TexInstruction *); // these 3
+
+   bool handleCALL(Instruction *);
+   bool handlePRECONT(Instruction *);
+   bool handleCONT(Instruction *);
+
+   void checkPredicate(Instruction *);
+
+private:
+   const Target *const targ;
+
+   BuildUtil bld;
+
+   Value *tid;
+};
+
+NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
+   targ(prog->getTarget()), tid(NULL)
+{
+   bld.setProgram(prog);
+}
+
+bool
+NV50LoweringPreSSA::visit(Function *f)
+{
+   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
+
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      Value *arg = new_LValue(func, FILE_GPR);
+      arg->reg.data.id = 0;
+      f->ins.push_back(arg);
+
+      bld.setPosition(root, false);
+      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
+   }
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTEX(TexInstruction *i)
+{
+   const int arg = i->tex.target.getArgCount();
+   const int dref = arg;
+   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
+
+   // dref comes before bias/lod
+   if (i->tex.target.isShadow())
+      if (i->op == OP_TXB || i->op == OP_TXL)
+         i->swapSources(dref, lod);
+
+   // array index must be converted to u32
+   if (i->tex.target.isArray()) {
+      Value *layer = i->getSrc(arg - 1);
+      LValue *src = new_LValue(func, FILE_GPR);
+      bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
+      bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
+      i->setSrc(arg - 1, src);
+
+      if (i->tex.target.isCube()) {
+         std::vector<Value *> acube, a2d;
+         int c;
+
+         acube.resize(4);
+         for (c = 0; c < 4; ++c)
+            acube[c] = i->getSrc(c);
+         a2d.resize(4);
+         for (c = 0; c < 3; ++c)
+            a2d[c] = new_LValue(func, FILE_GPR);
+         a2d[3] = NULL;
+
+         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
+                   a2d, acube)->asTex()->tex.mask = 0x7;
+
+         for (c = 0; c < 3; ++c)
+            i->setSrc(c, a2d[c]);
+         i->setSrc(c, NULL);
+         for (; i->srcExists(c + 1); ++c)
+            i->setSrc(c, i->getSrc(c + 1));
+
+         i->tex.target = i->tex.target.isShadow() ?
+            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
+      }
+   }
+
+   // texel offsets are 3 immediate fields in the instruction,
+   // nv50 cannot do textureGatherOffsets
+   assert(i->tex.useOffsets <= 1);
+
+   return true;
+}
+
+// Bias must be equal for all threads of a quad or lod calculation will fail.
+//
+// The lanes of a quad are grouped by the bit in the condition register they
+// have set, which is selected by differing bias values.
+// Move the input values for TEX into a new register set for each group and
+// execute TEX only for a specific group.
+// We always need to use 4 new registers for the inputs/outputs because the
+// implicitly calculated derivatives must be correct.
+//
+// TODO: move to SSA phase so we can easily determine whether bias is constant
+bool
+NV50LoweringPreSSA::handleTXB(TexInstruction *i)
+{
+   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
+   int l, d;
+
+   handleTEX(i);
+   Value *bias = i->getSrc(i->tex.target.getArgCount());
+   if (bias->isUniform())
+      return true;
+
+   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
+                                 bld.loadImm(NULL, 1));
+   bld.setPosition(cond, false);
+
+   for (l = 1; l < 4; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *bit = bld.getSSA();
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      Value *imm = bld.loadImm(NULL, (1 << l));
+      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
+      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
+      cond->setSrc(l, bit);
+   }
+   Value *flags = bld.getScratch(1, FILE_FLAGS);
+   bld.setPosition(cond, true);
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+
+   Instruction *tex[4];
+   for (l = 0; l < 4; ++l) {
+      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
+      bld.insert(tex[l]);
+   }
+
+   Value *res[4][4];
+   for (d = 0; i->defExists(d); ++d)
+      res[0][d] = tex[0]->getDef(d);
+   for (l = 1; l < 4; ++l) {
+      for (d = 0; tex[l]->defExists(d); ++d) {
+         res[l][d] = cloneShallow(func, res[0][d]);
+         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
+      }
+   }
+
+   for (d = 0; i->defExists(d); ++d) {
+      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
+      for (l = 0; l < 4; ++l)
+         dst->setSrc(l, res[l][d]);
+   }
+   delete_Instruction(prog, i);
+   return true;
+}
+
+// LOD must be equal for all threads of a quad.
+// Unlike with TXB, here we can just diverge since there's no LOD calculation
+// that would require all 4 threads' sources to be set up properly.
+bool
+NV50LoweringPreSSA::handleTXL(TexInstruction *i)
+{
+   handleTEX(i);
+   Value *lod = i->getSrc(i->tex.target.getArgCount());
+   if (lod->isUniform())
+      return true;
+
+   BasicBlock *currBB = i->bb;
+   BasicBlock *texiBB = i->bb->splitBefore(i, false);
+   BasicBlock *joinBB = i->bb->splitAfter(i);
+
+   bld.setPosition(currBB, true);
+   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+   for (int l = 0; l <= 3; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      bld.setPosition(currBB, true);
+      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
+      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
+      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
+      if (l <= 2) {
+         BasicBlock *laneBB = new BasicBlock(func);
+         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
+         currBB = laneBB;
+      }
+   }
+   bld.setPosition(joinBB, false);
+   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Instruction *tex;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   handleTEX(i);
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+
+   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // texture
+      bld.insert(tex = cloneForward(func, i));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSET(Instruction *i)
+{
+   if (i->dType == TYPE_F32) {
+      bld.setPosition(i, true);
+      i->dType = TYPE_U32;
+      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
+      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+   Value *pred = bld.getScratch(1, FILE_FLAGS);
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   // XXX: these probably shouldn't be immediates in the first place ...
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.setPosition(i, true);
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+
+   bld.setPosition(i, false);
+   i->op = OP_SET;
+   i->setFlagsDef(0, pred);
+   i->dType = TYPE_U8;
+   i->setSrc(0, i->getSrc(2));
+   i->setSrc(2, NULL);
+   i->setSrc(1, bld.loadImm(NULL, 0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSELP(Instruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleWRSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+
+   // these are all shader outputs, $sreg are not writeable
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
+   if (addr >= 0x400)
+      return false;
+   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
+
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCALL(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      i->setSrc(i->srcCount(), tid);
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePRECONT(Instruction *i)
+{
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCONT(Instruction *i)
+{
+   i->op = OP_BRA;
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleRDSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+   Value *def = i->getDef(0);
+   SVSemantic sv = sym->reg.data.sv.sv;
+   int idx = sym->reg.data.sv.index;
+
+   if (addr >= 0x400) // mov $sreg
+      return true;
+
+   switch (sv) {
+   case SV_POSITION:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      break;
+   case SV_FACE:
+      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
+      if (i->dType == TYPE_F32) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
+         bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
+      }
+      break;
+   case SV_NCTAID:
+   case SV_CTAID:
+   case SV_NTID:
+      if ((sv == SV_NCTAID && idx >= 2) ||
+          (sv == SV_NTID && idx >= 3)) {
+         bld.mkMov(def, bld.mkImm(1));
+      } else if (sv == SV_CTAID && idx >= 2) {
+         bld.mkMov(def, bld.mkImm(0));
+      } else {
+         Value *x = bld.getSSA(2);
+         bld.mkOp1(OP_LOAD, TYPE_U16, x,
+                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+      }
+      break;
+   case SV_TID:
+      if (idx == 0) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
+      } else if (idx == 1) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
+         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
+      } else if (idx == 2) {
+         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
+      } else {
+         bld.mkMov(def, bld.mkImm(0));
+      }
+      break;
+   default:
+      bld.mkFetch(i->getDef(0), i->dType,
+                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
+      break;
+   }
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleDIV(Instruction *i)
+{
+   if (!isFloatType(i->dType))
+      return true;
+   bld.setPosition(i, false);
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   i->op = OP_MUL;
+   i->setSrc(1, rcp->getDef(0));
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSQRT(Instruction *i)
+{
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePOW(Instruction *i)
+{
+   LValue *val = bld.getScratch();
+
+   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+   i->op = OP_EX2;
+   i->setSrc(0, val);
+   i->setSrc(1, NULL);
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleEXPORT(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      if (i->getIndirect(0, 0)) {
+         // TODO: redirect to l[] here, load to GPRs at exit
+         return false;
+      } else {
+         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
+
+         i->op = OP_MOV;
+         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+         i->src(0).set(i->src(1));
+         i->setSrc(1, NULL);
+         i->setDef(0, new_LValue(func, FILE_GPR));
+         i->getDef(0)->reg.data.id = id;
+
+         prog->maxGPR = MAX2(prog->maxGPR, id);
+      }
+   }
+   return true;
+}
+
+// Set flags according to predicate and make the instruction read $cX.
+void
+NV50LoweringPreSSA::checkPredicate(Instruction *insn)
+{
+   Value *pred = insn->getPredicate();
+   Value *cdst;
+
+   if (!pred || pred->reg.file == FILE_FLAGS)
+      return;
+   cdst = bld.getSSA(1, FILE_FLAGS);
+
+   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
+
+   insn->setPredicate(insn->cc, cdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NV50LoweringPreSSA::visit(Instruction *i)
+{
+   bld.setPosition(i, false);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXF:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXB:
+      return handleTXB(i->asTex());
+   case OP_TXL:
+      return handleTXL(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_SET:
+      return handleSET(i);
+   case OP_SLCT:
+      return handleSLCT(i->asCmp());
+   case OP_SELP:
+      return handleSELP(i);
+   case OP_POW:
+      return handlePOW(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_CALL:
+      return handleCALL(i);
+   case OP_PRECONT:
+      return handlePRECONT(i);
+   case OP_CONT:
+      return handleCONT(i);
+   default:
+      break;
+   }
+   return true;
+}
+
+bool
+TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
+{
+   bool ret = false;
+
+   if (stage == CG_STAGE_PRE_SSA) {
+      NV50LoweringPreSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      if (!prog->targetPriv)
+         prog->targetPriv = new std::list<Instruction *>();
+      NV50LegalizeSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NV50LegalizePostRA pass;
+      ret = pass.run(prog, false, true);
+      if (prog->targetPriv)
+         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   }
+   return ret;
+}
+
+} // namespace nv50_ir