32 files changed, 27861 insertions, 0 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
new file mode 100644
index 0000000..90fb51c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -0,0 +1,1231 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+#include "codegen/nv50_ir_driver.h"
+
+extern "C" {
+#include "nv50/nv50_program.h"
+#include "nv50/nv50_debug.h"
+}
+
+namespace nv50_ir {
+
+Modifier::Modifier(operation op)
+{
+   switch (op) {
+   case OP_NEG: bits = NV50_IR_MOD_NEG; break;
+   case OP_ABS: bits = NV50_IR_MOD_ABS; break;
+   case OP_SAT: bits = NV50_IR_MOD_SAT; break;
+   case OP_NOT: bits = NV50_IR_MOD_NOT; break;
+   default:
+      bits = 0;
+      break;
+   }
+}
+
+Modifier Modifier::operator*(const Modifier m) const
+{
+   unsigned int a, b, c;
+
+   b = m.bits;
+   if (this->bits & NV50_IR_MOD_ABS)
+      b &= ~NV50_IR_MOD_NEG;
+
+   a = (this->bits ^ b)      & (NV50_IR_MOD_NOT | NV50_IR_MOD_NEG);
+   c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT);
+
+   return Modifier(a | c);
+}
+
+ValueRef::ValueRef(Value *v) : value(NULL), insn(NULL)
+{
+   indirect[0] = -1;
+   indirect[1] = -1;
+   usedAsPtr = false;
+   set(v);
+}
+
+ValueRef::ValueRef(const ValueRef& ref) : value(NULL), insn(ref.insn)
+{
+   set(ref);
+   usedAsPtr = ref.usedAsPtr;
+}
+
+ValueRef::~ValueRef()
+{
+   this->set(NULL);
+}
+
+bool ValueRef::getImmediate(ImmediateValue &imm) const
+{
+   const ValueRef *src = this;
+   Modifier m;
+   DataType type = src->insn->sType;
+
+   while (src) {
+      if (src->mod) {
+         if (src->insn->sType != type)
+            break;
+         m *= src->mod;
+      }
+      if (src->getFile() == FILE_IMMEDIATE) {
+         imm = *(src->value->asImm());
+         // The immediate's type isn't required to match its use, it's
+         // more of a hint; applying a modifier makes use of that hint.
+         imm.reg.type = type;
+         m.applyTo(imm);
+         return true;
+      }
+
+      Instruction *insn = src->value->getUniqueInsn();
+
+      if (insn && insn->op == OP_MOV) {
+         src = &insn->src(0);
+         if (src->mod)
+            WARN("OP_MOV with modifier encountered !\n");
+      } else {
+         src = NULL;
+      }
+   }
+   return false;
+}
+
+ValueDef::ValueDef(Value *v) : value(NULL), insn(NULL)
+{
+   set(v);
+}
+
+ValueDef::ValueDef(const ValueDef& def) : value(NULL), insn(NULL)
+{
+   set(def.get());
+}
+
+ValueDef::~ValueDef()
+{
+   this->set(NULL);
+}
+
+void
+ValueRef::set(const ValueRef &ref)
+{
+   this->set(ref.get());
+   mod = ref.mod;
+   indirect[0] = ref.indirect[0];
+   indirect[1] = ref.indirect[1];
+}
+
+void
+ValueRef::set(Value *refVal)
+{
+   if (value == refVal)
+      return;
+   if (value)
+      value->uses.remove(this);
+   if (refVal)
+      refVal->uses.push_back(this);
+
+   value = refVal;
+}
+
+void
+ValueDef::set(Value *defVal)
+{
+   if (value == defVal)
+      return;
+   if (value)
+      value->defs.remove(this);
+   if (defVal)
+      defVal->defs.push_back(this);
+
+   value = defVal;
+}
+
+// Check if we can replace this definition's value by the value in @rep,
+// including the source modifiers, i.e. make sure that all uses support
+// @rep.mod.
+bool
+ValueDef::mayReplace(const ValueRef &rep)
+{
+   if (!rep.mod)
+      return true;
+
+   if (!insn || !insn->bb) // Unbound instruction ?
+      return false;
+
+   const Target *target = insn->bb->getProgram()->getTarget();
+
+   for (Value::UseIterator it = value->uses.begin(); it != value->uses.end();
+        ++it) {
+      Instruction *insn = (*it)->getInsn();
+      int s = -1;
+
+      for (int i = 0; insn->srcExists(i); ++i) {
+         if (insn->src(i).get() == value) {
+            // If there are multiple references to us we'd have to check if the
+            // combination of mods is still supported, but just bail for now.
+            if (&insn->src(i) != (*it))
+               return false;
+            s = i;
+         }
+      }
+      assert(s >= 0); // integrity of uses list
+
+      if (!target->isModSupported(insn, s, rep.mod))
+         return false;
+   }
+   return true;
+}
+
+void
+ValueDef::replace(const ValueRef &repVal, bool doSet)
+{
+   assert(mayReplace(repVal));
+
+   if (value == repVal.get())
+      return;
+
+   while (!value->uses.empty()) {
+      ValueRef *ref = value->uses.front();
+      ref->set(repVal.get());
+      ref->mod *= repVal.mod;
+   }
+
+   if (doSet)
+      set(repVal.get());
+}
+
+Value::Value()
+{
+  join = this;
+  memset(&reg, 0, sizeof(reg));
+  reg.size = 4;
+}
+
+LValue::LValue(Function *fn, DataFile file)
+{
+   reg.file = file;
+   reg.size = (file != FILE_PREDICATE) ? 4 : 1;
+   reg.data.id = -1;
+
+   compMask = 0;
+   compound = 0;
+   ssa = 0;
+   fixedReg = 0;
+   noSpill = 0;
+
+   fn->add(this, this->id);
+}
+
+LValue::LValue(Function *fn, LValue *lval)
+{
+   assert(lval);
+
+   reg.file = lval->reg.file;
+   reg.size = lval->reg.size;
+   reg.data.id = -1;
+
+   compMask = 0;
+   compound = 0;
+   ssa = 0;
+   fixedReg = 0;
+   noSpill = 0;
+
+   fn->add(this, this->id);
+}
+
+LValue *
+LValue::clone(ClonePolicy<Function>& pol) const
+{
+   LValue *that = new_LValue(pol.context(), reg.file);
+
+   pol.set<Value>(this, that);
+
+   that->reg.size = this->reg.size;
+   that->reg.type = this->reg.type;
+   that->reg.data = this->reg.data;
+
+   return that;
+}
+
+bool
+LValue::isUniform() const
+{
+   if (defs.size() > 1)
+      return false;
+   Instruction *insn = getInsn();
+   // let's not try too hard here for now ...
+   return !insn->srcExists(1) && insn->getSrc(0)->isUniform();
+}
+
+Symbol::Symbol(Program *prog, DataFile f, ubyte fidx)
+{
+   baseSym = NULL;
+
+   reg.file = f;
+   reg.fileIndex = fidx;
+   reg.data.offset = 0;
+
+   prog->add(this, this->id);
+}
+
+Symbol *
+Symbol::clone(ClonePolicy<Function>& pol) const
+{
+   Program *prog = pol.context()->getProgram();
+
+   Symbol *that = new_Symbol(prog, reg.file, reg.fileIndex);
+
+   pol.set<Value>(this, that);
+
+   that->reg.size = this->reg.size;
+   that->reg.type = this->reg.type;
+   that->reg.data = this->reg.data;
+
+   that->baseSym = this->baseSym;
+
+   return that;
+}
+
+bool
+Symbol::isUniform() const
+{
+   return
+      reg.file != FILE_SYSTEM_VALUE &&
+      reg.file != FILE_MEMORY_LOCAL &&
+      reg.file != FILE_SHADER_INPUT;
+}
+
+ImmediateValue::ImmediateValue(Program *prog, uint32_t uval)
+{
+   memset(&reg, 0, sizeof(reg));
+
+   reg.file = FILE_IMMEDIATE;
+   reg.size = 4;
+   reg.type = TYPE_U32;
+
+   reg.data.u32 = uval;
+
+   prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(Program *prog, float fval)
+{
+   memset(&reg, 0, sizeof(reg));
+
+   reg.file = FILE_IMMEDIATE;
+   reg.size = 4;
+   reg.type = TYPE_F32;
+
+   reg.data.f32 = fval;
+
+   prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(Program *prog, double dval)
+{
+   memset(&reg, 0, sizeof(reg));
+
+   reg.file = FILE_IMMEDIATE;
+   reg.size = 8;
+   reg.type = TYPE_F64;
+
+   reg.data.f64 = dval;
+
+   prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(const ImmediateValue *proto, DataType ty)
+{
+   reg = proto->reg;
+
+   reg.type = ty;
+   reg.size = typeSizeof(ty);
+}
+
+ImmediateValue *
+ImmediateValue::clone(ClonePolicy<Function>& pol) const
+{
+   Program *prog = pol.context()->getProgram();
+   ImmediateValue *that = new_ImmediateValue(prog, 0u);
+
+   pol.set<Value>(this, that);
+
+   that->reg.size = this->reg.size;
+   that->reg.type = this->reg.type;
+   that->reg.data = this->reg.data;
+
+   return that;
+}
+
+bool
+ImmediateValue::isInteger(const int i) const
+{
+   switch (reg.type) {
+   case TYPE_S8:
+      return reg.data.s8 == i;
+   case TYPE_U8:
+      return reg.data.u8 == i;
+   case TYPE_S16:
+      return reg.data.s16 == i;
+   case TYPE_U16:
+      return reg.data.u16 == i;
+   case TYPE_S32:
+   case TYPE_U32:
+      return reg.data.s32 == i; // as if ...
+   case TYPE_F32:
+      return reg.data.f32 == static_cast<float>(i);
+   case TYPE_F64:
+      return reg.data.f64 == static_cast<double>(i);
+   default:
+      return false;
+   }
+}
+
+bool
+ImmediateValue::isNegative() const
+{
+   switch (reg.type) {
+   case TYPE_S8:  return reg.data.s8 < 0;
+   case TYPE_S16: return reg.data.s16 < 0;
+   case TYPE_S32:
+   case TYPE_U32: return reg.data.s32 < 0;
+   case TYPE_F32: return reg.data.u32 & (1 << 31);
+   case TYPE_F64: return reg.data.u64 & (1ULL << 63);
+   default:
+      return false;
+   }
+}
+
+bool
+ImmediateValue::isPow2() const
+{
+   switch (reg.type) {
+   case TYPE_U8:
+   case TYPE_U16:
+   case TYPE_U32: return util_is_power_of_two(reg.data.u32);
+   default:
+      return false;
+   }
+}
+
+void
+ImmediateValue::applyLog2()
+{
+   switch (reg.type) {
+   case TYPE_S8:
+   case TYPE_S16:
+   case TYPE_S32:
+      assert(!this->isNegative());
+      // fall through
+   case TYPE_U8:
+   case TYPE_U16:
+   case TYPE_U32:
+      reg.data.u32 = util_logbase2(reg.data.u32);
+      break;
+   case TYPE_F32:
+      reg.data.f32 = log2f(reg.data.f32);
+      break;
+   case TYPE_F64:
+      reg.data.f64 = log2(reg.data.f64);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+bool
+ImmediateValue::compare(CondCode cc, float fval) const
+{
+   if (reg.type != TYPE_F32)
+      ERROR("immediate value is not of type f32");
+
+   switch (static_cast<CondCode>(cc & 7)) {
+   case CC_TR: return true;
+   case CC_FL: return false;
+   case CC_LT: return reg.data.f32 <  fval;
+   case CC_LE: return reg.data.f32 <= fval;
+   case CC_GT: return reg.data.f32 >  fval;
+   case CC_GE: return reg.data.f32 >= fval;
+   case CC_EQ: return reg.data.f32 == fval;
+   case CC_NE: return reg.data.f32 != fval;
+   default:
+      assert(0);
+      return false;
+   }
+}
+
+ImmediateValue&
+ImmediateValue::operator=(const ImmediateValue &that)
+{
+   this->reg = that.reg;
+   return (*this);
+}
+
+bool
+Value::interfers(const Value *that) const
+{
+   uint32_t idA, idB;
+
+   if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex)
+      return false;
+   if (this->asImm())
+      return false;
+
+   if (this->asSym()) {
+      idA = this->join->reg.data.offset;
+      idB = that->join->reg.data.offset;
+   } else {
+      idA = this->join->reg.data.id * MIN2(this->reg.size, 4);
+      idB = that->join->reg.data.id * MIN2(that->reg.size, 4);
+   }
+
+   if (idA < idB)
+      return (idA + this->reg.size > idB);
+   else
+   if (idA > idB)
+      return (idB + that->reg.size > idA);
+   else
+      return (idA == idB);
+}
+
+bool
+Value::equals(const Value *that, bool strict) const
+{
+   if (strict)
+      return this == that;
+
+   if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex)
+      return false;
+   if (that->reg.size != this->reg.size)
+      return false;
+
+   if (that->reg.data.id != this->reg.data.id)
+      return false;
+
+   return true;
+}
+
+bool
+ImmediateValue::equals(const Value *that, bool strict) const
+{
+   const ImmediateValue *imm = that->asImm();
+   if (!imm)
+      return false;
+   return reg.data.u64 == imm->reg.data.u64;
+}
+
+bool
+Symbol::equals(const Value *that, bool strict) const
+{
+   if (reg.file != that->reg.file || reg.fileIndex != that->reg.fileIndex)
+      return false;
+   assert(that->asSym());
+
+   if (this->baseSym != that->asSym()->baseSym)
+      return false;
+
+   if (reg.file == FILE_SYSTEM_VALUE)
+      return (this->reg.data.sv.sv    == that->reg.data.sv.sv &&
+              this->reg.data.sv.index == that->reg.data.sv.index);
+   return this->reg.data.offset == that->reg.data.offset;
+}
+
+void Instruction::init()
+{
+   next = prev = 0;
+
+   cc = CC_ALWAYS;
+   rnd = ROUND_N;
+   cache = CACHE_CA;
+   subOp = 0;
+
+   saturate = 0;
+   join = 0;
+   exit = 0;
+   terminator = 0;
+   ftz = 0;
+   dnz = 0;
+   perPatch = 0;
+   fixed = 0;
+   encSize = 0;
+   ipa = 0;
+   mask = 0;
+
+   lanes = 0xf;
+
+   postFactor = 0;
+
+   predSrc = -1;
+   flagsDef = -1;
+   flagsSrc = -1;
+}
+
+Instruction::Instruction()
+{
+   init();
+
+   op = OP_NOP;
+   dType = sType = TYPE_F32;
+
+   id = -1;
+   bb = 0;
+}
+
+Instruction::Instruction(Function *fn, operation opr, DataType ty)
+{
+   init();
+
+   op = opr;
+   dType = sType = ty;
+
+   fn->add(this, id);
+}
+
+Instruction::~Instruction()
+{
+   if (bb) {
+      Function *fn = bb->getFunction();
+      bb->remove(this);
+      fn->allInsns.remove(id);
+   }
+
+   for (int s = 0; srcExists(s); ++s)
+      setSrc(s, NULL);
+   // must unlink defs too since the list pointers will get deallocated
+   for (int d = 0; defExists(d); ++d)
+      setDef(d, NULL);
+}
+
+void
+Instruction::setDef(int i, Value *val)
+{
+   int size = defs.size();
+   if (i >= size) {
+      defs.resize(i + 1);
+      while (size <= i)
+         defs[size++].setInsn(this);
+   }
+   defs[i].set(val);
+}
+
+void
+Instruction::setSrc(int s, Value *val)
+{
+   int size = srcs.size();
+   if (s >= size) {
+      srcs.resize(s + 1);
+      while (size <= s)
+         srcs[size++].setInsn(this);
+   }
+   srcs[s].set(val);
+}
+
+void
+Instruction::setSrc(int s, const ValueRef& ref)
+{
+   setSrc(s, ref.get());
+   srcs[s].mod = ref.mod;
+}
+
+void
+Instruction::swapSources(int a, int b)
+{
+   Value *value = srcs[a].get();
+   Modifier m = srcs[a].mod;
+
+   setSrc(a, srcs[b]);
+
+   srcs[b].set(value);
+   srcs[b].mod = m;
+}
+
+static inline void moveSourcesAdjustIndex(int8_t &index, int s, int delta)
+{
+   if (index >= s)
+      index += delta;
+   else
+   if ((delta < 0) && (index >= (s + delta)))
+      index = -1;
+}
+
+// Moves sources [@s,last_source] by @delta.
+// If @delta < 0, sources [@s - abs(@delta), @s) are erased.
+void
+Instruction::moveSources(const int s, const int delta)
+{
+   if (delta == 0)
+      return;
+   assert(s + delta >= 0);
+
+   int k;
+
+   for (k = 0; srcExists(k); ++k) {
+      for (int i = 0; i < 2; ++i)
+         moveSourcesAdjustIndex(src(k).indirect[i], s, delta);
+   }
+   moveSourcesAdjustIndex(predSrc, s, delta);
+   moveSourcesAdjustIndex(flagsSrc, s, delta);
+   if (asTex()) {
+      TexInstruction *tex = asTex();
+      moveSourcesAdjustIndex(tex->tex.rIndirectSrc, s, delta);
+      moveSourcesAdjustIndex(tex->tex.sIndirectSrc, s, delta);
+   }
+
+   if (delta > 0) {
+      --k;
+      for (int p = k + delta; k >= s; --k, --p)
+         setSrc(p, src(k));
+   } else {
+      int p;
+      for (p = s; p < k; ++p)
+         setSrc(p + delta, src(p));
+      for (; (p + delta) < k; ++p)
+         setSrc(p + delta, NULL);
+   }
+}
+
+void
+Instruction::takeExtraSources(int s, Value *values[3])
+{
+   values[0] = getIndirect(s, 0);
+   if (values[0])
+      setIndirect(s, 0, NULL);
+
+   values[1] = getIndirect(s, 1);
+   if (values[1])
+      setIndirect(s, 1, NULL);
+
+   values[2] = getPredicate();
+   if (values[2])
+      setPredicate(cc, NULL);
+}
+
+void
+Instruction::putExtraSources(int s, Value *values[3])
+{
+   if (values[0])
+      setIndirect(s, 0, values[0]);
+   if (values[1])
+      setIndirect(s, 1, values[1]);
+   if (values[2])
+      setPredicate(cc, values[2]);
+}
+
+Instruction *
+Instruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   if (!i)
+      i = new_Instruction(pol.context(), op, dType);
+#ifndef NDEBUG // non-conformant assert, so this is required
+   assert(typeid(*i) == typeid(*this));
+#endif
+
+   pol.set<Instruction>(this, i);
+
+   i->sType = sType;
+
+   i->rnd = rnd;
+   i->cache = cache;
+   i->subOp = subOp;
+
+   i->saturate = saturate;
+   i->join = join;
+   i->exit = exit;
+   i->mask = mask;
+   i->ftz = ftz;
+   i->dnz = dnz;
+   i->ipa = ipa;
+   i->lanes = lanes;
+   i->perPatch = perPatch;
+
+   i->postFactor = postFactor;
+
+   for (int d = 0; defExists(d); ++d)
+      i->setDef(d, pol.get(getDef(d)));
+
+   for (int s = 0; srcExists(s); ++s) {
+      i->setSrc(s, pol.get(getSrc(s)));
+      i->src(s).mod = src(s).mod;
+   }
+
+   i->cc = cc;
+   i->predSrc = predSrc;
+   i->flagsDef = flagsDef;
+   i->flagsSrc = flagsSrc;
+
+   return i;
+}
+
+unsigned int
+Instruction::defCount(unsigned int mask, bool singleFile) const
+{
+   unsigned int i, n;
+
+   if (singleFile) {
+      unsigned int d = ffs(mask);
+      if (!d)
+         return 0;
+      for (i = d--; defExists(i); ++i)
+         if (getDef(i)->reg.file != getDef(d)->reg.file)
+            mask &= ~(1 << i);
+   }
+
+   for (n = 0, i = 0; this->defExists(i); ++i, mask >>= 1)
+      n += mask & 1;
+   return n;
+}
+
+unsigned int
+Instruction::srcCount(unsigned int mask, bool singleFile) const
+{
+   unsigned int i, n;
+
+   if (singleFile) {
+      unsigned int s = ffs(mask);
+      if (!s)
+         return 0;
+      for (i = s--; srcExists(i); ++i)
+         if (getSrc(i)->reg.file != getSrc(s)->reg.file)
+            mask &= ~(1 << i);
+   }
+
+   for (n = 0, i = 0; this->srcExists(i); ++i, mask >>= 1)
+      n += mask & 1;
+   return n;
+}
+
+bool
+Instruction::setIndirect(int s, int dim, Value *value)
+{
+   assert(this->srcExists(s));
+
+   int p = srcs[s].indirect[dim];
+   if (p < 0) {
+      if (!value)
+         return true;
+      p = srcs.size();
+      while (p > 0 && !srcExists(p - 1))
+         --p;
+   }
+   setSrc(p, value);
+   srcs[p].usedAsPtr = (value != 0);
+   srcs[s].indirect[dim] = value ? p : -1;
+   return true;
+}
+
+bool
+Instruction::setPredicate(CondCode ccode, Value *value)
+{
+   cc = ccode;
+
+   if (!value) {
+      if (predSrc >= 0) {
+         srcs[predSrc].set(NULL);
+         predSrc = -1;
+      }
+      return true;
+   }
+
+   if (predSrc < 0) {
+      predSrc = srcs.size();
+      while (predSrc > 0 && !srcExists(predSrc - 1))
+         --predSrc;
+   }
+
+   setSrc(predSrc, value);
+   return true;
+}
+
+bool
+Instruction::writesPredicate() const
+{
+   for (int d = 0; defExists(d); ++d)
+      if (getDef(d)->inFile(FILE_PREDICATE) || getDef(d)->inFile(FILE_FLAGS))
+         return true;
+   return false;
+}
+
+static bool
+insnCheckCommutationDefSrc(const Instruction *a, const Instruction *b)
+{
+   for (int d = 0; a->defExists(d); ++d)
+      for (int s = 0; b->srcExists(s); ++s)
+         if (a->getDef(d)->interfers(b->getSrc(s)))
+            return false;
+   return true;
+}
+
+static bool
+insnCheckCommutationDefDef(const Instruction *a, const Instruction *b)
+{
+   for (int d = 0; a->defExists(d); ++d)
+      for (int c = 0; b->defExists(c); ++c)
+         if (a->getDef(d)->interfers(b->getDef(c)))
+            return false;
+   return true;
+}
+
+bool
+Instruction::isCommutationLegal(const Instruction *i) const
+{
+   bool ret = insnCheckCommutationDefDef(this, i);
+   ret = ret && insnCheckCommutationDefSrc(this, i);
+   ret = ret && insnCheckCommutationDefSrc(i, this);
+   return ret;
+}
+
+TexInstruction::TexInstruction(Function *fn, operation op)
+   : Instruction(fn, op, TYPE_F32)
+{
+   memset(&tex, 0, sizeof(tex));
+
+   tex.rIndirectSrc = -1;
+   tex.sIndirectSrc = -1;
+}
+
+TexInstruction::~TexInstruction()
+{
+   for (int c = 0; c < 3; ++c) {
+      dPdx[c].set(NULL);
+      dPdy[c].set(NULL);
+   }
+}
+
+TexInstruction *
+TexInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   TexInstruction *tex = (i ? static_cast<TexInstruction *>(i) :
+                          new_TexInstruction(pol.context(), op));
+
+   Instruction::clone(pol, tex);
+
+   tex->tex = this->tex;
+
+   if (op == OP_TXD) {
+      for (unsigned int c = 0; c < tex->tex.target.getDim(); ++c) {
+         tex->dPdx[c].set(dPdx[c]);
+         tex->dPdy[c].set(dPdy[c]);
+      }
+   }
+
+   return tex;
+}
+
+const struct TexInstruction::Target::Desc TexInstruction::Target::descTable[] =
+{
+   { "1D",                1, 1, false, false, false },
+   { "2D",                2, 2, false, false, false },
+   { "2D_MS",             2, 3, false, false, false },
+   { "3D",                3, 3, false, false, false },
+   { "CUBE",              2, 3, false, true,  false },
+   { "1D_SHADOW",         1, 1, false, false, true  },
+   { "2D_SHADOW",         2, 2, false, false, true  },
+   { "CUBE_SHADOW",       2, 3, false, true,  true  },
+   { "1D_ARRAY",          1, 2, true,  false, false },
+   { "2D_ARRAY",          2, 3, true,  false, false },
+   { "2D_MS_ARRAY",       2, 4, true,  false, false },
+   { "CUBE_ARRAY",        2, 4, true,  true,  false },
+   { "1D_ARRAY_SHADOW",   1, 2, true,  false, true  },
+   { "2D_ARRAY_SHADOW",   2, 3, true,  false, true  },
+   { "RECT",              2, 2, false, false, false },
+   { "RECT_SHADOW",       2, 2, false, false, true  },
+   { "CUBE_ARRAY_SHADOW", 2, 4, true,  true,  true  },
+   { "BUFFER",            1, 1, false, false, false },
+};
+
+void
+TexInstruction::setIndirectR(Value *v)
+{
+   int p = ((tex.rIndirectSrc < 0) && v) ? srcs.size() : tex.rIndirectSrc;
+   if (p >= 0) {
+      tex.rIndirectSrc = p;
+      setSrc(p, v);
+      srcs[p].usedAsPtr = !!v;
+   }
+}
+
+void
+TexInstruction::setIndirectS(Value *v)
+{
+   int p = ((tex.sIndirectSrc < 0) && v) ? srcs.size() : tex.sIndirectSrc;
+   if (p >= 0) {
+      tex.sIndirectSrc = p;
+      setSrc(p, v);
+      srcs[p].usedAsPtr = !!v;
+   }
+}
+
+CmpInstruction::CmpInstruction(Function *fn, operation op)
+   : Instruction(fn, op, TYPE_F32)
+{
+   setCond = CC_ALWAYS;
+}
+
+CmpInstruction *
+CmpInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   CmpInstruction *cmp = (i ? static_cast<CmpInstruction *>(i) :
+                          new_CmpInstruction(pol.context(), op));
+   cmp->dType = dType;
+   Instruction::clone(pol, cmp);
+   cmp->setCond = setCond;
+   return cmp;
+}
+
+FlowInstruction::FlowInstruction(Function *fn, operation op, void *targ)
+   : Instruction(fn, op, TYPE_NONE)
+{
+   if (op == OP_CALL)
+      target.fn = reinterpret_cast<Function *>(targ);
+   else
+      target.bb = reinterpret_cast<BasicBlock *>(targ);
+
+   if (op == OP_BRA ||
+       op == OP_CONT || op == OP_BREAK ||
+       op == OP_RET || op == OP_EXIT)
+      terminator = 1;
+   else
+   if (op == OP_JOIN)
+      terminator = targ ? 1 : 0;
+
+   allWarp = absolute = limit = builtin = indirect = 0;
+}
+
+FlowInstruction *
+FlowInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+   FlowInstruction *flow = (i ? static_cast<FlowInstruction *>(i) :
+                            new_FlowInstruction(pol.context(), op, NULL));
+
+   Instruction::clone(pol, flow);
+   flow->allWarp = allWarp;
+   flow->absolute = absolute;
+   flow->limit = limit;
+   flow->builtin = builtin;
+
+   if (builtin)
+      flow->target.builtin = target.builtin;
+   else
+   if (op == OP_CALL)
+      flow->target.fn = target.fn;
+   else
+   if (target.bb)
+      flow->target.bb = pol.get<BasicBlock>(target.bb);
+
+   return flow;
+}
+
+Program::Program(Type type, Target *arch)
+   : progType(type),
+     target(arch),
+     mem_Instruction(sizeof(Instruction), 6),
+     mem_CmpInstruction(sizeof(CmpInstruction), 4),
+     mem_TexInstruction(sizeof(TexInstruction), 4),
+     mem_FlowInstruction(sizeof(FlowInstruction), 4),
+     mem_LValue(sizeof(LValue), 8),
+     mem_Symbol(sizeof(Symbol), 7),
+     mem_ImmediateValue(sizeof(ImmediateValue), 7)
+{
+   code = NULL;
+   binSize = 0;
+
+   maxGPR = -1;
+
+   main = new Function(this, "MAIN", ~0);
+   calls.insert(&main->call);
+
+   dbgFlags = 0;
+   optLevel = 0;
+
+   targetPriv = NULL;
+}
+
+Program::~Program()
+{
+   for (ArrayList::Iterator it = allFuncs.iterator(); !it.end(); it.next())
+      delete reinterpret_cast<Function *>(it.get());
+
+   for (ArrayList::Iterator it = allRValues.iterator(); !it.end(); it.next())
+      releaseValue(reinterpret_cast<Value *>(it.get()));
+}
+
+void Program::releaseInstruction(Instruction *insn)
+{
+   // TODO: make this not suck so much
+
+   insn->~Instruction();
+
+   if (insn->asCmp())
+      mem_CmpInstruction.release(insn);
+   else
+   if (insn->asTex())
+      mem_TexInstruction.release(insn);
+   else
+   if (insn->asFlow())
+      mem_FlowInstruction.release(insn);
+   else
+      mem_Instruction.release(insn);
+}
+
+void Program::releaseValue(Value *value)
+{
+   value->~Value();
+
+   if (value->asLValue())
+      mem_LValue.release(value);
+   else
+   if (value->asImm())
+      mem_ImmediateValue.release(value);
+   else
+   if (value->asSym())
+      mem_Symbol.release(value);
+}
+
+
+} // namespace nv50_ir
+
+extern "C" {
+
+static void
+nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
+{
+#if defined(PIPE_SHADER_HULL) && defined(PIPE_SHADER_DOMAIN)
+   if (info->type == PIPE_SHADER_HULL || info->type == PIPE_SHADER_DOMAIN) {
+      info->prop.tp.domain = PIPE_PRIM_MAX;
+      info->prop.tp.outputPrim = PIPE_PRIM_MAX;
+   }
+#endif
+   if (info->type == PIPE_SHADER_GEOMETRY) {
+      info->prop.gp.instanceCount = 1;
+      info->prop.gp.maxVertices = 1;
+   }
+   info->io.clipDistance = 0xff;
+   info->io.pointSize = 0xff;
+   info->io.instanceId = 0xff;
+   info->io.vertexId = 0xff;
+   info->io.edgeFlagIn = 0xff;
+   info->io.edgeFlagOut = 0xff;
+   info->io.fragDepth = 0xff;
+   info->io.sampleMask = 0xff;
+   info->io.backFaceColor[0] = info->io.backFaceColor[1] = 0xff;
+}
+
+int
+nv50_ir_generate_code(struct nv50_ir_prog_info *info)
+{
+   int ret = 0;
+
+   nv50_ir::Program::Type type;
+
+   nv50_ir_init_prog_info(info);
+
+#define PROG_TYPE_CASE(a, b)                                      \
+   case PIPE_SHADER_##a: type = nv50_ir::Program::TYPE_##b; break
+
+   switch (info->type) {
+   PROG_TYPE_CASE(VERTEX, VERTEX);
+// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL);
+// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL);
+   PROG_TYPE_CASE(GEOMETRY, GEOMETRY);
+   PROG_TYPE_CASE(FRAGMENT, FRAGMENT);
+   PROG_TYPE_CASE(COMPUTE, COMPUTE);
+   default:
+      type = nv50_ir::Program::TYPE_COMPUTE;
+      break;
+   }
+   INFO_DBG(info->dbgFlags, VERBOSE, "translating program of type %u\n", type);
+
+   nv50_ir::Target *targ = nv50_ir::Target::create(info->target);
+   if (!targ)
+      return -1;
+
+   nv50_ir::Program *prog = new nv50_ir::Program(type, targ);
+   if (!prog)
+      return -1;
+   prog->driver = info;
+   prog->dbgFlags = info->dbgFlags;
+   prog->optLevel = info->optLevel;
+
+   switch (info->bin.sourceRep) {
+#if 0
+   case PIPE_IR_LLVM:
+   case PIPE_IR_GLSL:
+      return -1;
+   case PIPE_IR_SM4:
+      ret = prog->makeFromSM4(info) ? 0 : -2;
+      break;
+   case PIPE_IR_TGSI:
+#endif
+   default:
+      ret = prog->makeFromTGSI(info) ? 0 : -2;
+      break;
+   }
+   if (ret < 0)
+      goto out;
+   if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+      prog->print();
+
+   targ->parseDriverInfo(info);
+   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
+
+   prog->convertToSSA();
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+      prog->print();
+
+   prog->optimizeSSA(info->optLevel);
+   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA);
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+      prog->print();
+
+   if (!prog->registerAllocation()) {
+      ret = -4;
+      goto out;
+   }
+   prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA);
+
+   prog->optimizePostRA(info->optLevel);
+
+   if (!prog->emitBinary(info)) {
+      ret = -5;
+      goto out;
+   }
+
+out:
+   INFO_DBG(prog->dbgFlags, VERBOSE, "nv50_ir_generate_code: ret = %i\n", ret);
+
+   info->bin.maxGPR = prog->maxGPR;
+   info->bin.code = prog->code;
+   info->bin.codeSize = prog->binSize;
+   info->bin.tlsSpace = prog->tlsSize;
+
+   delete prog;
+   nv50_ir::Target::destroy(targ);
+
+   return ret;
+}
+
+} // extern "C"
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
new file mode 100644
index 0000000..68c76e5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -0,0 +1,1197 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_H__
+#define __NV50_IR_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <deque>
+#include <list>
+#include <vector>
+
+#include "codegen/nv50_ir_util.h"
+#include "codegen/nv50_ir_graph.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+namespace nv50_ir {
+
+enum operation
+{
+   OP_NOP = 0,
+   OP_PHI,
+   OP_UNION, // unify a new definition and several source values
+   OP_SPLIT, // $r0d -> { $r0, $r1 } ($r0d and $r0/$r1 will be coalesced)
+   OP_MERGE, // opposite of split, e.g. combine 2 32 bit into a 64 bit value
+   OP_CONSTRAINT, // copy values into consecutive registers
+   OP_MOV, // simple copy, no modifiers allowed
+   OP_LOAD,
+   OP_STORE,
+   OP_ADD, // NOTE: add u64 + u32 is legal for targets w/o 64-bit integer adds
+   OP_SUB,
+   OP_MUL,
+   OP_DIV,
+   OP_MOD,
+   OP_MAD,
+   OP_FMA,
+   OP_SAD, // abs(src0 - src1) + src2
+   OP_ABS,
+   OP_NEG,
+   OP_NOT,
+   OP_AND,
+   OP_OR,
+   OP_XOR,
+   OP_SHL,
+   OP_SHR,
+   OP_MAX,
+   OP_MIN,
+   OP_SAT, // CLAMP(f32, 0.0, 1.0)
+   OP_CEIL,
+   OP_FLOOR,
+   OP_TRUNC,
+   OP_CVT,
+   OP_SET_AND, // dst = (src0 CMP src1) & src2
+   OP_SET_OR,
+   OP_SET_XOR,
+   OP_SET,
+   OP_SELP, // dst = src2 ? src0 : src1
+   OP_SLCT, // dst = (src2 CMP 0) ? src0 : src1
+   OP_RCP,
+   OP_RSQ,
+   OP_LG2,
+   OP_SIN,
+   OP_COS,
+   OP_EX2,
+   OP_EXP, // exponential (base M_E)
+   OP_LOG, // natural logarithm
+   OP_PRESIN,
+   OP_PREEX2,
+   OP_SQRT,
+   OP_POW,
+   OP_BRA,
+   OP_CALL,
+   OP_RET,
+   OP_CONT,
+   OP_BREAK,
+   OP_PRERET,
+   OP_PRECONT,
+   OP_PREBREAK,
+   OP_BRKPT,     // breakpoint (not related to loops)
+   OP_JOINAT,    // push control flow convergence point
+   OP_JOIN,      // converge
+   OP_DISCARD,
+   OP_EXIT,
+   OP_MEMBAR, // memory barrier (mfence, lfence, sfence)
+   OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base
+   OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1]
+   OP_EXPORT,
+   OP_LINTERP,
+   OP_PINTERP,
+   OP_EMIT,    // emit vertex
+   OP_RESTART, // restart primitive
+   OP_TEX,
+   OP_TXB, // texture bias
+   OP_TXL, // texure lod
+   OP_TXF, // texel fetch
+   OP_TXQ, // texture size query
+   OP_TXD, // texture derivatives
+   OP_TXG, // texture gather
+   OP_TEXCSAA, // texture op for coverage sampling
+   OP_TEXPREP, // turn cube map array into 2d array coordinates
+   OP_SULDB, // surface load (raw)
+   OP_SULDP, // surface load (formatted)
+   OP_SUSTB, // surface store (raw)
+   OP_SUSTP, // surface store (formatted)
+   OP_SUREDB,
+   OP_SUREDP, // surface reduction (atomic op)
+   OP_SULEA,   // surface load effective address
+   OP_SUBFM,   // surface bitfield manipulation
+   OP_SUCLAMP, // clamp surface coordinates
+   OP_SUEAU,   // surface effective address
+   OP_MADSP,   // special integer multiply-add
+   OP_TEXBAR, // texture dependency barrier
+   OP_DFDX,
+   OP_DFDY,
+   OP_RDSV, // read system value
+   OP_WRSV, // write system value
+   OP_QUADOP,
+   OP_QUADON,
+   OP_QUADPOP,
+   OP_POPCNT, // bitcount(src0 & src1)
+   OP_INSBF,  // insert first src1[8:15] bits of src0 into src2 at src1[0:7]
+   OP_EXTBF,  // place bits [K,K+N) of src0 into dst, src1 = 0xNNKK
+   OP_PERMT,  // dst = bytes from src2,src0 selected by src1 (nvc0's src order)
+   OP_ATOM,
+   OP_BAR,    // execution barrier, sources = { id, thread count, predicate }
+   OP_VADD,   // byte/word vector operations
+   OP_VAVG,
+   OP_VMIN,
+   OP_VMAX,
+   OP_VSAD,
+   OP_VSET,
+   OP_VSHR,
+   OP_VSHL,
+   OP_VSEL,
+   OP_CCTL, // cache control
+   OP_LAST
+};
+
+// various instruction-specific modifier definitions Instruction::subOp
+// MOV_FINAL marks a MOV originating from an EXPORT (used for placing TEXBARs)
+#define NV50_IR_SUBOP_MUL_HIGH     1
+#define NV50_IR_SUBOP_EMIT_RESTART 1
+#define NV50_IR_SUBOP_LDC_IL       1
+#define NV50_IR_SUBOP_LDC_IS       2
+#define NV50_IR_SUBOP_LDC_ISL      3
+#define NV50_IR_SUBOP_SHIFT_WRAP   1
+#define NV50_IR_SUBOP_EMU_PRERET   1
+#define NV50_IR_SUBOP_TEXBAR(n)    n
+#define NV50_IR_SUBOP_MOV_FINAL    1
+#define NV50_IR_SUBOP_EXTBF_REV    1
+#define NV50_IR_SUBOP_PERMT_F4E    1
+#define NV50_IR_SUBOP_PERMT_B4E    2
+#define NV50_IR_SUBOP_PERMT_RC8    3
+#define NV50_IR_SUBOP_PERMT_ECL    4
+#define NV50_IR_SUBOP_PERMT_ECR    5
+#define NV50_IR_SUBOP_PERMT_RC16   6
+#define NV50_IR_SUBOP_BAR_SYNC     0
+#define NV50_IR_SUBOP_BAR_ARRIVE   1
+#define NV50_IR_SUBOP_BAR_RED_AND  2
+#define NV50_IR_SUBOP_BAR_RED_OR   3
+#define NV50_IR_SUBOP_BAR_RED_POPC 4
+#define NV50_IR_SUBOP_MEMBAR_L     1
+#define NV50_IR_SUBOP_MEMBAR_S     2
+#define NV50_IR_SUBOP_MEMBAR_M     3
+#define NV50_IR_SUBOP_MEMBAR_CTA  (0 << 2)
+#define NV50_IR_SUBOP_MEMBAR_GL   (1 << 2)
+#define NV50_IR_SUBOP_MEMBAR_SYS  (2 << 2)
+#define NV50_IR_SUBOP_MEMBAR_DIR(m)   ((m) & 0x3)
+#define NV50_IR_SUBOP_MEMBAR_SCOPE(m) ((m) & ~0x3)
+#define NV50_IR_SUBOP_MEMBAR(d,s) \
+   (NV50_IR_SUBOP_MEMBAR_##d | NV50_IR_SUBOP_MEMBAR_##s)
+#define NV50_IR_SUBOP_ATOM_ADD      0
+#define NV50_IR_SUBOP_ATOM_MIN      1
+#define NV50_IR_SUBOP_ATOM_MAX      2
+#define NV50_IR_SUBOP_ATOM_INC      3
+#define NV50_IR_SUBOP_ATOM_DEC      4
+#define NV50_IR_SUBOP_ATOM_AND      5
+#define NV50_IR_SUBOP_ATOM_OR       6
+#define NV50_IR_SUBOP_ATOM_XOR      7
+#define NV50_IR_SUBOP_ATOM_CAS      8
+#define NV50_IR_SUBOP_ATOM_EXCH     9
+#define NV50_IR_SUBOP_CCTL_IV      5
+#define NV50_IR_SUBOP_CCTL_IVALL   6
+#define NV50_IR_SUBOP_SUST_IGN     0
+#define NV50_IR_SUBOP_SUST_TRAP    1
+#define NV50_IR_SUBOP_SUST_SDCL    3
+#define NV50_IR_SUBOP_SULD_ZERO    0
+#define NV50_IR_SUBOP_SULD_TRAP    1
+#define NV50_IR_SUBOP_SULD_SDCL    3
+#define NV50_IR_SUBOP_SUBFM_3D     1
+#define NV50_IR_SUBOP_SUCLAMP_2D   0x10
+#define NV50_IR_SUBOP_SUCLAMP_SD(r, d) (( 0 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_SUCLAMP_PL(r, d) (( 5 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_SUCLAMP_BL(r, d) ((10 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_MADSP_SD     0xffff
+// Yes, we could represent those with DataType.
+// Or put the type into operation and have a couple 1000 values in that enum.
+// This will have to do for now.
+// The bitfields are supposed to correspond to nve4 ISA.
+#define NV50_IR_SUBOP_MADSP(a,b,c) (((c) << 8) | ((b) << 4) | (a))
+#define NV50_IR_SUBOP_V1(d,a,b)    (((d) << 10) | ((b) << 5) | (a) | 0x0000)
+#define NV50_IR_SUBOP_V2(d,a,b)    (((d) << 10) | ((b) << 5) | (a) | 0x4000)
+#define NV50_IR_SUBOP_V4(d,a,b)    (((d) << 10) | ((b) << 5) | (a) | 0x8000)
+#define NV50_IR_SUBOP_Vn(n)        ((n) >> 14)
+
+enum DataType
+{
+   TYPE_NONE,
+   TYPE_U8,
+   TYPE_S8,
+   TYPE_U16,
+   TYPE_S16,
+   TYPE_U32,
+   TYPE_S32,
+   TYPE_U64, // 64 bit operations are only lowered after register allocation
+   TYPE_S64,
+   TYPE_F16,
+   TYPE_F32,
+   TYPE_F64,
+   TYPE_B96,
+   TYPE_B128
+};
+
+enum CondCode
+{
+   CC_FL = 0,
+   CC_NEVER = CC_FL, // when used with FILE_FLAGS
+   CC_LT = 1,
+   CC_EQ = 2,
+   CC_NOT_P = CC_EQ, // when used with FILE_PREDICATE
+   CC_LE = 3,
+   CC_GT = 4,
+   CC_NE = 5,
+   CC_P  = CC_NE,
+   CC_GE = 6,
+   CC_TR = 7,
+   CC_ALWAYS = CC_TR,
+   CC_U  = 8,
+   CC_LTU = 9,
+   CC_EQU = 10,
+   CC_LEU = 11,
+   CC_GTU = 12,
+   CC_NEU = 13,
+   CC_GEU = 14,
+   CC_NO = 0x10,
+   CC_NC = 0x11,
+   CC_NS = 0x12,
+   CC_NA = 0x13,
+   CC_A  = 0x14,
+   CC_S  = 0x15,
+   CC_C  = 0x16,
+   CC_O  = 0x17
+};
+
+enum RoundMode
+{
+   ROUND_N, // nearest
+   ROUND_M, // towards -inf
+   ROUND_Z, // towards 0
+   ROUND_P, // towards +inf
+   ROUND_NI, // nearest integer
+   ROUND_MI, // to integer towards -inf
+   ROUND_ZI, // to integer towards 0
+   ROUND_PI, // to integer towards +inf
+};
+
+enum CacheMode
+{
+   CACHE_CA,            // cache at all levels
+   CACHE_WB = CACHE_CA, // cache write back
+   CACHE_CG,            // cache at global level
+   CACHE_CS,            // cache streaming
+   CACHE_CV,            // cache as volatile
+   CACHE_WT = CACHE_CV  // cache write-through
+};
+
+enum DataFile
+{
+   FILE_NULL = 0,
+   FILE_GPR,
+   FILE_PREDICATE,       // boolean predicate
+   FILE_FLAGS,           // zero/sign/carry/overflow bits
+   FILE_ADDRESS,
+   LAST_REGISTER_FILE = FILE_ADDRESS,
+   FILE_IMMEDIATE,
+   FILE_MEMORY_CONST,
+   FILE_SHADER_INPUT,
+   FILE_SHADER_OUTPUT,
+   FILE_MEMORY_GLOBAL,
+   FILE_MEMORY_SHARED,
+   FILE_MEMORY_LOCAL,
+   FILE_SYSTEM_VALUE,
+   DATA_FILE_COUNT
+};
+
+enum TexTarget
+{
+   TEX_TARGET_1D,
+   TEX_TARGET_2D,
+   TEX_TARGET_2D_MS,
+   TEX_TARGET_3D,
+   TEX_TARGET_CUBE,
+   TEX_TARGET_1D_SHADOW,
+   TEX_TARGET_2D_SHADOW,
+   TEX_TARGET_CUBE_SHADOW,
+   TEX_TARGET_1D_ARRAY,
+   TEX_TARGET_2D_ARRAY,
+   TEX_TARGET_2D_MS_ARRAY,
+   TEX_TARGET_CUBE_ARRAY,
+   TEX_TARGET_1D_ARRAY_SHADOW,
+   TEX_TARGET_2D_ARRAY_SHADOW,
+   TEX_TARGET_RECT,
+   TEX_TARGET_RECT_SHADOW,
+   TEX_TARGET_CUBE_ARRAY_SHADOW,
+   TEX_TARGET_BUFFER,
+   TEX_TARGET_COUNT
+};
+
+enum SVSemantic
+{
+   SV_POSITION, // WPOS
+   SV_VERTEX_ID,
+   SV_INSTANCE_ID,
+   SV_INVOCATION_ID,
+   SV_PRIMITIVE_ID,
+   SV_VERTEX_COUNT, // gl_PatchVerticesIn
+   SV_LAYER,
+   SV_VIEWPORT_INDEX,
+   SV_YDIR,
+   SV_FACE,
+   SV_POINT_SIZE,
+   SV_POINT_COORD,
+   SV_CLIP_DISTANCE,
+   SV_SAMPLE_INDEX,
+   SV_TESS_FACTOR,
+   SV_TESS_COORD,
+   SV_TID,
+   SV_CTAID,
+   SV_NTID,
+   SV_GRIDID,
+   SV_NCTAID,
+   SV_LANEID,
+   SV_PHYSID,
+   SV_NPHYSID,
+   SV_CLOCK,
+   SV_LBASE,
+   SV_SBASE,
+   SV_UNDEFINED,
+   SV_LAST
+};
+
+class Program;
+class Function;
+class BasicBlock;
+
+class Target;
+
+class Instruction;
+class CmpInstruction;
+class TexInstruction;
+class FlowInstruction;
+
+class Value;
+class LValue;
+class Symbol;
+class ImmediateValue;
+
+struct Storage
+{
+   DataFile file;
+   int8_t fileIndex; // signed, may be indirect for CONST[]
+   uint8_t size; // this should match the Instruction type's size
+   DataType type; // mainly for pretty printing
+   union {
+      uint64_t u64;    // immediate values
+      uint32_t u32;
+      uint16_t u16;
+      uint8_t u8;
+      int64_t s64;
+      int32_t s32;
+      int16_t s16;
+      int8_t s8;
+      float f32;
+      double f64;
+      int32_t offset; // offset from 0 (base of address space)
+      int32_t id;     // register id (< 0 if virtual/unassigned, in units <= 4)
+      struct {
+         SVSemantic sv;
+         int index;
+      } sv;
+   } data;
+};
+
+// precedence: NOT after SAT after NEG after ABS
+#define NV50_IR_MOD_ABS (1 << 0)
+#define NV50_IR_MOD_NEG (1 << 1)
+#define NV50_IR_MOD_SAT (1 << 2)
+#define NV50_IR_MOD_NOT (1 << 3)
+#define NV50_IR_MOD_NEG_ABS (NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)
+
+#define NV50_IR_INTERP_MODE_MASK   0x3
+#define NV50_IR_INTERP_LINEAR      (0 << 0)
+#define NV50_IR_INTERP_PERSPECTIVE (1 << 0)
+#define NV50_IR_INTERP_FLAT        (2 << 0)
+#define NV50_IR_INTERP_SC          (3 << 0) // what exactly is that ?
+#define NV50_IR_INTERP_SAMPLE_MASK 0xc
+#define NV50_IR_INTERP_DEFAULT     (0 << 2)
+#define NV50_IR_INTERP_CENTROID    (1 << 2)
+#define NV50_IR_INTERP_OFFSET      (2 << 2)
+#define NV50_IR_INTERP_SAMPLEID    (3 << 2)
+
+// do we really want this to be a class ?
+class Modifier
+{
+public:
+   Modifier() : bits(0) { }
+   Modifier(unsigned int m) : bits(m) { }
+   Modifier(operation op);
+
+   // @return new Modifier applying a after b (asserts if unrepresentable)
+   Modifier operator*(const Modifier) const;
+   Modifier operator*=(const Modifier m) { *this = *this * m; return *this; }
+   Modifier operator==(const Modifier m) const { return m.bits == bits; }
+   Modifier operator!=(const Modifier m) const { return m.bits != bits; }
+
+   inline Modifier operator&(const Modifier m) const { return bits & m.bits; }
+   inline Modifier operator|(const Modifier m) const { return bits | m.bits; }
+   inline Modifier operator^(const Modifier m) const { return bits ^ m.bits; }
+
+   operation getOp() const;
+
+   inline int neg() const { return (bits & NV50_IR_MOD_NEG) ? 1 : 0; }
+   inline int abs() const { return (bits & NV50_IR_MOD_ABS) ? 1 : 0; }
+
+   inline operator bool() const { return bits ? true : false; }
+
+   void applyTo(ImmediateValue &imm) const;
+
+   int print(char *buf, size_t size) const;
+
+private:
+   uint8_t bits;
+};
+
+class ValueRef
+{
+public:
+   ValueRef(Value * = NULL);
+   ValueRef(const ValueRef&);
+   ~ValueRef();
+
+   inline bool exists() const { return value != NULL; }
+
+   void set(Value *);
+   void set(const ValueRef&);
+   inline Value *get() const { return value; }
+   inline Value *rep() const;
+
+   inline Instruction *getInsn() const { return insn; }
+   inline void setInsn(Instruction *inst) { insn = inst; }
+
+   inline bool isIndirect(int dim) const { return indirect[dim] >= 0; }
+   inline const ValueRef *getIndirect(int dim) const;
+
+   inline DataFile getFile() const;
+   inline unsigned getSize() const;
+
+   // SSA: return eventual (traverse MOVs) literal value, if it exists
+   bool getImmediate(ImmediateValue&) const;
+
+public:
+   Modifier mod;
+   int8_t indirect[2]; // >= 0 if relative to lvalue in insn->src(indirect[i])
+   uint8_t swizzle;
+
+   bool usedAsPtr; // for printing
+
+private:
+   Value *value;
+   Instruction *insn;
+};
+
+class ValueDef
+{
+public:
+   ValueDef(Value * = NULL);
+   ValueDef(const ValueDef&);
+   ~ValueDef();
+
+   inline bool exists() const { return value != NULL; }
+
+   inline Value *get() const { return value; }
+   inline Value *rep() const;
+   void set(Value *);
+   bool mayReplace(const ValueRef &);
+   void replace(const ValueRef &, bool doSet); // replace all uses of the old value
+
+   inline Instruction *getInsn() const { return insn; }
+   inline void setInsn(Instruction *inst) { insn = inst; }
+
+   inline DataFile getFile() const;
+   inline unsigned getSize() const;
+
+   inline void setSSA(LValue *);
+   inline const LValue *preSSA() const;
+
+private:
+   Value *value;   // should make this LValue * ...
+   LValue *origin; // pre SSA value
+   Instruction *insn;
+};
+
+class Value
+{
+public:
+   Value();
+   virtual ~Value() { }
+
+   virtual Value *clone(ClonePolicy<Function>&) const = 0;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const = 0;
+
+   virtual bool equals(const Value *, bool strict = false) const;
+   virtual bool interfers(const Value *) const;
+   virtual bool isUniform() const { return true; }
+
+   inline Value *rep() const { return join; }
+
+   inline Instruction *getUniqueInsn() const;
+   inline Instruction *getInsn() const; // use when uniqueness is certain
+
+   inline int refCount() { return uses.size(); }
+
+   inline LValue *asLValue();
+   inline Symbol *asSym();
+   inline ImmediateValue *asImm();
+   inline const Symbol *asSym() const;
+   inline const ImmediateValue *asImm() const;
+
+   inline bool inFile(DataFile f) { return reg.file == f; }
+
+   static inline Value *get(Iterator&);
+
+   std::list<ValueRef *> uses;
+   std::list<ValueDef *> defs;
+   typedef std::list<ValueRef *>::iterator UseIterator;
+   typedef std::list<ValueRef *>::const_iterator UseCIterator;
+   typedef std::list<ValueDef *>::iterator DefIterator;
+   typedef std::list<ValueDef *>::const_iterator DefCIterator;
+
+   int id;
+   Storage reg;
+
+   // TODO: these should be in LValue:
+   Interval livei;
+   Value *join;
+};
+
+class LValue : public Value
+{
+public:
+   LValue(Function *, DataFile file);
+   LValue(Function *, LValue *);
+   ~LValue() { }
+
+   virtual bool isUniform() const;
+
+   virtual LValue *clone(ClonePolicy<Function>&) const;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+
+public:
+   unsigned compMask : 8; // compound/component mask
+   unsigned compound : 1; // used by RA, value involved in split/merge
+   unsigned ssa      : 1;
+   unsigned fixedReg : 1; // set & used by RA, earlier just use (id < 0)
+   unsigned noSpill  : 1; // do not spill (e.g. if spill temporary already)
+};
+
+class Symbol : public Value
+{
+public:
+   Symbol(Program *, DataFile file = FILE_MEMORY_CONST, ubyte fileIdx = 0);
+   ~Symbol() { }
+
+   virtual Symbol *clone(ClonePolicy<Function>&) const;
+
+   virtual bool equals(const Value *that, bool strict) const;
+
+   virtual bool isUniform() const;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+
+   // print with indirect values
+   int print(char *, size_t, Value *, Value *, DataType ty = TYPE_NONE) const;
+
+   inline void setFile(DataFile file, ubyte fileIndex = 0)
+   {
+      reg.file = file;
+      reg.fileIndex = fileIndex;
+   }
+
+   inline void setOffset(int32_t offset);
+   inline void setAddress(Symbol *base, int32_t offset);
+   inline void setSV(SVSemantic sv, uint32_t idx = 0);
+
+   inline const Symbol *getBase() const { return baseSym; }
+
+private:
+   Symbol *baseSym; // array base for Symbols representing array elements
+};
+
+class ImmediateValue : public Value
+{
+public:
+   ImmediateValue() { }
+   ImmediateValue(Program *, uint32_t);
+   ImmediateValue(Program *, float);
+   ImmediateValue(Program *, double);
+   // NOTE: not added to program with
+   ImmediateValue(const ImmediateValue *, DataType ty);
+   ~ImmediateValue() { };
+
+   virtual ImmediateValue *clone(ClonePolicy<Function>&) const;
+
+   virtual bool equals(const Value *that, bool strict) const;
+
+   // these only work if 'type' is valid (we mostly use untyped literals):
+   bool isInteger(const int ival) const; // ival is cast to this' type
+   bool isNegative() const;
+   bool isPow2() const;
+
+   void applyLog2();
+
+   // for constant folding:
+   ImmediateValue operator+(const ImmediateValue&) const;
+   ImmediateValue operator-(const ImmediateValue&) const;
+   ImmediateValue operator*(const ImmediateValue&) const;
+   ImmediateValue operator/(const ImmediateValue&) const;
+
+   ImmediateValue& operator=(const ImmediateValue&); // only sets value !
+
+   bool compare(CondCode cc, float fval) const;
+
+   virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+};
+
+class Instruction
+{
+public:
+   Instruction();
+   Instruction(Function *, operation, DataType);
+   virtual ~Instruction();
+
+   virtual Instruction *clone(ClonePolicy<Function>&,
+                              Instruction * = NULL) const;
+
+   void setDef(int i, Value *);
+   void setSrc(int s, Value *);
+   void setSrc(int s, const ValueRef&);
+   void swapSources(int a, int b);
+   void moveSources(int s, int delta);
+   bool setIndirect(int s, int dim, Value *);
+
+   inline ValueRef& src(int s) { return srcs[s]; }
+   inline ValueDef& def(int s) { return defs[s]; }
+   inline const ValueRef& src(int s) const { return srcs[s]; }
+   inline const ValueDef& def(int s) const { return defs[s]; }
+
+   inline Value *getDef(int d) const { return defs[d].get(); }
+   inline Value *getSrc(int s) const { return srcs[s].get(); }
+   inline Value *getIndirect(int s, int dim) const;
+
+   inline bool defExists(unsigned d) const
+   {
+      return d < defs.size() && defs[d].exists();
+   }
+   inline bool srcExists(unsigned s) const
+   {
+      return s < srcs.size() && srcs[s].exists();
+   }
+
+   inline bool constrainedDefs() const;
+
+   bool setPredicate(CondCode ccode, Value *);
+   inline Value *getPredicate() const;
+   bool writesPredicate() const;
+   inline bool isPredicated() const { return predSrc >= 0; }
+
+   inline void setFlagsSrc(int s, Value *);
+   inline void setFlagsDef(int d, Value *);
+   inline bool usesFlags() const { return flagsSrc >= 0; }
+
+   unsigned int defCount() const { return defs.size(); };
+   unsigned int defCount(unsigned int mask, bool singleFile = false) const;
+   unsigned int srcCount() const { return srcs.size(); };
+   unsigned int srcCount(unsigned int mask, bool singleFile = false) const;
+
+   // save & remove / set indirect[0,1] and predicate source
+   void takeExtraSources(int s, Value *[3]);
+   void putExtraSources(int s, Value *[3]);
+
+   inline void setType(DataType type) { dType = sType = type; }
+
+   inline void setType(DataType dtype, DataType stype)
+   {
+      dType = dtype;
+      sType = stype;
+   }
+
+   inline bool isPseudo() const { return op < OP_MOV; }
+   bool isDead() const;
+   bool isNop() const;
+   bool isCommutationLegal(const Instruction *) const; // must be adjacent !
+   bool isActionEqual(const Instruction *) const;
+   bool isResultEqual(const Instruction *) const;
+
+   void print() const;
+
+   inline CmpInstruction *asCmp();
+   inline TexInstruction *asTex();
+   inline FlowInstruction *asFlow();
+   inline const TexInstruction *asTex() const;
+   inline const CmpInstruction *asCmp() const;
+   inline const FlowInstruction *asFlow() const;
+
+public:
+   Instruction *next;
+   Instruction *prev;
+   int id;
+   int serial; // CFG order
+
+   operation op;
+   DataType dType; // destination or defining type
+   DataType sType; // source or secondary type
+   CondCode cc;
+   RoundMode rnd;
+   CacheMode cache;
+
+   uint16_t subOp; // quadop, 1 for mul-high, etc.
+
+   unsigned encSize    : 4; // encoding size in bytes
+   unsigned saturate   : 1; // to [0.0f, 1.0f]
+   unsigned join       : 1; // converge control flow (use OP_JOIN until end)
+   unsigned fixed      : 1; // prevent dead code elimination
+   unsigned terminator : 1; // end of basic block
+   unsigned ftz        : 1; // flush denormal to zero
+   unsigned dnz        : 1; // denormals, NaN are zero
+   unsigned ipa        : 4; // interpolation mode
+   unsigned lanes      : 4;
+   unsigned perPatch   : 1;
+   unsigned exit       : 1; // terminate program after insn
+   unsigned mask       : 4; // for vector ops
+
+   int8_t postFactor; // MUL/DIV(if < 0) by 1 << postFactor
+
+   int8_t predSrc;
+   int8_t flagsDef;
+   int8_t flagsSrc;
+
+   uint8_t sched; // scheduling data (NOTE: maybe move to separate storage)
+
+   BasicBlock *bb;
+
+protected:
+   std::deque<ValueDef> defs; // no gaps !
+   std::deque<ValueRef> srcs; // no gaps !
+
+   // instruction specific methods:
+   // (don't want to subclass, would need more constructors and memory pools)
+public:
+   inline void setInterpolate(unsigned int mode) { ipa = mode; }
+
+   unsigned int getInterpMode() const { return ipa & 0x3; }
+   unsigned int getSampleMode() const { return ipa & 0xc; }
+
+private:
+   void init();
+};
+
+enum TexQuery
+{
+   TXQ_DIMS,
+   TXQ_TYPE,
+   TXQ_SAMPLE_POSITION,
+   TXQ_FILTER,
+   TXQ_LOD,
+   TXQ_WRAP,
+   TXQ_BORDER_COLOUR
+};
+
+class TexInstruction : public Instruction
+{
+public:
+   class Target
+   {
+   public:
+      Target(TexTarget targ = TEX_TARGET_2D) : target(targ) { }
+
+      const char *getName() const { return descTable[target].name; }
+      unsigned int getArgCount() const { return descTable[target].argc; }
+      unsigned int getDim() const { return descTable[target].dim; }
+      int isArray() const { return descTable[target].array ? 1 : 0; }
+      int isCube() const { return descTable[target].cube ? 1 : 0; }
+      int isShadow() const { return descTable[target].shadow ? 1 : 0; }
+      int isMS() const {
+        return target == TEX_TARGET_2D_MS || target == TEX_TARGET_2D_MS_ARRAY; }
+
+      Target& operator=(TexTarget targ)
+      {
+         assert(targ < TEX_TARGET_COUNT);
+         target = targ;
+         return *this;
+      }
+
+      inline bool operator==(TexTarget targ) const { return target == targ; }
+      inline bool operator!=(TexTarget targ) const { return target != targ; }
+
+      enum TexTarget getEnum() const { return target; }
+
+   private:
+      struct Desc
+      {
+         char name[19];
+         uint8_t dim;
+         uint8_t argc;
+         bool array;
+         bool cube;
+         bool shadow;
+      };
+
+      static const struct Desc descTable[TEX_TARGET_COUNT];
+
+   private:
+      enum TexTarget target;
+   };
+
+public:
+   TexInstruction(Function *, operation);
+   virtual ~TexInstruction();
+
+   virtual TexInstruction *clone(ClonePolicy<Function>&,
+                                 Instruction * = NULL) const;
+
+   inline void setTexture(Target targ, uint8_t r, uint8_t s)
+   {
+      tex.r = r;
+      tex.s = s;
+      tex.target = targ;
+   }
+
+   void setIndirectR(Value *);
+   void setIndirectS(Value *);
+   inline Value *getIndirectR() const;
+   inline Value *getIndirectS() const;
+
+public:
+   struct {
+      Target target;
+
+      uint16_t r;
+      uint16_t s;
+      int8_t rIndirectSrc;
+      int8_t sIndirectSrc;
+
+      uint8_t mask;
+      uint8_t gatherComp;
+
+      bool liveOnly; // only execute on live pixels of a quad (optimization)
+      bool levelZero;
+      bool derivAll;
+
+      int8_t useOffsets; // 0, 1, or 4 for textureGatherOffsets
+      int8_t offset[4][3];
+
+      enum TexQuery query;
+   } tex;
+
+   ValueRef dPdx[3];
+   ValueRef dPdy[3];
+};
+
+class CmpInstruction : public Instruction
+{
+public:
+   CmpInstruction(Function *, operation);
+
+   virtual CmpInstruction *clone(ClonePolicy<Function>&,
+                                 Instruction * = NULL) const;
+
+   void setCondition(CondCode cond) { setCond = cond; }
+   CondCode getCondition() const { return setCond; }
+
+public:
+   CondCode setCond;
+};
+
+class FlowInstruction : public Instruction
+{
+public:
+   FlowInstruction(Function *, operation, void *target);
+
+   virtual FlowInstruction *clone(ClonePolicy<Function>&,
+                                  Instruction * = NULL) const;
+
+public:
+   unsigned allWarp  : 1;
+   unsigned absolute : 1;
+   unsigned limit    : 1;
+   unsigned builtin  : 1; // true for calls to emulation code
+   unsigned indirect : 1; // target in src(0)
+
+   union {
+      BasicBlock *bb;
+      int builtin;
+      Function *fn;
+   } target;
+};
+
+class BasicBlock
+{
+public:
+   BasicBlock(Function *);
+   ~BasicBlock();
+
+   BasicBlock *clone(ClonePolicy<Function>&) const;
+
+   inline int getId() const { return id; }
+   inline unsigned int getInsnCount() const { return numInsns; }
+   inline bool isTerminated() const { return exit && exit->terminator; }
+
+   bool dominatedBy(BasicBlock *bb);
+   inline bool reachableBy(const BasicBlock *by, const BasicBlock *term);
+
+   // returns mask of conditional out blocks
+   // e.g. 3 for IF { .. } ELSE { .. } ENDIF, 1 for IF { .. } ENDIF
+   unsigned int initiatesSimpleConditional() const;
+
+public:
+   Function *getFunction() const { return func; }
+   Program *getProgram() const { return program; }
+
+   Instruction *getEntry() const { return entry; } // first non-phi instruction
+   Instruction *getPhi() const { return phi; }
+   Instruction *getFirst() const { return phi ? phi : entry; }
+   Instruction *getExit() const { return exit; }
+
+   void insertHead(Instruction *);
+   void insertTail(Instruction *);
+   void insertBefore(Instruction *, Instruction *);
+   void insertAfter(Instruction *, Instruction *);
+   void remove(Instruction *);
+   void permuteAdjacent(Instruction *, Instruction *);
+
+   BasicBlock *idom() const;
+
+   // NOTE: currently does not rebuild the dominator tree
+   BasicBlock *splitBefore(Instruction *, bool attach = true);
+   BasicBlock *splitAfter(Instruction *, bool attach = true);
+
+   DLList& getDF() { return df; }
+   DLList::Iterator iterDF() { return df.iterator(); }
+
+   static inline BasicBlock *get(Iterator&);
+   static inline BasicBlock *get(Graph::Node *);
+
+public:
+   Graph::Node cfg; // first edge is branch *taken* (the ELSE branch)
+   Graph::Node dom;
+
+   BitSet liveSet;
+   BitSet defSet;
+
+   uint32_t binPos;
+   uint32_t binSize;
+
+   Instruction *joinAt; // for quick reference
+
+   bool explicitCont; // loop headers: true if loop contains continue stmts
+
+private:
+   int id;
+   DLList df;
+
+   Instruction *phi;
+   Instruction *entry;
+   Instruction *exit;
+
+   unsigned int numInsns;
+
+private:
+   Function *func;
+   Program *program;
+
+   void splitCommon(Instruction *, BasicBlock *, bool attach);
+};
+
+class Function
+{
+public:
+   Function(Program *, const char *name, uint32_t label);
+   ~Function();
+
+   static inline Function *get(Graph::Node *node);
+
+   inline Program *getProgram() const { return prog; }
+   inline const char *getName() const { return name; }
+   inline int getId() const { return id; }
+   inline uint32_t getLabel() const { return label; }
+
+   void print();
+   void printLiveIntervals() const;
+   void printCFGraph(const char *filePath);
+
+   bool setEntry(BasicBlock *);
+   bool setExit(BasicBlock *);
+
+   unsigned int orderInstructions(ArrayList&);
+
+   inline void add(BasicBlock *bb, int& id) { allBBlocks.insert(bb, id); }
+   inline void add(Instruction *insn, int& id) { allInsns.insert(insn, id); }
+   inline void add(LValue *lval, int& id) { allLValues.insert(lval, id); }
+
+   inline LValue *getLValue(int id);
+
+   void buildLiveSets();
+   void buildDefSets();
+   bool convertToSSA();
+
+public:
+   std::deque<ValueDef> ins;
+   std::deque<ValueRef> outs;
+   std::deque<Value *> clobbers;
+
+   Graph cfg;
+   Graph::Node *cfgExit;
+   Graph *domTree;
+   Graph::Node call; // node in the call graph
+
+   BasicBlock **bbArray; // BBs in emission order
+   int bbCount;
+
+   unsigned int loopNestingBound;
+   int regClobberMax;
+
+   uint32_t binPos;
+   uint32_t binSize;
+
+   Value *stackPtr;
+
+   uint32_t tlsBase; // base address for l[] space (if no stack pointer is used)
+   uint32_t tlsSize;
+
+   ArrayList allBBlocks;
+   ArrayList allInsns;
+   ArrayList allLValues;
+
+private:
+   void buildLiveSetsPreSSA(BasicBlock *, const int sequence);
+   void buildDefSetsPreSSA(BasicBlock *bb, const int seq);
+
+private:
+   uint32_t label;
+   int id;
+   const char *const name;
+   Program *prog;
+};
+
+enum CGStage
+{
+   CG_STAGE_PRE_SSA,
+   CG_STAGE_SSA, // expected directly before register allocation
+   CG_STAGE_POST_RA
+};
+
+class Program
+{
+public:
+   enum Type
+   {
+      TYPE_VERTEX,
+      TYPE_TESSELLATION_CONTROL,
+      TYPE_TESSELLATION_EVAL,
+      TYPE_GEOMETRY,
+      TYPE_FRAGMENT,
+      TYPE_COMPUTE
+   };
+
+   Program(Type type, Target *targ);
+   ~Program();
+
+   void print();
+
+   Type getType() const { return progType; }
+
+   inline void add(Function *fn, int& id) { allFuncs.insert(fn, id); }
+   inline void del(Function *fn, int& id) { allFuncs.remove(id); }
+   inline void add(Value *rval, int& id) { allRValues.insert(rval, id); }
+
+   bool makeFromTGSI(struct nv50_ir_prog_info *);
+   bool makeFromSM4(struct nv50_ir_prog_info *);
+   bool convertToSSA();
+   bool optimizeSSA(int level);
+   bool optimizePostRA(int level);
+   bool registerAllocation();
+   bool emitBinary(struct nv50_ir_prog_info *);
+
+   const Target *getTarget() const { return target; }
+
+private:
+   void emitSymbolTable(struct nv50_ir_prog_info *);
+
+   Type progType;
+   Target *target;
+
+public:
+   Function *main;
+   Graph calls;
+
+   ArrayList allFuncs;
+   ArrayList allRValues;
+
+   uint32_t *code;
+   uint32_t binSize;
+   uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL
+
+   int maxGPR;
+
+   MemoryPool mem_Instruction;
+   MemoryPool mem_CmpInstruction;
+   MemoryPool mem_TexInstruction;
+   MemoryPool mem_FlowInstruction;
+   MemoryPool mem_LValue;
+   MemoryPool mem_Symbol;
+   MemoryPool mem_ImmediateValue;
+
+   uint32_t dbgFlags;
+   uint8_t  optLevel;
+
+   void *targetPriv; // e.g. to carry information between passes
+
+   const struct nv50_ir_prog_info *driver; // for driver configuration
+
+   void releaseInstruction(Instruction *);
+   void releaseValue(Value *);
+};
+
+// TODO: add const version
+class Pass
+{
+public:
+   bool run(Program *, bool ordered = false, bool skipPhi = false);
+   bool run(Function *, bool ordered = false, bool skipPhi = false);
+
+private:
+   // return false to continue with next entity on next higher level
+   virtual bool visit(Function *) { return true; }
+   virtual bool visit(BasicBlock *) { return true; }
+   virtual bool visit(Instruction *) { return false; }
+
+   bool doRun(Program *, bool ordered, bool skipPhi);
+   bool doRun(Function *, bool ordered, bool skipPhi);
+
+protected:
+   bool err;
+   Function *func;
+   Program *prog;
+};
+
+// =============================================================================
+
+#include "codegen/nv50_ir_inlines.h"
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
new file mode 100644
index 0000000..51b9225
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -0,0 +1,550 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+Function::Function(Program *p, const char *fnName, uint32_t label)
+   : call(this),
+     label(label),
+     name(fnName),
+     prog(p)
+{
+   cfgExit = NULL;
+   domTree = NULL;
+
+   bbArray = NULL;
+   bbCount = 0;
+   loopNestingBound = 0;
+   regClobberMax = 0;
+
+   binPos = 0;
+   binSize = 0;
+
+   stackPtr = NULL;
+   tlsBase = 0;
+   tlsSize = 0;
+
+   prog->add(this, id);
+}
+
+Function::~Function()
+{
+   prog->del(this, id);
+
+   if (domTree)
+      delete domTree;
+   if (bbArray)
+      delete[] bbArray;
+
+   // clear value refs and defs
+   ins.clear();
+   outs.clear();
+
+   for (ArrayList::Iterator it = allInsns.iterator(); !it.end(); it.next())
+      delete_Instruction(prog, reinterpret_cast<Instruction *>(it.get()));
+
+   for (ArrayList::Iterator it = allLValues.iterator(); !it.end(); it.next())
+      delete_Value(prog, reinterpret_cast<LValue *>(it.get()));
+
+   for (ArrayList::Iterator BBs = allBBlocks.iterator(); !BBs.end(); BBs.next())
+      delete reinterpret_cast<BasicBlock *>(BBs.get());
+}
+
+BasicBlock::BasicBlock(Function *fn) : cfg(this), dom(this), func(fn)
+{
+   program = func->getProgram();
+
+   joinAt = phi = entry = exit = NULL;
+
+   numInsns = 0;
+   binPos = 0;
+   binSize = 0;
+
+   explicitCont = false;
+
+   func->add(this, this->id);
+}
+
+BasicBlock::~BasicBlock()
+{
+   // nothing yet
+}
+
+BasicBlock *
+BasicBlock::clone(ClonePolicy<Function>& pol) const
+{
+   BasicBlock *bb = new BasicBlock(pol.context());
+
+   pol.set(this, bb);
+
+   for (Instruction *i = getFirst(); i; i = i->next)
+      bb->insertTail(i->clone(pol));
+
+   pol.context()->cfg.insert(&bb->cfg);
+
+   for (Graph::EdgeIterator it = cfg.outgoing(); !it.end(); it.next()) {
+      BasicBlock *obb = BasicBlock::get(it.getNode());
+      bb->cfg.attach(&pol.get(obb)->cfg, it.getType());
+   }
+
+   return bb;
+}
+
+BasicBlock *
+BasicBlock::idom() const
+{
+   Graph::Node *dn = dom.parent();
+   return dn ? BasicBlock::get(dn) : NULL;
+}
+
+void
+BasicBlock::insertHead(Instruction *inst)
+{
+   assert(inst->next == 0 && inst->prev == 0);
+
+   if (inst->op == OP_PHI) {
+      if (phi) {
+         insertBefore(phi, inst);
+      } else {
+         if (entry) {
+            insertBefore(entry, inst);
+         } else {
+            assert(!exit);
+            phi = exit = inst;
+            inst->bb = this;
+            ++numInsns;
+         }
+      }
+   } else {
+      if (entry) {
+         insertBefore(entry, inst);
+      } else {
+         if (phi) {
+            insertAfter(exit, inst); // after last phi
+         } else {
+            assert(!exit);
+            entry = exit = inst;
+            inst->bb = this;
+            ++numInsns;
+         }
+      }
+   }
+}
+
+void
+BasicBlock::insertTail(Instruction *inst)
+{
+   assert(inst->next == 0 && inst->prev == 0);
+
+   if (inst->op == OP_PHI) {
+      if (entry) {
+         insertBefore(entry, inst);
+      } else
+      if (exit) {
+         assert(phi);
+         insertAfter(exit, inst);
+      } else {
+         assert(!phi);
+         phi = exit = inst;
+         inst->bb = this;
+         ++numInsns;
+      }
+   } else {
+      if (exit) {
+         insertAfter(exit, inst);
+      } else {
+         assert(!phi);
+         entry = exit = inst;
+         inst->bb = this;
+         ++numInsns;
+      }
+   }
+}
+
+void
+BasicBlock::insertBefore(Instruction *q, Instruction *p)
+{
+   assert(p && q);
+
+   assert(p->next == 0 && p->prev == 0);
+
+   if (q == entry) {
+      if (p->op == OP_PHI) {
+         if (!phi)
+            phi = p;
+      } else {
+         entry = p;
+      }
+   } else
+   if (q == phi) {
+      assert(p->op == OP_PHI);
+      phi = p;
+   }
+
+   p->next = q;
+   p->prev = q->prev;
+   if (p->prev)
+      p->prev->next = p;
+   q->prev = p;
+
+   p->bb = this;
+   ++numInsns;
+}
+
+void
+BasicBlock::insertAfter(Instruction *p, Instruction *q)
+{
+   assert(p && q);
+   assert(q->op != OP_PHI || p->op == OP_PHI);
+
+   assert(q->next == 0 && q->prev == 0);
+
+   if (p == exit)
+      exit = q;
+   if (p->op == OP_PHI && q->op != OP_PHI)
+      entry = q;
+
+   q->prev = p;
+   q->next = p->next;
+   if (q->next)
+      q->next->prev = q;
+   p->next = q;
+
+   q->bb = this;
+   ++numInsns;
+}
+
+void
+BasicBlock::remove(Instruction *insn)
+{
+   assert(insn->bb == this);
+
+   if (insn->prev)
+      insn->prev->next = insn->next;
+
+   if (insn->next)
+      insn->next->prev = insn->prev;
+   else
+      exit = insn->prev;
+
+   if (insn == entry) {
+      if (insn->next)
+         entry = insn->next;
+      else
+      if (insn->prev && insn->prev->op != OP_PHI)
+         entry = insn->prev;
+      else
+         entry = NULL;
+   }
+
+   if (insn == phi)
+      phi = (insn->next && insn->next->op == OP_PHI) ? insn->next : 0;
+
+   --numInsns;
+   insn->bb = NULL;
+   insn->next =
+   insn->prev = NULL;
+}
+
+void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)
+{
+   assert(a->bb == b->bb);
+
+   if (a->next != b) {
+      Instruction *i = a;
+      a = b;
+      b = i;
+   }
+   assert(a->next == b);
+   assert(a->op != OP_PHI && b->op != OP_PHI);
+
+   if (b == exit)
+      exit = a;
+   if (a == entry)
+      entry = b;
+
+   b->prev = a->prev;
+   a->next = b->next;
+   b->next = a;
+   a->prev = b;
+
+   if (b->prev)
+      b->prev->next = b;
+   if (a->prev)
+      a->next->prev = a;
+}
+
+void
+BasicBlock::splitCommon(Instruction *insn, BasicBlock *bb, bool attach)
+{
+   bb->entry = insn;
+
+   if (insn) {
+      exit = insn->prev;
+      insn->prev = NULL;
+   }
+
+   if (exit)
+      exit->next = NULL;
+   else
+      entry = NULL;
+
+   while (!cfg.outgoing(true).end()) {
+      Graph::Edge *e = cfg.outgoing(true).getEdge();
+      bb->cfg.attach(e->getTarget(), e->getType());
+      this->cfg.detach(e->getTarget());
+   }
+
+   for (; insn; insn = insn->next) {
+      this->numInsns--;
+      bb->numInsns++;
+      insn->bb = bb;
+      bb->exit = insn;
+   }
+   if (attach)
+      this->cfg.attach(&bb->cfg, Graph::Edge::TREE);
+}
+
+BasicBlock *
+BasicBlock::splitBefore(Instruction *insn, bool attach)
+{
+   BasicBlock *bb = new BasicBlock(func);
+   assert(!insn || insn->op != OP_PHI);
+
+   splitCommon(insn, bb, attach);
+   return bb;
+}
+
+BasicBlock *
+BasicBlock::splitAfter(Instruction *insn, bool attach)
+{
+   BasicBlock *bb = new BasicBlock(func);
+   assert(!insn || insn->op != OP_PHI);
+
+   bb->joinAt = joinAt;
+   joinAt = NULL;
+
+   splitCommon(insn ? insn->next : NULL, bb, attach);
+   return bb;
+}
+
+bool
+BasicBlock::dominatedBy(BasicBlock *that)
+{
+   Graph::Node *bn = &that->dom;
+   Graph::Node *dn = &this->dom;
+
+   while (dn && dn != bn)
+      dn = dn->parent();
+
+   return dn != NULL;
+}
+
+unsigned int
+BasicBlock::initiatesSimpleConditional() const
+{
+   Graph::Node *out[2];
+   int n;
+   Graph::Edge::Type eR;
+
+   if (cfg.outgoingCount() != 2) // -> if and -> else/endif
+      return false;
+
+   n = 0;
+   for (Graph::EdgeIterator ei = cfg.outgoing(); !ei.end(); ei.next())
+      out[n++] = ei.getNode();
+   eR = out[1]->outgoing().getType();
+
+   // IF block is out edge to the right
+   if (eR == Graph::Edge::CROSS || eR == Graph::Edge::BACK)
+      return 0x2;
+
+   if (out[1]->outgoingCount() != 1) // 0 is IF { RET; }, >1 is more divergence
+      return 0x0;
+   // do they reconverge immediately ?
+   if (out[1]->outgoing().getNode() == out[0])
+      return 0x1;
+   if (out[0]->outgoingCount() == 1)
+      if (out[0]->outgoing().getNode() == out[1]->outgoing().getNode())
+         return 0x3;
+
+   return 0x0;
+}
+
+bool
+Function::setEntry(BasicBlock *bb)
+{
+   if (cfg.getRoot())
+      return false;
+   cfg.insert(&bb->cfg);
+   return true;
+}
+
+bool
+Function::setExit(BasicBlock *bb)
+{
+   if (cfgExit)
+      return false;
+   cfgExit = &bb->cfg;
+   return true;
+}
+
+unsigned int
+Function::orderInstructions(ArrayList &result)
+{
+   result.clear();
+
+   for (IteratorRef it = cfg.iteratorCFG(); !it->end(); it->next()) {
+      BasicBlock *bb =
+         BasicBlock::get(reinterpret_cast<Graph::Node *>(it->get()));
+
+      for (Instruction *insn = bb->getFirst(); insn; insn = insn->next)
+         result.insert(insn, insn->serial);
+   }
+
+   return result.getSize();
+}
+
+void
+Function::buildLiveSets()
+{
+   for (unsigned i = 0; i <= loopNestingBound; ++i)
+      buildLiveSetsPreSSA(BasicBlock::get(cfg.getRoot()), cfg.nextSequence());
+
+   for (ArrayList::Iterator bi = allBBlocks.iterator(); !bi.end(); bi.next())
+      BasicBlock::get(bi)->liveSet.marker = false;
+}
+
+void
+Function::buildDefSets()
+{
+   for (unsigned i = 0; i <= loopNestingBound; ++i)
+      buildDefSetsPreSSA(BasicBlock::get(cfgExit), cfg.nextSequence());
+
+   for (ArrayList::Iterator bi = allBBlocks.iterator(); !bi.end(); bi.next())
+      BasicBlock::get(bi)->liveSet.marker = false;
+}
+
+bool
+Pass::run(Program *prog, bool ordered, bool skipPhi)
+{
+   this->prog = prog;
+   err = false;
+   return doRun(prog, ordered, skipPhi);
+}
+
+bool
+Pass::doRun(Program *prog, bool ordered, bool skipPhi)
+{
+   for (IteratorRef it = prog->calls.iteratorDFS(false);
+        !it->end(); it->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(it->get());
+      if (!doRun(Function::get(n), ordered, skipPhi))
+         return false;
+   }
+   return !err;
+}
+
+bool
+Pass::run(Function *func, bool ordered, bool skipPhi)
+{
+   prog = func->getProgram();
+   err = false;
+   return doRun(func, ordered, skipPhi);
+}
+
+bool
+Pass::doRun(Function *func, bool ordered, bool skipPhi)
+{
+   IteratorRef bbIter;
+   BasicBlock *bb;
+   Instruction *insn, *next;
+
+   this->func = func;
+   if (!visit(func))
+      return false;
+
+   bbIter = ordered ? func->cfg.iteratorCFG() : func->cfg.iteratorDFS();
+
+   for (; !bbIter->end(); bbIter->next()) {
+      bb = BasicBlock::get(reinterpret_cast<Graph::Node *>(bbIter->get()));
+      if (!visit(bb))
+         break;
+      for (insn = skipPhi ? bb->getEntry() : bb->getFirst(); insn != NULL;
+           insn = next) {
+         next = insn->next;
+         if (!visit(insn))
+            break;
+      }
+   }
+
+   return !err;
+}
+
+void
+Function::printCFGraph(const char *filePath)
+{
+   FILE *out = fopen(filePath, "a");
+   if (!out) {
+      ERROR("failed to open file: %s\n", filePath);
+      return;
+   }
+   INFO("printing control flow graph to: %s\n", filePath);
+
+   fprintf(out, "digraph G {\n");
+
+   for (IteratorRef it = cfg.iteratorDFS(); !it->end(); it->next()) {
+      BasicBlock *bb = BasicBlock::get(
+         reinterpret_cast<Graph::Node *>(it->get()));
+      int idA = bb->getId();
+      for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+         int idB = BasicBlock::get(ei.getNode())->getId();
+         switch (ei.getType()) {
+         case Graph::Edge::TREE:
+            fprintf(out, "\t%i -> %i;\n", idA, idB);
+            break;
+         case Graph::Edge::FORWARD:
+            fprintf(out, "\t%i -> %i [color=green];\n", idA, idB);
+            break;
+         case Graph::Edge::CROSS:
+            fprintf(out, "\t%i -> %i [color=red];\n", idA, idB);
+            break;
+         case Graph::Edge::BACK:
+            fprintf(out, "\t%i -> %i;\n", idA, idB);
+            break;
+         case Graph::Edge::DUMMY:
+            fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB);
+            break;
+         default:
+            assert(0);
+            break;
+         }
+      }
+   }
+
+   fprintf(out, "}\n");
+   fclose(out);
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
new file mode 100644
index 0000000..70e5e22
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+namespace nv50_ir {
+
+BuildUtil::BuildUtil()
+{
+   init(NULL);
+}
+
+BuildUtil::BuildUtil(Program *prog)
+{
+   init(prog);
+}
+
+void
+BuildUtil::init(Program *prog)
+{
+   this->prog = prog;
+
+   func = NULL;
+   bb = NULL;
+   pos = NULL;
+
+   memset(imms, 0, sizeof(imms));
+   immCount = 0;
+}
+
+void
+BuildUtil::addImmediate(ImmediateValue *imm)
+{
+   if (immCount > (NV50_IR_BUILD_IMM_HT_SIZE * 3) / 4)
+      return;
+
+   unsigned int pos = u32Hash(imm->reg.data.u32);
+
+   while (imms[pos])
+      pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE;
+   imms[pos] = imm;
+   immCount++;
+}
+
+Instruction *
+BuildUtil::mkOp1(operation op, DataType ty, Value *dst, Value *src)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkOp2(operation op, DataType ty, Value *dst,
+                 Value *src0, Value *src1)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src0);
+   insn->setSrc(1, src1);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkOp3(operation op, DataType ty, Value *dst,
+                 Value *src0, Value *src1, Value *src2)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src0);
+   insn->setSrc(1, src1);
+   insn->setSrc(2, src2);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkLoad(DataType ty, Value *dst, Symbol *mem, Value *ptr)
+{
+   Instruction *insn = new_Instruction(func, OP_LOAD, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, mem);
+   if (ptr)
+      insn->setIndirect(0, 0, ptr);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkStore(operation op, DataType ty, Symbol *mem, Value *ptr,
+                   Value *stVal)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+
+   insn->setSrc(0, mem);
+   insn->setSrc(1, stVal);
+   if (ptr)
+      insn->setIndirect(0, 0, ptr);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkFetch(Value *dst, DataType ty, DataFile file, int32_t offset,
+                   Value *attrRel, Value *primRel)
+{
+   Symbol *sym = mkSymbol(file, 0, ty, offset);
+
+   Instruction *insn = mkOp1(OP_VFETCH, ty, dst, sym);
+
+   insn->setIndirect(0, 0, attrRel);
+   insn->setIndirect(0, 1, primRel);
+
+   // already inserted
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkInterp(unsigned mode, Value *dst, int32_t offset, Value *rel)
+{
+   operation op = OP_LINTERP;
+   DataType ty = TYPE_F32;
+
+   if ((mode & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_FLAT)
+      ty = TYPE_U32;
+   else
+   if ((mode & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_PERSPECTIVE)
+      op = OP_PINTERP;
+
+   Symbol *sym = mkSymbol(FILE_SHADER_INPUT, 0, ty, offset);
+
+   Instruction *insn = mkOp1(op, ty, dst, sym);
+   insn->setIndirect(0, 0, rel);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkMov(Value *dst, Value *src, DataType ty)
+{
+   Instruction *insn = new_Instruction(func, OP_MOV, ty);
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkMovToReg(int id, Value *src)
+{
+   Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(src->reg.size));
+
+   insn->setDef(0, new_LValue(func, FILE_GPR));
+   insn->getDef(0)->reg.data.id = id;
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkMovFromReg(Value *dst, int id)
+{
+   Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(dst->reg.size));
+
+   insn->setDef(0, dst);
+   insn->setSrc(0, new_LValue(func, FILE_GPR));
+   insn->getSrc(0)->reg.data.id = id;
+
+   insert(insn);
+   return insn;
+}
+
+Instruction *
+BuildUtil::mkCvt(operation op,
+                 DataType dstTy, Value *dst, DataType srcTy, Value *src)
+{
+   Instruction *insn = new_Instruction(func, op, dstTy);
+
+   insn->setType(dstTy, srcTy);
+   insn->setDef(0, dst);
+   insn->setSrc(0, src);
+
+   insert(insn);
+   return insn;
+}
+
+CmpInstruction *
+BuildUtil::mkCmp(operation op, CondCode cc, DataType ty, Value *dst,
+                 Value *src0, Value *src1, Value *src2)
+{
+   CmpInstruction *insn = new_CmpInstruction(func, op);
+
+   insn->setType((dst->reg.file == FILE_PREDICATE ||
+                  dst->reg.file == FILE_FLAGS) ? TYPE_U8 : ty, ty);
+   insn->setCondition(cc);
+   insn->setDef(0, dst);
+   insn->setSrc(0, src0);
+   insn->setSrc(1, src1);
+   if (src2)
+      insn->setSrc(2, src2);
+
+   if (dst->reg.file == FILE_FLAGS)
+      insn->flagsDef = 0;
+
+   insert(insn);
+   return insn;
+}
+
+TexInstruction *
+BuildUtil::mkTex(operation op, TexTarget targ,
+                 uint16_t tic, uint16_t tsc,
+                 const std::vector<Value *> &def,
+                 const std::vector<Value *> &src)
+{
+   TexInstruction *tex = new_TexInstruction(func, op);
+
+   for (size_t d = 0; d < def.size() && def[d]; ++d)
+      tex->setDef(d, def[d]);
+   for (size_t s = 0; s < src.size() && src[s]; ++s)
+      tex->setSrc(s, src[s]);
+
+   tex->setTexture(targ, tic, tsc);
+
+   insert(tex);
+   return tex;
+}
+
+Instruction *
+BuildUtil::mkQuadop(uint8_t q, Value *def, uint8_t l, Value *src0, Value *src1)
+{
+   Instruction *quadop = mkOp2(OP_QUADOP, TYPE_F32, def, src0, src1);
+   quadop->subOp = q;
+   quadop->lanes = l;
+   return quadop;
+}
+
+Instruction *
+BuildUtil::mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc)
+{
+   LValue *def0 = getSSA();
+   LValue *def1 = getSSA();
+
+   mkMov(def0, trSrc)->setPredicate(CC_P, pred);
+   mkMov(def1, flSrc)->setPredicate(CC_NOT_P, pred);
+
+   return mkOp2(OP_UNION, typeOfSize(dst->reg.size), dst, def0, def1);
+}
+
+Instruction *
+BuildUtil::mkSplit(Value *h[2], uint8_t halfSize, Value *val)
+{
+   Instruction *insn = NULL;
+
+   const DataType fTy = typeOfSize(halfSize * 2);
+
+   if (val->reg.file == FILE_IMMEDIATE)
+      val = mkMov(getSSA(halfSize * 2), val, fTy)->getDef(0);
+
+   if (isMemoryFile(val->reg.file)) {
+      h[0] = cloneShallow(getFunction(), val);
+      h[1] = cloneShallow(getFunction(), val);
+      h[0]->reg.size = halfSize;
+      h[1]->reg.size = halfSize;
+      h[1]->reg.data.offset += halfSize;
+   } else {
+      h[0] = getSSA(halfSize, val->reg.file);
+      h[1] = getSSA(halfSize, val->reg.file);
+      insn = mkOp1(OP_SPLIT, fTy, h[0], val);
+      insn->setDef(1, h[1]);
+   }
+   return insn;
+}
+
+FlowInstruction *
+BuildUtil::mkFlow(operation op, void *targ, CondCode cc, Value *pred)
+{
+   FlowInstruction *insn = new_FlowInstruction(func, op, targ);
+
+   if (pred)
+      insn->setPredicate(cc, pred);
+
+   insert(insn);
+   return insn;
+}
+
+void
+BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
+{
+   static const uint16_t baseSize2[16] =
+   {
+      0x0000, 0x0010, 0x0011, 0x0020, 0x0012, 0x1210, 0x1211, 0x1220,
+      0x0013, 0x1310, 0x1311, 0x1320, 0x0022, 0x2210, 0x2211, 0x0040,
+   };
+
+   int base = 0;
+
+   for (; rMask; rMask >>= 4, base += 4) {
+      const uint32_t mask = rMask & 0xf;
+      if (!mask)
+         continue;
+      int base1 = (baseSize2[mask] >>  0) & 0xf;
+      int size1 = (baseSize2[mask] >>  4) & 0xf;
+      int base2 = (baseSize2[mask] >>  8) & 0xf;
+      int size2 = (baseSize2[mask] >> 12) & 0xf;
+      Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL);
+      if (1) { // size1 can't be 0
+         LValue *reg = new_LValue(func, f);
+         reg->reg.size = size1 << unit;
+         reg->reg.data.id = base + base1;
+         insn->setDef(0, reg);
+      }
+      if (size2) {
+         LValue *reg = new_LValue(func, f);
+         reg->reg.size = size2 << unit;
+         reg->reg.data.id = base + base2;
+         insn->setDef(1, reg);
+      }
+   }
+}
+
+ImmediateValue *
+BuildUtil::mkImm(uint32_t u)
+{
+   unsigned int pos = u32Hash(u);
+
+   while (imms[pos] && imms[pos]->reg.data.u32 != u)
+      pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE;
+
+   ImmediateValue *imm = imms[pos];
+   if (!imm) {
+      imm = new_ImmediateValue(prog, u);
+      addImmediate(imm);
+   }
+   return imm;
+}
+
+ImmediateValue *
+BuildUtil::mkImm(uint64_t u)
+{
+   ImmediateValue *imm = new_ImmediateValue(prog, (uint32_t)0);
+
+   imm->reg.size = 8;
+   imm->reg.type = TYPE_U64;
+   imm->reg.data.u64 = u;
+
+   return imm;
+}
+
+ImmediateValue *
+BuildUtil::mkImm(float f)
+{
+   union {
+      float f32;
+      uint32_t u32;
+   } u;
+   u.f32 = f;
+   return mkImm(u.u32);
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, float f)
+{
+   return mkOp1v(OP_MOV, TYPE_F32, dst ? dst : getScratch(), mkImm(f));
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, uint32_t u)
+{
+   return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u));
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, uint64_t u)
+{
+   return mkOp1v(OP_MOV, TYPE_U64, dst ? dst : getScratch(8), mkImm(u));
+}
+
+Symbol *
+BuildUtil::mkSymbol(DataFile file, int8_t fileIndex, DataType ty,
+                    uint32_t baseAddr)
+{
+   Symbol *sym = new_Symbol(prog, file, fileIndex);
+
+   sym->setOffset(baseAddr);
+   sym->reg.type = ty;
+   sym->reg.size = typeSizeof(ty);
+
+   return sym;
+}
+
+Symbol *
+BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
+{
+   Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0);
+
+   assert(svIndex < 4 ||
+          (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR));
+
+   switch (svName) {
+   case SV_POSITION:
+   case SV_FACE:
+   case SV_YDIR:
+   case SV_POINT_SIZE:
+   case SV_POINT_COORD:
+   case SV_CLIP_DISTANCE:
+   case SV_TESS_FACTOR:
+      sym->reg.type = TYPE_F32;
+      break;
+   default:
+      sym->reg.type = TYPE_U32;
+      break;
+   }
+   sym->reg.size = typeSizeof(sym->reg.type);
+
+   sym->reg.data.sv.sv = svName;
+   sym->reg.data.sv.index = svIndex;
+
+   return sym;
+}
+
+void
+BuildUtil::DataArray::setup(unsigned array, unsigned arrayIdx,
+                            uint32_t base, int len, int vecDim, int eltSize,
+                            DataFile file, int8_t fileIdx)
+{
+   this->array = array;
+   this->arrayIdx = arrayIdx;
+   this->baseAddr = base;
+   this->arrayLen = len;
+   this->vecDim = vecDim;
+   this->eltSize = eltSize;
+   this->file = file;
+   this->regOnly = !isMemoryFile(file);
+
+   if (!regOnly) {
+      baseSym = new_Symbol(up->getProgram(), file, fileIdx);
+      baseSym->setOffset(baseAddr);
+      baseSym->reg.size = eltSize;
+   } else {
+      baseSym = NULL;
+   }
+}
+
+Value *
+BuildUtil::DataArray::acquire(ValueMap &m, int i, int c)
+{
+   if (regOnly) {
+      Value *v = lookup(m, i, c);
+      if (!v)
+         v = insert(m, i, c, new_LValue(up->getFunction(), file));
+
+      return v;
+   } else {
+      return up->getScratch();
+   }
+}
+
+Value *
+BuildUtil::DataArray::load(ValueMap &m, int i, int c, Value *ptr)
+{
+   if (regOnly) {
+      Value *v = lookup(m, i, c);
+      if (!v)
+         v = insert(m, i, c, new_LValue(up->getFunction(), file));
+
+      return v;
+   } else {
+      Value *sym = lookup(m, i, c);
+      if (!sym)
+         sym = insert(m, i, c, mkSymbol(i, c));
+
+      return up->mkLoadv(typeOfSize(eltSize), static_cast<Symbol *>(sym), ptr);
+   }
+}
+
+void
+BuildUtil::DataArray::store(ValueMap &m, int i, int c, Value *ptr, Value *value)
+{
+   if (regOnly) {
+      assert(!ptr);
+      if (!lookup(m, i, c))
+         insert(m, i, c, value);
+
+      assert(lookup(m, i, c) == value);
+   } else {
+      Value *sym = lookup(m, i, c);
+      if (!sym)
+         sym = insert(m, i, c, mkSymbol(i, c));
+
+      const DataType stTy = typeOfSize(value->reg.size);
+
+      up->mkStore(OP_STORE, stTy, static_cast<Symbol *>(sym), ptr, value);
+   }
+}
+
+Symbol *
+BuildUtil::DataArray::mkSymbol(int i, int c)
+{
+   const unsigned int idx = i * vecDim + c;
+   Symbol *sym = new_Symbol(up->getProgram(), file, 0);
+
+   assert(baseSym || (idx < arrayLen && c < vecDim));
+
+   sym->reg.size = eltSize;
+   sym->reg.type = typeOfSize(eltSize);
+   sym->setAddress(baseSym, baseAddr + idx * eltSize);
+   return sym;
+}
+
+
+Instruction *
+BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
+                              Value *zero,
+                              Value *carry)
+{
+   DataType hTy;
+   int srcNr;
+
+   switch (i->dType) {
+   case TYPE_U64: hTy = TYPE_U32; break;
+   case TYPE_S64: hTy = TYPE_S32; break;
+   default:
+      return NULL;
+   }
+
+   switch (i->op) {
+   case OP_MOV: srcNr = 1; break;
+   case OP_ADD:
+   case OP_SUB:
+      if (!carry)
+         return NULL;
+      srcNr = 2;
+      break;
+   default:
+      // TODO when needed
+      return NULL;
+   }
+
+   i->setType(hTy);
+   i->setDef(0, cloneShallow(fn, i->getDef(0)));
+   i->getDef(0)->reg.size = 4;
+   Instruction *lo = i;
+   Instruction *hi = cloneForward(fn, i);
+   lo->bb->insertAfter(lo, hi);
+
+   hi->getDef(0)->reg.data.id++;
+
+   for (int s = 0; s < srcNr; ++s) {
+      if (lo->getSrc(s)->reg.size < 8) {
+         hi->setSrc(s, zero);
+      } else {
+         if (lo->getSrc(s)->refCount() > 1)
+            lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
+         lo->getSrc(s)->reg.size /= 2;
+         hi->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
+
+         switch (hi->src(s).getFile()) {
+         case FILE_IMMEDIATE:
+            hi->getSrc(s)->reg.data.u64 >>= 32;
+            break;
+         case FILE_MEMORY_CONST:
+         case FILE_MEMORY_SHARED:
+         case FILE_SHADER_INPUT:
+            hi->getSrc(s)->reg.data.offset += 4;
+            break;
+         default:
+            assert(hi->src(s).getFile() == FILE_GPR);
+            hi->getSrc(s)->reg.data.id++;
+            break;
+         }
+      }
+   }
+   if (srcNr == 2) {
+      lo->setDef(1, carry);
+      hi->setFlagsSrc(hi->srcCount(), carry);
+   }
+   return hi;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
new file mode 100644
index 0000000..2305a27
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_BUILD_UTIL__
+#define __NV50_IR_BUILD_UTIL__
+
+namespace nv50_ir {
+
+class BuildUtil
+{
+public:
+   BuildUtil();
+   BuildUtil(Program *);
+
+   inline void setProgram(Program *);
+   inline Program *getProgram() const { return prog; }
+   inline Function *getFunction() const { return func; }
+
+   // keeps inserting at head/tail of block
+   inline void setPosition(BasicBlock *, bool tail);
+   // position advances only if @after is true
+   inline void setPosition(Instruction *, bool after);
+
+   inline BasicBlock *getBB() { return bb; }
+
+   inline void insert(Instruction *);
+   inline void remove(Instruction *i) { assert(i->bb == bb); bb->remove(i); }
+
+   inline LValue *getScratch(int size = 4, DataFile = FILE_GPR);
+   // scratch value for a single assignment:
+   inline LValue *getSSA(int size = 4, DataFile = FILE_GPR);
+
+   inline Instruction *mkOp(operation, DataType, Value *);
+   Instruction *mkOp1(operation, DataType, Value *, Value *);
+   Instruction *mkOp2(operation, DataType, Value *, Value *, Value *);
+   Instruction *mkOp3(operation, DataType, Value *, Value *, Value *, Value *);
+
+   LValue *mkOp1v(operation, DataType, Value *, Value *);
+   LValue *mkOp2v(operation, DataType, Value *, Value *, Value *);
+   LValue *mkOp3v(operation, DataType, Value *, Value *, Value *, Value *);
+
+   Instruction *mkLoad(DataType, Value *dst, Symbol *, Value *ptr);
+   Instruction *mkStore(operation, DataType, Symbol *, Value *ptr, Value *val);
+
+   LValue *mkLoadv(DataType, Symbol *, Value *ptr);
+
+   Instruction *mkMov(Value *, Value *, DataType = TYPE_U32);
+   Instruction *mkMovToReg(int id, Value *);
+   Instruction *mkMovFromReg(Value *, int id);
+
+   Instruction *mkInterp(unsigned mode, Value *, int32_t offset, Value *rel);
+   Instruction *mkFetch(Value *, DataType, DataFile, int32_t offset,
+                        Value *attrRel, Value *primRel);
+
+   Instruction *mkCvt(operation, DataType, Value *, DataType, Value *);
+   CmpInstruction *mkCmp(operation, CondCode, DataType,
+			 Value *,
+			 Value *, Value *, Value * = NULL);
+   TexInstruction *mkTex(operation, TexTarget,
+                         uint16_t tic, uint16_t tsc,
+                         const std::vector<Value *> &def,
+                         const std::vector<Value *> &src);
+   Instruction *mkQuadop(uint8_t qop, Value *, uint8_t l, Value *, Value *);
+
+   FlowInstruction *mkFlow(operation, void *target, CondCode, Value *pred);
+
+   Instruction *mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc);
+
+   Instruction *mkSplit(Value *half[2], uint8_t halfSize, Value *);
+
+   void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
+
+   ImmediateValue *mkImm(float);
+   ImmediateValue *mkImm(uint32_t);
+   ImmediateValue *mkImm(uint64_t);
+
+   ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
+
+   Value *loadImm(Value *dst, float);
+   Value *loadImm(Value *dst, uint32_t);
+   Value *loadImm(Value *dst, uint64_t);
+
+   Value *loadImm(Value *dst, int i) { return loadImm(dst, (uint32_t)i); }
+
+   // returns high part of the operation
+   static Instruction *split64BitOpPostRA(Function *, Instruction *,
+                                          Value *zero, Value *carry);
+
+   struct Location
+   {
+      Location(unsigned array, unsigned arrayIdx, unsigned i, unsigned c)
+         : array(array), arrayIdx(arrayIdx), i(i), c(c) { }
+      Location(const Location &l)
+         : array(l.array), arrayIdx(l.arrayIdx), i(l.i), c(l.c) { }
+
+      bool operator==(const Location &l) const
+      {
+         return
+            array == l.array && arrayIdx == l.arrayIdx && i == l.i && c == l.c;
+      }
+
+      bool operator<(const Location &l) const
+      {
+         return array != l.array ? array < l.array :
+            arrayIdx != l.arrayIdx ? arrayIdx < l.arrayIdx :
+            i != l.i ? i < l.i :
+            c != l.c ? c < l.c :
+            false;
+      }
+
+      unsigned array, arrayIdx, i, c;
+   };
+
+   typedef bimap<Location, Value *> ValueMap;
+
+   class DataArray
+   {
+   public:
+      DataArray(BuildUtil *bld) : up(bld) { }
+
+      void setup(unsigned array, unsigned arrayIdx,
+                 uint32_t base, int len, int vecDim, int eltSize,
+                 DataFile file, int8_t fileIdx);
+
+      inline bool exists(ValueMap&, unsigned int i, unsigned int c);
+
+      Value *load(ValueMap&, int i, int c, Value *ptr);
+      void store(ValueMap&, int i, int c, Value *ptr, Value *value);
+      Value *acquire(ValueMap&, int i, int c);
+
+   private:
+      inline Value *lookup(ValueMap&, unsigned i, unsigned c);
+      inline Value *insert(ValueMap&, unsigned i, unsigned c, Value *v);
+
+      Symbol *mkSymbol(int i, int c);
+
+   private:
+      BuildUtil *up;
+      unsigned array, arrayIdx;
+
+      uint32_t baseAddr;
+      uint32_t arrayLen;
+      Symbol *baseSym;
+
+      uint8_t vecDim;
+      uint8_t eltSize; // in bytes
+
+      DataFile file;
+      bool regOnly;
+   };
+
+   Symbol *mkSymbol(DataFile file, int8_t fileIndex,
+                    DataType ty, uint32_t baseAddress);
+
+   Symbol *mkSysVal(SVSemantic svName, uint32_t svIndex);
+
+private:
+   void init(Program *);
+   void addImmediate(ImmediateValue *);
+   inline unsigned int u32Hash(uint32_t);
+
+protected:
+   Program *prog;
+   Function *func;
+   Instruction *pos;
+   BasicBlock *bb;
+   bool tail;
+
+#define NV50_IR_BUILD_IMM_HT_SIZE 256
+
+   ImmediateValue *imms[NV50_IR_BUILD_IMM_HT_SIZE];
+   unsigned int immCount;
+};
+
+unsigned int BuildUtil::u32Hash(uint32_t u)
+{
+   return (u % 273) % NV50_IR_BUILD_IMM_HT_SIZE;
+}
+
+void BuildUtil::setProgram(Program *program)
+{
+   prog = program;
+}
+
+void
+BuildUtil::setPosition(BasicBlock *block, bool atTail)
+{
+   bb = block;
+   prog = bb->getProgram();
+   func = bb->getFunction();
+   pos = NULL;
+   tail = atTail;
+}
+
+void
+BuildUtil::setPosition(Instruction *i, bool after)
+{
+   bb = i->bb;
+   prog = bb->getProgram();
+   func = bb->getFunction();
+   pos = i;
+   tail = after;
+   assert(bb);
+}
+
+LValue *
+BuildUtil::getScratch(int size, DataFile f)
+{
+   LValue *lval = new_LValue(func, f);
+   lval->reg.size = size;
+   return lval;
+}
+
+LValue *
+BuildUtil::getSSA(int size, DataFile f)
+{
+   LValue *lval = new_LValue(func, f);
+   lval->ssa = 1;
+   lval->reg.size = size;
+   return lval;
+}
+
+void BuildUtil::insert(Instruction *i)
+{
+   if (!pos) {
+      tail ? bb->insertTail(i) : bb->insertHead(i);
+   } else {
+      if (tail) {
+         bb->insertAfter(pos, i);
+         pos = i;
+      } else {
+         bb->insertBefore(pos, i);
+      }
+   }
+}
+
+Instruction *
+BuildUtil::mkOp(operation op, DataType ty, Value *dst)
+{
+   Instruction *insn = new_Instruction(func, op, ty);
+   insn->setDef(0, dst);
+   insert(insn);
+   if (op == OP_DISCARD || op == OP_EXIT ||
+       op == OP_JOIN ||
+       op == OP_QUADON || op == OP_QUADPOP ||
+       op == OP_EMIT || op == OP_RESTART)
+      insn->fixed = 1;
+   return insn;
+}
+
+inline LValue *
+BuildUtil::mkOp1v(operation op, DataType ty, Value *dst, Value *src)
+{
+   mkOp1(op, ty, dst, src);
+   return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkOp2v(operation op, DataType ty, Value *dst,
+                  Value *src0, Value *src1)
+{
+   mkOp2(op, ty, dst, src0, src1);
+   return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkOp3v(operation op, DataType ty, Value *dst,
+                  Value *src0, Value *src1, Value *src2)
+{
+   mkOp3(op, ty, dst, src0, src1, src2);
+   return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkLoadv(DataType ty, Symbol *mem, Value *ptr)
+{
+   LValue *dst = getScratch();
+   mkLoad(ty, dst, mem, ptr);
+   return dst;
+}
+
+bool
+BuildUtil::DataArray::exists(ValueMap &m, unsigned int i, unsigned int c)
+{
+   assert(i < arrayLen && c < vecDim);
+   return !regOnly || m.r.count(Location(array, arrayIdx, i, c));
+}
+
+Value *
+BuildUtil::DataArray::lookup(ValueMap &m, unsigned i, unsigned c)
+{
+   ValueMap::r_iterator it = m.r.find(Location(array, arrayIdx, i, c));
+   return it != m.r.end() ? it->second : NULL;
+}
+
+Value *
+BuildUtil::DataArray::insert(ValueMap &m, unsigned i, unsigned c, Value *v)
+{
+   m.insert(Location(array, arrayIdx, i, c), v);
+   return v;
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_BUILD_UTIL_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
new file mode 100644
index 0000000..752bad3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_DRIVER_H__
+#define __NV50_IR_DRIVER_H__
+
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+/*
+ * This struct constitutes linkage information in TGSI terminology.
+ *
+ * It is created by the code generator and handed to the pipe driver
+ * for input/output slot assignment.
+ */
+struct nv50_ir_varying
+{
+   uint8_t slot[4]; /* native slots for xyzw (addresses in 32-bit words) */
+
+   unsigned mask     : 4; /* vec4 mask */
+   unsigned linear   : 1; /* linearly interpolated if true (and not flat) */
+   unsigned flat     : 1;
+   unsigned sc       : 1; /* special colour interpolation mode (SHADE_MODEL) */
+   unsigned centroid : 1;
+   unsigned patch    : 1; /* patch constant value */
+   unsigned regular  : 1; /* driver-specific meaning (e.g. input in sreg) */
+   unsigned input    : 1; /* indicates direction of system values */
+   unsigned oread    : 1; /* true if output is read from parallel TCP */
+
+   ubyte id; /* TGSI register index */
+   ubyte sn; /* TGSI semantic name */
+   ubyte si; /* TGSI semantic index */
+};
+
+#define NV50_PROGRAM_IR_TGSI 0
+#define NV50_PROGRAM_IR_SM4  1
+#define NV50_PROGRAM_IR_GLSL 2
+#define NV50_PROGRAM_IR_LLVM 3
+
+#ifdef DEBUG
+# define NV50_IR_DEBUG_BASIC     (1 << 0)
+# define NV50_IR_DEBUG_VERBOSE   (2 << 0)
+# define NV50_IR_DEBUG_REG_ALLOC (1 << 2)
+#else
+# define NV50_IR_DEBUG_BASIC     0
+# define NV50_IR_DEBUG_VERBOSE   0
+# define NV50_IR_DEBUG_REG_ALLOC 0
+#endif
+
+#define NV50_SEMANTIC_CLIPDISTANCE  (TGSI_SEMANTIC_COUNT + 0)
+#define NV50_SEMANTIC_VIEWPORTINDEX (TGSI_SEMANTIC_COUNT + 4)
+#define NV50_SEMANTIC_LAYER         (TGSI_SEMANTIC_COUNT + 5)
+#define NV50_SEMANTIC_INVOCATIONID  (TGSI_SEMANTIC_COUNT + 6)
+#define NV50_SEMANTIC_TESSFACTOR    (TGSI_SEMANTIC_COUNT + 7)
+#define NV50_SEMANTIC_TESSCOORD     (TGSI_SEMANTIC_COUNT + 8)
+#define NV50_SEMANTIC_SAMPLEMASK    (TGSI_SEMANTIC_COUNT + 9)
+#define NV50_SEMANTIC_COUNT         (TGSI_SEMANTIC_COUNT + 10)
+
+#define NV50_TESS_PART_FRACT_ODD  0
+#define NV50_TESS_PART_FRACT_EVEN 1
+#define NV50_TESS_PART_POW2       2
+#define NV50_TESS_PART_INTEGER    3
+
+#define NV50_PRIM_PATCHES PIPE_PRIM_MAX
+
+struct nv50_ir_prog_symbol
+{
+   uint32_t label;
+   uint32_t offset;
+};
+
+#define NVISA_GF100_CHIPSET_C0 0xc0
+#define NVISA_GF100_CHIPSET_D0 0xd0
+#define NVISA_GK104_CHIPSET    0xe0
+#define NVISA_GK110_CHIPSET    0xf0
+
+struct nv50_ir_prog_info
+{
+   uint16_t target; /* chipset (0x50, 0x84, 0xc0, ...) */
+
+   uint8_t type; /* PIPE_SHADER */
+
+   uint8_t optLevel; /* optimization level (0 to 3) */
+   uint8_t dbgFlags;
+
+   struct {
+      int16_t maxGPR;     /* may be -1 if none used */
+      int16_t maxOutput;
+      uint32_t tlsSpace;  /* required local memory per thread */
+      uint32_t *code;
+      uint32_t codeSize;
+      uint8_t sourceRep;  /* NV50_PROGRAM_IR */
+      const void *source;
+      void *relocData;
+      struct nv50_ir_prog_symbol *syms;
+      uint16_t numSyms;
+   } bin;
+
+   struct nv50_ir_varying sv[PIPE_MAX_SHADER_INPUTS];
+   struct nv50_ir_varying in[PIPE_MAX_SHADER_INPUTS];
+   struct nv50_ir_varying out[PIPE_MAX_SHADER_OUTPUTS];
+   uint8_t numInputs;
+   uint8_t numOutputs;
+   uint8_t numPatchConstants; /* also included in numInputs/numOutputs */
+   uint8_t numSysVals;
+
+   struct {
+      uint32_t *buf;    /* for IMMEDIATE_ARRAY */
+      uint16_t bufSize; /* size of immediate array */
+      uint16_t count;   /* count of inline immediates */
+      uint32_t *data;   /* inline immediate data */
+      uint8_t *type;    /* for each vec4 (128 bit) */
+   } immd;
+
+   union {
+      struct {
+         uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */
+      } vp;
+      struct {
+         uint8_t inputPatchSize;
+         uint8_t outputPatchSize;
+         uint8_t partitioning;    /* PIPE_TESS_PART */
+         int8_t winding;          /* +1 (clockwise) / -1 (counter-clockwise) */
+         uint8_t domain;          /* PIPE_PRIM_{QUADS,TRIANGLES,LINES} */
+         uint8_t outputPrim;      /* PIPE_PRIM_{TRIANGLES,LINES,POINTS} */
+      } tp;
+      struct {
+         uint8_t inputPrim;
+         uint8_t outputPrim;
+         unsigned instanceCount;
+         unsigned maxVertices;
+      } gp;
+      struct {
+         unsigned numColourResults;
+         boolean writesDepth;
+         boolean earlyFragTests;
+         boolean separateFragData;
+         boolean usesDiscard;
+      } fp;
+      struct {
+         uint32_t inputOffset; /* base address for user args */
+         uint32_t sharedOffset; /* reserved space in s[] */
+         uint32_t gridInfoBase;  /* base address for NTID,NCTAID */
+      } cp;
+   } prop;
+
+   uint8_t numBarriers;
+
+   struct {
+      uint8_t clipDistance;      /* index of first clip distance output */
+      uint8_t clipDistanceMask;  /* mask of clip distances defined */
+      uint8_t cullDistanceMask;  /* clip distance mode (1 bit per output) */
+      int8_t genUserClip;        /* request user clip planes for ClipVertex */
+      uint16_t ucpBase;          /* base address for UCPs */
+      uint8_t ucpCBSlot;         /* constant buffer index of UCP data */
+      uint8_t pointSize;         /* output index for PointSize */
+      uint8_t instanceId;        /* system value index of InstanceID */
+      uint8_t vertexId;          /* system value index of VertexID */
+      uint8_t edgeFlagIn;
+      uint8_t edgeFlagOut;
+      uint8_t fragDepth;         /* output index of FragDepth */
+      uint8_t sampleMask;        /* output index of SampleMask */
+      uint8_t backFaceColor[2];  /* input/output indices of back face colour */
+      uint8_t globalAccess;      /* 1 for read, 2 for wr, 3 for rw */
+      boolean nv50styleSurfaces; /* generate gX[] access for raw buffers */
+      uint8_t resInfoCBSlot;     /* cX[] used for tex handles, surface info */
+      uint16_t texBindBase;      /* base address for tex handles (nve4) */
+      uint16_t suInfoBase;       /* base address for surface info (nve4) */
+      uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
+      uint16_t msInfoBase;       /* base address for multisample info */
+   } io;
+
+   /* driver callback to assign input/output locations */
+   int (*assignSlots)(struct nv50_ir_prog_info *);
+
+   void *driverPriv;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int nv50_ir_generate_code(struct nv50_ir_prog_info *);
+
+extern void nv50_ir_relocate_code(void *relocData, uint32_t *code,
+                                  uint32_t codePos,
+                                  uint32_t libPos,
+                                  uint32_t dataPos);
+
+/* obtain code that will be shared among programs */
+extern void nv50_ir_get_target_library(uint32_t chipset,
+                                       const uint32_t **code, uint32_t *size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __NV50_IR_DRIVER_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
new file mode 100644
index 0000000..ac59187
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -0,0 +1,1682 @@
+/*
+ * Copyright 2012 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+// CodeEmitter for GK110 encoding of the Fermi/Kepler ISA.
+
+namespace nv50_ir {
+
+class CodeEmitterGK110 : public CodeEmitter
+{
+public:
+   CodeEmitterGK110(const TargetNVC0 *);
+
+   virtual bool emitInstruction(Instruction *);
+   virtual uint32_t getMinEncodingSize(const Instruction *) const;
+   virtual void prepareEmission(Function *);
+
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
+private:
+   const TargetNVC0 *targNVC0;
+
+   Program::Type progType;
+
+   const bool writeIssueDelays;
+
+private:
+   void emitForm_21(const Instruction *, uint32_t opc2, uint32_t opc1);
+   void emitForm_C(const Instruction *, uint32_t opc, uint8_t ctg);
+   void emitForm_L(const Instruction *, uint32_t opc, uint8_t ctg, Modifier);
+
+   void emitPredicate(const Instruction *);
+
+   void setCAddress14(const ValueRef&);
+   void setShortImmediate(const Instruction *, const int s);
+   void setImmediate32(const Instruction *, const int s, Modifier);
+
+   void modNegAbsF32_3b(const Instruction *, const int s);
+
+   void emitCondCode(CondCode cc, int pos, uint8_t mask);
+   void emitInterpMode(const Instruction *);
+   void emitLoadStoreType(DataType ty, const int pos);
+   void emitCachingMode(CacheMode c, const int pos);
+
+   inline uint8_t getSRegEncoding(const ValueRef&);
+
+   void emitRoundMode(RoundMode, const int pos, const int rintPos);
+   void emitRoundModeF(RoundMode, const int pos);
+   void emitRoundModeI(RoundMode, const int pos);
+
+   void emitNegAbs12(const Instruction *);
+
+   void emitNOP(const Instruction *);
+
+   void emitLOAD(const Instruction *);
+   void emitSTORE(const Instruction *);
+   void emitMOV(const Instruction *);
+
+   void emitINTERP(const Instruction *);
+   void emitPFETCH(const Instruction *);
+   void emitVFETCH(const Instruction *);
+   void emitEXPORT(const Instruction *);
+   void emitOUT(const Instruction *);
+
+   void emitUADD(const Instruction *);
+   void emitFADD(const Instruction *);
+   void emitIMUL(const Instruction *);
+   void emitFMUL(const Instruction *);
+   void emitIMAD(const Instruction *);
+   void emitISAD(const Instruction *);
+   void emitFMAD(const Instruction *);
+
+   void emitNOT(const Instruction *);
+   void emitLogicOp(const Instruction *, uint8_t subOp);
+   void emitPOPC(const Instruction *);
+   void emitINSBF(const Instruction *);
+   void emitShift(const Instruction *);
+
+   void emitSFnOp(const Instruction *, uint8_t subOp);
+
+   void emitCVT(const Instruction *);
+   void emitMINMAX(const Instruction *);
+   void emitPreOp(const Instruction *);
+
+   void emitSET(const CmpInstruction *);
+   void emitSLCT(const CmpInstruction *);
+   void emitSELP(const Instruction *);
+
+   void emitTEXBAR(const Instruction *);
+   void emitTEX(const TexInstruction *);
+   void emitTEXCSAA(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
+
+   void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
+
+   void emitFlow(const Instruction *);
+
+   inline void defId(const ValueDef&, const int pos);
+   inline void srcId(const ValueRef&, const int pos);
+   inline void srcId(const ValueRef *, const int pos);
+   inline void srcId(const Instruction *, int s, const int pos);
+
+   inline void srcAddr32(const ValueRef&, const int pos); // address / 4
+
+   inline bool isLIMM(const ValueRef&, DataType ty, bool mod = false);
+};
+
+#define GK110_GPR_ZERO 255
+
+#define NEG_(b, s) \
+   if (i->src(s).mod.neg()) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+#define ABS_(b, s) \
+   if (i->src(s).mod.abs()) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define NOT_(b, s) if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT))       \
+   code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define FTZ_(b) if (i->ftz) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define SAT_(b) if (i->saturate) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define RND_(b, t) emitRoundMode##t(i->rnd, 0x##b)
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterGK110::srcId(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (src.get() ? SDATA(src).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+void CodeEmitterGK110::srcId(const ValueRef *src, const int pos)
+{
+   code[pos / 32] |= (src ? SDATA(*src).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+void CodeEmitterGK110::srcId(const Instruction *insn, int s, int pos)
+{
+   int r = insn->srcExists(s) ? SDATA(insn->src(s)).id : GK110_GPR_ZERO;
+   code[pos / 32] |= r << (pos % 32);
+}
+
+void CodeEmitterGK110::srcAddr32(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (SDATA(src).offset >> 2) << (pos % 32);
+}
+
+void CodeEmitterGK110::defId(const ValueDef& def, const int pos)
+{
+   code[pos / 32] |= (def.get() ? DDATA(def).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+bool CodeEmitterGK110::isLIMM(const ValueRef& ref, DataType ty, bool mod)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+}
+
+void
+CodeEmitterGK110::emitRoundMode(RoundMode rnd, const int pos, const int rintPos)
+{
+   bool rint = false;
+   uint8_t n;
+
+   switch (rnd) {
+   case ROUND_MI: rint = true; /* fall through */ case ROUND_M: n = 1; break;
+   case ROUND_PI: rint = true; /* fall through */ case ROUND_P: n = 2; break;
+   case ROUND_ZI: rint = true; /* fall through */ case ROUND_Z: n = 3; break;
+   default:
+      rint = rnd == ROUND_NI;
+      n = 0;
+      assert(rnd == ROUND_N || rnd == ROUND_NI);
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+   if (rint && rintPos >= 0)
+      code[rintPos / 32] |= 1 << (rintPos % 32);
+}
+
+void
+CodeEmitterGK110::emitRoundModeF(RoundMode rnd, const int pos)
+{
+   uint8_t n;
+
+   switch (rnd) {
+   case ROUND_M: n = 1; break;
+   case ROUND_P: n = 2; break;
+   case ROUND_Z: n = 3; break;
+   default:
+      n = 0;
+      assert(rnd == ROUND_N);
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitRoundModeI(RoundMode rnd, const int pos)
+{
+   uint8_t n;
+
+   switch (rnd) {
+   case ROUND_MI: n = 1; break;
+   case ROUND_PI: n = 2; break;
+   case ROUND_ZI: n = 3; break;
+   default:
+      n = 0;
+      assert(rnd == ROUND_NI);
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void CodeEmitterGK110::emitCondCode(CondCode cc, int pos, uint8_t mask)
+{
+   uint8_t n;
+
+   switch (cc) {
+   case CC_FL:  n = 0x00; break;
+   case CC_LT:  n = 0x01; break;
+   case CC_EQ:  n = 0x02; break;
+   case CC_LE:  n = 0x03; break;
+   case CC_GT:  n = 0x04; break;
+   case CC_NE:  n = 0x05; break;
+   case CC_GE:  n = 0x06; break;
+   case CC_LTU: n = 0x09; break;
+   case CC_EQU: n = 0x0a; break;
+   case CC_LEU: n = 0x0b; break;
+   case CC_GTU: n = 0x0c; break;
+   case CC_NEU: n = 0x0d; break;
+   case CC_GEU: n = 0x0e; break;
+   case CC_TR:  n = 0x0f; break;
+   case CC_NO:  n = 0x10; break;
+   case CC_NC:  n = 0x11; break;
+   case CC_NS:  n = 0x12; break;
+   case CC_NA:  n = 0x13; break;
+   case CC_A:   n = 0x14; break;
+   case CC_S:   n = 0x15; break;
+   case CC_C:   n = 0x16; break;
+   case CC_O:   n = 0x17; break;
+   default:
+      n = 0;
+      assert(!"invalid condition code");
+      break;
+   }
+   code[pos / 32] |= (n & mask) << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitPredicate(const Instruction *i)
+{
+   if (i->predSrc >= 0) {
+      srcId(i->src(i->predSrc), 18);
+      if (i->cc == CC_NOT_P)
+         code[0] |= 8 << 18; // negate
+      assert(i->getPredicate()->reg.file == FILE_PREDICATE);
+   } else {
+      code[0] |= 7 << 18;
+   }
+}
+
+void
+CodeEmitterGK110::setCAddress14(const ValueRef& src)
+{
+   const int32_t addr = src.get()->asSym()->reg.data.offset / 4;
+
+   code[0] |= (addr & 0x01ff) << 23;
+   code[1] |= (addr & 0x3e00) >> 9;
+}
+
+void
+CodeEmitterGK110::setShortImmediate(const Instruction *i, const int s)
+{
+   const uint32_t u32 = i->getSrc(s)->asImm()->reg.data.u32;
+   const uint64_t u64 = i->getSrc(s)->asImm()->reg.data.u64;
+
+   if (i->sType == TYPE_F32) {
+      assert(!(u32 & 0x00000fff));
+      code[0] |= ((u32 & 0x001ff000) >> 12) << 23;
+      code[1] |= ((u32 & 0x7fe00000) >> 21);
+      code[1] |= ((u32 & 0x80000000) >> 4);
+   } else
+   if (i->sType == TYPE_F64) {
+      assert(!(u64 & 0x00000fffffffffffULL));
+      code[0] |= ((u64 & 0x001ff00000000000ULL) >> 44) << 23;
+      code[1] |= ((u64 & 0x7fe0000000000000ULL) >> 53);
+      code[1] |= ((u64 & 0x8000000000000000ULL) >> 36);
+   } else {
+      assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+      code[0] |= (u32 & 0x001ff) << 23;
+      code[1] |= (u32 & 0x7fe00) >> 9;
+      code[1] |= (u32 & 0x80000) << 8;
+   }
+}
+
+void
+CodeEmitterGK110::setImmediate32(const Instruction *i, const int s,
+                                 Modifier mod)
+{
+   uint32_t u32 = i->getSrc(s)->asImm()->reg.data.u32;
+
+   if (mod) {
+      ImmediateValue imm(i->getSrc(s)->asImm(), i->sType);
+      mod.applyTo(imm);
+      u32 = imm.reg.data.u32;
+   }
+
+   code[0] |= u32 << 23;
+   code[1] |= u32 >> 9;
+}
+
+void
+CodeEmitterGK110::emitForm_L(const Instruction *i, uint32_t opc, uint8_t ctg,
+                             Modifier mod)
+{
+   code[0] = ctg;
+   code[1] = opc << 20;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+      switch (i->src(s).getFile()) {
+      case FILE_GPR:
+         srcId(i->src(s), s ? 42 : 10);
+         break;
+      case FILE_IMMEDIATE:
+         setImmediate32(i, s, mod);
+         break;
+      default:
+         break;
+      }
+   }
+}
+
+
+void
+CodeEmitterGK110::emitForm_C(const Instruction *i, uint32_t opc, uint8_t ctg)
+{
+   code[0] = ctg;
+   code[1] = opc << 20;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_CONST:
+      code[1] |= 0x4 << 28;
+      setCAddress14(i->src(0));
+      break;
+   case FILE_GPR:
+      code[1] |= 0xc << 28;
+      srcId(i->src(0), 23);
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+// 0x2 for GPR, c[] and 0x1 for short immediate
+void
+CodeEmitterGK110::emitForm_21(const Instruction *i, uint32_t opc2,
+                              uint32_t opc1)
+{
+   const bool imm = i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE;
+
+   int s1 = 23;
+   if (i->srcExists(2) && i->src(2).getFile() == FILE_MEMORY_CONST)
+      s1 = 42;
+
+   if (imm) {
+      code[0] = 0x1;
+      code[1] = opc1 << 20;
+   } else {
+      code[0] = 0x2;
+      code[1] = (0xc << 28) | (opc2 << 20);
+   }
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+      switch (i->src(s).getFile()) {
+      case FILE_MEMORY_CONST:
+         code[1] &= (s == 2) ? ~(0x4 << 28) : ~(0x8 << 28);
+         setCAddress14(i->src(s));
+         code[1] |= i->getSrc(s)->reg.fileIndex << 5;
+         break;
+      case FILE_IMMEDIATE:
+         setShortImmediate(i, s);
+         break;
+      case FILE_GPR:
+         srcId(i->src(s), s ? ((s == 2) ? 42 : s1) : 10);
+         break;
+      default:
+         // ignore here, can be predicate or flags, but must not be address
+         break;
+      }
+   }
+   // 0x0 = invalid
+   // 0xc = rrr
+   // 0x8 = rrc
+   // 0x4 = rcr
+   assert(imm || (code[1] & (0xc << 28)));
+}
+
+inline void
+CodeEmitterGK110::modNegAbsF32_3b(const Instruction *i, const int s)
+{
+   if (i->src(s).mod.abs()) code[1] &= ~(1 << 27);
+   if (i->src(s).mod.neg()) code[1] ^=  (1 << 27);
+}
+
+void
+CodeEmitterGK110::emitNOP(const Instruction *i)
+{
+   code[0] = 0x00003c02;
+   code[1] = 0x85800000;
+
+   if (i)
+      emitPredicate(i);
+   else
+      code[0] = 0x001c3c02;
+}
+
+void
+CodeEmitterGK110::emitFMAD(const Instruction *i)
+{
+   assert(!isLIMM(i->src(1), TYPE_F32));
+
+   emitForm_21(i, 0x0c0, 0x940);
+
+   NEG_(34, 2);
+   SAT_(35);
+   RND_(36, F);
+   FTZ_(38);
+
+   bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   if (code[0] & 0x1) {
+      if (neg1)
+         code[1] ^= 1 << 27;
+   } else
+   if (neg1) {
+      code[1] |= 1 << 19;
+   }
+}
+
+void
+CodeEmitterGK110::emitFMUL(const Instruction *i)
+{
+   bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(i->postFactor >= -3 && i->postFactor <= 3);
+
+   if (isLIMM(i->src(1), TYPE_F32)) {
+      emitForm_L(i, 0x200, 0x2, Modifier(0));
+
+      FTZ_(38);
+      SAT_(3a);
+      if (neg)
+         code[1] ^= 1 << 22;
+
+      assert(i->postFactor == 0);
+   } else {
+      emitForm_21(i, 0x234, 0xc34);
+
+      RND_(2a, F);
+      FTZ_(2f);
+      SAT_(35);
+
+      if (code[0] & 0x1) {
+         if (neg)
+            code[1] ^= 1 << 27;
+      } else
+      if (neg) {
+         code[1] |= 1 << 19;
+      }
+   }
+}
+
+void
+CodeEmitterGK110::emitIMUL(const Instruction *i)
+{
+   assert(!i->src(0).mod.neg() && !i->src(1).mod.neg());
+   assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+
+   if (isLIMM(i->src(1), TYPE_S32)) {
+      emitForm_L(i, 0x280, 2, Modifier(0));
+
+      assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
+
+      if (i->sType == TYPE_S32)
+         code[1] |= 3 << 25;
+   } else {
+      emitForm_21(i, 0x21c, 0xc1c);
+
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 10;
+      if (i->sType == TYPE_S32)
+         code[1] |= 3 << 11;
+   }
+}
+
+void
+CodeEmitterGK110::emitFADD(const Instruction *i)
+{
+   if (isLIMM(i->src(1), TYPE_F32)) {
+      assert(i->rnd == ROUND_N);
+      assert(!i->saturate);
+
+      emitForm_L(i, 0x400, 0, i->src(1).mod);
+
+      FTZ_(3a);
+      NEG_(3b, 0);
+      ABS_(39, 0);
+   } else {
+      emitForm_21(i, 0x22c, 0xc2c);
+
+      FTZ_(2f);
+      RND_(2a, F);
+      ABS_(31, 0);
+      NEG_(33, 0);
+
+      if (code[0] & 0x1) {
+         modNegAbsF32_3b(i, 1);
+      } else {
+         ABS_(34, 1);
+         NEG_(30, 1);
+      }
+   }
+}
+
+void
+CodeEmitterGK110::emitUADD(const Instruction *i)
+{
+   uint8_t addOp = (i->src(0).mod.neg() << 1) | i->src(1).mod.neg();
+
+   if (i->op == OP_SUB)
+      addOp ^= 1;
+
+   assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+
+   if (isLIMM(i->src(1), TYPE_S32)) {
+      emitForm_L(i, 0x400, 1, Modifier((addOp & 1) ? NV50_IR_MOD_NEG : 0));
+
+      if (addOp & 2)
+         code[1] |= 1 << 27;
+
+      assert(!i->defExists(1));
+      assert(i->flagsSrc < 0);
+
+      SAT_(39);
+   } else {
+      emitForm_21(i, 0x208, 0xc08);
+
+      assert(addOp != 3); // would be add-plus-one
+
+      code[1] |= addOp << 19;
+
+      if (i->defExists(1))
+         code[1] |= 1 << 18; // write carry
+      if (i->flagsSrc >= 0)
+         code[1] |= 1 << 14; // add carry
+
+      SAT_(35);
+   }
+}
+
+// TODO: shl-add
+void
+CodeEmitterGK110::emitIMAD(const Instruction *i)
+{
+   uint8_t addOp =
+      (i->src(2).mod.neg() << 1) | (i->src(0).mod.neg() ^ i->src(1).mod.neg());
+
+   emitForm_21(i, 0x100, 0xa00);
+
+   assert(addOp != 3);
+   code[1] |= addOp << 26;
+
+   if (i->sType == TYPE_S32)
+      code[1] |= (1 << 19) | (1 << 24);
+
+   if (code[0] & 0x1) {
+      assert(!i->subOp);
+      SAT_(39);
+   } else {
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[1] |= 1 << 25;
+      SAT_(35);
+   }
+}
+
+void
+CodeEmitterGK110::emitISAD(const Instruction *i)
+{
+   assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);
+
+   emitForm_21(i, 0x1fc, 0xb74);
+
+   if (i->dType == TYPE_S32)
+      code[1] |= 1 << 19;
+}
+
+void
+CodeEmitterGK110::emitNOT(const Instruction *i)
+{
+   code[0] = 0x0003fc02; // logop(mov2) dst, 0, not src
+   code[1] = 0x22003800;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+
+   switch (i->src(0).getFile()) {
+   case FILE_GPR:
+      code[1] |= 0xc << 28;
+      srcId(i->src(0), 23);
+      break;
+   case FILE_MEMORY_CONST:
+      code[1] |= 0x4 << 28;
+      setCAddress14(i->src(1));
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void
+CodeEmitterGK110::emitLogicOp(const Instruction *i, uint8_t subOp)
+{
+   assert(!(i->src(0).mod & Modifier(NV50_IR_MOD_NOT))); // XXX: find me
+
+   if (isLIMM(i->src(1), TYPE_S32)) {
+      emitForm_L(i, 0x200, 0, i->src(1).mod);
+      code[1] |= subOp << 24;
+   } else {
+      emitForm_21(i, 0x220, 0xc20);
+      code[1] |= subOp << 12;
+      NOT_(2b, 1);
+   }
+   assert(!(code[0] & 0x1) || !(i->src(1).mod & Modifier(NV50_IR_MOD_NOT)));
+}
+
+void
+CodeEmitterGK110::emitPOPC(const Instruction *i)
+{
+   assert(!isLIMM(i->src(1), TYPE_S32, true));
+
+   emitForm_21(i, 0x204, 0xc04);
+
+   NOT_(2a, 0);
+   if (!(code[0] & 0x1))
+      NOT_(2b, 1);
+}
+
+void
+CodeEmitterGK110::emitINSBF(const Instruction *i)
+{
+   emitForm_21(i, 0x1f8, 0xb78);
+}
+
+void
+CodeEmitterGK110::emitShift(const Instruction *i)
+{
+   const bool sar = i->op == OP_SHR && isSignedType(i->sType);
+
+   if (sar) {
+      emitForm_21(i, 0x214, 0x014);
+      code[1] |= 1 << 19;
+   } else
+   if (i->op == OP_SHR) {
+      // this is actually RSHF
+      emitForm_21(i, 0x27c, 0x87c);
+      code[1] |= GK110_GPR_ZERO << 10;
+   } else {
+      // this is actually LSHF
+      emitForm_21(i, 0x1fc, 0xb7c);
+      code[1] |= GK110_GPR_ZERO << 10;
+   }
+
+   if (i->subOp == NV50_IR_SUBOP_SHIFT_WRAP) {
+      if (!sar)
+         code[1] |= 1 << 21;
+      // XXX: find wrap modifier for SHR S32
+   }
+}
+
+void
+CodeEmitterGK110::emitPreOp(const Instruction *i)
+{
+   emitForm_21(i, 0x248, -1);
+
+   if (i->op == OP_PREEX2)
+      code[1] |= 1 << 10;
+
+   NEG_(30, 0);
+   ABS_(34, 0);
+}
+
+void
+CodeEmitterGK110::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+   code[0] = 0x00000002 | (subOp << 23);
+   code[1] = 0x84000000;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0), 10);
+
+   NEG_(33, 0);
+   ABS_(31, 0);
+
+   // XXX: find saturate
+}
+
+void
+CodeEmitterGK110::emitMINMAX(const Instruction *i)
+{
+   uint32_t op2, op1;
+
+   switch (i->dType) {
+   case TYPE_U32:
+   case TYPE_S32:
+      op2 = 0x210;
+      op1 = 0xc10;
+      break;
+   case TYPE_F32:
+      op2 = 0x230;
+      op1 = 0xc30;
+      break;
+   case TYPE_F64:
+      op2 = 0x228;
+      op1 = 0xc28;
+      break;
+   default:
+      assert(0);
+      op2 = 0;
+      op1 = 0;
+      break;
+   }
+   emitForm_21(i, op2, op1);
+
+   if (i->dType == TYPE_S32)
+      code[1] |= 1 << 19;
+   code[1] |= (i->op == OP_MIN) ? 0x1c00 : 0x3c00; // [!]pt
+
+   FTZ_(2f);
+   ABS_(31, 0);
+   NEG_(33, 0);
+   if (code[0] & 0x1) {
+      modNegAbsF32_3b(i, 1);
+   } else {
+      ABS_(34, 1);
+      NEG_(30, 1);
+   }
+}
+
+void
+CodeEmitterGK110::emitCVT(const Instruction *i)
+{
+   const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+   const bool f2i = !isFloatType(i->dType) && isFloatType(i->sType);
+   const bool i2f = isFloatType(i->dType) && !isFloatType(i->sType);
+
+   bool sat = i->saturate;
+   bool abs = i->src(0).mod.abs();
+   bool neg = i->src(0).mod.neg();
+
+   RoundMode rnd = i->rnd;
+
+   switch (i->op) {
+   case OP_CEIL:  rnd = f2f ? ROUND_PI : ROUND_P; break;
+   case OP_FLOOR: rnd = f2f ? ROUND_MI : ROUND_M; break;
+   case OP_TRUNC: rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+   case OP_SAT: sat = true; break;
+   case OP_NEG: neg = !neg; break;
+   case OP_ABS: abs = true; neg = false; break;
+   default:
+      break;
+   }
+
+   uint32_t op;
+
+   if      (f2f) op = 0x254;
+   else if (f2i) op = 0x258;
+   else if (i2f) op = 0x25c;
+   else          op = 0x260;
+
+   emitForm_C(i, op, 0x2);
+
+   FTZ_(2f);
+   if (neg) code[1] |= 1 << 16;
+   if (abs) code[1] |= 1 << 20;
+   if (sat) code[1] |= 1 << 21;
+
+   emitRoundMode(rnd, 32 + 10, f2f ? (32 + 13) : -1);
+
+   code[0] |= typeSizeofLog2(i->dType) << 10;
+   code[0] |= typeSizeofLog2(i->sType) << 12;
+
+   if (isSignedIntType(i->dType))
+      code[0] |= 0x4000;
+   if (isSignedIntType(i->sType))
+      code[0] |= 0x8000;
+}
+
+void
+CodeEmitterGK110::emitSET(const CmpInstruction *i)
+{
+   uint16_t op1, op2;
+
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      switch (i->sType) {
+      case TYPE_F32: op2 = 0x1d8; op1 = 0xb58; break;
+      case TYPE_F64: op2 = 0x1c0; op1 = 0xb40; break;
+      default:
+         op2 = 0x1b0;
+         op1 = 0xb30;
+         break;
+      }
+      emitForm_21(i, op2, op1);
+
+      NEG_(2e, 0);
+      ABS_(9, 0);
+      if (!(code[0] & 0x1)) {
+         NEG_(8, 1);
+         ABS_(2f, 1);
+      } else {
+         modNegAbsF32_3b(i, 1);
+      }
+      FTZ_(32);
+
+      // normal DST field is negated predicate result
+      code[0] = (code[0] & ~0xfc) | ((code[0] << 3) & 0xe0);
+      if (i->defExists(1))
+         defId(i->def(1), 2);
+   else
+      code[0] |= 0x1c;
+   } else {
+      switch (i->sType) {
+      case TYPE_F32: op2 = 0x000; op1 = 0x820; break;
+      case TYPE_F64: op2 = 0x080; op1 = 0x900; break;
+      default:
+         op2 = 0x1a8;
+         op1 = 0xb28;
+         break;
+      }
+      emitForm_21(i, op2, op1);
+
+      NEG_(2e, 0);
+      ABS_(39, 0);
+      if (!(code[0] & 0x1)) {
+         NEG_(38, 1);
+         ABS_(2f, 1);
+      } else {
+         modNegAbsF32_3b(i, 1);
+      }
+      FTZ_(3a);
+   }
+   if (i->sType == TYPE_S32)
+      code[1] |= 1 << 19;
+
+   if (i->op != OP_SET) {
+      switch (i->op) {
+      case OP_SET_AND: code[1] |= 0x0 << 16; break;
+      case OP_SET_OR:  code[1] |= 0x1 << 16; break;
+      case OP_SET_XOR: code[1] |= 0x2 << 16; break;
+      default:
+         assert(0);
+         break;
+      }
+      srcId(i->src(2), 0x2a);
+   } else {
+      code[1] |= 0x7 << 10;
+   }
+   emitCondCode(i->setCond,
+                isFloatType(i->sType) ? 0x33 : 0x34,
+                isFloatType(i->sType) ? 0xf : 0x7);
+}
+
+void
+CodeEmitterGK110::emitSLCT(const CmpInstruction *i)
+{
+   CondCode cc = i->setCond;
+   if (i->src(2).mod.neg())
+      cc = reverseCondCode(cc);
+
+   if (i->dType == TYPE_F32) {
+      emitForm_21(i, 0x1d0, 0xb50);
+      FTZ_(32);
+      emitCondCode(cc, 0x33, 0xf);
+   } else {
+      emitForm_21(i, 0x1a4, 0xb20);
+      emitCondCode(cc, 0x34, 0x7);
+   }
+}
+
+void CodeEmitterGK110::emitSELP(const Instruction *i)
+{
+   emitForm_21(i, 0x250, 0x050);
+
+   if ((i->cc == CC_NOT_P) ^ (bool)(i->src(2).mod & Modifier(NV50_IR_MOD_NOT)))
+      code[1] |= 1 << 13;
+}
+
+void CodeEmitterGK110::emitTEXBAR(const Instruction *i)
+{
+   code[0] = 0x00000002 | (i->subOp << 23);
+   code[1] = 0x77000000;
+
+   emitPredicate(i);
+}
+
+void CodeEmitterGK110::emitTEXCSAA(const TexInstruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+static inline bool
+isNextIndependentTex(const TexInstruction *i)
+{
+   if (!i->next || !isTextureOp(i->next->op))
+      return false;
+   if (i->getDef(0)->interfers(i->next->getSrc(0)))
+      return false;
+   return !i->next->srcExists(1) || !i->getDef(0)->interfers(i->next->getSrc(1));
+}
+
+void
+CodeEmitterGK110::emitTEX(const TexInstruction *i)
+{
+   const bool ind = i->tex.rIndirectSrc >= 0;
+
+   if (ind) {
+      code[0] = 0x00000002;
+      switch (i->op) {
+      case OP_TXD:
+         code[1] = 0x7e000000;
+         break;
+      default:
+         code[1] = 0x7d800000;
+         break;
+      }
+   } else {
+      switch (i->op) {
+      case OP_TXD:
+         code[0] = 0x00000002;
+         code[1] = 0x76000000;
+         break;
+      default:
+         code[0] = 0x00000001;
+         code[1] = 0x60000000;
+         break;
+      }
+      code[1] |= i->tex.r << 15;
+   }
+
+   code[1] |= isNextIndependentTex(i) ? 0x1 : 0x2; // t : p mode
+
+   // if (i->tex.liveOnly)
+   //    ?
+
+   switch (i->op) {
+   case OP_TEX: break;
+   case OP_TXB: code[1] |= 0x2000; break;
+   case OP_TXL: code[1] |= 0x3000; break;
+   case OP_TXF: break; // XXX
+   case OP_TXG: break; // XXX
+   case OP_TXD: break;
+   default:
+      assert(!"invalid texture op");
+      break;
+   }
+   /*
+   if (i->op == OP_TXF) {
+      if (!i->tex.levelZero)
+         code[1] |= 0x02000000;
+   } else */
+   if (i->tex.levelZero) {
+      code[1] |= 0x1000;
+   }
+
+   // if (i->op != OP_TXD && i->tex.derivAll)
+   //   code[1] |= 1 << 13;
+
+   emitPredicate(i);
+
+   code[1] |= i->tex.mask << 2;
+
+   const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+   defId(i->def(0), 2);
+   srcId(i->src(0), 10);
+   srcId(i, src1, 23);
+
+   // if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
+
+   // texture target:
+   code[1] |= (i->tex.target.isCube() ? 3 : (i->tex.target.getDim() - 1)) << 7;
+   if (i->tex.target.isArray())
+      code[1] |= 0x40;
+   // if (i->tex.target.isShadow())
+   //   ?
+   // if (i->tex.target == TEX_TARGET_2D_MS ||
+   //     i->tex.target == TEX_TARGET_2D_MS_ARRAY)
+   //   ?
+
+   if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) {
+      // ?
+   }
+
+   // if (i->tex.useOffsets)
+   //   ?
+}
+
+void
+CodeEmitterGK110::emitTXQ(const TexInstruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitFlow(const Instruction *i)
+{
+   const FlowInstruction *f = i->asFlow();
+
+   unsigned mask; // bit 0: predicate, bit 1: target
+
+   code[0] = 0x00000000;
+
+   switch (i->op) {
+   case OP_BRA:
+      code[1] = f->absolute ? 0x00000 : 0x12000000; // XXX
+      // if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+      //   code[0] |= 0x4000;
+      mask = 3;
+      break;
+   case OP_CALL:
+      code[1] = f->absolute ? 0x00000 : 0x13000000; // XXX
+      // if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+      //   code[0] |= 0x4000;
+      mask = 2;
+      break;
+
+   case OP_EXIT:    code[1] = 0x18000000; mask = 1; break;
+   case OP_RET:     code[1] = 0x19000000; mask = 1; break;
+   case OP_DISCARD: code[1] = 0x19800000; mask = 1; break; // XXX: guess
+   case OP_BREAK:   code[1] = 0x1a800000; mask = 1; break; // XXX: guess
+   case OP_CONT:    code[1] = 0x1b000000; mask = 1; break; // XXX: guess
+
+   case OP_JOINAT:   code[1] = 0x14800000; mask = 2; break;
+   case OP_PREBREAK: code[1] = 0x15000000; mask = 2; break; // XXX: guess
+   case OP_PRECONT:  code[1] = 0x15800000; mask = 2; break; // XXX: guess
+   case OP_PRERET:   code[1] = 0x16000000; mask = 2; break; // XXX: guess
+
+   case OP_QUADON:  code[1] = 0x1c000000; mask = 0; break; // XXX: guess
+   case OP_QUADPOP: code[1] = 0x1c800000; mask = 0; break; // XXX: guess
+   case OP_BRKPT:   code[1] = 0x1d000000; mask = 0; break; // XXX: guess
+   default:
+      assert(!"invalid flow operation");
+      return;
+   }
+
+   if (mask & 1) {
+      emitPredicate(i);
+      if (i->flagsSrc < 0)
+         code[0] |= 0x3c;
+   }
+
+   if (!f)
+      return;
+
+   // TODO
+   /*
+   if (f->allWarp)
+      code[0] |= 1 << 15;
+   if (f->limit)
+      code[0] |= 1 << 16;
+   */
+
+   if (f->op == OP_CALL) {
+      if (f->builtin) {
+         assert(f->absolute);
+         uint32_t pcAbs = targNVC0->getBuiltinOffset(f->target.builtin);
+         addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xff800000, 23);
+         addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x007fffff, -9);
+      } else {
+         assert(!f->absolute);
+         int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
+         code[0] |= (pcRel & 0x1ff) << 23;
+         code[1] |= (pcRel >> 9) & 0x7fff;
+      }
+   } else
+   if (mask & 2) {
+      int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      // currently we don't want absolute branches
+      assert(!f->absolute);
+      code[0] |= (pcRel & 0x1ff) << 23;
+      code[1] |= (pcRel >> 9) & 0x7fff;
+   }
+}
+
+void
+CodeEmitterGK110::emitPFETCH(const Instruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitVFETCH(const Instruction *i)
+{
+   uint32_t offset = i->src(0).get()->reg.data.offset;
+
+   code[0] = 0x00000002 | (offset << 23);
+   code[1] = 0x7ec00000 | (offset >> 9);
+
+#if 0
+   if (i->perPatch)
+      code[0] |= 0x100;
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
+#endif
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+   srcId(i->src(0).getIndirect(1), 32 + 10); // vertex address
+}
+
+void
+CodeEmitterGK110::emitEXPORT(const Instruction *i)
+{
+   uint32_t offset = i->src(0).get()->reg.data.offset;
+
+   code[0] = 0x00000002 | (offset << 23);
+   code[1] = 0x7f000000 | (offset >> 9);
+
+#if 0
+   if (i->perPatch)
+      code[0] |= 0x100;
+#endif
+
+   emitPredicate(i);
+
+   assert(i->src(1).getFile() == FILE_GPR);
+
+   srcId(i->src(0).getIndirect(0), 10);
+   srcId(i->src(0).getIndirect(1), 32 + 10); // vertex base address
+   srcId(i->src(1), 2);
+}
+
+void
+CodeEmitterGK110::emitOUT(const Instruction *i)
+{
+   emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitInterpMode(const Instruction *i)
+{
+   code[1] |= i->ipa << 21; // TODO: INTERP_SAMPLEID
+}
+
+void
+CodeEmitterGK110::emitINTERP(const Instruction *i)
+{
+   const uint32_t base = i->getSrc(0)->reg.data.offset;
+
+   code[0] = 0x00000002 | (base << 31);
+   code[1] = 0x74800000 | (base >> 1);
+
+   if (i->saturate)
+      code[1] |= 1 << 18;
+
+   if (i->op == OP_PINTERP)
+      srcId(i->src(1), 23);
+   else
+      code[0] |= 0xff << 23;
+
+   srcId(i->src(0).getIndirect(0), 10);
+   emitInterpMode(i);
+
+   emitPredicate(i);
+   defId(i->def(0), 2);
+
+   if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
+      srcId(i->src(i->op == OP_PINTERP ? 2 : 1), 32 + 10);
+   else
+      code[1] |= 0xff << 10;
+}
+
+void
+CodeEmitterGK110::emitLoadStoreType(DataType ty, const int pos)
+{
+   uint8_t n;
+
+   switch (ty) {
+   case TYPE_U8:
+      n = 0;
+      break;
+   case TYPE_S8:
+      n = 1;
+      break;
+   case TYPE_U16:
+      n = 2;
+      break;
+   case TYPE_S16:
+      n = 3;
+      break;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      n = 4;
+      break;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      n = 5;
+      break;
+   case TYPE_B128:
+      n = 6;
+      break;
+   default:
+      n = 0;
+      assert(!"invalid ld/st type");
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitCachingMode(CacheMode c, const int pos)
+{
+   uint8_t n;
+
+   switch (c) {
+   case CACHE_CA:
+// case CACHE_WB:
+      n = 0;
+      break;
+   case CACHE_CG:
+      n = 1;
+      break;
+   case CACHE_CS:
+      n = 2;
+      break;
+   case CACHE_CV:
+// case CACHE_WT:
+      n = 3;
+      break;
+   default:
+      n = 0;
+      assert(!"invalid caching mode");
+      break;
+   }
+   code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitSTORE(const Instruction *i)
+{
+   int32_t offset = SDATA(i->src(0)).offset;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: code[1] = 0xe0000000; code[0] = 0x00000000; break;
+   case FILE_MEMORY_LOCAL:  code[1] = 0x7a800000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+   default:
+      assert(!"invalid memory file");
+      break;
+   }
+
+   if (i->src(0).getFile() != FILE_MEMORY_GLOBAL)
+      offset &= 0xffffff;
+
+   if (code[0] & 0x2) {
+      emitLoadStoreType(i->dType, 0x33);
+      if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
+         emitCachingMode(i->cache, 0x2f);
+   } else {
+      emitLoadStoreType(i->dType, 0x38);
+      emitCachingMode(i->cache, 0x3b);
+   }
+   code[0] |= offset << 23;
+   code[1] |= offset >> 9;
+
+   emitPredicate(i);
+
+   srcId(i->src(1), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+}
+
+void
+CodeEmitterGK110::emitLOAD(const Instruction *i)
+{
+   int32_t offset = SDATA(i->src(0)).offset;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: code[1] = 0xc0000000; code[0] = 0x00000000; break;
+   case FILE_MEMORY_LOCAL:  code[1] = 0x7a000000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+   case FILE_MEMORY_CONST:
+      if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
+         emitMOV(i);
+         return;
+      }
+      offset &= 0xffff;
+      code[0] = 0x00000002;
+      code[1] = 0x7c800000 | (i->src(0).get()->reg.fileIndex << 7);
+      break;
+   default:
+      assert(!"invalid memory file");
+      break;
+   }
+
+   if (code[0] & 0x2) {
+      offset &= 0xffffff;
+      emitLoadStoreType(i->dType, 0x33);
+      if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
+         emitCachingMode(i->cache, 0x2f);
+   } else {
+      emitLoadStoreType(i->dType, 0x38);
+      emitCachingMode(i->cache, 0x3b);
+   }
+   code[0] |= offset << 23;
+   code[1] |= offset >> 9;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 2);
+   srcId(i->src(0).getIndirect(0), 10);
+}
+
+uint8_t
+CodeEmitterGK110::getSRegEncoding(const ValueRef& ref)
+{
+   switch (SDATA(ref).sv.sv) {
+   case SV_LANEID:        return 0x00;
+   case SV_PHYSID:        return 0x03;
+   case SV_VERTEX_COUNT:  return 0x10;
+   case SV_INVOCATION_ID: return 0x11;
+   case SV_YDIR:          return 0x12;
+   case SV_TID:           return 0x21 + SDATA(ref).sv.index;
+   case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
+   case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
+   case SV_GRIDID:        return 0x2c;
+   case SV_NCTAID:        return 0x2d + SDATA(ref).sv.index;
+   case SV_LBASE:         return 0x34;
+   case SV_SBASE:         return 0x30;
+   case SV_CLOCK:         return 0x50 + SDATA(ref).sv.index;
+   default:
+      assert(!"no sreg for system value");
+      return 0;
+   }
+}
+
+void
+CodeEmitterGK110::emitMOV(const Instruction *i)
+{
+   if (i->src(0).getFile() == FILE_SYSTEM_VALUE) {
+      code[0] = 0x00000002 | (getSRegEncoding(i->src(0)) << 23);
+      code[1] = 0x86400000;
+      emitPredicate(i);
+      defId(i->def(0), 2);
+   } else
+   if (i->src(0).getFile() == FILE_IMMEDIATE) {
+      code[0] = 0x00000002 | (i->lanes << 14);
+      code[1] = 0x74000000;
+      emitPredicate(i);
+      defId(i->def(0), 2);
+      setImmediate32(i, 0, Modifier(0));
+   } else
+   if (i->src(0).getFile() == FILE_PREDICATE) {
+      // TODO
+   } else {
+      emitForm_C(i, 0x24c, 2);
+      code[1] |= i->lanes << 10;
+   }
+}
+
+bool
+CodeEmitterGK110::emitInstruction(Instruction *insn)
+{
+   const unsigned int size = (writeIssueDelays && !(codeSize & 0x3f)) ? 16 : 8;
+
+   if (insn->encSize != 8) {
+      ERROR("skipping unencodable instruction: ");
+      insn->print();
+      return false;
+   } else
+   if (codeSize + size > codeSizeLimit) {
+      ERROR("code emitter output buffer too small\n");
+      return false;
+   }
+
+   if (writeIssueDelays) {
+      int id = (codeSize & 0x3f) / 8 - 1;
+      if (id < 0) {
+         id += 1;
+         code[0] = 0x00000000; // cf issue delay "instruction"
+         code[1] = 0x08000000;
+         code += 2;
+         codeSize += 8;
+      }
+      uint32_t *data = code - (id * 2 + 2);
+
+      switch (id) {
+      case 0: data[0] |= insn->sched << 2; break;
+      case 1: data[0] |= insn->sched << 10; break;
+      case 2: data[0] |= insn->sched << 18; break;
+      case 3: data[0] |= insn->sched << 26; data[1] |= insn->sched >> 6; break;
+      case 4: data[1] |= insn->sched << 2;
+      case 5: data[1] |= insn->sched << 10; break;
+      case 6: data[1] |= insn->sched << 18; break;
+      default:
+         assert(0);
+         break;
+      }
+   }
+
+   // assert that instructions with multiple defs don't corrupt registers
+   for (int d = 0; insn->defExists(d); ++d)
+      assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
+
+   switch (insn->op) {
+   case OP_MOV:
+   case OP_RDSV:
+      emitMOV(insn);
+      break;
+   case OP_NOP:
+      break;
+   case OP_LOAD:
+      emitLOAD(insn);
+      break;
+   case OP_STORE:
+      emitSTORE(insn);
+      break;
+   case OP_LINTERP:
+   case OP_PINTERP:
+      emitINTERP(insn);
+      break;
+   case OP_VFETCH:
+      emitVFETCH(insn);
+      break;
+   case OP_EXPORT:
+      emitEXPORT(insn);
+      break;
+   case OP_PFETCH:
+      emitPFETCH(insn);
+      break;
+   case OP_EMIT:
+   case OP_RESTART:
+      emitOUT(insn);
+      break;
+   case OP_ADD:
+   case OP_SUB:
+      if (isFloatType(insn->dType))
+         emitFADD(insn);
+      else
+         emitUADD(insn);
+      break;
+   case OP_MUL:
+      if (isFloatType(insn->dType))
+         emitFMUL(insn);
+      else
+         emitIMUL(insn);
+      break;
+   case OP_MAD:
+   case OP_FMA:
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
+      break;
+   case OP_SAD:
+      emitISAD(insn);
+      break;
+   case OP_NOT:
+      emitNOT(insn);
+      break;
+   case OP_AND:
+      emitLogicOp(insn, 0);
+      break;
+   case OP_OR:
+      emitLogicOp(insn, 1);
+      break;
+   case OP_XOR:
+      emitLogicOp(insn, 2);
+      break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+      emitSET(insn->asCmp());
+      break;
+   case OP_SELP:
+      emitSELP(insn);
+      break;
+   case OP_SLCT:
+      emitSLCT(insn->asCmp());
+      break;
+   case OP_MIN:
+   case OP_MAX:
+      emitMINMAX(insn);
+      break;
+   case OP_ABS:
+   case OP_NEG:
+   case OP_CEIL:
+   case OP_FLOOR:
+   case OP_TRUNC:
+   case OP_CVT:
+   case OP_SAT:
+      emitCVT(insn);
+      break;
+   case OP_RSQ:
+      emitSFnOp(insn, 5);
+      break;
+   case OP_RCP:
+      emitSFnOp(insn, 4);
+      break;
+   case OP_LG2:
+      emitSFnOp(insn, 3);
+      break;
+   case OP_EX2:
+      emitSFnOp(insn, 2);
+      break;
+   case OP_SIN:
+      emitSFnOp(insn, 1);
+      break;
+   case OP_COS:
+      emitSFnOp(insn, 0);
+      break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      emitPreOp(insn);
+      break;
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXD:
+   case OP_TXF:
+      emitTEX(insn->asTex());
+      break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
+   case OP_TEXBAR:
+      emitTEXBAR(insn);
+      break;
+   case OP_BRA:
+   case OP_CALL:
+   case OP_PRERET:
+   case OP_RET:
+   case OP_DISCARD:
+   case OP_EXIT:
+   case OP_PRECONT:
+   case OP_CONT:
+   case OP_PREBREAK:
+   case OP_BREAK:
+   case OP_JOINAT:
+   case OP_BRKPT:
+   case OP_QUADON:
+   case OP_QUADPOP:
+      emitFlow(insn);
+      break;
+   case OP_QUADOP:
+      emitQUADOP(insn, insn->subOp, insn->lanes);
+      break;
+   case OP_DFDX:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x66 : 0x99, 0x4);
+      break;
+   case OP_DFDY:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x5a : 0xa5, 0x5);
+      break;
+   case OP_POPCNT:
+      emitPOPC(insn);
+      break;
+   case OP_JOIN:
+      emitNOP(insn);
+      insn->join = 1;
+      break;
+   case OP_PHI:
+   case OP_UNION:
+   case OP_CONSTRAINT:
+      ERROR("operation should have been eliminated");
+      return false;
+   case OP_EXP:
+   case OP_LOG:
+   case OP_SQRT:
+   case OP_POW:
+      ERROR("operation should have been lowered\n");
+      return false;
+   default:
+      ERROR("unknow op\n");
+      return false;
+   }
+
+   if (insn->join)
+      code[0] |= 1 << 22;
+
+   code += 2;
+   codeSize += 8;
+   return true;
+}
+
+uint32_t
+CodeEmitterGK110::getMinEncodingSize(const Instruction *i) const
+{
+   // No more short instruction encodings.
+   return 8;
+}
+
+void
+CodeEmitterGK110::prepareEmission(Function *func)
+{
+   const Target *targ = func->getProgram()->getTarget();
+
+   CodeEmitter::prepareEmission(func);
+
+   if (targ->hasSWSched)
+      calculateSchedDataNVC0(targ, func);
+}
+
+CodeEmitterGK110::CodeEmitterGK110(const TargetNVC0 *target)
+   : CodeEmitter(target),
+     targNVC0(target),
+     writeIssueDelays(target->hasSWSched)
+{
+   code = NULL;
+   codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNVC0::createCodeEmitterGK110(Program::Type type)
+{
+   CodeEmitterGK110 *emit = new CodeEmitterGK110(this);
+   emit->setProgramType(type);
+   return emit;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
new file mode 100644
index 0000000..3eca27d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -0,0 +1,1962 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+#define NV50_OP_ENC_LONG     0
+#define NV50_OP_ENC_SHORT    1
+#define NV50_OP_ENC_IMM      2
+#define NV50_OP_ENC_LONG_ALT 3
+
+class CodeEmitterNV50 : public CodeEmitter
+{
+public:
+   CodeEmitterNV50(const TargetNV50 *);
+
+   virtual bool emitInstruction(Instruction *);
+
+   virtual uint32_t getMinEncodingSize(const Instruction *) const;
+
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
+   virtual void prepareEmission(Function *);
+
+private:
+   Program::Type progType;
+
+   const TargetNV50 *targNV50;
+
+private:
+   inline void defId(const ValueDef&, const int pos);
+   inline void srcId(const ValueRef&, const int pos);
+   inline void srcId(const ValueRef *, const int pos);
+
+   inline void srcAddr16(const ValueRef&, bool adj, const int pos);
+   inline void srcAddr8(const ValueRef&, const int pos);
+
+   void emitFlagsRd(const Instruction *);
+   void emitFlagsWr(const Instruction *);
+
+   void emitCondCode(CondCode cc, DataType ty, int pos);
+
+   inline void setARegBits(unsigned int);
+
+   void setAReg16(const Instruction *, int s);
+   void setImmediate(const Instruction *, int s);
+
+   void setDst(const Value *);
+   void setDst(const Instruction *, int d);
+   void setSrcFileBits(const Instruction *, int enc);
+   void setSrc(const Instruction *, unsigned int s, int slot);
+
+   void emitForm_MAD(const Instruction *);
+   void emitForm_ADD(const Instruction *);
+   void emitForm_MUL(const Instruction *);
+   void emitForm_IMM(const Instruction *);
+
+   void emitLoadStoreSizeLG(DataType ty, int pos);
+   void emitLoadStoreSizeCS(DataType ty);
+
+   void roundMode_MAD(const Instruction *);
+   void roundMode_CVT(RoundMode);
+
+   void emitMNeg12(const Instruction *);
+
+   void emitLOAD(const Instruction *);
+   void emitSTORE(const Instruction *);
+   void emitMOV(const Instruction *);
+   void emitNOP();
+   void emitINTERP(const Instruction *);
+   void emitPFETCH(const Instruction *);
+   void emitOUT(const Instruction *);
+
+   void emitUADD(const Instruction *);
+   void emitAADD(const Instruction *);
+   void emitFADD(const Instruction *);
+   void emitIMUL(const Instruction *);
+   void emitFMUL(const Instruction *);
+   void emitFMAD(const Instruction *);
+   void emitIMAD(const Instruction *);
+   void emitISAD(const Instruction *);
+
+   void emitMINMAX(const Instruction *);
+
+   void emitPreOp(const Instruction *);
+   void emitSFnOp(const Instruction *, uint8_t subOp);
+
+   void emitShift(const Instruction *);
+   void emitARL(const Instruction *, unsigned int shl);
+   void emitLogicOp(const Instruction *);
+   void emitNOT(const Instruction *);
+
+   void emitCVT(const Instruction *);
+   void emitSET(const Instruction *);
+
+   void emitTEX(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
+   void emitTEXPREP(const TexInstruction *);
+
+   void emitQUADOP(const Instruction *, uint8_t lane, uint8_t quOp);
+
+   void emitFlow(const Instruction *, uint8_t flowOp);
+   void emitPRERETEmu(const FlowInstruction *);
+   void emitBAR(const Instruction *);
+
+   void emitATOM(const Instruction *);
+};
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterNV50::srcId(const ValueRef& src, const int pos)
+{
+   assert(src.get());
+   code[pos / 32] |= SDATA(src).id << (pos % 32);
+}
+
+void CodeEmitterNV50::srcId(const ValueRef *src, const int pos)
+{
+   assert(src->get());
+   code[pos / 32] |= SDATA(*src).id << (pos % 32);
+}
+
+void CodeEmitterNV50::srcAddr16(const ValueRef& src, bool adj, const int pos)
+{
+   assert(src.get());
+
+   int32_t offset = SDATA(src).offset;
+
+   assert(!adj || src.get()->reg.size <= 4);
+   if (adj)
+      offset /= src.get()->reg.size;
+
+   assert(offset <= 0x7fff && offset >= (int32_t)-0x8000 && (pos % 32) <= 16);
+
+   if (offset < 0)
+      offset &= adj ? (0xffff >> (src.get()->reg.size >> 1)) : 0xffff;
+
+   code[pos / 32] |= offset << (pos % 32);
+}
+
+void CodeEmitterNV50::srcAddr8(const ValueRef& src, const int pos)
+{
+   assert(src.get());
+
+   uint32_t offset = SDATA(src).offset;
+
+   assert((offset <= 0x1fc || offset == 0x3fc) && !(offset & 0x3));
+
+   code[pos / 32] |= (offset >> 2) << (pos % 32);
+}
+
+void CodeEmitterNV50::defId(const ValueDef& def, const int pos)
+{
+   assert(def.get() && def.getFile() != FILE_SHADER_OUTPUT);
+
+   code[pos / 32] |= DDATA(def).id << (pos % 32);
+}
+
+void
+CodeEmitterNV50::roundMode_MAD(const Instruction *insn)
+{
+   switch (insn->rnd) {
+   case ROUND_M: code[1] |= 1 << 22; break;
+   case ROUND_P: code[1] |= 2 << 22; break;
+   case ROUND_Z: code[1] |= 3 << 22; break;
+   default:
+      assert(insn->rnd == ROUND_N);
+      break;
+   }
+}
+
+void
+CodeEmitterNV50::emitMNeg12(const Instruction *i)
+{
+   code[1] |= i->src(0).mod.neg() << 26;
+   code[1] |= i->src(1).mod.neg() << 27;
+}
+
+void CodeEmitterNV50::emitCondCode(CondCode cc, DataType ty, int pos)
+{
+   uint8_t enc;
+
+   assert(pos >= 32 || pos <= 27);
+
+   switch (cc) {
+   case CC_LT:  enc = 0x1; break;
+   case CC_LTU: enc = 0x9; break;
+   case CC_EQ:  enc = 0x2; break;
+   case CC_EQU: enc = 0xa; break;
+   case CC_LE:  enc = 0x3; break;
+   case CC_LEU: enc = 0xb; break;
+   case CC_GT:  enc = 0x4; break;
+   case CC_GTU: enc = 0xc; break;
+   case CC_NE:  enc = 0x5; break;
+   case CC_NEU: enc = 0xd; break;
+   case CC_GE:  enc = 0x6; break;
+   case CC_GEU: enc = 0xe; break;
+   case CC_TR:  enc = 0xf; break;
+   case CC_FL:  enc = 0x0; break;
+
+   case CC_O:  enc = 0x10; break;
+   case CC_C:  enc = 0x11; break;
+   case CC_A:  enc = 0x12; break;
+   case CC_S:  enc = 0x13; break;
+   case CC_NS: enc = 0x1c; break;
+   case CC_NA: enc = 0x1d; break;
+   case CC_NC: enc = 0x1e; break;
+   case CC_NO: enc = 0x1f; break;
+
+   default:
+      enc = 0;
+      assert(!"invalid condition code");
+      break;
+   }
+   if (ty != TYPE_NONE && !isFloatType(ty))
+      enc &= ~0x8; // unordered only exists for float types
+
+   code[pos / 32] |= enc << (pos % 32);
+}
+
+void
+CodeEmitterNV50::emitFlagsRd(const Instruction *i)
+{
+   int s = (i->flagsSrc >= 0) ? i->flagsSrc : i->predSrc;
+
+   assert(!(code[1] & 0x00003f80));
+
+   if (s >= 0) {
+      assert(i->getSrc(s)->reg.file == FILE_FLAGS);
+      emitCondCode(i->cc, TYPE_NONE, 32 + 7);
+      srcId(i->src(s), 32 + 12);
+   } else {
+      code[1] |= 0x0780;
+   }
+}
+
+void
+CodeEmitterNV50::emitFlagsWr(const Instruction *i)
+{
+   assert(!(code[1] & 0x70));
+
+   int flagsDef = i->flagsDef;
+
+   // find flags definition and check that it is the last def
+   if (flagsDef < 0) {
+      for (int d = 0; i->defExists(d); ++d)
+         if (i->def(d).getFile() == FILE_FLAGS)
+            flagsDef = d;
+      if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point
+         WARN("Instruction::flagsDef was not set properly\n");
+   }
+   if (flagsDef == 0 && i->defExists(1))
+      WARN("flags def should not be the primary definition\n");
+
+   if (flagsDef >= 0)
+      code[1] |= (DDATA(i->def(flagsDef)).id << 4) | 0x40;
+
+}
+
+void
+CodeEmitterNV50::setARegBits(unsigned int u)
+{
+   code[0] |= (u & 3) << 26;
+   code[1] |= (u & 4);
+}
+
+void
+CodeEmitterNV50::setAReg16(const Instruction *i, int s)
+{
+   if (i->srcExists(s)) {
+      s = i->src(s).indirect[0];
+      if (s >= 0)
+         setARegBits(SDATA(i->src(s)).id + 1);
+   }
+}
+
+void
+CodeEmitterNV50::setImmediate(const Instruction *i, int s)
+{
+   const ImmediateValue *imm = i->src(s).get()->asImm();
+   assert(imm);
+
+   uint32_t u = imm->reg.data.u32;
+
+   if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT))
+      u = ~u;
+
+   code[1] |= 3;
+   code[0] |= (u & 0x3f) << 16;
+   code[1] |= (u >> 6) << 2;
+}
+
+void
+CodeEmitterNV50::setDst(const Value *dst)
+{
+   const Storage *reg = &dst->join->reg;
+
+   assert(reg->file != FILE_ADDRESS);
+
+   if (reg->data.id < 0 || reg->file == FILE_FLAGS) {
+      code[0] |= (127 << 2) | 1;
+      code[1] |= 8;
+   } else {
+      int id;
+      if (reg->file == FILE_SHADER_OUTPUT) {
+         code[1] |= 8;
+         id = reg->data.offset / 4;
+      } else {
+         id = reg->data.id;
+      }
+      code[0] |= id << 2;
+   }
+}
+
+void
+CodeEmitterNV50::setDst(const Instruction *i, int d)
+{
+   if (i->defExists(d)) {
+      setDst(i->getDef(d));
+   } else
+   if (!d) {
+      code[0] |= 0x01fc; // bit bucket
+      code[1] |= 0x0008;
+   }
+}
+
+// 3 * 2 bits:
+// 0: r
+// 1: a/s
+// 2: c
+// 3: i
+void
+CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
+{
+   uint8_t mode = 0;
+
+   for (unsigned int s = 0; s < Target::operationSrcNr[i->op]; ++s) {
+      switch (i->src(s).getFile()) {
+      case FILE_GPR:
+         break;
+      case FILE_MEMORY_SHARED:
+      case FILE_SHADER_INPUT:
+         mode |= 1 << (s * 2);
+         break;
+      case FILE_MEMORY_CONST:
+         mode |= 2 << (s * 2);
+         break;
+      case FILE_IMMEDIATE:
+         mode |= 3 << (s * 2);
+         break;
+      default:
+	      ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+         assert(0);
+         break;
+      }
+   }
+   switch (mode) {
+   case 0x00: // rrr
+      break;
+   case 0x01: // arr/grr
+      if (progType == Program::TYPE_GEOMETRY) {
+         code[0] |= 0x01800000;
+         if (enc == NV50_OP_ENC_LONG || enc == NV50_OP_ENC_LONG_ALT)
+            code[1] |= 0x00200000;
+      } else {
+         if (enc == NV50_OP_ENC_SHORT)
+            code[0] |= 0x01000000;
+         else
+            code[1] |= 0x00200000;
+      }
+      break;
+   case 0x03: // irr
+      assert(i->op == OP_MOV);
+      return;
+   case 0x0c: // rir
+      break;
+   case 0x0d: // gir
+      code[0] |= 0x01000000;
+      assert(progType == Program::TYPE_GEOMETRY ||
+             progType == Program::TYPE_COMPUTE);
+      break;
+   case 0x08: // rcr
+      code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+      code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+      break;
+   case 0x09: // acr/gcr
+      if (progType == Program::TYPE_GEOMETRY) {
+         code[0] |= 0x01800000;
+      } else {
+         code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+         code[1] |= 0x00200000;
+      }
+      code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+      break;
+   case 0x20: // rrc
+      code[0] |= 0x01000000;
+      code[1] |= (i->getSrc(2)->reg.fileIndex << 22);
+      break;
+   case 0x21: // arc
+      code[0] |= 0x01000000;
+      code[1] |= 0x00200000 | (i->getSrc(2)->reg.fileIndex << 22);
+      assert(progType != Program::TYPE_GEOMETRY);
+      break;
+   default:
+      ERROR("not encodable: %x\n", mode);
+      assert(0);
+      break;
+   }
+   if (progType != Program::TYPE_COMPUTE)
+      return;
+
+   if ((mode & 3) == 1) {
+      const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
+
+      switch (i->getSrc(0)->reg.type) {
+      case TYPE_U8:
+         break;
+      case TYPE_U16:
+         code[0] |= 1 << pos;
+         break;
+      case TYPE_S16:
+         code[0] |= 2 << pos;
+         break;
+      default:
+         code[0] |= 3 << pos;
+         assert(i->getSrc(0)->reg.size == 4);
+         break;
+      }
+   }
+}
+
+void
+CodeEmitterNV50::setSrc(const Instruction *i, unsigned int s, int slot)
+{
+   if (Target::operationSrcNr[i->op] <= s)
+      return;
+   const Storage *reg = &i->src(s).rep()->reg;
+
+   unsigned int id = (reg->file == FILE_GPR) ?
+      reg->data.id :
+      reg->data.offset >> (reg->size >> 1); // no > 4 byte sources here
+
+   switch (slot) {
+   case 0: code[0] |= id << 9; break;
+   case 1: code[0] |= id << 16; break;
+   case 2: code[1] |= id << 14; break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+// the default form:
+//  - long instruction
+//  - 1 to 3 sources in slots 0, 1, 2 (rrr, arr, rcr, acr, rrc, arc, gcr, grr)
+//  - address & flags
+void
+CodeEmitterNV50::emitForm_MAD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   code[0] |= 1;
+
+   emitFlagsRd(i);
+   emitFlagsWr(i);
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_LONG);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 1);
+   setSrc(i, 2, 2);
+
+   setAReg16(i, 1);
+}
+
+// like default form, but 2nd source in slot 2, and no 3rd source
+void
+CodeEmitterNV50::emitForm_ADD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   code[0] |= 1;
+
+   emitFlagsRd(i);
+   emitFlagsWr(i);
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_LONG_ALT);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 2);
+
+   setAReg16(i, 1);
+}
+
+// default short form (rr, ar, rc, gr)
+void
+CodeEmitterNV50::emitForm_MUL(const Instruction *i)
+{
+   assert(i->encSize == 4 && !(code[0] & 1));
+   assert(i->defExists(0));
+   assert(!i->getPredicate());
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_SHORT);
+   setSrc(i, 0, 0);
+   setSrc(i, 1, 1);
+}
+
+// usual immediate form
+// - 1 to 3 sources where last is immediate (rir, gir)
+// - no address or predicate possible
+void
+CodeEmitterNV50::emitForm_IMM(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   code[0] |= 1;
+
+   assert(i->defExists(0) && i->srcExists(0));
+
+   setDst(i, 0);
+
+   setSrcFileBits(i, NV50_OP_ENC_IMM);
+   if (Target::operationSrcNr[i->op] > 1) {
+      setSrc(i, 0, 0);
+      setImmediate(i, 1);
+      setSrc(i, 2, 1);
+   } else {
+      setImmediate(i, 0);
+   }
+}
+
+void
+CodeEmitterNV50::emitLoadStoreSizeLG(DataType ty, int pos)
+{
+   uint8_t enc;
+
+   switch (ty) {
+   case TYPE_F32: // fall through
+   case TYPE_S32: // fall through
+   case TYPE_U32:  enc = 0x6; break;
+   case TYPE_B128: enc = 0x5; break;
+   case TYPE_F64: // fall through
+   case TYPE_S64: // fall through
+   case TYPE_U64:  enc = 0x4; break;
+   case TYPE_S16:  enc = 0x3; break;
+   case TYPE_U16:  enc = 0x2; break;
+   case TYPE_S8:   enc = 0x1; break;
+   case TYPE_U8:   enc = 0x0; break;
+   default:
+      enc = 0;
+      assert(!"invalid load/store type");
+      break;
+   }
+   code[pos / 32] |= enc << (pos % 32);
+}
+
+void
+CodeEmitterNV50::emitLoadStoreSizeCS(DataType ty)
+{
+   switch (ty) {
+   case TYPE_U8: break;
+   case TYPE_U16: code[1] |= 0x4000; break;
+   case TYPE_S16: code[1] |= 0x8000; break;
+   case TYPE_F32:
+   case TYPE_S32:
+   case TYPE_U32: code[1] |= 0xc000; break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void
+CodeEmitterNV50::emitLOAD(const Instruction *i)
+{
+   DataFile sf = i->src(0).getFile();
+   int32_t offset = i->getSrc(0)->reg.data.offset;
+
+   switch (sf) {
+   case FILE_SHADER_INPUT:
+      // use 'mov' where we can
+      code[0] = i->src(0).isIndirect(0) ? 0x00000001 : 0x10000001;
+      code[1] = 0x00200000 | (i->lanes << 14);
+      if (typeSizeof(i->dType) == 4)
+         code[1] |= 0x04000000;
+      break;
+   case FILE_MEMORY_SHARED:
+      if (targ->getChipset() >= 0x84) {
+         assert(offset <= (int32_t)(0x3fff * typeSizeof(i->sType)));
+         code[0] = 0x10000001;
+         code[1] = 0x40000000;
+
+         if (typeSizeof(i->dType) == 4)
+            code[1] |= 0x04000000;
+
+         emitLoadStoreSizeCS(i->sType);
+      } else {
+         assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType)));
+         code[0] = 0x10000001;
+         code[1] = 0x00200000 | (i->lanes << 14);
+         emitLoadStoreSizeCS(i->sType);
+      }
+      break;
+   case FILE_MEMORY_CONST:
+      code[0] = 0x10000001;
+      code[1] = 0x20000000 | (i->getSrc(0)->reg.fileIndex << 22);
+      if (typeSizeof(i->dType) == 4)
+         code[1] |= 0x04000000;
+      emitLoadStoreSizeCS(i->sType);
+      break;
+   case FILE_MEMORY_LOCAL:
+      code[0] = 0xd0000001;
+      code[1] = 0x40000000;
+      break;
+   case FILE_MEMORY_GLOBAL:
+      code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16);
+      code[1] = 0x80000000;
+      break;
+   default:
+      assert(!"invalid load source file");
+      break;
+   }
+   if (sf == FILE_MEMORY_LOCAL ||
+       sf == FILE_MEMORY_GLOBAL)
+      emitLoadStoreSizeLG(i->sType, 21 + 32);
+
+   setDst(i, 0);
+
+   emitFlagsRd(i);
+   emitFlagsWr(i);
+
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      srcId(*i->src(0).getIndirect(0), 9);
+   } else {
+      setAReg16(i, 0);
+      srcAddr16(i->src(0), i->src(0).getFile() != FILE_MEMORY_LOCAL, 9);
+   }
+}
+
+void
+CodeEmitterNV50::emitSTORE(const Instruction *i)
+{
+   DataFile f = i->getSrc(0)->reg.file;
+   int32_t offset = i->getSrc(0)->reg.data.offset;
+
+   switch (f) {
+   case FILE_SHADER_OUTPUT:
+      code[0] = 0x00000001 | ((offset >> 2) << 9);
+      code[1] = 0x80c00000;
+      srcId(i->src(1), 32 + 14);
+      break;
+   case FILE_MEMORY_GLOBAL:
+      code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16);
+      code[1] = 0xa0000000;
+      emitLoadStoreSizeLG(i->dType, 21 + 32);
+      srcId(i->src(1), 2);
+      break;
+   case FILE_MEMORY_LOCAL:
+      code[0] = 0xd0000001;
+      code[1] = 0x60000000;
+      emitLoadStoreSizeLG(i->dType, 21 + 32);
+      srcId(i->src(1), 2);
+      break;
+   case FILE_MEMORY_SHARED:
+      code[0] = 0x00000001;
+      code[1] = 0xe0000000;
+      switch (typeSizeof(i->dType)) {
+      case 1:
+         code[0] |= offset << 9;
+         code[1] |= 0x00400000;
+         break;
+      case 2:
+         code[0] |= (offset >> 1) << 9;
+         break;
+      case 4:
+         code[0] |= (offset >> 2) << 9;
+         code[1] |= 0x04200000;
+         break;
+      default:
+         assert(0);
+         break;
+      }
+      srcId(i->src(1), 32 + 14);
+      break;
+   default:
+      assert(!"invalid store destination file");
+      break;
+   }
+
+   if (f == FILE_MEMORY_GLOBAL)
+      srcId(*i->src(0).getIndirect(0), 9);
+   else
+      setAReg16(i, 0);
+
+   if (f == FILE_MEMORY_LOCAL)
+      srcAddr16(i->src(0), false, 9);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitMOV(const Instruction *i)
+{
+   DataFile sf = i->getSrc(0)->reg.file;
+   DataFile df = i->getDef(0)->reg.file;
+
+   assert(sf == FILE_GPR || df == FILE_GPR);
+
+   if (sf == FILE_FLAGS) {
+      code[0] = 0x00000001;
+      code[1] = 0x20000000;
+      defId(i->def(0), 2);
+      srcId(i->src(0), 12);
+      emitFlagsRd(i);
+   } else
+   if (sf == FILE_ADDRESS) {
+      code[0] = 0x00000001;
+      code[1] = 0x40000000;
+      defId(i->def(0), 2);
+      setARegBits(SDATA(i->src(0)).id + 1);
+      emitFlagsRd(i);
+   } else
+   if (df == FILE_FLAGS) {
+      code[0] = 0x00000001;
+      code[1] = 0xa0000000;
+      defId(i->def(0), 4);
+      srcId(i->src(0), 9);
+      emitFlagsRd(i);
+   } else
+   if (sf == FILE_IMMEDIATE) {
+      code[0] = 0x10008001;
+      code[1] = 0x00000003;
+      emitForm_IMM(i);
+   } else {
+      if (i->encSize == 4) {
+         code[0] = 0x10008000;
+      } else {
+         code[0] = 0x10000001;
+         code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
+         code[1] |= (i->lanes << 14);
+         emitFlagsRd(i);
+      }
+      defId(i->def(0), 2);
+      srcId(i->src(0), 9);
+   }
+   if (df == FILE_SHADER_OUTPUT) {
+      assert(i->encSize == 8);
+      code[1] |= 0x8;
+   }
+}
+
+void
+CodeEmitterNV50::emitNOP()
+{
+   code[0] = 0xf0000001;
+   code[1] = 0xe0000000;
+}
+
+void
+CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp)
+{
+   code[0] = 0xc0000000 | (lane << 16);
+   code[1] = 0x80000000;
+
+   code[0] |= (quOp & 0x03) << 20;
+   code[1] |= (quOp & 0xfc) << 20;
+
+   emitForm_ADD(i);
+
+   if (!i->srcExists(1))
+      srcId(i->src(0), 32 + 14);
+}
+
+void
+CodeEmitterNV50::emitPFETCH(const Instruction *i)
+{
+   code[0] = 0x11800001;
+   code[1] = 0x04200000 | (0xf << 14);
+
+   defId(i->def(0), 2);
+   srcAddr8(i->src(0), 9);
+   setAReg16(i, 0);
+}
+
+void
+CodeEmitterNV50::emitINTERP(const Instruction *i)
+{
+   code[0] = 0x80000000;
+
+   defId(i->def(0), 2);
+   srcAddr8(i->src(0), 16);
+
+   if (i->getInterpMode() == NV50_IR_INTERP_FLAT) {
+      code[0] |= 1 << 8;
+   } else {
+      if (i->op == OP_PINTERP) {
+         code[0] |= 1 << 25;
+         srcId(i->src(1), 9);
+      }
+      if (i->getSampleMode() == NV50_IR_INTERP_CENTROID)
+         code[0] |= 1 << 24;
+   }
+
+   if (i->encSize == 8) {
+      code[1] =
+         (code[0] & (3 << 24)) >> (24 - 16) |
+         (code[0] & (1 <<  8)) << (18 -  8);
+      code[0] &= ~0x03000100;
+      code[0] |= 1;
+      emitFlagsRd(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitMINMAX(const Instruction *i)
+{
+   if (i->dType == TYPE_F64) {
+      code[0] = 0xe0000000;
+      code[1] = (i->op == OP_MIN) ? 0xa0000000 : 0xc0000000;
+   } else {
+      code[0] = 0x30000000;
+      code[1] = 0x80000000;
+      if (i->op == OP_MIN)
+         code[1] |= 0x20000000;
+
+      switch (i->dType) {
+      case TYPE_F32: code[0] |= 0x80000000; break;
+      case TYPE_S32: code[1] |= 0x8c000000; break;
+      case TYPE_U32: code[1] |= 0x84000000; break;
+      case TYPE_S16: code[1] |= 0x80000000; break;
+      case TYPE_U16: break;
+      default:
+         assert(0);
+         break;
+      }
+      code[1] |= i->src(0).mod.abs() << 20;
+      code[1] |= i->src(1).mod.abs() << 19;
+   }
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitFMAD(const Instruction *i)
+{
+   const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   const int neg_add = i->src(2).mod.neg();
+
+   code[0] = 0xe0000000;
+
+   if (i->encSize == 4) {
+      emitForm_MUL(i);
+      assert(!neg_mul && !neg_add);
+   } else {
+      code[1]  = neg_mul << 26;
+      code[1] |= neg_add << 27;
+      if (i->saturate)
+         code[1] |= 1 << 29;
+      emitForm_MAD(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitFADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   code[0] = 0xb0000000;
+
+   assert(!(i->src(0).mod | i->src(1).mod).abs());
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
+      emitForm_IMM(i);
+      code[0] |= neg0 << 15;
+      code[0] |= neg1 << 22;
+      if (i->saturate)
+         code[0] |= 1 << 8;
+   } else
+   if (i->encSize == 8) {
+      code[1] = 0;
+      emitForm_ADD(i);
+      code[1] |= neg0 << 26;
+      code[1] |= neg1 << 27;
+      if (i->saturate)
+         code[1] |= 1 << 29;
+   } else {
+      emitForm_MUL(i);
+      code[0] |= neg0 << 15;
+      code[0] |= neg1 << 22;
+      if (i->saturate)
+         code[0] |= 1 << 8;
+   }
+}
+
+void
+CodeEmitterNV50::emitUADD(const Instruction *i)
+{
+   const int neg0 = i->src(0).mod.neg();
+   const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+   code[0] = 0x20008000;
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
+      emitForm_IMM(i);
+   } else
+   if (i->encSize == 8) {
+      code[0] = 0x20000000;
+      code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
+      emitForm_ADD(i);
+   } else {
+      emitForm_MUL(i);
+   }
+   assert(!(neg0 && neg1));
+   code[0] |= neg0 << 28;
+   code[0] |= neg1 << 22;
+
+   if (i->flagsSrc >= 0) {
+      // addc == sub | subr
+      assert(!(code[0] & 0x10400000) && !i->getPredicate());
+      code[0] |= 0x10400000;
+      srcId(i->src(i->flagsSrc), 32 + 12);
+   }
+}
+
+void
+CodeEmitterNV50::emitAADD(const Instruction *i)
+{
+   const int s = (i->op == OP_MOV) ? 0 : 1;
+
+   code[0] = 0xd0000001 | (i->getSrc(s)->reg.data.u16 << 9);
+   code[1] = 0x20000000;
+
+   code[0] |= (DDATA(i->def(0)).id + 1) << 2;
+
+   emitFlagsRd(i);
+
+   if (s && i->srcExists(0))
+      setARegBits(SDATA(i->src(0)).id + 1);
+}
+
+void
+CodeEmitterNV50::emitIMUL(const Instruction *i)
+{
+   code[0] = 0x40000000;
+
+   if (i->encSize == 8) {
+      code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
+      emitForm_MAD(i);
+   } else {
+      if (i->sType == TYPE_S16)
+         code[0] |= 0x8100;
+      emitForm_MUL(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitFMUL(const Instruction *i)
+{
+   const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   code[0] = 0xc0000000;
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
+      emitForm_IMM(i);
+      if (neg)
+         code[0] |= 0x8000;
+   } else
+   if (i->encSize == 8) {
+      code[1] = i->rnd == ROUND_Z ? 0x0000c000 : 0;
+      if (neg)
+         code[1] |= 0x08000000;
+      emitForm_MAD(i);
+   } else {
+      emitForm_MUL(i);
+      if (neg)
+         code[0] |= 0x8000;
+   }
+}
+
+void
+CodeEmitterNV50::emitIMAD(const Instruction *i)
+{
+   code[0] = 0x60000000;
+   if (isSignedType(i->sType))
+      code[1] = i->saturate ? 0x40000000 : 0x20000000;
+   else
+      code[1] = 0x00000000;
+
+   int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+   int neg2 = i->src(2).mod.neg();
+
+   assert(!(neg1 & neg2));
+   code[1] |= neg1 << 27;
+   code[1] |= neg2 << 26;
+
+   emitForm_MAD(i);
+
+   if (i->flagsSrc >= 0) {
+      // add with carry from $cX
+      assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+      code[1] |= 0xc << 24;
+      srcId(i->src(i->flagsSrc), 32 + 12);
+   }
+}
+
+void
+CodeEmitterNV50::emitISAD(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      code[0] = 0x50000000;
+      switch (i->sType) {
+      case TYPE_U32: code[1] = 0x04000000; break;
+      case TYPE_S32: code[1] = 0x0c000000; break;
+      case TYPE_U16: code[1] = 0x00000000; break;
+      case TYPE_S16: code[1] = 0x08000000; break;
+      default:
+         assert(0);
+         break;
+      }
+      emitForm_MAD(i);
+   } else {
+      switch (i->sType) {
+      case TYPE_U32: code[0] = 0x50008000; break;
+      case TYPE_S32: code[0] = 0x50008100; break;
+      case TYPE_U16: code[0] = 0x50000000; break;
+      case TYPE_S16: code[0] = 0x50000100; break;
+      default:
+         assert(0);
+         break;
+      }
+      emitForm_MUL(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitSET(const Instruction *i)
+{
+   code[0] = 0x30000000;
+   code[1] = 0x60000000;
+
+   emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
+   switch (i->sType) {
+   case TYPE_F32: code[0] |= 0x80000000; break;
+   case TYPE_S32: code[1] |= 0x0c000000; break;
+   case TYPE_U32: code[1] |= 0x04000000; break;
+   case TYPE_S16: code[1] |= 0x08000000; break;
+   case TYPE_U16: break;
+   default:
+      assert(0);
+      break;
+   }
+   if (i->src(0).mod.neg()) code[1] |= 0x04000000;
+   if (i->src(1).mod.neg()) code[1] |= 0x08000000;
+   if (i->src(0).mod.abs()) code[1] |= 0x00100000;
+   if (i->src(1).mod.abs()) code[1] |= 0x00080000;
+
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::roundMode_CVT(RoundMode rnd)
+{
+   switch (rnd) {
+   case ROUND_NI: code[1] |= 0x08000000; break;
+   case ROUND_M:  code[1] |= 0x00020000; break;
+   case ROUND_MI: code[1] |= 0x08020000; break;
+   case ROUND_P:  code[1] |= 0x00040000; break;
+   case ROUND_PI: code[1] |= 0x08040000; break;
+   case ROUND_Z:  code[1] |= 0x00060000; break;
+   case ROUND_ZI: code[1] |= 0x08060000; break;
+   default:
+      assert(rnd == ROUND_N);
+      break;
+   }
+}
+
+void
+CodeEmitterNV50::emitCVT(const Instruction *i)
+{
+   const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+   RoundMode rnd;
+
+   switch (i->op) {
+   case OP_CEIL:  rnd = f2f ? ROUND_PI : ROUND_P; break;
+   case OP_FLOOR: rnd = f2f ? ROUND_MI : ROUND_M; break;
+   case OP_TRUNC: rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+   default:
+      rnd = i->rnd;
+      break;
+   }
+
+   code[0] = 0xa0000000;
+
+   switch (i->dType) {
+   case TYPE_F64:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0xc4404000; break;
+      case TYPE_S64: code[1] = 0x44414000; break;
+      case TYPE_U64: code[1] = 0x44404000; break;
+      case TYPE_F32: code[1] = 0xc4400000; break;
+      case TYPE_S32: code[1] = 0x44410000; break;
+      case TYPE_U32: code[1] = 0x44400000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_S64:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x8c404000; break;
+      case TYPE_F32: code[1] = 0x8c400000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_U64:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x84404000; break;
+      case TYPE_F32: code[1] = 0x84400000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_F32:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0xc0404000; break;
+      case TYPE_S64: code[1] = 0x40414000; break;
+      case TYPE_U64: code[1] = 0x40404000; break;
+      case TYPE_F32: code[1] = 0xc4004000; break;
+      case TYPE_S32: code[1] = 0x44014000; break;
+      case TYPE_U32: code[1] = 0x44004000; break;
+      case TYPE_F16: code[1] = 0xc4000000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_S32:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x88404000; break;
+      case TYPE_F32: code[1] = 0x8c004000; break;
+      case TYPE_S32: code[1] = 0x0c014000; break;
+      case TYPE_U32: code[1] = 0x0c004000; break;
+      case TYPE_F16: code[1] = 0x8c000000; break;
+      case TYPE_S16: code[1] = 0x0c010000; break;
+      case TYPE_U16: code[1] = 0x0c000000; break;
+      case TYPE_S8:  code[1] = 0x0c018000; break;
+      case TYPE_U8:  code[1] = 0x0c008000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_U32:
+      switch (i->sType) {
+      case TYPE_F64: code[1] = 0x80404000; break;
+      case TYPE_F32: code[1] = 0x84004000; break;
+      case TYPE_S32: code[1] = 0x04014000; break;
+      case TYPE_U32: code[1] = 0x04004000; break;
+      case TYPE_F16: code[1] = 0x84000000; break;
+      case TYPE_S16: code[1] = 0x04010000; break;
+      case TYPE_U16: code[1] = 0x04000000; break;
+      case TYPE_S8:  code[1] = 0x04018000; break;
+      case TYPE_U8:  code[1] = 0x04008000; break;
+      default:
+         assert(0);
+         break;
+      }
+      break;
+   case TYPE_S16:
+   case TYPE_U16:
+   case TYPE_S8:
+   case TYPE_U8:
+   default:
+      assert(0);
+      break;
+   }
+   if (typeSizeof(i->sType) == 1 && i->getSrc(0)->reg.size == 4)
+      code[1] |= 0x00004000;
+
+   roundMode_CVT(rnd);
+
+   switch (i->op) {
+   case OP_ABS: code[1] |= 1 << 20; break;
+   case OP_SAT: code[1] |= 1 << 19; break;
+   case OP_NEG: code[1] |= 1 << 29; break;
+   default:
+      break;
+   }
+   code[1] ^= i->src(0).mod.neg() << 29;
+   code[1] |= i->src(0).mod.abs() << 20;
+   if (i->saturate)
+      code[1] |= 1 << 19;
+
+   assert(i->op != OP_ABS || !i->src(0).mod.neg());
+
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitPreOp(const Instruction *i)
+{
+   code[0] = 0xb0000000;
+   code[1] = (i->op == OP_PREEX2) ? 0xc0004000 : 0xc0000000;
+
+   code[1] |= i->src(0).mod.abs() << 20;
+   code[1] |= i->src(0).mod.neg() << 26;
+
+   emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+   code[0] = 0x90000000;
+
+   if (i->encSize == 4) {
+      assert(i->op == OP_RCP);
+      code[0] |= i->src(0).mod.abs() << 15;
+      code[0] |= i->src(0).mod.neg() << 22;
+      emitForm_MUL(i);
+   } else {
+      code[1] = subOp << 29;
+      code[1] |= i->src(0).mod.abs() << 20;
+      code[1] |= i->src(0).mod.neg() << 26;
+      emitForm_MAD(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitNOT(const Instruction *i)
+{
+   code[0] = 0xd0000000;
+   code[1] = 0x0002c000;
+
+   switch (i->sType) {
+   case TYPE_U32:
+   case TYPE_S32:
+      code[1] |= 0x04000000;
+      break;
+   default:
+      break;
+   }
+   emitForm_MAD(i);
+   setSrc(i, 0, 1);
+}
+
+void
+CodeEmitterNV50::emitLogicOp(const Instruction *i)
+{
+   code[0] = 0xd0000000;
+   code[1] = 0;
+
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      switch (i->op) {
+      case OP_OR:  code[0] |= 0x0100; break;
+      case OP_XOR: code[0] |= 0x8000; break;
+      default:
+         assert(i->op == OP_AND);
+         break;
+      }
+      if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
+         code[0] |= 1 << 22;
+
+      emitForm_IMM(i);
+   } else {
+      switch (i->op) {
+      case OP_AND: code[1] = 0x04000000; break;
+      case OP_OR:  code[1] = 0x04004000; break;
+      case OP_XOR: code[1] = 0x04008000; break;
+      default:
+         assert(0);
+         break;
+      }
+      if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 16;
+      if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 17;
+
+      emitForm_MAD(i);
+   }
+}
+
+void
+CodeEmitterNV50::emitARL(const Instruction *i, unsigned int shl)
+{
+   code[0] = 0x00000001 | (shl << 16);
+   code[1] = 0xc0000000;
+
+   code[0] |= (DDATA(i->def(0)).id + 1) << 2;
+
+   setSrcFileBits(i, NV50_OP_ENC_IMM);
+   setSrc(i, 0, 0);
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitShift(const Instruction *i)
+{
+   if (i->def(0).getFile() == FILE_ADDRESS) {
+      assert(i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE);
+      emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f);
+   } else {
+      code[0] = 0x30000001;
+      code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000;
+      if (i->op == OP_SHR && isSignedType(i->sType))
+          code[1] |= 1 << 27;
+
+      if (i->src(1).getFile() == FILE_IMMEDIATE) {
+         code[1] |= 1 << 20;
+         code[0] |= (i->getSrc(1)->reg.data.u32 & 0x7f) << 16;
+         defId(i->def(0), 2);
+         srcId(i->src(0), 9);
+         emitFlagsRd(i);
+      } else {
+         emitForm_MAD(i);
+      }
+   }
+}
+
+void
+CodeEmitterNV50::emitOUT(const Instruction *i)
+{
+   code[0] = (i->op == OP_EMIT) ? 0xf0000200 : 0xf0000400;
+   code[1] = 0xc0000001;
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTEX(const TexInstruction *i)
+{
+   code[0] = 0xf0000001;
+   code[1] = 0x00000000;
+
+   switch (i->op) {
+   case OP_TXB:
+      code[1] = 0x20000000;
+      break;
+   case OP_TXL:
+      code[1] = 0x40000000;
+      break;
+   case OP_TXF:
+      code[0] |= 0x01000000;
+      break;
+   case OP_TXG:
+      code[0] = 0x01000000;
+      code[1] = 0x80000000;
+      break;
+   default:
+      assert(i->op == OP_TEX);
+      break;
+   }
+
+   code[0] |= i->tex.r << 9;
+   code[0] |= i->tex.s << 17;
+
+   int argc = i->tex.target.getArgCount();
+
+   if (i->op == OP_TXB || i->op == OP_TXL || i->op == OP_TXF)
+      argc += 1;
+   if (i->tex.target.isShadow())
+      argc += 1;
+   assert(argc <= 4);
+
+   code[0] |= (argc - 1) << 22;
+
+   if (i->tex.target.isCube()) {
+      code[0] |= 0x08000000;
+   } else
+   if (i->tex.useOffsets) {
+      code[1] |= (i->tex.offset[0][0] & 0xf) << 24;
+      code[1] |= (i->tex.offset[0][1] & 0xf) << 20;
+      code[1] |= (i->tex.offset[0][2] & 0xf) << 16;
+   }
+
+   code[0] |= (i->tex.mask & 0x3) << 25;
+   code[1] |= (i->tex.mask & 0xc) << 12;
+
+   if (i->tex.liveOnly)
+      code[1] |= 4;
+
+   defId(i->def(0), 2);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTXQ(const TexInstruction *i)
+{
+   assert(i->tex.query == TXQ_DIMS);
+
+   code[0] = 0xf0000001;
+   code[1] = 0x60000000;
+
+   code[0] |= i->tex.r << 9;
+   code[0] |= i->tex.s << 17;
+
+   code[0] |= (i->tex.mask & 0x3) << 25;
+   code[1] |= (i->tex.mask & 0xc) << 12;
+
+   defId(i->def(0), 2);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTEXPREP(const TexInstruction *i)
+{
+   code[0] = 0xf8000001 | (3 << 22) | (i->tex.s << 17) | (i->tex.r << 9);
+   code[1] = 0x60010000;
+
+   code[0] |= (i->tex.mask & 0x3) << 25;
+   code[1] |= (i->tex.mask & 0xc) << 12;
+   defId(i->def(0), 2);
+
+   emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitPRERETEmu(const FlowInstruction *i)
+{
+   uint32_t pos = i->target.bb->binPos + 8; // +8 to skip an op */
+
+   code[0] = 0x10000003; // bra
+   code[1] = 0x00000780; // always
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_EMU_PRERET + 0: // bra to the call
+      break;
+   case NV50_IR_SUBOP_EMU_PRERET + 1: // bra to skip the call
+      pos += 8;
+      break;
+   default:
+      assert(i->subOp == (NV50_IR_SUBOP_EMU_PRERET + 2));
+      code[0] = 0x20000003; // call
+      code[1] = 0x00000000; // no predicate
+      break;
+   }
+   addReloc(RelocEntry::TYPE_CODE, 0, pos, 0x07fff800, 9);
+   addReloc(RelocEntry::TYPE_CODE, 1, pos, 0x000fc000, -4);
+}
+
+void
+CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp)
+{
+   const FlowInstruction *f = i->asFlow();
+   bool hasPred = false;
+   bool hasTarg = false;
+
+   code[0] = 0x00000003 | (flowOp << 28);
+   code[1] = 0x00000000;
+
+   switch (i->op) {
+   case OP_BRA:
+      hasPred = true;
+      hasTarg = true;
+      break;
+   case OP_BREAK:
+   case OP_BRKPT:
+   case OP_DISCARD:
+   case OP_RET:
+      hasPred = true;
+      break;
+   case OP_CALL:
+   case OP_PREBREAK:
+   case OP_JOINAT:
+      hasTarg = true;
+      break;
+   case OP_PRERET:
+      hasTarg = true;
+      if (i->subOp >= NV50_IR_SUBOP_EMU_PRERET) {
+         emitPRERETEmu(f);
+         return;
+      }
+      break;
+   default:
+      break;
+   }
+
+   if (hasPred)
+      emitFlagsRd(i);
+
+   if (hasTarg && f) {
+      uint32_t pos;
+
+      if (f->op == OP_CALL) {
+         if (f->builtin) {
+            pos = targNV50->getBuiltinOffset(f->target.builtin);
+         } else {
+            pos = f->target.fn->binPos;
+         }
+      } else {
+         pos = f->target.bb->binPos;
+      }
+
+      code[0] |= ((pos >>  2) & 0xffff) << 11;
+      code[1] |= ((pos >> 18) & 0x003f) << 14;
+
+      RelocEntry::Type relocTy;
+
+      relocTy = f->builtin ? RelocEntry::TYPE_BUILTIN : RelocEntry::TYPE_CODE;
+
+      addReloc(relocTy, 0, pos, 0x07fff800, 9);
+      addReloc(relocTy, 1, pos, 0x000fc000, -4);
+   }
+}
+
+void
+CodeEmitterNV50::emitBAR(const Instruction *i)
+{
+   ImmediateValue *barId = i->getSrc(0)->asImm();
+   assert(barId);
+
+   code[0] = 0x82000003 | (barId->reg.data.u32 << 21);
+   code[1] = 0x00004000;
+
+   if (i->subOp == NV50_IR_SUBOP_BAR_SYNC)
+      code[0] |= 1 << 26;
+}
+
+void
+CodeEmitterNV50::emitATOM(const Instruction *i)
+{
+   uint8_t subOp;
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_ATOM_ADD:  subOp = 0x0; break;
+   case NV50_IR_SUBOP_ATOM_MIN:  subOp = 0x7; break;
+   case NV50_IR_SUBOP_ATOM_MAX:  subOp = 0x6; break;
+   case NV50_IR_SUBOP_ATOM_INC:  subOp = 0x4; break;
+   case NV50_IR_SUBOP_ATOM_DEC:  subOp = 0x5; break;
+   case NV50_IR_SUBOP_ATOM_AND:  subOp = 0xa; break;
+   case NV50_IR_SUBOP_ATOM_OR:   subOp = 0xb; break;
+   case NV50_IR_SUBOP_ATOM_XOR:  subOp = 0xc; break;
+   case NV50_IR_SUBOP_ATOM_CAS:  subOp = 0x2; break;
+   case NV50_IR_SUBOP_ATOM_EXCH: subOp = 0x1; break;
+   default:
+      assert(!"invalid subop");
+      return;
+   }
+   code[0] = 0xd0000001;
+   code[1] = 0xe0c00000 | (subOp << 2);
+   if (isSignedType(i->dType))
+      code[1] |= 1 << 21;
+
+   // args
+   emitFlagsRd(i);
+   setDst(i, 0);
+   setSrc(i, 1, 1);
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      setSrc(i, 2, 2);
+
+   // g[] pointer
+   code[0] |= i->getSrc(0)->reg.fileIndex << 23;
+   srcId(i->getIndirect(0, 0), 9);
+}
+
+bool
+CodeEmitterNV50::emitInstruction(Instruction *insn)
+{
+   if (!insn->encSize) {
+      ERROR("skipping unencodable instruction: "); insn->print();
+      return false;
+   } else
+   if (codeSize + insn->encSize > codeSizeLimit) {
+      ERROR("code emitter output buffer too small\n");
+      return false;
+   }
+
+   if (insn->bb->getProgram()->dbgFlags & NV50_IR_DEBUG_BASIC) {
+      INFO("EMIT: "); insn->print();
+   }
+
+   switch (insn->op) {
+   case OP_MOV:
+      emitMOV(insn);
+      break;
+   case OP_EXIT:
+   case OP_NOP:
+   case OP_JOIN:
+      emitNOP();
+      break;
+   case OP_VFETCH:
+   case OP_LOAD:
+      emitLOAD(insn);
+      break;
+   case OP_EXPORT:
+   case OP_STORE:
+      emitSTORE(insn);
+      break;
+   case OP_PFETCH:
+      emitPFETCH(insn);
+      break;
+   case OP_LINTERP:
+   case OP_PINTERP:
+      emitINTERP(insn);
+      break;
+   case OP_ADD:
+   case OP_SUB:
+      if (isFloatType(insn->dType))
+         emitFADD(insn);
+      else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
+         emitAADD(insn);
+      else
+         emitUADD(insn);
+      break;
+   case OP_MUL:
+      if (isFloatType(insn->dType))
+         emitFMUL(insn);
+      else
+         emitIMUL(insn);
+      break;
+   case OP_MAD:
+   case OP_FMA:
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
+      break;
+   case OP_SAD:
+      emitISAD(insn);
+      break;
+   case OP_NOT:
+      emitNOT(insn);
+      break;
+   case OP_AND:
+   case OP_OR:
+   case OP_XOR:
+      emitLogicOp(insn);
+      break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+      emitSET(insn);
+      break;
+   case OP_MIN:
+   case OP_MAX:
+      emitMINMAX(insn);
+      break;
+   case OP_CEIL:
+   case OP_FLOOR:
+   case OP_TRUNC:
+   case OP_ABS:
+   case OP_NEG:
+   case OP_SAT:
+      emitCVT(insn);
+      break;
+   case OP_CVT:
+      if (insn->def(0).getFile() == FILE_ADDRESS)
+         emitARL(insn, 0);
+      else
+      if (insn->def(0).getFile() == FILE_FLAGS ||
+          insn->src(0).getFile() == FILE_FLAGS ||
+          insn->src(0).getFile() == FILE_ADDRESS)
+         emitMOV(insn);
+      else
+         emitCVT(insn);
+      break;
+   case OP_RCP:
+      emitSFnOp(insn, 0);
+      break;
+   case OP_RSQ:
+      emitSFnOp(insn, 2);
+      break;
+   case OP_LG2:
+      emitSFnOp(insn, 3);
+      break;
+   case OP_SIN:
+      emitSFnOp(insn, 4);
+      break;
+   case OP_COS:
+      emitSFnOp(insn, 5);
+      break;
+   case OP_EX2:
+      emitSFnOp(insn, 6);
+      break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      emitPreOp(insn);
+      break;
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXF:
+      emitTEX(insn->asTex());
+      break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
+   case OP_TEXPREP:
+      emitTEXPREP(insn->asTex());
+      break;
+   case OP_EMIT:
+   case OP_RESTART:
+      emitOUT(insn);
+      break;
+   case OP_DISCARD:
+      emitFlow(insn, 0x0);
+      break;
+   case OP_BRA:
+      emitFlow(insn, 0x1);
+      break;
+   case OP_CALL:
+      emitFlow(insn, 0x2);
+      break;
+   case OP_RET:
+      emitFlow(insn, 0x3);
+      break;
+   case OP_PREBREAK:
+      emitFlow(insn, 0x4);
+      break;
+   case OP_BREAK:
+      emitFlow(insn, 0x5);
+      break;
+   case OP_QUADON:
+      emitFlow(insn, 0x6);
+      break;
+   case OP_QUADPOP:
+      emitFlow(insn, 0x7);
+      break;
+   case OP_JOINAT:
+      emitFlow(insn, 0xa);
+      break;
+   case OP_PRERET:
+      emitFlow(insn, 0xd);
+      break;
+   case OP_QUADOP:
+      emitQUADOP(insn, insn->lanes, insn->subOp);
+      break;
+   case OP_DFDX:
+      emitQUADOP(insn, 4, insn->src(0).mod.neg() ? 0x66 : 0x99);
+      break;
+   case OP_DFDY:
+      emitQUADOP(insn, 5, insn->src(0).mod.neg() ? 0x5a : 0xa5);
+      break;
+   case OP_ATOM:
+      emitATOM(insn);
+      break;
+   case OP_BAR:
+      emitBAR(insn);
+      break;
+   case OP_PHI:
+   case OP_UNION:
+   case OP_CONSTRAINT:
+      ERROR("operation should have been eliminated\n");
+      return false;
+   case OP_EXP:
+   case OP_LOG:
+   case OP_SQRT:
+   case OP_POW:
+   case OP_SELP:
+   case OP_SLCT:
+   case OP_TXD:
+   case OP_PRECONT:
+   case OP_CONT:
+   case OP_POPCNT:
+   case OP_INSBF:
+   case OP_EXTBF:
+      ERROR("operation should have been lowered\n");
+      return false;
+   default:
+      ERROR("unknown op: %u\n", insn->op);
+      return false;
+   }
+   if (insn->join || insn->op == OP_JOIN)
+      code[1] |= 0x2;
+   else
+   if (insn->exit || insn->op == OP_EXIT)
+      code[1] |= 0x1;
+
+   assert((insn->encSize == 8) == (code[0] & 1));
+
+   code += insn->encSize / 4;
+   codeSize += insn->encSize;
+   return true;
+}
+
+uint32_t
+CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
+{
+   const Target::OpInfo &info = targ->getOpInfo(i);
+
+   if (info.minEncSize > 4)
+      return 8;
+
+   // check constraints on dst and src operands
+   for (int d = 0; i->defExists(d); ++d) {
+      if (i->def(d).rep()->reg.data.id > 63 ||
+          i->def(d).rep()->reg.file != FILE_GPR)
+         return 8;
+   }
+
+   for (int s = 0; i->srcExists(s); ++s) {
+      DataFile sf = i->src(s).getFile();
+      if (sf != FILE_GPR)
+         if (sf != FILE_SHADER_INPUT || progType != Program::TYPE_FRAGMENT)
+            return 8;
+      if (i->src(s).rep()->reg.data.id > 63)
+         return 8;
+   }
+
+   // check modifiers & rounding
+   if (i->join || i->lanes != 0xf || i->exit)
+      return 8;
+   if (i->op == OP_MUL && i->rnd != ROUND_N)
+      return 8;
+
+   if (i->asTex())
+      return 8; // TODO: short tex encoding
+
+   // check constraints on short MAD
+   if (info.srcNr >= 2 && i->srcExists(2)) {
+      if (i->saturate || i->src(2).mod)
+         return 8;
+      if ((i->src(0).mod ^ i->src(1).mod) ||
+          (i->src(0).mod | i->src(1).mod).abs())
+         return 8;
+      if (!i->defExists(0) ||
+          i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
+         return 8;
+   }
+
+   return info.minEncSize;
+}
+
+// Change the encoding size of an instruction after BBs have been scheduled.
+static void
+makeInstructionLong(Instruction *insn)
+{
+   if (insn->encSize == 8)
+      return;
+   Function *fn = insn->bb->getFunction();
+   int n = 0;
+   int adj = 4;
+
+   for (Instruction *i = insn->next; i && i->encSize == 4; ++n, i = i->next);
+
+   if (n & 1) {
+      adj = 8;
+      insn->next->encSize = 8;
+   } else
+   if (insn->prev && insn->prev->encSize == 4) {
+      adj = 8;
+      insn->prev->encSize = 8;
+   }
+   insn->encSize = 8;
+
+   for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) {
+      fn->bbArray[i]->binPos += 4;
+   }
+   fn->binSize += adj;
+   insn->bb->binSize += adj;
+}
+
+static bool
+trySetExitModifier(Instruction *insn)
+{
+   if (insn->op == OP_DISCARD ||
+       insn->op == OP_QUADON ||
+       insn->op == OP_QUADPOP)
+      return false;
+   for (int s = 0; insn->srcExists(s); ++s)
+      if (insn->src(s).getFile() == FILE_IMMEDIATE)
+         return false;
+   if (insn->asFlow()) {
+      if (insn->op == OP_CALL) // side effects !
+         return false;
+      if (insn->getPredicate()) // cannot do conditional exit (or can we ?)
+         return false;
+      insn->op = OP_EXIT;
+   }
+   insn->exit = 1;
+   makeInstructionLong(insn);
+   return true;
+}
+
+static void
+replaceExitWithModifier(Function *func)
+{
+   BasicBlock *epilogue = BasicBlock::get(func->cfgExit);
+
+   if (!epilogue->getExit() ||
+       epilogue->getExit()->op != OP_EXIT) // only main will use OP_EXIT
+      return;
+
+   if (epilogue->getEntry()->op != OP_EXIT) {
+      Instruction *insn = epilogue->getExit()->prev;
+      if (!insn || !trySetExitModifier(insn))
+         return;
+      insn->exit = 1;
+   } else {
+      for (Graph::EdgeIterator ei = func->cfgExit->incident();
+           !ei.end(); ei.next()) {
+         BasicBlock *bb = BasicBlock::get(ei.getNode());
+         Instruction *i = bb->getExit();
+
+         if (!i || !trySetExitModifier(i))
+            return;
+      }
+   }
+   epilogue->binSize -= 8;
+   func->binSize -= 8;
+   delete_Instruction(func->getProgram(), epilogue->getExit());
+}
+
+void
+CodeEmitterNV50::prepareEmission(Function *func)
+{
+   CodeEmitter::prepareEmission(func);
+
+   replaceExitWithModifier(func);
+}
+
+CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) :
+   CodeEmitter(target), targNV50(target)
+{
+   targ = target; // specialized
+   code = NULL;
+   codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNV50::getCodeEmitter(Program::Type type)
+{
+   CodeEmitterNV50 *emit = new CodeEmitterNV50(this);
+   emit->setProgramType(type);
+   return emit;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
new file mode 100644
index 0000000..90c409d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -0,0 +1,2988 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+// Argh, all these assertions ...
+
+class CodeEmitterNVC0 : public CodeEmitter
+{
+public:
+   CodeEmitterNVC0(const TargetNVC0 *);
+
+   virtual bool emitInstruction(Instruction *);
+   virtual uint32_t getMinEncodingSize(const Instruction *) const;
+   virtual void prepareEmission(Function *);
+
+   inline void setProgramType(Program::Type pType) { progType = pType; }
+
+private:
+   const TargetNVC0 *targNVC0;
+
+   Program::Type progType;
+
+   const bool writeIssueDelays;
+
+private:
+   void emitForm_A(const Instruction *, uint64_t);
+   void emitForm_B(const Instruction *, uint64_t);
+   void emitForm_S(const Instruction *, uint32_t, bool pred);
+
+   void emitPredicate(const Instruction *);
+
+   void setAddress16(const ValueRef&);
+   void setAddress24(const ValueRef&);
+   void setAddressByFile(const ValueRef&);
+   void setImmediate(const Instruction *, const int s); // needs op already set
+   void setImmediateS8(const ValueRef&);
+   void setSUConst16(const Instruction *, const int s);
+   void setSUPred(const Instruction *, const int s);
+
+   void emitCondCode(CondCode cc, int pos);
+   void emitInterpMode(const Instruction *);
+   void emitLoadStoreType(DataType ty);
+   void emitSUGType(DataType);
+   void emitCachingMode(CacheMode c);
+
+   void emitShortSrc2(const ValueRef&);
+
+   inline uint8_t getSRegEncoding(const ValueRef&);
+
+   void roundMode_A(const Instruction *);
+   void roundMode_C(const Instruction *);
+   void roundMode_CS(const Instruction *);
+
+   void emitNegAbs12(const Instruction *);
+
+   void emitNOP(const Instruction *);
+
+   void emitLOAD(const Instruction *);
+   void emitSTORE(const Instruction *);
+   void emitMOV(const Instruction *);
+   void emitATOM(const Instruction *);
+   void emitMEMBAR(const Instruction *);
+   void emitCCTL(const Instruction *);
+
+   void emitINTERP(const Instruction *);
+   void emitPFETCH(const Instruction *);
+   void emitVFETCH(const Instruction *);
+   void emitEXPORT(const Instruction *);
+   void emitOUT(const Instruction *);
+
+   void emitUADD(const Instruction *);
+   void emitFADD(const Instruction *);
+   void emitUMUL(const Instruction *);
+   void emitFMUL(const Instruction *);
+   void emitIMAD(const Instruction *);
+   void emitISAD(const Instruction *);
+   void emitFMAD(const Instruction *);
+   void emitMADSP(const Instruction *);
+
+   void emitNOT(Instruction *);
+   void emitLogicOp(const Instruction *, uint8_t subOp);
+   void emitPOPC(const Instruction *);
+   void emitINSBF(const Instruction *);
+   void emitEXTBF(const Instruction *);
+   void emitPERMT(const Instruction *);
+   void emitShift(const Instruction *);
+
+   void emitSFnOp(const Instruction *, uint8_t subOp);
+
+   void emitCVT(Instruction *);
+   void emitMINMAX(const Instruction *);
+   void emitPreOp(const Instruction *);
+
+   void emitSET(const CmpInstruction *);
+   void emitSLCT(const CmpInstruction *);
+   void emitSELP(const Instruction *);
+
+   void emitTEXBAR(const Instruction *);
+   void emitTEX(const TexInstruction *);
+   void emitTEXCSAA(const TexInstruction *);
+   void emitTXQ(const TexInstruction *);
+
+   void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
+
+   void emitFlow(const Instruction *);
+   void emitBAR(const Instruction *);
+
+   void emitSUCLAMPMode(uint16_t);
+   void emitSUCalc(Instruction *);
+   void emitSULDGB(const TexInstruction *);
+   void emitSUSTGx(const TexInstruction *);
+
+   void emitVSHL(const Instruction *);
+   void emitVectorSubOp(const Instruction *);
+
+   inline void defId(const ValueDef&, const int pos);
+   inline void defId(const Instruction *, int d, const int pos);
+   inline void srcId(const ValueRef&, const int pos);
+   inline void srcId(const ValueRef *, const int pos);
+   inline void srcId(const Instruction *, int s, const int pos);
+   inline void srcAddr32(const ValueRef&, int pos, int shr);
+
+   inline bool isLIMM(const ValueRef&, DataType ty);
+};
+
+// for better visibility
+#define HEX64(h, l) 0x##h##l##ULL
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterNVC0::srcId(const ValueRef& src, const int pos)
+{
+   code[pos / 32] |= (src.get() ? SDATA(src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcId(const ValueRef *src, const int pos)
+{
+   code[pos / 32] |= (src ? SDATA(*src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcId(const Instruction *insn, int s, int pos)
+{
+   int r = insn->srcExists(s) ? SDATA(insn->src(s)).id : 63;
+   code[pos / 32] |= r << (pos % 32);
+}
+
+void
+CodeEmitterNVC0::srcAddr32(const ValueRef& src, int pos, int shr)
+{
+   const uint32_t offset = SDATA(src).offset >> shr;
+
+   code[pos / 32] |= offset << (pos % 32);
+   if (pos && (pos < 32))
+      code[1] |= offset >> (32 - pos);
+}
+
+void CodeEmitterNVC0::defId(const ValueDef& def, const int pos)
+{
+   code[pos / 32] |= (def.get() ? DDATA(def).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::defId(const Instruction *insn, int d, int pos)
+{
+   int r = insn->defExists(d) ? DDATA(insn->def(d)).id : 63;
+   code[pos / 32] |= r << (pos % 32);
+}
+
+bool CodeEmitterNVC0::isLIMM(const ValueRef& ref, DataType ty)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+}
+
+void
+CodeEmitterNVC0::roundMode_A(const Instruction *insn)
+{
+   switch (insn->rnd) {
+   case ROUND_M: code[1] |= 1 << 23; break;
+   case ROUND_P: code[1] |= 2 << 23; break;
+   case ROUND_Z: code[1] |= 3 << 23; break;
+   default:
+      assert(insn->rnd == ROUND_N);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitNegAbs12(const Instruction *i)
+{
+   if (i->src(1).mod.abs()) code[0] |= 1 << 6;
+   if (i->src(0).mod.abs()) code[0] |= 1 << 7;
+   if (i->src(1).mod.neg()) code[0] |= 1 << 8;
+   if (i->src(0).mod.neg()) code[0] |= 1 << 9;
+}
+
+void CodeEmitterNVC0::emitCondCode(CondCode cc, int pos)
+{
+   uint8_t val;
+
+   switch (cc) {
+   case CC_LT:  val = 0x1; break;
+   case CC_LTU: val = 0x9; break;
+   case CC_EQ:  val = 0x2; break;
+   case CC_EQU: val = 0xa; break;
+   case CC_LE:  val = 0x3; break;
+   case CC_LEU: val = 0xb; break;
+   case CC_GT:  val = 0x4; break;
+   case CC_GTU: val = 0xc; break;
+   case CC_NE:  val = 0x5; break;
+   case CC_NEU: val = 0xd; break;
+   case CC_GE:  val = 0x6; break;
+   case CC_GEU: val = 0xe; break;
+   case CC_TR:  val = 0xf; break;
+   case CC_FL:  val = 0x0; break;
+
+   case CC_A:  val = 0x14; break;
+   case CC_NA: val = 0x13; break;
+   case CC_S:  val = 0x15; break;
+   case CC_NS: val = 0x12; break;
+   case CC_C:  val = 0x16; break;
+   case CC_NC: val = 0x11; break;
+   case CC_O:  val = 0x17; break;
+   case CC_NO: val = 0x10; break;
+
+   default:
+      val = 0;
+      assert(!"invalid condition code");
+      break;
+   }
+   code[pos / 32] |= val << (pos % 32);
+}
+
+void
+CodeEmitterNVC0::emitPredicate(const Instruction *i)
+{
+   if (i->predSrc >= 0) {
+      assert(i->getPredicate()->reg.file == FILE_PREDICATE);
+      srcId(i->src(i->predSrc), 10);
+      if (i->cc == CC_NOT_P)
+         code[0] |= 0x2000; // negate
+   } else {
+      code[0] |= 0x1c00;
+   }
+}
+
+void
+CodeEmitterNVC0::setAddressByFile(const ValueRef& src)
+{
+   switch (src.getFile()) {
+   case FILE_MEMORY_GLOBAL:
+      srcAddr32(src, 26, 0);
+      break;
+   case FILE_MEMORY_LOCAL:
+   case FILE_MEMORY_SHARED:
+      setAddress24(src);
+      break;
+   default:
+      assert(src.getFile() == FILE_MEMORY_CONST);
+      setAddress16(src);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::setAddress16(const ValueRef& src)
+{
+   Symbol *sym = src.get()->asSym();
+
+   assert(sym);
+
+   code[0] |= (sym->reg.data.offset & 0x003f) << 26;
+   code[1] |= (sym->reg.data.offset & 0xffc0) >> 6;
+}
+
+void
+CodeEmitterNVC0::setAddress24(const ValueRef& src)
+{
+   Symbol *sym = src.get()->asSym();
+
+   assert(sym);
+
+   code[0] |= (sym->reg.data.offset & 0x00003f) << 26;
+   code[1] |= (sym->reg.data.offset & 0xffffc0) >> 6;
+}
+
+void
+CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
+{
+   const ImmediateValue *imm = i->src(s).get()->asImm();
+   uint32_t u32;
+
+   assert(imm);
+   u32 = imm->reg.data.u32;
+
+   if ((code[0] & 0xf) == 0x2) {
+      // LIMM
+      code[0] |= (u32 & 0x3f) << 26;
+      code[1] |= u32 >> 6;
+   } else
+   if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) {
+      // integer immediate
+      assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+      assert(!(code[1] & 0xc000));
+      u32 &= 0xfffff;
+      code[0] |= (u32 & 0x3f) << 26;
+      code[1] |= 0xc000 | (u32 >> 6);
+   } else {
+      // float immediate
+      assert(!(u32 & 0x00000fff));
+      assert(!(code[1] & 0xc000));
+      code[0] |= ((u32 >> 12) & 0x3f) << 26;
+      code[1] |= 0xc000 | (u32 >> 18);
+   }
+}
+
+void CodeEmitterNVC0::setImmediateS8(const ValueRef &ref)
+{
+   const ImmediateValue *imm = ref.get()->asImm();
+
+   int8_t s8 = static_cast<int8_t>(imm->reg.data.s32);
+
+   assert(s8 == imm->reg.data.s32);
+
+   code[0] |= (s8 & 0x3f) << 26;
+   code[0] |= (s8 >> 6) << 8;
+}
+
+void
+CodeEmitterNVC0::emitForm_A(const Instruction *i, uint64_t opc)
+{
+   code[0] = opc;
+   code[1] = opc >> 32;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+
+   int s1 = 26;
+   if (i->srcExists(2) && i->getSrc(2)->reg.file == FILE_MEMORY_CONST)
+      s1 = 49;
+
+   for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+      switch (i->getSrc(s)->reg.file) {
+      case FILE_MEMORY_CONST:
+         assert(!(code[1] & 0xc000));
+         code[1] |= (s == 2) ? 0x8000 : 0x4000;
+         code[1] |= i->getSrc(s)->reg.fileIndex << 10;
+         setAddress16(i->src(s));
+         break;
+      case FILE_IMMEDIATE:
+         assert(s == 1 ||
+                i->op == OP_MOV || i->op == OP_PRESIN || i->op == OP_PREEX2);
+         assert(!(code[1] & 0xc000));
+         setImmediate(i, s);
+         break;
+      case FILE_GPR:
+         if ((s == 2) && ((code[0] & 0x7) == 2)) // LIMM: 3rd src == dst
+            break;
+         srcId(i->src(s), s ? ((s == 2) ? 49 : s1) : 20);
+         break;
+      default:
+         // ignore here, can be predicate or flags, but must not be address
+         break;
+      }
+   }
+}
+
+void
+CodeEmitterNVC0::emitForm_B(const Instruction *i, uint64_t opc)
+{
+   code[0] = opc;
+   code[1] = opc >> 32;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_CONST:
+      assert(!(code[1] & 0xc000));
+      code[1] |= 0x4000 | (i->src(0).get()->reg.fileIndex << 10);
+      setAddress16(i->src(0));
+      break;
+   case FILE_IMMEDIATE:
+      assert(!(code[1] & 0xc000));
+      setImmediate(i, 0);
+      break;
+   case FILE_GPR:
+      srcId(i->src(0), 26);
+      break;
+   default:
+      // ignore here, can be predicate or flags, but must not be address
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitForm_S(const Instruction *i, uint32_t opc, bool pred)
+{
+   code[0] = opc;
+
+   int ss2a = 0;
+   if (opc == 0x0d || opc == 0x0e)
+      ss2a = 2;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+
+   assert(pred || (i->predSrc < 0));
+   if (pred)
+      emitPredicate(i);
+
+   for (int s = 1; s < 3 && i->srcExists(s); ++s) {
+      if (i->src(s).get()->reg.file == FILE_MEMORY_CONST) {
+         assert(!(code[0] & (0x300 >> ss2a)));
+         switch (i->src(s).get()->reg.fileIndex) {
+         case 0:  code[0] |= 0x100 >> ss2a; break;
+         case 1:  code[0] |= 0x200 >> ss2a; break;
+         case 16: code[0] |= 0x300 >> ss2a; break;
+         default:
+            ERROR("invalid c[] space for short form\n");
+            break;
+         }
+         if (s == 1)
+            code[0] |= i->getSrc(s)->reg.data.offset << 24;
+         else
+            code[0] |= i->getSrc(s)->reg.data.offset << 6;
+      } else
+      if (i->src(s).getFile() == FILE_IMMEDIATE) {
+         assert(s == 1);
+         setImmediateS8(i->src(s));
+      } else
+      if (i->src(s).getFile() == FILE_GPR) {
+         srcId(i->src(s), (s == 1) ? 26 : 8);
+      }
+   }
+}
+
+void
+CodeEmitterNVC0::emitShortSrc2(const ValueRef &src)
+{
+   if (src.getFile() == FILE_MEMORY_CONST) {
+      switch (src.get()->reg.fileIndex) {
+      case 0:  code[0] |= 0x100; break;
+      case 1:  code[0] |= 0x200; break;
+      case 16: code[0] |= 0x300; break;
+      default:
+         assert(!"unsupported file index for short op");
+         break;
+      }
+      srcAddr32(src, 20, 2);
+   } else {
+      srcId(src, 20);
+      assert(src.getFile() == FILE_GPR);
+   }
+}
+
+void
+CodeEmitterNVC0::emitNOP(const Instruction *i)
+{
+   code[0] = 0x000001e4;
+   code[1] = 0x40000000;
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFMAD(const Instruction *i)
+{
+   bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_F32)) {
+         emitForm_A(i, HEX64(20000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(30000000, 00000000));
+
+         if (i->src(2).mod.neg())
+            code[0] |= 1 << 8;
+      }
+      roundMode_A(i);
+
+      if (neg1)
+         code[0] |= 1 << 9;
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+      if (i->ftz)
+         code[0] |= 1 << 6;
+   } else {
+      assert(!i->saturate && !i->src(2).mod.neg());
+      emitForm_S(i, (i->src(2).getFile() == FILE_MEMORY_CONST) ? 0x2e : 0x0e,
+                 false);
+      if (neg1)
+         code[0] |= 1 << 4;
+   }
+}
+
+void
+CodeEmitterNVC0::emitFMUL(const Instruction *i)
+{
+   bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+   assert(i->postFactor >= -3 && i->postFactor <= 3);
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_F32)) {
+         assert(i->postFactor == 0); // constant folded, hopefully
+         emitForm_A(i, HEX64(30000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(58000000, 00000000));
+         roundMode_A(i);
+         code[1] |= ((i->postFactor > 0) ?
+                     (7 - i->postFactor) : (0 - i->postFactor)) << 17;
+      }
+      if (neg)
+         code[1] ^= 1 << 25; // aliases with LIMM sign bit
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+
+      if (i->dnz)
+         code[0] |= 1 << 7;
+      else
+      if (i->ftz)
+         code[0] |= 1 << 6;
+   } else {
+      assert(!neg && !i->saturate && !i->ftz && !i->postFactor);
+      emitForm_S(i, 0xa8, true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitUMUL(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      if (i->src(1).getFile() == FILE_IMMEDIATE) {
+         emitForm_A(i, HEX64(10000000, 00000002));
+      } else {
+         emitForm_A(i, HEX64(50000000, 00000003));
+      }
+      if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+         code[0] |= 1 << 6;
+      if (i->sType == TYPE_S32)
+         code[0] |= 1 << 5;
+      if (i->dType == TYPE_S32)
+         code[0] |= 1 << 7;
+   } else {
+      emitForm_S(i, i->src(1).getFile() == FILE_IMMEDIATE ? 0xaa : 0x2a, true);
+
+      if (i->sType == TYPE_S32)
+         code[0] |= 1 << 6;
+   }
+}
+
+void
+CodeEmitterNVC0::emitFADD(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_F32)) {
+         assert(!i->saturate);
+         emitForm_A(i, HEX64(28000000, 00000002));
+
+         code[0] |= i->src(0).mod.abs() << 7;
+         code[0] |= i->src(0).mod.neg() << 9;
+
+         if (i->src(1).mod.abs())
+            code[1] &= 0xfdffffff;
+         if ((i->op == OP_SUB) != static_cast<bool>(i->src(1).mod.neg()))
+            code[1] ^= 0x02000000;
+      } else {
+         emitForm_A(i, HEX64(50000000, 00000000));
+
+         roundMode_A(i);
+         if (i->saturate)
+            code[1] |= 1 << 17;
+
+         emitNegAbs12(i);
+         if (i->op == OP_SUB) code[0] ^= 1 << 8;
+      }
+      if (i->ftz)
+         code[0] |= 1 << 5;
+   } else {
+      assert(!i->saturate && i->op != OP_SUB &&
+             !i->src(0).mod.abs() &&
+             !i->src(1).mod.neg() && !i->src(1).mod.abs());
+
+      emitForm_S(i, 0x49, true);
+
+      if (i->src(0).mod.neg())
+         code[0] |= 1 << 7;
+   }
+}
+
+void
+CodeEmitterNVC0::emitUADD(const Instruction *i)
+{
+   uint32_t addOp = 0;
+
+   assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+   assert(!i->src(0).mod.neg() || !i->src(1).mod.neg());
+
+   if (i->src(0).mod.neg())
+      addOp |= 0x200;
+   if (i->src(1).mod.neg())
+      addOp |= 0x100;
+   if (i->op == OP_SUB) {
+      addOp ^= 0x100;
+      assert(addOp != 0x300); // would be add-plus-one
+   }
+
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_U32)) {
+         emitForm_A(i, HEX64(08000000, 00000002));
+         if (i->defExists(1))
+            code[1] |= 1 << 26; // write carry
+      } else {
+         emitForm_A(i, HEX64(48000000, 00000003));
+         if (i->defExists(1))
+            code[1] |= 1 << 16; // write carry
+      }
+      code[0] |= addOp;
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+      if (i->flagsSrc >= 0) // add carry
+         code[0] |= 1 << 6;
+   } else {
+      assert(!(addOp & 0x100));
+      emitForm_S(i, (addOp >> 3) |
+                 ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0xac : 0x2c), true);
+   }
+}
+
+// TODO: shl-add
+void
+CodeEmitterNVC0::emitIMAD(const Instruction *i)
+{
+   assert(i->encSize == 8);
+   emitForm_A(i, HEX64(20000000, 00000003));
+
+   if (isSignedType(i->dType))
+      code[0] |= 1 << 7;
+   if (isSignedType(i->sType))
+      code[0] |= 1 << 5;
+
+   code[1] |= i->saturate << 24;
+
+   if (i->flagsDef >= 0) code[1] |= 1 << 16;
+   if (i->flagsSrc >= 0) code[1] |= 1 << 23;
+
+   if (i->src(2).mod.neg()) code[0] |= 0x10;
+   if (i->src(1).mod.neg() ^
+       i->src(0).mod.neg()) code[0] |= 0x20;
+
+   if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+      code[0] |= 1 << 6;
+}
+
+void
+CodeEmitterNVC0::emitMADSP(const Instruction *i)
+{
+   assert(targ->getChipset() >= NVISA_GK104_CHIPSET);
+
+   emitForm_A(i, HEX64(00000000, 00000003));
+
+   if (i->subOp == NV50_IR_SUBOP_MADSP_SD) {
+      code[1] |= 0x01800000;
+   } else {
+      code[0] |= (i->subOp & 0x00f) << 7;
+      code[0] |= (i->subOp & 0x0f0) << 1;
+      code[0] |= (i->subOp & 0x100) >> 3;
+      code[0] |= (i->subOp & 0x200) >> 2;
+      code[1] |= (i->subOp & 0xc00) << 13;
+   }
+
+   if (i->flagsDef >= 0)
+      code[1] |= 1 << 16;
+}
+
+void
+CodeEmitterNVC0::emitISAD(const Instruction *i)
+{
+   assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);
+   assert(i->encSize == 8);
+
+   emitForm_A(i, HEX64(38000000, 00000003));
+
+   if (i->dType == TYPE_S32)
+      code[0] |= 1 << 5;
+}
+
+void
+CodeEmitterNVC0::emitNOT(Instruction *i)
+{
+   assert(i->encSize == 8);
+   i->setSrc(1, i->src(0));
+   emitForm_A(i, HEX64(68000000, 000001c3));
+}
+
+void
+CodeEmitterNVC0::emitLogicOp(const Instruction *i, uint8_t subOp)
+{
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      code[0] = 0x00000004 | (subOp << 30);
+      code[1] = 0x0c000000;
+
+      emitPredicate(i);
+
+      defId(i->def(0), 17);
+      srcId(i->src(0), 20);
+      if (i->src(0).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 23;
+      srcId(i->src(1), 26);
+      if (i->src(1).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 29;
+
+      if (i->defExists(1)) {
+         defId(i->def(1), 14);
+      } else {
+         code[0] |= 7 << 14;
+      }
+      // (a OP b) OP c
+      if (i->predSrc != 2 && i->srcExists(2)) {
+         code[1] |= subOp << 21;
+         srcId(i->src(2), 17);
+         if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 20;
+      } else {
+         code[1] |= 0x000e0000;
+      }
+   } else
+   if (i->encSize == 8) {
+      if (isLIMM(i->src(1), TYPE_U32)) {
+         emitForm_A(i, HEX64(38000000, 00000002));
+
+         if (i->flagsDef >= 0)
+            code[1] |= 1 << 26;
+      } else {
+         emitForm_A(i, HEX64(68000000, 00000003));
+
+         if (i->flagsDef >= 0)
+            code[1] |= 1 << 16;
+      }
+      code[0] |= subOp << 6;
+
+      if (i->flagsSrc >= 0) // carry
+         code[0] |= 1 << 5;
+
+      if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+      if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+   } else {
+      emitForm_S(i, (subOp << 5) |
+                 ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0x1d : 0x8d), true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitPOPC(const Instruction *i)
+{
+   emitForm_A(i, HEX64(54000000, 00000004));
+
+   if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+   if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+}
+
+void
+CodeEmitterNVC0::emitINSBF(const Instruction *i)
+{
+   emitForm_A(i, HEX64(28000000, 00000003));
+}
+
+void
+CodeEmitterNVC0::emitEXTBF(const Instruction *i)
+{
+   emitForm_A(i, HEX64(70000000, 00000003));
+
+   if (i->dType == TYPE_S32)
+      code[0] |= 1 << 5;
+   if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
+      code[0] |= 1 << 8;
+}
+
+void
+CodeEmitterNVC0::emitPERMT(const Instruction *i)
+{
+   emitForm_A(i, HEX64(24000000, 00000004));
+
+   code[0] |= i->subOp << 5;
+}
+
+void
+CodeEmitterNVC0::emitShift(const Instruction *i)
+{
+   if (i->op == OP_SHR) {
+      emitForm_A(i, HEX64(58000000, 00000003)
+                 | (isSignedType(i->dType) ? 0x20 : 0x00));
+   } else {
+      emitForm_A(i, HEX64(60000000, 00000003));
+   }
+
+   if (i->subOp == NV50_IR_SUBOP_SHIFT_WRAP)
+      code[0] |= 1 << 9;
+}
+
+void
+CodeEmitterNVC0::emitPreOp(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      emitForm_B(i, HEX64(60000000, 00000000));
+
+      if (i->op == OP_PREEX2)
+         code[0] |= 0x20;
+
+      if (i->src(0).mod.abs()) code[0] |= 1 << 6;
+      if (i->src(0).mod.neg()) code[0] |= 1 << 8;
+   } else {
+      emitForm_S(i, i->op == OP_PREEX2 ? 0x74000008 : 0x70000008, true);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+   if (i->encSize == 8) {
+      code[0] = 0x00000000 | (subOp << 26);
+      code[1] = 0xc8000000;
+
+      emitPredicate(i);
+
+      defId(i->def(0), 14);
+      srcId(i->src(0), 20);
+
+      assert(i->src(0).getFile() == FILE_GPR);
+
+      if (i->saturate) code[0] |= 1 << 5;
+
+      if (i->src(0).mod.abs()) code[0] |= 1 << 7;
+      if (i->src(0).mod.neg()) code[0] |= 1 << 9;
+   } else {
+      emitForm_S(i, 0x80000008 | (subOp << 26), true);
+
+      assert(!i->src(0).mod.neg());
+      if (i->src(0).mod.abs()) code[0] |= 1 << 30;
+   }
+}
+
+void
+CodeEmitterNVC0::emitMINMAX(const Instruction *i)
+{
+   uint64_t op;
+
+   assert(i->encSize == 8);
+
+   op = (i->op == OP_MIN) ? 0x080e000000000000ULL : 0x081e000000000000ULL;
+
+   if (i->ftz)
+      op |= 1 << 5;
+   else
+   if (!isFloatType(i->dType))
+      op |= isSignedType(i->dType) ? 0x23 : 0x03;
+
+   emitForm_A(i, op);
+   emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::roundMode_C(const Instruction *i)
+{
+   switch (i->rnd) {
+   case ROUND_M:  code[1] |= 1 << 17; break;
+   case ROUND_P:  code[1] |= 2 << 17; break;
+   case ROUND_Z:  code[1] |= 3 << 17; break;
+   case ROUND_NI: code[0] |= 1 << 7; break;
+   case ROUND_MI: code[0] |= 1 << 7; code[1] |= 1 << 17; break;
+   case ROUND_PI: code[0] |= 1 << 7; code[1] |= 2 << 17; break;
+   case ROUND_ZI: code[0] |= 1 << 7; code[1] |= 3 << 17; break;
+   case ROUND_N: break;
+   default:
+      assert(!"invalid round mode");
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::roundMode_CS(const Instruction *i)
+{
+   switch (i->rnd) {
+   case ROUND_M:
+   case ROUND_MI: code[0] |= 1 << 16; break;
+   case ROUND_P:
+   case ROUND_PI: code[0] |= 2 << 16; break;
+   case ROUND_Z:
+   case ROUND_ZI: code[0] |= 3 << 16; break;
+   default:
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitCVT(Instruction *i)
+{
+   const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+
+   switch (i->op) {
+   case OP_CEIL:  i->rnd = f2f ? ROUND_PI : ROUND_P; break;
+   case OP_FLOOR: i->rnd = f2f ? ROUND_MI : ROUND_M; break;
+   case OP_TRUNC: i->rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+   default:
+      break;
+   }
+
+   const bool sat = (i->op == OP_SAT) || i->saturate;
+   const bool abs = (i->op == OP_ABS) || i->src(0).mod.abs();
+   const bool neg = (i->op == OP_NEG) || i->src(0).mod.neg();
+
+   if (i->encSize == 8) {
+      emitForm_B(i, HEX64(10000000, 00000004));
+
+      roundMode_C(i);
+
+      // cvt u16 f32 sets high bits to 0, so we don't have to use Value::Size()
+      code[0] |= util_logbase2(typeSizeof(i->dType)) << 20;
+      code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
+
+      if (sat)
+         code[0] |= 0x20;
+      if (abs)
+         code[0] |= 1 << 6;
+      if (neg && i->op != OP_ABS)
+         code[0] |= 1 << 8;
+
+      if (i->ftz)
+         code[1] |= 1 << 23;
+
+      if (isSignedIntType(i->dType))
+         code[0] |= 0x080;
+      if (isSignedIntType(i->sType))
+         code[0] |= 0x200;
+
+      if (isFloatType(i->dType)) {
+         if (!isFloatType(i->sType))
+            code[1] |= 0x08000000;
+      } else {
+         if (isFloatType(i->sType))
+            code[1] |= 0x04000000;
+         else
+            code[1] |= 0x0c000000;
+      }
+   } else {
+      if (i->op == OP_CEIL || i->op == OP_FLOOR || i->op == OP_TRUNC) {
+         code[0] = 0x298;
+      } else
+      if (isFloatType(i->dType)) {
+         if (isFloatType(i->sType))
+            code[0] = 0x098;
+         else
+            code[0] = 0x088 | (isSignedType(i->sType) ? (1 << 8) : 0);
+      } else {
+         assert(isFloatType(i->sType));
+
+         code[0] = 0x288 | (isSignedType(i->sType) ? (1 << 8) : 0);
+      }
+
+      if (neg) code[0] |= 1 << 16;
+      if (sat) code[0] |= 1 << 18;
+      if (abs) code[0] |= 1 << 19;
+
+      roundMode_CS(i);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSET(const CmpInstruction *i)
+{
+   uint32_t hi;
+   uint32_t lo = 0;
+
+   if (i->sType == TYPE_F64)
+      lo = 0x1;
+   else
+   if (!isFloatType(i->sType))
+      lo = 0x3;
+
+   if (isFloatType(i->dType) || isSignedIntType(i->sType))
+      lo |= 0x20;
+
+   switch (i->op) {
+   case OP_SET_AND: hi = 0x10000000; break;
+   case OP_SET_OR:  hi = 0x10200000; break;
+   case OP_SET_XOR: hi = 0x10400000; break;
+   default:
+      hi = 0x100e0000;
+      break;
+   }
+   emitForm_A(i, (static_cast<uint64_t>(hi) << 32) | lo);
+
+   if (i->op != OP_SET)
+      srcId(i->src(2), 32 + 17);
+
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      if (i->sType == TYPE_F32)
+         code[1] += 0x10000000;
+      else
+         code[1] += 0x08000000;
+
+      code[0] &= ~0xfc000;
+      defId(i->def(0), 17);
+      if (i->defExists(1))
+         defId(i->def(1), 14);
+      else
+         code[0] |= 0x1c000;
+   }
+
+   if (i->ftz)
+      code[1] |= 1 << 27;
+
+   emitCondCode(i->setCond, 32 + 23);
+   emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::emitSLCT(const CmpInstruction *i)
+{
+   uint64_t op;
+
+   switch (i->dType) {
+   case TYPE_S32:
+      op = HEX64(30000000, 00000023);
+      break;
+   case TYPE_U32:
+      op = HEX64(30000000, 00000003);
+      break;
+   case TYPE_F32:
+      op = HEX64(38000000, 00000000);
+      break;
+   default:
+      assert(!"invalid type for SLCT");
+      op = 0;
+      break;
+   }
+   emitForm_A(i, op);
+
+   CondCode cc = i->setCond;
+
+   if (i->src(2).mod.neg())
+      cc = reverseCondCode(cc);
+
+   emitCondCode(cc, 32 + 23);
+
+   if (i->ftz)
+      code[0] |= 1 << 5;
+}
+
+void CodeEmitterNVC0::emitSELP(const Instruction *i)
+{
+   emitForm_A(i, HEX64(20000000, 00000004));
+
+   if (i->cc == CC_NOT_P || i->src(2).mod & Modifier(NV50_IR_MOD_NOT))
+      code[1] |= 1 << 20;
+}
+
+void CodeEmitterNVC0::emitTEXBAR(const Instruction *i)
+{
+   code[0] = 0x00000006 | (i->subOp << 26);
+   code[1] = 0xf0000000;
+   emitPredicate(i);
+   emitCondCode(i->flagsSrc >= 0 ? i->cc : CC_ALWAYS, 5);
+}
+
+void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i)
+{
+   code[0] = 0x00000086;
+   code[1] = 0xd0000000;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+
+   if (i->tex.liveOnly)
+      code[0] |= 1 << 9;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+}
+
+static inline bool
+isNextIndependentTex(const TexInstruction *i)
+{
+   if (!i->next || !isTextureOp(i->next->op))
+      return false;
+   if (i->getDef(0)->interfers(i->next->getSrc(0)))
+      return false;
+   return !i->next->srcExists(1) || !i->getDef(0)->interfers(i->next->getSrc(1));
+}
+
+void
+CodeEmitterNVC0::emitTEX(const TexInstruction *i)
+{
+   code[0] = 0x00000006;
+
+   if (isNextIndependentTex(i))
+      code[0] |= 0x080; // t mode
+   else
+      code[0] |= 0x100; // p mode
+
+   if (i->tex.liveOnly)
+      code[0] |= 1 << 9;
+
+   switch (i->op) {
+   case OP_TEX: code[1] = 0x80000000; break;
+   case OP_TXB: code[1] = 0x84000000; break;
+   case OP_TXL: code[1] = 0x86000000; break;
+   case OP_TXF: code[1] = 0x90000000; break;
+   case OP_TXG: code[1] = 0xa0000000; break;
+   case OP_TXD: code[1] = 0xe0000000; break;
+   default:
+      assert(!"invalid texture op");
+      break;
+   }
+   if (i->op == OP_TXF) {
+      if (!i->tex.levelZero)
+         code[1] |= 0x02000000;
+   } else
+   if (i->tex.levelZero) {
+      code[1] |= 0x02000000;
+   }
+
+   if (i->op != OP_TXD && i->tex.derivAll)
+      code[1] |= 1 << 13;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+
+   emitPredicate(i);
+
+   if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
+
+   code[1] |= i->tex.mask << 14;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+   if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0)
+      code[1] |= 1 << 18; // in 1st source (with array index)
+
+   // texture target:
+   code[1] |= (i->tex.target.getDim() - 1) << 20;
+   if (i->tex.target.isCube())
+      code[1] += 2 << 20;
+   if (i->tex.target.isArray())
+      code[1] |= 1 << 19;
+   if (i->tex.target.isShadow())
+      code[1] |= 1 << 24;
+
+   const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+   if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) {
+      // lzero
+      if (i->op == OP_TXL)
+         code[1] &= ~(1 << 26);
+      else
+      if (i->op == OP_TXF)
+         code[1] &= ~(1 << 25);
+   }
+   if (i->tex.target == TEX_TARGET_2D_MS ||
+       i->tex.target == TEX_TARGET_2D_MS_ARRAY)
+      code[1] |= 1 << 23;
+
+   if (i->tex.useOffsets) // in vecSrc0.w
+      code[1] |= 1 << 22;
+
+   srcId(i, src1, 26);
+}
+
+void
+CodeEmitterNVC0::emitTXQ(const TexInstruction *i)
+{
+   code[0] = 0x00000086;
+   code[1] = 0xc0000000;
+
+   switch (i->tex.query) {
+   case TXQ_DIMS:            code[1] |= 0 << 22; break;
+   case TXQ_TYPE:            code[1] |= 1 << 22; break;
+   case TXQ_SAMPLE_POSITION: code[1] |= 2 << 22; break;
+   case TXQ_FILTER:          code[1] |= 3 << 22; break;
+   case TXQ_LOD:             code[1] |= 4 << 22; break;
+   case TXQ_BORDER_COLOUR:   code[1] |= 5 << 22; break;
+   default:
+      assert(!"invalid texture query");
+      break;
+   }
+
+   code[1] |= i->tex.mask << 14;
+
+   code[1] |= i->tex.r;
+   code[1] |= i->tex.s << 8;
+   if (i->tex.sIndirectSrc >= 0 || i->tex.rIndirectSrc >= 0)
+      code[1] |= 1 << 18;
+
+   const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+   srcId(i, src1, 26);
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
+{
+   code[0] = 0x00000000 | (laneMask << 6);
+   code[1] = 0x48000000 | qOp;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0), 20);
+   srcId(i->srcExists(1) ? i->src(1) : i->src(0), 26);
+
+   if (i->op == OP_QUADOP && progType != Program::TYPE_FRAGMENT)
+      code[0] |= 1 << 9; // dall
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFlow(const Instruction *i)
+{
+   const FlowInstruction *f = i->asFlow();
+
+   unsigned mask; // bit 0: predicate, bit 1: target
+
+   code[0] = 0x00000007;
+
+   switch (i->op) {
+   case OP_BRA:
+      code[1] = f->absolute ? 0x00000000 : 0x40000000;
+      if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+         code[0] |= 0x4000;
+      mask = 3;
+      break;
+   case OP_CALL:
+      code[1] = f->absolute ? 0x10000000 : 0x50000000;
+      if (f->indirect)
+         code[0] |= 0x4000; // indirect calls always use c[] source
+      mask = 2;
+      break;
+
+   case OP_EXIT:    code[1] = 0x80000000; mask = 1; break;
+   case OP_RET:     code[1] = 0x90000000; mask = 1; break;
+   case OP_DISCARD: code[1] = 0x98000000; mask = 1; break;
+   case OP_BREAK:   code[1] = 0xa8000000; mask = 1; break;
+   case OP_CONT:    code[1] = 0xb0000000; mask = 1; break;
+
+   case OP_JOINAT:   code[1] = 0x60000000; mask = 2; break;
+   case OP_PREBREAK: code[1] = 0x68000000; mask = 2; break;
+   case OP_PRECONT:  code[1] = 0x70000000; mask = 2; break;
+   case OP_PRERET:   code[1] = 0x78000000; mask = 2; break;
+
+   case OP_QUADON:  code[1] = 0xc0000000; mask = 0; break;
+   case OP_QUADPOP: code[1] = 0xc8000000; mask = 0; break;
+   case OP_BRKPT:   code[1] = 0xd0000000; mask = 0; break;
+   default:
+      assert(!"invalid flow operation");
+      return;
+   }
+
+   if (mask & 1) {
+      emitPredicate(i);
+      if (i->flagsSrc < 0)
+         code[0] |= 0x1e0;
+   }
+
+   if (!f)
+      return;
+
+   if (f->allWarp)
+      code[0] |= 1 << 15;
+   if (f->limit)
+      code[0] |= 1 << 16;
+
+   if (f->indirect) {
+      if (code[0] & 0x4000) {
+         assert(i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST);
+         setAddress16(i->src(0));
+         code[1] |= i->getSrc(0)->reg.fileIndex << 10;
+         if (f->op == OP_BRA)
+            srcId(f->src(0).getIndirect(0), 20);
+      } else {
+         srcId(f, 0, 20);
+      }
+   }
+
+   if (f->op == OP_CALL) {
+      if (f->indirect) {
+         // nothing
+      } else
+      if (f->builtin) {
+         assert(f->absolute);
+         uint32_t pcAbs = targNVC0->getBuiltinOffset(f->target.builtin);
+         addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xfc000000, 26);
+         addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x03ffffff, -6);
+      } else {
+         assert(!f->absolute);
+         int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
+         code[0] |= (pcRel & 0x3f) << 26;
+         code[1] |= (pcRel >> 6) & 0x3ffff;
+      }
+   } else
+   if (mask & 2) {
+      int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      // currently we don't want absolute branches
+      assert(!f->absolute);
+      code[0] |= (pcRel & 0x3f) << 26;
+      code[1] |= (pcRel >> 6) & 0x3ffff;
+   }
+}
+
+void
+CodeEmitterNVC0::emitBAR(const Instruction *i)
+{
+   Value *rDef = NULL, *pDef = NULL;
+
+   switch (i->subOp) {
+   case NV50_IR_SUBOP_BAR_ARRIVE:   code[0] = 0x84; break;
+   case NV50_IR_SUBOP_BAR_RED_AND:  code[0] = 0x24; break;
+   case NV50_IR_SUBOP_BAR_RED_OR:   code[0] = 0x44; break;
+   case NV50_IR_SUBOP_BAR_RED_POPC: code[0] = 0x04; break;
+   default:
+      code[0] = 0x04;
+      assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC);
+      break;
+   }
+   code[1] = 0x50000000;
+
+   code[0] |= 63 << 14;
+   code[1] |= 7 << 21;
+
+   emitPredicate(i);
+
+   // barrier id
+   if (i->src(0).getFile() == FILE_GPR) {
+      srcId(i->src(0), 20);
+   } else {
+      ImmediateValue *imm = i->getSrc(0)->asImm();
+      assert(imm);
+      code[0] |= imm->reg.data.u32 << 20;
+   }
+
+   // thread count
+   if (i->src(1).getFile() == FILE_GPR) {
+      srcId(i->src(1), 26);
+   } else {
+      ImmediateValue *imm = i->getSrc(1)->asImm();
+      assert(imm);
+      code[0] |= imm->reg.data.u32 << 26;
+      code[1] |= imm->reg.data.u32 >> 6;
+   }
+
+   if (i->srcExists(2) && (i->predSrc != 2)) {
+      srcId(i->src(2), 32 + 17);
+      if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 20;
+   } else {
+      code[1] |= 7 << 17;
+   }
+
+   if (i->defExists(0)) {
+      if (i->def(0).getFile() == FILE_GPR)
+         rDef = i->getDef(0);
+      else
+         pDef = i->getDef(0);
+
+      if (i->defExists(1)) {
+         if (i->def(1).getFile() == FILE_GPR)
+            rDef = i->getDef(1);
+         else
+            pDef = i->getDef(1);
+      }
+   }
+   if (rDef) {
+      code[0] &= ~(63 << 14);
+      defId(rDef, 14);
+   }
+   if (pDef) {
+      code[1] &= ~(7 << 21);
+      defId(pDef, 32 + 21);
+   }
+}
+
+void
+CodeEmitterNVC0::emitPFETCH(const Instruction *i)
+{
+   uint32_t prim = i->src(0).get()->reg.data.u32;
+
+   code[0] = 0x00000006 | ((prim & 0x3f) << 26);
+   code[1] = 0x00000000 | (prim >> 6);
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14);
+   srcId(i->src(1), 20);
+}
+
+void
+CodeEmitterNVC0::emitVFETCH(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x06000000 | i->src(0).get()->reg.data.offset;
+
+   if (i->perPatch)
+      code[0] |= 0x100;
+   if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+      code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
+
+   emitPredicate(i);
+
+   code[0] |= ((i->getDef(0)->reg.size / 4) - 1) << 5;
+
+   defId(i->def(0), 14);
+   srcId(i->src(0).getIndirect(0), 20);
+   srcId(i->src(0).getIndirect(1), 26); // vertex address
+}
+
+void
+CodeEmitterNVC0::emitEXPORT(const Instruction *i)
+{
+   unsigned int size = typeSizeof(i->dType);
+
+   code[0] = 0x00000006 | ((size / 4 - 1) << 5);
+   code[1] = 0x0a000000 | i->src(0).get()->reg.data.offset;
+
+   assert(!(code[1] & ((size == 12) ? 15 : (size - 1))));
+
+   if (i->perPatch)
+      code[0] |= 0x100;
+
+   emitPredicate(i);
+
+   assert(i->src(1).getFile() == FILE_GPR);
+
+   srcId(i->src(0).getIndirect(0), 20);
+   srcId(i->src(0).getIndirect(1), 32 + 17); // vertex base address
+   srcId(i->src(1), 26);
+}
+
+void
+CodeEmitterNVC0::emitOUT(const Instruction *i)
+{
+   code[0] = 0x00000006;
+   code[1] = 0x1c000000;
+
+   emitPredicate(i);
+
+   defId(i->def(0), 14); // new secret address
+   srcId(i->src(0), 20); // old secret address, should be 0 initially
+
+   assert(i->src(0).getFile() == FILE_GPR);
+
+   if (i->op == OP_EMIT)
+      code[0] |= 1 << 5;
+   if (i->op == OP_RESTART || i->subOp == NV50_IR_SUBOP_EMIT_RESTART)
+      code[0] |= 1 << 6;
+
+   // vertex stream
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] |= 0xc000;
+      code[0] |= SDATA(i->src(1)).u32 << 26;
+   } else {
+      srcId(i->src(1), 26);
+   }
+}
+
+void
+CodeEmitterNVC0::emitInterpMode(const Instruction *i)
+{
+   if (i->encSize == 8) {
+      code[0] |= i->ipa << 6; // TODO: INTERP_SAMPLEID
+   } else {
+      if (i->getInterpMode() == NV50_IR_INTERP_SC)
+         code[0] |= 0x80;
+      assert(i->op == OP_PINTERP && i->getSampleMode() == 0);
+   }
+}
+
+void
+CodeEmitterNVC0::emitINTERP(const Instruction *i)
+{
+   const uint32_t base = i->getSrc(0)->reg.data.offset;
+
+   if (i->encSize == 8) {
+      code[0] = 0x00000000;
+      code[1] = 0xc0000000 | (base & 0xffff);
+
+      if (i->saturate)
+         code[0] |= 1 << 5;
+
+      if (i->op == OP_PINTERP)
+         srcId(i->src(1), 26);
+      else
+         code[0] |= 0x3f << 26;
+
+      srcId(i->src(0).getIndirect(0), 20);
+   } else {
+      assert(i->op == OP_PINTERP);
+      code[0] = 0x00000009 | ((base & 0xc) << 6) | ((base >> 4) << 26);
+      srcId(i->src(1), 20);
+   }
+   emitInterpMode(i);
+
+   emitPredicate(i);
+   defId(i->def(0), 14);
+
+   if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
+      srcId(i->src(i->op == OP_PINTERP ? 2 : 1), 17);
+   else
+      code[1] |= 0x3f << 17;
+}
+
+void
+CodeEmitterNVC0::emitLoadStoreType(DataType ty)
+{
+   uint8_t val;
+
+   switch (ty) {
+   case TYPE_U8:
+      val = 0x00;
+      break;
+   case TYPE_S8:
+      val = 0x20;
+      break;
+   case TYPE_F16:
+   case TYPE_U16:
+      val = 0x40;
+      break;
+   case TYPE_S16:
+      val = 0x60;
+      break;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      val = 0x80;
+      break;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      val = 0xa0;
+      break;
+   case TYPE_B128:
+      val = 0xc0;
+      break;
+   default:
+      val = 0x80;
+      assert(!"invalid type");
+      break;
+   }
+   code[0] |= val;
+}
+
+void
+CodeEmitterNVC0::emitCachingMode(CacheMode c)
+{
+   uint32_t val;
+
+   switch (c) {
+   case CACHE_CA:
+// case CACHE_WB:
+      val = 0x000;
+      break;
+   case CACHE_CG:
+      val = 0x100;
+      break;
+   case CACHE_CS:
+      val = 0x200;
+      break;
+   case CACHE_CV:
+// case CACHE_WT:
+      val = 0x300;
+      break;
+   default:
+      val = 0;
+      assert(!"invalid caching mode");
+      break;
+   }
+   code[0] |= val;
+}
+
+static inline bool
+uses64bitAddress(const Instruction *ldst)
+{
+   return ldst->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+      ldst->src(0).isIndirect(0) &&
+      ldst->getIndirect(0, 0)->reg.size == 8;
+}
+
+void
+CodeEmitterNVC0::emitSTORE(const Instruction *i)
+{
+   uint32_t opc;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: opc = 0x90000000; break;
+   case FILE_MEMORY_LOCAL:  opc = 0xc8000000; break;
+   case FILE_MEMORY_SHARED: opc = 0xc9000000; break;
+   default:
+      assert(!"invalid memory file");
+      opc = 0;
+      break;
+   }
+   code[0] = 0x00000005;
+   code[1] = opc;
+
+   setAddressByFile(i->src(0));
+   srcId(i->src(1), 14);
+   srcId(i->src(0).getIndirect(0), 20);
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 26;
+
+   emitPredicate(i);
+
+   emitLoadStoreType(i->dType);
+   emitCachingMode(i->cache);
+}
+
+void
+CodeEmitterNVC0::emitLOAD(const Instruction *i)
+{
+   uint32_t opc;
+
+   code[0] = 0x00000005;
+
+   switch (i->src(0).getFile()) {
+   case FILE_MEMORY_GLOBAL: opc = 0x80000000; break;
+   case FILE_MEMORY_LOCAL:  opc = 0xc0000000; break;
+   case FILE_MEMORY_SHARED: opc = 0xc1000000; break;
+   case FILE_MEMORY_CONST:
+      if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
+         emitMOV(i); // not sure if this is any better
+         return;
+      }
+      opc = 0x14000000 | (i->src(0).get()->reg.fileIndex << 10);
+      code[0] = 0x00000006 | (i->subOp << 8);
+      break;
+   default:
+      assert(!"invalid memory file");
+      opc = 0;
+      break;
+   }
+   code[1] = opc;
+
+   defId(i->def(0), 14);
+
+   setAddressByFile(i->src(0));
+   srcId(i->src(0).getIndirect(0), 20);
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 26;
+
+   emitPredicate(i);
+
+   emitLoadStoreType(i->dType);
+   emitCachingMode(i->cache);
+}
+
+uint8_t
+CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
+{
+   switch (SDATA(ref).sv.sv) {
+   case SV_LANEID:        return 0x00;
+   case SV_PHYSID:        return 0x03;
+   case SV_VERTEX_COUNT:  return 0x10;
+   case SV_INVOCATION_ID: return 0x11;
+   case SV_YDIR:          return 0x12;
+   case SV_TID:           return 0x21 + SDATA(ref).sv.index;
+   case SV_CTAID:         return 0x25 + SDATA(ref).sv.index;
+   case SV_NTID:          return 0x29 + SDATA(ref).sv.index;
+   case SV_GRIDID:        return 0x2c;
+   case SV_NCTAID:        return 0x2d + SDATA(ref).sv.index;
+   case SV_LBASE:         return 0x34;
+   case SV_SBASE:         return 0x30;
+   case SV_CLOCK:         return 0x50 + SDATA(ref).sv.index;
+   default:
+      assert(!"no sreg for system value");
+      return 0;
+   }
+}
+
+void
+CodeEmitterNVC0::emitMOV(const Instruction *i)
+{
+   if (i->def(0).getFile() == FILE_PREDICATE) {
+      if (i->src(0).getFile() == FILE_GPR) {
+         code[0] = 0xfc01c003;
+         code[1] = 0x1a8e0000;
+         srcId(i->src(0), 20);
+      } else {
+         code[0] = 0x0001c004;
+         code[1] = 0x0c0e0000;
+         if (i->src(0).getFile() == FILE_IMMEDIATE) {
+            code[0] |= 7 << 20;
+            if (!i->getSrc(0)->reg.data.u32)
+               code[0] |= 1 << 23;
+         } else {
+            srcId(i->src(0), 20);
+         }
+      }
+      defId(i->def(0), 17);
+      emitPredicate(i);
+   } else
+   if (i->src(0).getFile() == FILE_SYSTEM_VALUE) {
+      uint8_t sr = getSRegEncoding(i->src(0));
+
+      if (i->encSize == 8) {
+         code[0] = 0x00000004 | (sr << 26);
+         code[1] = 0x2c000000;
+      } else {
+         code[0] = 0x40000008 | (sr << 20);
+      }
+      defId(i->def(0), 14);
+
+      emitPredicate(i);
+   } else
+   if (i->encSize == 8) {
+      uint64_t opc;
+
+      if (i->src(0).getFile() == FILE_IMMEDIATE)
+         opc = HEX64(18000000, 000001e2);
+      else
+      if (i->src(0).getFile() == FILE_PREDICATE)
+         opc = HEX64(080e0000, 1c000004);
+      else
+         opc = HEX64(28000000, 00000004);
+
+      opc |= i->lanes << 5;
+
+      emitForm_B(i, opc);
+   } else {
+      uint32_t imm;
+
+      if (i->src(0).getFile() == FILE_IMMEDIATE) {
+         imm = SDATA(i->src(0)).u32;
+         if (imm & 0xfff00000) {
+            assert(!(imm & 0x000fffff));
+            code[0] = 0x00000318 | imm;
+         } else {
+            assert(imm < 0x800 || ((int32_t)imm >= -0x800));
+            code[0] = 0x00000118 | (imm << 20);
+         }
+      } else {
+         code[0] = 0x0028;
+         emitShortSrc2(i->src(0));
+      }
+      defId(i->def(0), 14);
+
+      emitPredicate(i);
+   }
+}
+
+void
+CodeEmitterNVC0::emitATOM(const Instruction *i)
+{
+   const bool hasDst = i->defExists(0);
+   const bool casOrExch =
+      i->subOp == NV50_IR_SUBOP_ATOM_EXCH ||
+      i->subOp == NV50_IR_SUBOP_ATOM_CAS;
+
+   if (i->dType == TYPE_U64) {
+      switch (i->subOp) {
+      case NV50_IR_SUBOP_ATOM_ADD:
+         code[0] = 0x205;
+         if (hasDst)
+            code[1] = 0x507e0000;
+         else
+            code[1] = 0x10000000;
+         break;
+      case NV50_IR_SUBOP_ATOM_EXCH:
+         code[0] = 0x305;
+         code[1] = 0x507e0000;
+         break;
+      case NV50_IR_SUBOP_ATOM_CAS:
+         code[0] = 0x325;
+         code[1] = 0x50000000;
+         break;
+      default:
+         assert(!"invalid u64 red op");
+         break;
+      }
+   } else
+   if (i->dType == TYPE_U32) {
+      switch (i->subOp) {
+      case NV50_IR_SUBOP_ATOM_EXCH:
+         code[0] = 0x105;
+         code[1] = 0x507e0000;
+         break;
+      case NV50_IR_SUBOP_ATOM_CAS:
+         code[0] = 0x125;
+         code[1] = 0x50000000;
+         break;
+      default:
+         code[0] = 0x5 | (i->subOp << 5);
+         if (hasDst)
+            code[1] = 0x507e0000;
+         else
+            code[1] = 0x10000000;
+         break;
+      }
+   } else
+   if (i->dType == TYPE_S32) {
+      assert(i->subOp <= 2);
+      code[0] = 0x205 | (i->subOp << 5);
+      if (hasDst)
+         code[1] = 0x587e0000;
+      else
+         code[1] = 0x18000000;
+   } else
+   if (i->dType == TYPE_F32) {
+      assert(i->subOp == NV50_IR_SUBOP_ATOM_ADD);
+      code[0] = 0x205;
+      if (hasDst)
+         code[1] = 0x687e0000;
+      else
+         code[1] = 0x28000000;
+   }
+
+   emitPredicate(i);
+
+   srcId(i->src(1), 14);
+
+   if (hasDst)
+      defId(i->def(0), 32 + 11);
+   else
+   if (casOrExch)
+      code[1] |= 63 << 11;
+
+   if (hasDst || casOrExch) {
+      const int32_t offset = SDATA(i->src(0)).offset;
+      assert(offset < 0x80000 && offset >= -0x80000);
+      code[0] |= offset << 26;
+      code[1] |= (offset & 0x1ffc0) >> 6;
+      code[1] |= (offset & 0xe0000) << 6;
+   } else {
+      srcAddr32(i->src(0), 26, 0);
+   }
+   if (i->getIndirect(0, 0)) {
+      srcId(i->getIndirect(0, 0), 20);
+      if (i->getIndirect(0, 0)->reg.size == 8)
+         code[1] |= 1 << 26;
+   } else {
+      code[0] |= 63 << 20;
+   }
+
+   if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+      srcId(i->src(2), 32 + 17);
+}
+
+void
+CodeEmitterNVC0::emitMEMBAR(const Instruction *i)
+{
+   switch (NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp)) {
+   case NV50_IR_SUBOP_MEMBAR_CTA: code[0] = 0x05; break;
+   case NV50_IR_SUBOP_MEMBAR_GL:  code[0] = 0x25; break;
+   default:
+      code[0] = 0x45;
+      assert(NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) == NV50_IR_SUBOP_MEMBAR_SYS);
+      break;
+   }
+   code[1] = 0xe0000000;
+
+   emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitCCTL(const Instruction *i)
+{
+   code[0] = 0x00000005 | (i->subOp << 5);
+
+   if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      code[1] = 0x98000000;
+      srcAddr32(i->src(0), 28, 2);
+   } else {
+      code[1] = 0xd0000000;
+      setAddress24(i->src(0));
+   }
+   if (uses64bitAddress(i))
+      code[1] |= 1 << 26;
+   srcId(i->src(0).getIndirect(0), 20);
+
+   emitPredicate(i);
+
+   defId(i, 0, 14);
+}
+
+void
+CodeEmitterNVC0::emitSUCLAMPMode(uint16_t subOp)
+{
+   uint8_t m;
+   switch (subOp & ~NV50_IR_SUBOP_SUCLAMP_2D) {
+   case NV50_IR_SUBOP_SUCLAMP_SD(0, 1): m = 0; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(1, 1): m = 1; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(2, 1): m = 2; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(3, 1): m = 3; break;
+   case NV50_IR_SUBOP_SUCLAMP_SD(4, 1): m = 4; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(0, 1): m = 5; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(1, 1): m = 6; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(2, 1): m = 7; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(3, 1): m = 8; break;
+   case NV50_IR_SUBOP_SUCLAMP_PL(4, 1): m = 9; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(0, 1): m = 10; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(1, 1): m = 11; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(2, 1): m = 12; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(3, 1): m = 13; break;
+   case NV50_IR_SUBOP_SUCLAMP_BL(4, 1): m = 14; break;
+   default:
+      return;
+   }
+   code[0] |= m << 5;
+   if (subOp & NV50_IR_SUBOP_SUCLAMP_2D)
+      code[1] |= 1 << 16;
+}
+
+void
+CodeEmitterNVC0::emitSUCalc(Instruction *i)
+{
+   ImmediateValue *imm = NULL;
+   uint64_t opc;
+
+   if (i->srcExists(2)) {
+      imm = i->getSrc(2)->asImm();
+      if (imm)
+         i->setSrc(2, NULL); // special case, make emitForm_A not assert
+   }
+
+   switch (i->op) {
+   case OP_SUCLAMP: opc = HEX64(58000000, 00000004); break;
+   case OP_SUBFM: opc = HEX64(5c000000, 00000004); break;
+   case OP_SUEAU: opc = HEX64(60000000, 00000004); break;
+   default:
+      assert(0);
+      return;
+   }
+   emitForm_A(i, opc);
+
+   if (i->op == OP_SUCLAMP) {
+      if (i->dType == TYPE_S32)
+         code[0] |= 1 << 9;
+      emitSUCLAMPMode(i->subOp);
+   }
+
+   if (i->op == OP_SUBFM && i->subOp == NV50_IR_SUBOP_SUBFM_3D)
+         code[1] |= 1 << 16;
+
+   if (i->op != OP_SUEAU) {
+      if (i->def(0).getFile() == FILE_PREDICATE) { // p, #
+         code[0] |= 63 << 14;
+         code[1] |= i->getDef(0)->reg.data.id << 23;
+      } else
+      if (i->defExists(1)) { // r, p
+         assert(i->def(1).getFile() == FILE_PREDICATE);
+         code[1] |= i->getDef(1)->reg.data.id << 23;
+      } else { // r, #
+         code[1] |= 7 << 23;
+      }
+   }
+   if (imm) {
+      assert(i->op == OP_SUCLAMP);
+      i->setSrc(2, imm);
+      code[1] |= (imm->reg.data.u32 & 0x3f) << 17; // sint6
+   }
+}
+
+void
+CodeEmitterNVC0::emitSUGType(DataType ty)
+{
+   switch (ty) {
+   case TYPE_S32: code[1] |= 1 << 13; break;
+   case TYPE_U8:  code[1] |= 2 << 13; break;
+   case TYPE_S8:  code[1] |= 3 << 13; break;
+   default:
+      assert(ty == TYPE_U32);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::setSUConst16(const Instruction *i, const int s)
+{
+   const uint32_t offset = i->getSrc(s)->reg.data.offset;
+
+   assert(i->src(s).getFile() == FILE_MEMORY_CONST);
+   assert(offset == (offset & 0xfffc));
+
+   code[1] |= 1 << 21;
+   code[0] |= offset << 24;
+   code[1] |= offset >> 8;
+   code[1] |= i->getSrc(s)->reg.fileIndex << 8;
+}
+
+void
+CodeEmitterNVC0::setSUPred(const Instruction *i, const int s)
+{
+   if (!i->srcExists(s) || (i->predSrc == s)) {
+      code[1] |= 0x7 << 17;
+   } else {
+      if (i->src(s).mod == Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 20;
+      srcId(i->src(s), 32 + 17);
+   }
+}
+
+void
+CodeEmitterNVC0::emitSULDGB(const TexInstruction *i)
+{
+   code[0] = 0x5;
+   code[1] = 0xd4000000 | (i->subOp << 15);
+
+   emitLoadStoreType(i->dType);
+   emitSUGType(i->sType);
+   emitCachingMode(i->cache);
+
+   emitPredicate(i);
+   defId(i->def(0), 14); // destination
+   srcId(i->src(0), 20); // address
+   // format
+   if (i->src(1).getFile() == FILE_GPR)
+      srcId(i->src(1), 26);
+   else
+      setSUConst16(i, 1);
+   setSUPred(i, 2);
+}
+
+void
+CodeEmitterNVC0::emitSUSTGx(const TexInstruction *i)
+{
+   code[0] = 0x5;
+   code[1] = 0xdc000000 | (i->subOp << 15);
+
+   if (i->op == OP_SUSTP)
+      code[1] |= i->tex.mask << 22;
+   else
+      emitLoadStoreType(i->dType);
+   emitSUGType(i->sType);
+   emitCachingMode(i->cache);
+
+   emitPredicate(i);
+   srcId(i->src(0), 20); // address
+   // format
+   if (i->src(1).getFile() == FILE_GPR)
+      srcId(i->src(1), 26);
+   else
+      setSUConst16(i, 1);
+   srcId(i->src(3), 14); // values
+   setSUPred(i, 2);
+}
+
+void
+CodeEmitterNVC0::emitVectorSubOp(const Instruction *i)
+{
+   switch (NV50_IR_SUBOP_Vn(i->subOp)) {
+   case 0:
+      code[1] |= (i->subOp & 0x000f) << 12; // vsrc1
+      code[1] |= (i->subOp & 0x00e0) >> 5;  // vsrc2
+      code[1] |= (i->subOp & 0x0100) << 7;  // vsrc2
+      code[1] |= (i->subOp & 0x3c00) << 13; // vdst
+      break;
+   case 1:
+      code[1] |= (i->subOp & 0x000f) << 8;  // v2src1
+      code[1] |= (i->subOp & 0x0010) << 11; // v2src1
+      code[1] |= (i->subOp & 0x01e0) >> 1;  // v2src2
+      code[1] |= (i->subOp & 0x0200) << 6;  // v2src2
+      code[1] |= (i->subOp & 0x3c00) << 2;  // v4dst
+      code[1] |= (i->mask & 0x3) << 2;
+      break;
+   case 2:
+      code[1] |= (i->subOp & 0x000f) << 8; // v4src1
+      code[1] |= (i->subOp & 0x01e0) >> 1; // v4src2
+      code[1] |= (i->subOp & 0x3c00) << 2; // v4dst
+      code[1] |= (i->mask & 0x3) << 2;
+      code[1] |= (i->mask & 0xc) << 21;
+      break;
+   default:
+      assert(0);
+      break;
+   }
+}
+
+void
+CodeEmitterNVC0::emitVSHL(const Instruction *i)
+{
+   uint64_t opc = 0x4;
+
+   switch (NV50_IR_SUBOP_Vn(i->subOp)) {
+   case 0: opc |= 0xe8ULL << 56; break;
+   case 1: opc |= 0xb4ULL << 56; break;
+   case 2: opc |= 0x94ULL << 56; break;
+   default:
+      assert(0);
+      break;
+   }
+   if (NV50_IR_SUBOP_Vn(i->subOp) == 1) {
+      if (isSignedType(i->dType)) opc |= 1ULL << 0x2a;
+      if (isSignedType(i->sType)) opc |= (1 << 6) | (1 << 5);
+   } else {
+      if (isSignedType(i->dType)) opc |= 1ULL << 0x39;
+      if (isSignedType(i->sType)) opc |= 1 << 6;
+   }
+   emitForm_A(i, opc);
+   emitVectorSubOp(i);
+
+   if (i->saturate)
+      code[0] |= 1 << 9;
+   if (i->flagsDef >= 0)
+      code[1] |= 1 << 16;
+}
+
+bool
+CodeEmitterNVC0::emitInstruction(Instruction *insn)
+{
+   unsigned int size = insn->encSize;
+
+   if (writeIssueDelays && !(codeSize & 0x3f))
+      size += 8;
+
+   if (!insn->encSize) {
+      ERROR("skipping unencodable instruction: "); insn->print();
+      return false;
+   } else
+   if (codeSize + size > codeSizeLimit) {
+      ERROR("code emitter output buffer too small\n");
+      return false;
+   }
+
+   if (writeIssueDelays) {
+      if (!(codeSize & 0x3f)) {
+         code[0] = 0x00000007; // cf issue delay "instruction"
+         code[1] = 0x20000000;
+         code += 2;
+         codeSize += 8;
+      }
+      const unsigned int id = (codeSize & 0x3f) / 8 - 1;
+      uint32_t *data = code - (id * 2 + 2);
+      if (id <= 2) {
+         data[0] |= insn->sched << (id * 8 + 4);
+      } else
+      if (id == 3) {
+         data[0] |= insn->sched << 28;
+         data[1] |= insn->sched >> 4;
+      } else {
+         data[1] |= insn->sched << ((id - 4) * 8 + 4);
+      }
+   }
+
+   // assert that instructions with multiple defs don't corrupt registers
+   for (int d = 0; insn->defExists(d); ++d)
+      assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
+
+   switch (insn->op) {
+   case OP_MOV:
+   case OP_RDSV:
+      emitMOV(insn);
+      break;
+   case OP_NOP:
+      break;
+   case OP_LOAD:
+      emitLOAD(insn);
+      break;
+   case OP_STORE:
+      emitSTORE(insn);
+      break;
+   case OP_LINTERP:
+   case OP_PINTERP:
+      emitINTERP(insn);
+      break;
+   case OP_VFETCH:
+      emitVFETCH(insn);
+      break;
+   case OP_EXPORT:
+      emitEXPORT(insn);
+      break;
+   case OP_PFETCH:
+      emitPFETCH(insn);
+      break;
+   case OP_EMIT:
+   case OP_RESTART:
+      emitOUT(insn);
+      break;
+   case OP_ADD:
+   case OP_SUB:
+      if (isFloatType(insn->dType))
+         emitFADD(insn);
+      else
+         emitUADD(insn);
+      break;
+   case OP_MUL:
+      if (isFloatType(insn->dType))
+         emitFMUL(insn);
+      else
+         emitUMUL(insn);
+      break;
+   case OP_MAD:
+   case OP_FMA:
+      if (isFloatType(insn->dType))
+         emitFMAD(insn);
+      else
+         emitIMAD(insn);
+      break;
+   case OP_SAD:
+      emitISAD(insn);
+      break;
+   case OP_NOT:
+      emitNOT(insn);
+      break;
+   case OP_AND:
+      emitLogicOp(insn, 0);
+      break;
+   case OP_OR:
+      emitLogicOp(insn, 1);
+      break;
+   case OP_XOR:
+      emitLogicOp(insn, 2);
+      break;
+   case OP_SHL:
+   case OP_SHR:
+      emitShift(insn);
+      break;
+   case OP_SET:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+      emitSET(insn->asCmp());
+      break;
+   case OP_SELP:
+      emitSELP(insn);
+      break;
+   case OP_SLCT:
+      emitSLCT(insn->asCmp());
+      break;
+   case OP_MIN:
+   case OP_MAX:
+      emitMINMAX(insn);
+      break;
+   case OP_ABS:
+   case OP_NEG:
+   case OP_CEIL:
+   case OP_FLOOR:
+   case OP_TRUNC:
+   case OP_CVT:
+   case OP_SAT:
+      emitCVT(insn);
+      break;
+   case OP_RSQ:
+      emitSFnOp(insn, 5);
+      break;
+   case OP_RCP:
+      emitSFnOp(insn, 4);
+      break;
+   case OP_LG2:
+      emitSFnOp(insn, 3);
+      break;
+   case OP_EX2:
+      emitSFnOp(insn, 2);
+      break;
+   case OP_SIN:
+      emitSFnOp(insn, 1);
+      break;
+   case OP_COS:
+      emitSFnOp(insn, 0);
+      break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      emitPreOp(insn);
+      break;
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXD:
+   case OP_TXF:
+      emitTEX(insn->asTex());
+      break;
+   case OP_TXQ:
+      emitTXQ(insn->asTex());
+      break;
+   case OP_TEXBAR:
+      emitTEXBAR(insn);
+      break;
+   case OP_SUBFM:
+   case OP_SUCLAMP:
+   case OP_SUEAU:
+      emitSUCalc(insn);
+      break;
+   case OP_MADSP:
+      emitMADSP(insn);
+      break;
+   case OP_SULDB:
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+         emitSULDGB(insn->asTex());
+      else
+         ERROR("SULDB not yet supported on < nve4\n");
+      break;
+   case OP_SUSTB:
+   case OP_SUSTP:
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+         emitSUSTGx(insn->asTex());
+      else
+         ERROR("SUSTx not yet supported on < nve4\n");
+      break;
+   case OP_ATOM:
+      emitATOM(insn);
+      break;
+   case OP_BRA:
+   case OP_CALL:
+   case OP_PRERET:
+   case OP_RET:
+   case OP_DISCARD:
+   case OP_EXIT:
+   case OP_PRECONT:
+   case OP_CONT:
+   case OP_PREBREAK:
+   case OP_BREAK:
+   case OP_JOINAT:
+   case OP_BRKPT:
+   case OP_QUADON:
+   case OP_QUADPOP:
+      emitFlow(insn);
+      break;
+   case OP_QUADOP:
+      emitQUADOP(insn, insn->subOp, insn->lanes);
+      break;
+   case OP_DFDX:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x66 : 0x99, 0x4);
+      break;
+   case OP_DFDY:
+      emitQUADOP(insn, insn->src(0).mod.neg() ? 0x5a : 0xa5, 0x5);
+      break;
+   case OP_POPCNT:
+      emitPOPC(insn);
+      break;
+   case OP_INSBF:
+      emitINSBF(insn);
+      break;
+   case OP_EXTBF:
+      emitEXTBF(insn);
+      break;
+   case OP_PERMT:
+      emitPERMT(insn);
+      break;
+   case OP_JOIN:
+      emitNOP(insn);
+      insn->join = 1;
+      break;
+   case OP_BAR:
+      emitBAR(insn);
+      break;
+   case OP_MEMBAR:
+      emitMEMBAR(insn);
+      break;
+   case OP_CCTL:
+      emitCCTL(insn);
+      break;
+   case OP_VSHL:
+      emitVSHL(insn);
+      break;
+   case OP_PHI:
+   case OP_UNION:
+   case OP_CONSTRAINT:
+      ERROR("operation should have been eliminated");
+      return false;
+   case OP_EXP:
+   case OP_LOG:
+   case OP_SQRT:
+   case OP_POW:
+      ERROR("operation should have been lowered\n");
+      return false;
+   default:
+      ERROR("unknow op\n");
+      return false;
+   }
+
+   if (insn->join) {
+      code[0] |= 0x10;
+      assert(insn->encSize == 8);
+   }
+
+   code += insn->encSize / 4;
+   codeSize += insn->encSize;
+   return true;
+}
+
+uint32_t
+CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
+{
+   const Target::OpInfo &info = targ->getOpInfo(i);
+
+   if (writeIssueDelays || info.minEncSize == 8 || 1)
+      return 8;
+
+   if (i->ftz || i->saturate || i->join)
+      return 8;
+   if (i->rnd != ROUND_N)
+      return 8;
+   if (i->predSrc >= 0 && i->op == OP_MAD)
+      return 8;
+
+   if (i->op == OP_PINTERP) {
+      if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work
+         return 8;
+   } else
+   if (i->op == OP_MOV && i->lanes != 0xf) {
+      return 8;
+   }
+
+   for (int s = 0; i->srcExists(s); ++s) {
+      if (i->src(s).isIndirect(0))
+         return 8;
+
+      if (i->src(s).getFile() == FILE_MEMORY_CONST) {
+         if (SDATA(i->src(s)).offset >= 0x100)
+            return 8;
+         if (i->getSrc(s)->reg.fileIndex > 1 &&
+             i->getSrc(s)->reg.fileIndex != 16)
+             return 8;
+      } else
+      if (i->src(s).getFile() == FILE_IMMEDIATE) {
+         if (i->dType == TYPE_F32) {
+            if (SDATA(i->src(s)).u32 >= 0x100)
+               return 8;
+         } else {
+            if (SDATA(i->src(s)).u32 > 0xff)
+               return 8;
+         }
+      }
+
+      if (i->op == OP_CVT)
+         continue;
+      if (i->src(s).mod != Modifier(0)) {
+         if (i->src(s).mod == Modifier(NV50_IR_MOD_ABS))
+            if (i->op != OP_RSQ)
+               return 8;
+         if (i->src(s).mod == Modifier(NV50_IR_MOD_NEG))
+            if (i->op != OP_ADD || s != 0)
+               return 8;
+      }
+   }
+
+   return 4;
+}
+
+// Simplified, erring on safe side.
+class SchedDataCalculator : public Pass
+{
+public:
+   SchedDataCalculator(const Target *targ) : targ(targ) { }
+
+private:
+   struct RegScores
+   {
+      struct Resource {
+         int st[DATA_FILE_COUNT]; // LD to LD delay 3
+         int ld[DATA_FILE_COUNT]; // ST to ST delay 3
+         int tex; // TEX to non-TEX delay 17 (0x11)
+         int sfu; // SFU to SFU delay 3 (except PRE-ops)
+         int imul; // integer MUL to MUL delay 3
+      } res;
+      struct ScoreData {
+         int r[64];
+         int p[8];
+         int c;
+      } rd, wr;
+      int base;
+
+      void rebase(const int base)
+      {
+         const int delta = this->base - base;
+         if (!delta)
+            return;
+         this->base = 0;
+
+         for (int i = 0; i < 64; ++i) {
+            rd.r[i] += delta;
+            wr.r[i] += delta;
+         }
+         for (int i = 0; i < 8; ++i) {
+            rd.p[i] += delta;
+            wr.p[i] += delta;
+         }
+         rd.c += delta;
+         wr.c += delta;
+
+         for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+            res.ld[f] += delta;
+            res.st[f] += delta;
+         }
+         res.sfu += delta;
+         res.imul += delta;
+         res.tex += delta;
+      }
+      void wipe()
+      {
+         memset(&rd, 0, sizeof(rd));
+         memset(&wr, 0, sizeof(wr));
+         memset(&res, 0, sizeof(res));
+      }
+      int getLatest(const ScoreData& d) const
+      {
+         int max = 0;
+         for (int i = 0; i < 64; ++i)
+            if (d.r[i] > max)
+               max = d.r[i];
+         for (int i = 0; i < 8; ++i)
+            if (d.p[i] > max)
+               max = d.p[i];
+         if (d.c > max)
+            max = d.c;
+         return max;
+      }
+      inline int getLatestRd() const
+      {
+         return getLatest(rd);
+      }
+      inline int getLatestWr() const
+      {
+         return getLatest(wr);
+      }
+      inline int getLatest() const
+      {
+         const int a = getLatestRd();
+         const int b = getLatestWr();
+
+         int max = MAX2(a, b);
+         for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+            max = MAX2(res.ld[f], max);
+            max = MAX2(res.st[f], max);
+         }
+         max = MAX2(res.sfu, max);
+         max = MAX2(res.imul, max);
+         max = MAX2(res.tex, max);
+         return max;
+      }
+      void setMax(const RegScores *that)
+      {
+         for (int i = 0; i < 64; ++i) {
+            rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
+            wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
+         }
+         for (int i = 0; i < 8; ++i) {
+            rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
+            wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
+         }
+         rd.c = MAX2(rd.c, that->rd.c);
+         wr.c = MAX2(wr.c, that->wr.c);
+
+         for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+            res.ld[f] = MAX2(res.ld[f], that->res.ld[f]);
+            res.st[f] = MAX2(res.st[f], that->res.st[f]);
+         }
+         res.sfu = MAX2(res.sfu, that->res.sfu);
+         res.imul = MAX2(res.imul, that->res.imul);
+         res.tex = MAX2(res.tex, that->res.tex);
+      }
+      void print(int cycle)
+      {
+         for (int i = 0; i < 64; ++i) {
+            if (rd.r[i] > cycle)
+               INFO("rd $r%i @ %i\n", i, rd.r[i]);
+            if (wr.r[i] > cycle)
+               INFO("wr $r%i @ %i\n", i, wr.r[i]);
+         }
+         for (int i = 0; i < 8; ++i) {
+            if (rd.p[i] > cycle)
+               INFO("rd $p%i @ %i\n", i, rd.p[i]);
+            if (wr.p[i] > cycle)
+               INFO("wr $p%i @ %i\n", i, wr.p[i]);
+         }
+         if (rd.c > cycle)
+            INFO("rd $c @ %i\n", rd.c);
+         if (wr.c > cycle)
+            INFO("wr $c @ %i\n", wr.c);
+         if (res.sfu > cycle)
+            INFO("sfu @ %i\n", res.sfu);
+         if (res.imul > cycle)
+            INFO("imul @ %i\n", res.imul);
+         if (res.tex > cycle)
+            INFO("tex @ %i\n", res.tex);
+      }
+   };
+
+   RegScores *score; // for current BB
+   std::vector<RegScores> scoreBoards;
+   int cycle;
+   int prevData;
+   operation prevOp;
+
+   const Target *targ;
+
+   bool visit(Function *);
+   bool visit(BasicBlock *);
+
+   void commitInsn(const Instruction *, int cycle);
+   int calcDelay(const Instruction *, int cycle) const;
+   void setDelay(Instruction *, int delay, Instruction *next);
+
+   void recordRd(const Value *, const int ready);
+   void recordWr(const Value *, const int ready);
+   void checkRd(const Value *, int cycle, int& delay) const;
+   void checkWr(const Value *, int cycle, int& delay) const;
+
+   int getCycles(const Instruction *, int origDelay) const;
+};
+
+void
+SchedDataCalculator::setDelay(Instruction *insn, int delay, Instruction *next)
+{
+   if (insn->op == OP_EXIT || insn->op == OP_RET)
+      delay = MAX2(delay, 14);
+
+   if (insn->op == OP_TEXBAR) {
+      // TODO: except if results not used before EXIT
+      insn->sched = 0xc2;
+   } else
+   if (insn->op == OP_JOIN || insn->join) {
+      insn->sched = 0x00;
+   } else
+   if (delay >= 0 || prevData == 0x04 ||
+       !next || !targ->canDualIssue(insn, next)) {
+      insn->sched = static_cast<uint8_t>(MAX2(delay, 0));
+      if (prevOp == OP_EXPORT)
+         insn->sched |= 0x40;
+      else
+         insn->sched |= 0x20;
+   } else {
+      insn->sched = 0x04; // dual-issue
+   }
+
+   if (prevData != 0x04 || prevOp != OP_EXPORT)
+      if (insn->sched != 0x04 || insn->op == OP_EXPORT)
+         prevOp = insn->op;
+
+   prevData = insn->sched;
+}
+
+int
+SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
+{
+   if (insn->sched & 0x80) {
+      int c = (insn->sched & 0x0f) * 2 + 1;
+      if (insn->op == OP_TEXBAR && origDelay > 0)
+         c += origDelay;
+      return c;
+   }
+   if (insn->sched & 0x60)
+      return (insn->sched & 0x1f) + 1;
+   return (insn->sched == 0x04) ? 0 : 32;
+}
+
+bool
+SchedDataCalculator::visit(Function *func)
+{
+   scoreBoards.resize(func->cfg.getSize());
+   for (size_t i = 0; i < scoreBoards.size(); ++i)
+      scoreBoards[i].wipe();
+   return true;
+}
+
+bool
+SchedDataCalculator::visit(BasicBlock *bb)
+{
+   Instruction *insn;
+   Instruction *next = NULL;
+
+   int cycle = 0;
+
+   prevData = 0x00;
+   prevOp = OP_NOP;
+   score = &scoreBoards.at(bb->getId());
+
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      // back branches will wait until all target dependencies are satisfied
+      if (ei.getType() == Graph::Edge::BACK) // sched would be uninitialized
+         continue;
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+      if (in->getExit()) {
+         if (prevData != 0x04)
+            prevData = in->getExit()->sched;
+         prevOp = in->getExit()->op;
+      }
+      score->setMax(&scoreBoards.at(in->getId()));
+   }
+   if (bb->cfg.incidentCount() > 1)
+      prevOp = OP_NOP;
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+   INFO("=== BB:%i initial scores\n", bb->getId());
+   score->print(cycle);
+#endif
+
+   for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
+      next = insn->next;
+
+      commitInsn(insn, cycle);
+      int delay = calcDelay(next, cycle);
+      setDelay(insn, delay, next);
+      cycle += getCycles(insn, delay);
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+      INFO("cycle %i, sched %02x\n", cycle, insn->sched);
+      insn->print();
+      next->print();
+#endif
+   }
+   if (!insn)
+      return true;
+   commitInsn(insn, cycle);
+
+   int bbDelay = -1;
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      BasicBlock *out = BasicBlock::get(ei.getNode());
+
+      if (ei.getType() != Graph::Edge::BACK) {
+         // only test the first instruction of the outgoing block
+         next = out->getEntry();
+         if (next)
+            bbDelay = MAX2(bbDelay, calcDelay(next, cycle));
+      } else {
+         // wait until all dependencies are satisfied
+         const int regsFree = score->getLatest();
+         next = out->getFirst();
+         for (int c = cycle; next && c < regsFree; next = next->next) {
+            bbDelay = MAX2(bbDelay, calcDelay(next, c));
+            c += getCycles(next, bbDelay);
+         }
+         next = NULL;
+      }
+   }
+   if (bb->cfg.outgoingCount() != 1)
+      next = NULL;
+   setDelay(insn, bbDelay, next);
+   cycle += getCycles(insn, bbDelay);
+
+   score->rebase(cycle); // common base for initializing out blocks' scores
+   return true;
+}
+
+#define NVE4_MAX_ISSUE_DELAY 0x1f
+int
+SchedDataCalculator::calcDelay(const Instruction *insn, int cycle) const
+{
+   int delay = 0, ready = cycle;
+
+   for (int s = 0; insn->srcExists(s); ++s)
+      checkRd(insn->getSrc(s), cycle, delay);
+   // WAR & WAW don't seem to matter
+   // for (int s = 0; insn->srcExists(s); ++s)
+   //   recordRd(insn->getSrc(s), cycle);
+
+   switch (Target::getOpClass(insn->op)) {
+   case OPCLASS_SFU:
+      ready = score->res.sfu;
+      break;
+   case OPCLASS_ARITH:
+      if (insn->op == OP_MUL && !isFloatType(insn->dType))
+         ready = score->res.imul;
+      break;
+   case OPCLASS_TEXTURE:
+      ready = score->res.tex;
+      break;
+   case OPCLASS_LOAD:
+      ready = score->res.ld[insn->src(0).getFile()];
+      break;
+   case OPCLASS_STORE:
+      ready = score->res.st[insn->src(0).getFile()];
+      break;
+   default:
+      break;
+   }
+   if (Target::getOpClass(insn->op) != OPCLASS_TEXTURE)
+      ready = MAX2(ready, score->res.tex);
+
+   delay = MAX2(delay, ready - cycle);
+
+   // if can issue next cycle, delay is 0, not 1
+   return MIN2(delay - 1, NVE4_MAX_ISSUE_DELAY);
+}
+
+void
+SchedDataCalculator::commitInsn(const Instruction *insn, int cycle)
+{
+   const int ready = cycle + targ->getLatency(insn);
+
+   for (int d = 0; insn->defExists(d); ++d)
+      recordWr(insn->getDef(d), ready);
+   // WAR & WAW don't seem to matter
+   // for (int s = 0; insn->srcExists(s); ++s)
+   //   recordRd(insn->getSrc(s), cycle);
+
+   switch (Target::getOpClass(insn->op)) {
+   case OPCLASS_SFU:
+      score->res.sfu = cycle + 4;
+      break;
+   case OPCLASS_ARITH:
+      if (insn->op == OP_MUL && !isFloatType(insn->dType))
+         score->res.imul = cycle + 4;
+      break;
+   case OPCLASS_TEXTURE:
+      score->res.tex = cycle + 18;
+      break;
+   case OPCLASS_LOAD:
+      if (insn->src(0).getFile() == FILE_MEMORY_CONST)
+         break;
+      score->res.ld[insn->src(0).getFile()] = cycle + 4;
+      score->res.st[insn->src(0).getFile()] = ready;
+      break;
+   case OPCLASS_STORE:
+      score->res.st[insn->src(0).getFile()] = cycle + 4;
+      score->res.ld[insn->src(0).getFile()] = ready;
+      break;
+   case OPCLASS_OTHER:
+      if (insn->op == OP_TEXBAR)
+         score->res.tex = cycle;
+      break;
+   default:
+      break;
+   }
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+   score->print(cycle);
+#endif
+}
+
+void
+SchedDataCalculator::checkRd(const Value *v, int cycle, int& delay) const
+{
+   int ready = cycle;
+   int a, b;
+
+   switch (v->reg.file) {
+   case FILE_GPR:
+      a = v->reg.data.id;
+      b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         ready = MAX2(ready, score->rd.r[r]);
+      break;
+   case FILE_PREDICATE:
+      ready = MAX2(ready, score->rd.p[v->reg.data.id]);
+      break;
+   case FILE_FLAGS:
+      ready = MAX2(ready, score->rd.c);
+      break;
+   case FILE_SHADER_INPUT:
+   case FILE_SHADER_OUTPUT: // yes, TCPs can read outputs
+   case FILE_MEMORY_LOCAL:
+   case FILE_MEMORY_CONST:
+   case FILE_MEMORY_SHARED:
+   case FILE_MEMORY_GLOBAL:
+   case FILE_SYSTEM_VALUE:
+      // TODO: any restrictions here ?
+      break;
+   case FILE_IMMEDIATE:
+      break;
+   default:
+      assert(0);
+      break;
+   }
+   if (cycle < ready)
+      delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculator::checkWr(const Value *v, int cycle, int& delay) const
+{
+   int ready = cycle;
+   int a, b;
+
+   switch (v->reg.file) {
+   case FILE_GPR:
+      a = v->reg.data.id;
+      b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         ready = MAX2(ready, score->wr.r[r]);
+      break;
+   case FILE_PREDICATE:
+      ready = MAX2(ready, score->wr.p[v->reg.data.id]);
+      break;
+   default:
+      assert(v->reg.file == FILE_FLAGS);
+      ready = MAX2(ready, score->wr.c);
+      break;
+   }
+   if (cycle < ready)
+      delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculator::recordWr(const Value *v, const int ready)
+{
+   int a = v->reg.data.id;
+
+   if (v->reg.file == FILE_GPR) {
+      int b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         score->rd.r[r] = ready;
+   } else
+   // $c, $pX: shorter issue-to-read delay (at least as exec pred and carry)
+   if (v->reg.file == FILE_PREDICATE) {
+      score->rd.p[a] = ready + 4;
+   } else {
+      assert(v->reg.file == FILE_FLAGS);
+      score->rd.c = ready + 4;
+   }
+}
+
+void
+SchedDataCalculator::recordRd(const Value *v, const int ready)
+{
+   int a = v->reg.data.id;
+
+   if (v->reg.file == FILE_GPR) {
+      int b = a + v->reg.size / 4;
+      for (int r = a; r < b; ++r)
+         score->wr.r[r] = ready;
+   } else
+   if (v->reg.file == FILE_PREDICATE) {
+      score->wr.p[a] = ready;
+   } else
+   if (v->reg.file == FILE_FLAGS) {
+      score->wr.c = ready;
+   }
+}
+
+bool
+calculateSchedDataNVC0(const Target *targ, Function *func)
+{
+   SchedDataCalculator sched(targ);
+   return sched.run(func, true, true);
+}
+
+void
+CodeEmitterNVC0::prepareEmission(Function *func)
+{
+   CodeEmitter::prepareEmission(func);
+
+   if (targ->hasSWSched)
+      calculateSchedDataNVC0(targ, func);
+}
+
+CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target)
+   : CodeEmitter(target),
+     targNVC0(target),
+     writeIssueDelays(target->hasSWSched)
+{
+   code = NULL;
+   codeSize = codeSizeLimit = 0;
+   relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNVC0::createCodeEmitterNVC0(Program::Type type)
+{
+   CodeEmitterNVC0 *emit = new CodeEmitterNVC0(this);
+   emit->setProgramType(type);
+   return emit;
+}
+
+CodeEmitter *
+TargetNVC0::getCodeEmitter(Program::Type type)
+{
+   if (chipset >= NVISA_GK110_CHIPSET)
+      return createCodeEmitterGK110(type);
+   return createCodeEmitterNVC0(type);
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
new file mode 100644
index 0000000..3193ea6
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -0,0 +1,2852 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+extern "C" {
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_util.h"
+}
+
+#include <set>
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_util.h"
+#include "codegen/nv50_ir_build_util.h"
+
+namespace tgsi {
+
+class Source;
+
+static nv50_ir::operation translateOpcode(uint opcode);
+static nv50_ir::DataFile translateFile(uint file);
+static nv50_ir::TexTarget translateTexture(uint texTarg);
+static nv50_ir::SVSemantic translateSysVal(uint sysval);
+
+class Instruction
+{
+public:
+   Instruction(const struct tgsi_full_instruction *inst) : insn(inst) { }
+
+   class SrcRegister
+   {
+   public:
+      SrcRegister(const struct tgsi_full_src_register *src)
+         : reg(src->Register),
+           fsr(src)
+      { }
+
+      SrcRegister(const struct tgsi_src_register& src) : reg(src), fsr(NULL) { }
+
+      SrcRegister(const struct tgsi_ind_register& ind)
+         : reg(tgsi_util_get_src_from_ind(&ind)),
+           fsr(NULL)
+      { }
+
+      struct tgsi_src_register offsetToSrc(struct tgsi_texture_offset off)
+      {
+         struct tgsi_src_register reg;
+         memset(&reg, 0, sizeof(reg));
+         reg.Index = off.Index;
+         reg.File = off.File;
+         reg.SwizzleX = off.SwizzleX;
+         reg.SwizzleY = off.SwizzleY;
+         reg.SwizzleZ = off.SwizzleZ;
+         return reg;
+      }
+
+      SrcRegister(const struct tgsi_texture_offset& off) :
+         reg(offsetToSrc(off)),
+         fsr(NULL)
+      { }
+
+      uint getFile() const { return reg.File; }
+
+      bool is2D() const { return reg.Dimension; }
+
+      bool isIndirect(int dim) const
+      {
+         return (dim && fsr) ? fsr->Dimension.Indirect : reg.Indirect;
+      }
+
+      int getIndex(int dim) const
+      {
+         return (dim && fsr) ? fsr->Dimension.Index : reg.Index;
+      }
+
+      int getSwizzle(int chan) const
+      {
+         return tgsi_util_get_src_register_swizzle(&reg, chan);
+      }
+
+      nv50_ir::Modifier getMod(int chan) const;
+
+      SrcRegister getIndirect(int dim) const
+      {
+         assert(fsr && isIndirect(dim));
+         if (dim)
+            return SrcRegister(fsr->DimIndirect);
+         return SrcRegister(fsr->Indirect);
+      }
+
+      uint32_t getValueU32(int c, const struct nv50_ir_prog_info *info) const
+      {
+         assert(reg.File == TGSI_FILE_IMMEDIATE);
+         assert(!reg.Absolute);
+         assert(!reg.Negate);
+         return info->immd.data[reg.Index * 4 + getSwizzle(c)];
+      }
+
+   private:
+      const struct tgsi_src_register reg;
+      const struct tgsi_full_src_register *fsr;
+   };
+
+   class DstRegister
+   {
+   public:
+      DstRegister(const struct tgsi_full_dst_register *dst)
+         : reg(dst->Register),
+           fdr(dst)
+      { }
+
+      DstRegister(const struct tgsi_dst_register& dst) : reg(dst), fdr(NULL) { }
+
+      uint getFile() const { return reg.File; }
+
+      bool is2D() const { return reg.Dimension; }
+
+      bool isIndirect(int dim) const
+      {
+         return (dim && fdr) ? fdr->Dimension.Indirect : reg.Indirect;
+      }
+
+      int getIndex(int dim) const
+      {
+         return (dim && fdr) ? fdr->Dimension.Dimension : reg.Index;
+      }
+
+      unsigned int getMask() const { return reg.WriteMask; }
+
+      bool isMasked(int chan) const { return !(getMask() & (1 << chan)); }
+
+      SrcRegister getIndirect(int dim) const
+      {
+         assert(fdr && isIndirect(dim));
+         if (dim)
+            return SrcRegister(fdr->DimIndirect);
+         return SrcRegister(fdr->Indirect);
+      }
+
+   private:
+      const struct tgsi_dst_register reg;
+      const struct tgsi_full_dst_register *fdr;
+   };
+
+   inline uint getOpcode() const { return insn->Instruction.Opcode; }
+
+   unsigned int srcCount() const { return insn->Instruction.NumSrcRegs; }
+   unsigned int dstCount() const { return insn->Instruction.NumDstRegs; }
+
+   // mask of used components of source s
+   unsigned int srcMask(unsigned int s) const;
+
+   SrcRegister getSrc(unsigned int s) const
+   {
+      assert(s < srcCount());
+      return SrcRegister(&insn->Src[s]);
+   }
+
+   DstRegister getDst(unsigned int d) const
+   {
+      assert(d < dstCount());
+      return DstRegister(&insn->Dst[d]);
+   }
+
+   SrcRegister getTexOffset(unsigned int i) const
+   {
+      assert(i < TGSI_FULL_MAX_TEX_OFFSETS);
+      return SrcRegister(insn->TexOffsets[i]);
+   }
+
+   unsigned int getNumTexOffsets() const { return insn->Texture.NumOffsets; }
+
+   bool checkDstSrcAliasing() const;
+
+   inline nv50_ir::operation getOP() const {
+      return translateOpcode(getOpcode()); }
+
+   nv50_ir::DataType inferSrcType() const;
+   nv50_ir::DataType inferDstType() const;
+
+   nv50_ir::CondCode getSetCond() const;
+
+   nv50_ir::TexInstruction::Target getTexture(const Source *, int s) const;
+
+   inline uint getLabel() { return insn->Label.Label; }
+
+   unsigned getSaturate() const { return insn->Instruction.Saturate; }
+
+   void print() const
+   {
+      tgsi_dump_instruction(insn, 1);
+   }
+
+private:
+   const struct tgsi_full_instruction *insn;
+};
+
+unsigned int Instruction::srcMask(unsigned int s) const
+{
+   unsigned int mask = insn->Dst[0].Register.WriteMask;
+
+   switch (insn->Instruction.Opcode) {
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+      return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+   case TGSI_OPCODE_DP2:
+      return 0x3;
+   case TGSI_OPCODE_DP3:
+      return 0x7;
+   case TGSI_OPCODE_DP4:
+   case TGSI_OPCODE_DPH:
+   case TGSI_OPCODE_KILL_IF: /* WriteMask ignored */
+      return 0xf;
+   case TGSI_OPCODE_DST:
+      return mask & (s ? 0xa : 0x6);
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_LG2:
+   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_RSQ:
+   case TGSI_OPCODE_SCS:
+      return 0x1;
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_UIF:
+      return 0x1;
+   case TGSI_OPCODE_LIT:
+      return 0xb;
+   case TGSI_OPCODE_TEX2:
+   case TGSI_OPCODE_TXB2:
+   case TGSI_OPCODE_TXL2:
+      return (s == 0) ? 0xf : 0x3;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXD:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+   {
+      const struct tgsi_instruction_texture *tex = &insn->Texture;
+
+      assert(insn->Instruction.Texture);
+
+      mask = 0x7;
+      if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
+          insn->Instruction.Opcode != TGSI_OPCODE_TXD)
+         mask |= 0x8; /* bias, lod or proj */
+
+      switch (tex->Texture) {
+      case TGSI_TEXTURE_1D:
+         mask &= 0x9;
+         break;
+      case TGSI_TEXTURE_SHADOW1D:
+         mask &= 0xd;
+         break;
+      case TGSI_TEXTURE_1D_ARRAY:
+      case TGSI_TEXTURE_2D:
+      case TGSI_TEXTURE_RECT:
+         mask &= 0xb;
+         break;
+      case TGSI_TEXTURE_CUBE_ARRAY:
+      case TGSI_TEXTURE_SHADOW2D_ARRAY:
+      case TGSI_TEXTURE_SHADOWCUBE:
+      case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+         mask |= 0x8;
+         break;
+      default:
+         break;
+      }
+   }
+      return mask;
+   case TGSI_OPCODE_XPD:
+   {
+      unsigned int x = 0;
+      if (mask & 1) x |= 0x6;
+      if (mask & 2) x |= 0x5;
+      if (mask & 4) x |= 0x3;
+      return x;
+   }
+   default:
+      break;
+   }
+
+   return mask;
+}
+
+nv50_ir::Modifier Instruction::SrcRegister::getMod(int chan) const
+{
+   nv50_ir::Modifier m(0);
+
+   if (reg.Absolute)
+      m = m | nv50_ir::Modifier(NV50_IR_MOD_ABS);
+   if (reg.Negate)
+      m = m | nv50_ir::Modifier(NV50_IR_MOD_NEG);
+   return m;
+}
+
+static nv50_ir::DataFile translateFile(uint file)
+{
+   switch (file) {
+   case TGSI_FILE_CONSTANT:        return nv50_ir::FILE_MEMORY_CONST;
+   case TGSI_FILE_INPUT:           return nv50_ir::FILE_SHADER_INPUT;
+   case TGSI_FILE_OUTPUT:          return nv50_ir::FILE_SHADER_OUTPUT;
+   case TGSI_FILE_TEMPORARY:       return nv50_ir::FILE_GPR;
+   case TGSI_FILE_ADDRESS:         return nv50_ir::FILE_ADDRESS;
+   case TGSI_FILE_PREDICATE:       return nv50_ir::FILE_PREDICATE;
+   case TGSI_FILE_IMMEDIATE:       return nv50_ir::FILE_IMMEDIATE;
+   case TGSI_FILE_SYSTEM_VALUE:    return nv50_ir::FILE_SYSTEM_VALUE;
+   case TGSI_FILE_RESOURCE:        return nv50_ir::FILE_MEMORY_GLOBAL;
+   case TGSI_FILE_SAMPLER:
+   case TGSI_FILE_NULL:
+   default:
+      return nv50_ir::FILE_NULL;
+   }
+}
+
+static nv50_ir::SVSemantic translateSysVal(uint sysval)
+{
+   switch (sysval) {
+   case TGSI_SEMANTIC_FACE:       return nv50_ir::SV_FACE;
+   case TGSI_SEMANTIC_PSIZE:      return nv50_ir::SV_POINT_SIZE;
+   case TGSI_SEMANTIC_PRIMID:     return nv50_ir::SV_PRIMITIVE_ID;
+   case TGSI_SEMANTIC_INSTANCEID: return nv50_ir::SV_INSTANCE_ID;
+   case TGSI_SEMANTIC_VERTEXID:   return nv50_ir::SV_VERTEX_ID;
+   case TGSI_SEMANTIC_GRID_SIZE:  return nv50_ir::SV_NCTAID;
+   case TGSI_SEMANTIC_BLOCK_ID:   return nv50_ir::SV_CTAID;
+   case TGSI_SEMANTIC_BLOCK_SIZE: return nv50_ir::SV_NTID;
+   case TGSI_SEMANTIC_THREAD_ID:  return nv50_ir::SV_TID;
+   default:
+      assert(0);
+      return nv50_ir::SV_CLOCK;
+   }
+}
+
+#define NV50_IR_TEX_TARG_CASE(a, b) \
+   case TGSI_TEXTURE_##a: return nv50_ir::TEX_TARGET_##b;
+
+static nv50_ir::TexTarget translateTexture(uint tex)
+{
+   switch (tex) {
+   NV50_IR_TEX_TARG_CASE(1D, 1D);
+   NV50_IR_TEX_TARG_CASE(2D, 2D);
+   NV50_IR_TEX_TARG_CASE(2D_MSAA, 2D_MS);
+   NV50_IR_TEX_TARG_CASE(3D, 3D);
+   NV50_IR_TEX_TARG_CASE(CUBE, CUBE);
+   NV50_IR_TEX_TARG_CASE(RECT, RECT);
+   NV50_IR_TEX_TARG_CASE(1D_ARRAY, 1D_ARRAY);
+   NV50_IR_TEX_TARG_CASE(2D_ARRAY, 2D_ARRAY);
+   NV50_IR_TEX_TARG_CASE(2D_ARRAY_MSAA, 2D_MS_ARRAY);
+   NV50_IR_TEX_TARG_CASE(CUBE_ARRAY, CUBE_ARRAY);
+   NV50_IR_TEX_TARG_CASE(SHADOW1D, 1D_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOW2D, 2D_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOWCUBE, CUBE_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOWRECT, RECT_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOW1D_ARRAY, 1D_ARRAY_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOW2D_ARRAY, 2D_ARRAY_SHADOW);
+   NV50_IR_TEX_TARG_CASE(SHADOWCUBE_ARRAY, CUBE_ARRAY_SHADOW);
+   NV50_IR_TEX_TARG_CASE(BUFFER, BUFFER);
+
+   case TGSI_TEXTURE_UNKNOWN:
+   default:
+      assert(!"invalid texture target");
+      return nv50_ir::TEX_TARGET_2D;
+   }
+}
+
+nv50_ir::DataType Instruction::inferSrcType() const
+{
+   switch (getOpcode()) {
+   case TGSI_OPCODE_UIF:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_XOR:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_U2F:
+   case TGSI_OPCODE_UADD:
+   case TGSI_OPCODE_UDIV:
+   case TGSI_OPCODE_UMOD:
+   case TGSI_OPCODE_UMAD:
+   case TGSI_OPCODE_UMUL:
+   case TGSI_OPCODE_UMAX:
+   case TGSI_OPCODE_UMIN:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE:
+   case TGSI_OPCODE_USHR:
+   case TGSI_OPCODE_UCMP:
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+      return nv50_ir::TYPE_U32;
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_IDIV:
+   case TGSI_OPCODE_IMAX:
+   case TGSI_OPCODE_IMIN:
+   case TGSI_OPCODE_IABS:
+   case TGSI_OPCODE_INEG:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_ISSG:
+   case TGSI_OPCODE_SAD: // not sure about SAD, but no one has a float version
+   case TGSI_OPCODE_MOD:
+   case TGSI_OPCODE_UARL:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      return nv50_ir::TYPE_S32;
+   default:
+      return nv50_ir::TYPE_F32;
+   }
+}
+
+nv50_ir::DataType Instruction::inferDstType() const
+{
+   switch (getOpcode()) {
+   case TGSI_OPCODE_F2U: return nv50_ir::TYPE_U32;
+   case TGSI_OPCODE_F2I: return nv50_ir::TYPE_S32;
+   case TGSI_OPCODE_FSEQ:
+   case TGSI_OPCODE_FSGE:
+   case TGSI_OPCODE_FSLT:
+   case TGSI_OPCODE_FSNE:
+      return nv50_ir::TYPE_U32;
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_U2F:
+      return nv50_ir::TYPE_F32;
+   default:
+      return inferSrcType();
+   }
+}
+
+nv50_ir::CondCode Instruction::getSetCond() const
+{
+   using namespace nv50_ir;
+
+   switch (getOpcode()) {
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_FSLT:
+      return CC_LT;
+   case TGSI_OPCODE_SLE:
+      return CC_LE;
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_FSGE:
+      return CC_GE;
+   case TGSI_OPCODE_SGT:
+      return CC_GT;
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_FSEQ:
+      return CC_EQ;
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_FSNE:
+      return CC_NEU;
+   case TGSI_OPCODE_USNE:
+      return CC_NE;
+   case TGSI_OPCODE_SFL:
+      return CC_NEVER;
+   case TGSI_OPCODE_STR:
+   default:
+      return CC_ALWAYS;
+   }
+}
+
+#define NV50_IR_OPCODE_CASE(a, b) case TGSI_OPCODE_##a: return nv50_ir::OP_##b
+
+static nv50_ir::operation translateOpcode(uint opcode)
+{
+   switch (opcode) {
+   NV50_IR_OPCODE_CASE(ARL, SHL);
+   NV50_IR_OPCODE_CASE(MOV, MOV);
+
+   NV50_IR_OPCODE_CASE(RCP, RCP);
+   NV50_IR_OPCODE_CASE(RSQ, RSQ);
+
+   NV50_IR_OPCODE_CASE(MUL, MUL);
+   NV50_IR_OPCODE_CASE(ADD, ADD);
+
+   NV50_IR_OPCODE_CASE(MIN, MIN);
+   NV50_IR_OPCODE_CASE(MAX, MAX);
+   NV50_IR_OPCODE_CASE(SLT, SET);
+   NV50_IR_OPCODE_CASE(SGE, SET);
+   NV50_IR_OPCODE_CASE(MAD, MAD);
+   NV50_IR_OPCODE_CASE(SUB, SUB);
+
+   NV50_IR_OPCODE_CASE(FLR, FLOOR);
+   NV50_IR_OPCODE_CASE(ROUND, CVT);
+   NV50_IR_OPCODE_CASE(EX2, EX2);
+   NV50_IR_OPCODE_CASE(LG2, LG2);
+   NV50_IR_OPCODE_CASE(POW, POW);
+
+   NV50_IR_OPCODE_CASE(ABS, ABS);
+
+   NV50_IR_OPCODE_CASE(COS, COS);
+   NV50_IR_OPCODE_CASE(DDX, DFDX);
+   NV50_IR_OPCODE_CASE(DDY, DFDY);
+   NV50_IR_OPCODE_CASE(KILL, DISCARD);
+
+   NV50_IR_OPCODE_CASE(SEQ, SET);
+   NV50_IR_OPCODE_CASE(SFL, SET);
+   NV50_IR_OPCODE_CASE(SGT, SET);
+   NV50_IR_OPCODE_CASE(SIN, SIN);
+   NV50_IR_OPCODE_CASE(SLE, SET);
+   NV50_IR_OPCODE_CASE(SNE, SET);
+   NV50_IR_OPCODE_CASE(STR, SET);
+   NV50_IR_OPCODE_CASE(TEX, TEX);
+   NV50_IR_OPCODE_CASE(TXD, TXD);
+   NV50_IR_OPCODE_CASE(TXP, TEX);
+
+   NV50_IR_OPCODE_CASE(BRA, BRA);
+   NV50_IR_OPCODE_CASE(CAL, CALL);
+   NV50_IR_OPCODE_CASE(RET, RET);
+   NV50_IR_OPCODE_CASE(CMP, SLCT);
+
+   NV50_IR_OPCODE_CASE(TXB, TXB);
+
+   NV50_IR_OPCODE_CASE(DIV, DIV);
+
+   NV50_IR_OPCODE_CASE(TXL, TXL);
+
+   NV50_IR_OPCODE_CASE(CEIL, CEIL);
+   NV50_IR_OPCODE_CASE(I2F, CVT);
+   NV50_IR_OPCODE_CASE(NOT, NOT);
+   NV50_IR_OPCODE_CASE(TRUNC, TRUNC);
+   NV50_IR_OPCODE_CASE(SHL, SHL);
+
+   NV50_IR_OPCODE_CASE(AND, AND);
+   NV50_IR_OPCODE_CASE(OR, OR);
+   NV50_IR_OPCODE_CASE(MOD, MOD);
+   NV50_IR_OPCODE_CASE(XOR, XOR);
+   NV50_IR_OPCODE_CASE(SAD, SAD);
+   NV50_IR_OPCODE_CASE(TXF, TXF);
+   NV50_IR_OPCODE_CASE(TXQ, TXQ);
+
+   NV50_IR_OPCODE_CASE(EMIT, EMIT);
+   NV50_IR_OPCODE_CASE(ENDPRIM, RESTART);
+
+   NV50_IR_OPCODE_CASE(KILL_IF, DISCARD);
+
+   NV50_IR_OPCODE_CASE(F2I, CVT);
+   NV50_IR_OPCODE_CASE(FSEQ, SET);
+   NV50_IR_OPCODE_CASE(FSGE, SET);
+   NV50_IR_OPCODE_CASE(FSLT, SET);
+   NV50_IR_OPCODE_CASE(FSNE, SET);
+   NV50_IR_OPCODE_CASE(IDIV, DIV);
+   NV50_IR_OPCODE_CASE(IMAX, MAX);
+   NV50_IR_OPCODE_CASE(IMIN, MIN);
+   NV50_IR_OPCODE_CASE(IABS, ABS);
+   NV50_IR_OPCODE_CASE(INEG, NEG);
+   NV50_IR_OPCODE_CASE(ISGE, SET);
+   NV50_IR_OPCODE_CASE(ISHR, SHR);
+   NV50_IR_OPCODE_CASE(ISLT, SET);
+   NV50_IR_OPCODE_CASE(F2U, CVT);
+   NV50_IR_OPCODE_CASE(U2F, CVT);
+   NV50_IR_OPCODE_CASE(UADD, ADD);
+   NV50_IR_OPCODE_CASE(UDIV, DIV);
+   NV50_IR_OPCODE_CASE(UMAD, MAD);
+   NV50_IR_OPCODE_CASE(UMAX, MAX);
+   NV50_IR_OPCODE_CASE(UMIN, MIN);
+   NV50_IR_OPCODE_CASE(UMOD, MOD);
+   NV50_IR_OPCODE_CASE(UMUL, MUL);
+   NV50_IR_OPCODE_CASE(USEQ, SET);
+   NV50_IR_OPCODE_CASE(USGE, SET);
+   NV50_IR_OPCODE_CASE(USHR, SHR);
+   NV50_IR_OPCODE_CASE(USLT, SET);
+   NV50_IR_OPCODE_CASE(USNE, SET);
+
+   NV50_IR_OPCODE_CASE(SAMPLE, TEX);
+   NV50_IR_OPCODE_CASE(SAMPLE_B, TXB);
+   NV50_IR_OPCODE_CASE(SAMPLE_C, TEX);
+   NV50_IR_OPCODE_CASE(SAMPLE_C_LZ, TEX);
+   NV50_IR_OPCODE_CASE(SAMPLE_D, TXD);
+   NV50_IR_OPCODE_CASE(SAMPLE_L, TXL);
+   NV50_IR_OPCODE_CASE(SAMPLE_I, TXF);
+   NV50_IR_OPCODE_CASE(SAMPLE_I_MS, TXF);
+   NV50_IR_OPCODE_CASE(GATHER4, TXG);
+   NV50_IR_OPCODE_CASE(SVIEWINFO, TXQ);
+
+   NV50_IR_OPCODE_CASE(ATOMUADD, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMXCHG, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMCAS, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMAND, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMOR, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMXOR, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMUMIN, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMUMAX, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMIMIN, ATOM);
+   NV50_IR_OPCODE_CASE(ATOMIMAX, ATOM);
+
+   NV50_IR_OPCODE_CASE(TEX2, TEX);
+   NV50_IR_OPCODE_CASE(TXB2, TXB);
+   NV50_IR_OPCODE_CASE(TXL2, TXL);
+
+   NV50_IR_OPCODE_CASE(END, EXIT);
+
+   default:
+      return nv50_ir::OP_NOP;
+   }
+}
+
+static uint16_t opcodeToSubOp(uint opcode)
+{
+   switch (opcode) {
+   case TGSI_OPCODE_LFENCE:   return NV50_IR_SUBOP_MEMBAR(L, GL);
+   case TGSI_OPCODE_SFENCE:   return NV50_IR_SUBOP_MEMBAR(S, GL);
+   case TGSI_OPCODE_MFENCE:   return NV50_IR_SUBOP_MEMBAR(M, GL);
+   case TGSI_OPCODE_ATOMUADD: return NV50_IR_SUBOP_ATOM_ADD;
+   case TGSI_OPCODE_ATOMXCHG: return NV50_IR_SUBOP_ATOM_EXCH;
+   case TGSI_OPCODE_ATOMCAS:  return NV50_IR_SUBOP_ATOM_CAS;
+   case TGSI_OPCODE_ATOMAND:  return NV50_IR_SUBOP_ATOM_AND;
+   case TGSI_OPCODE_ATOMOR:   return NV50_IR_SUBOP_ATOM_OR;
+   case TGSI_OPCODE_ATOMXOR:  return NV50_IR_SUBOP_ATOM_XOR;
+   case TGSI_OPCODE_ATOMUMIN: return NV50_IR_SUBOP_ATOM_MIN;
+   case TGSI_OPCODE_ATOMIMIN: return NV50_IR_SUBOP_ATOM_MIN;
+   case TGSI_OPCODE_ATOMUMAX: return NV50_IR_SUBOP_ATOM_MAX;
+   case TGSI_OPCODE_ATOMIMAX: return NV50_IR_SUBOP_ATOM_MAX;
+   default:
+      return 0;
+   }
+}
+
+bool Instruction::checkDstSrcAliasing() const
+{
+   if (insn->Dst[0].Register.Indirect) // no danger if indirect, using memory
+      return false;
+
+   for (int s = 0; s < TGSI_FULL_MAX_SRC_REGISTERS; ++s) {
+      if (insn->Src[s].Register.File == TGSI_FILE_NULL)
+         break;
+      if (insn->Src[s].Register.File == insn->Dst[0].Register.File &&
+          insn->Src[s].Register.Index == insn->Dst[0].Register.Index)
+         return true;
+   }
+   return false;
+}
+
+class Source
+{
+public:
+   Source(struct nv50_ir_prog_info *);
+   ~Source();
+
+public:
+   bool scanSource();
+   unsigned fileSize(unsigned file) const { return scan.file_max[file] + 1; }
+
+public:
+   struct tgsi_shader_info scan;
+   struct tgsi_full_instruction *insns;
+   const struct tgsi_token *tokens;
+   struct nv50_ir_prog_info *info;
+
+   nv50_ir::DynArray tempArrays;
+   nv50_ir::DynArray immdArrays;
+
+   typedef nv50_ir::BuildUtil::Location Location;
+   // these registers are per-subroutine, cannot be used for parameter passing
+   std::set<Location> locals;
+
+   bool mainTempsInLMem;
+
+   int clipVertexOutput;
+
+   struct TextureView {
+      uint8_t target; // TGSI_TEXTURE_*
+   };
+   std::vector<TextureView> textureViews;
+
+   struct Resource {
+      uint8_t target; // TGSI_TEXTURE_*
+      bool raw;
+      uint8_t slot; // $surface index
+   };
+   std::vector<Resource> resources;
+
+private:
+   int inferSysValDirection(unsigned sn) const;
+   bool scanDeclaration(const struct tgsi_full_declaration *);
+   bool scanInstruction(const struct tgsi_full_instruction *);
+   void scanProperty(const struct tgsi_full_property *);
+   void scanImmediate(const struct tgsi_full_immediate *);
+
+   inline bool isEdgeFlagPassthrough(const Instruction&) const;
+};
+
+Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
+{
+   tokens = (const struct tgsi_token *)info->bin.source;
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+      tgsi_dump(tokens, 0);
+
+   mainTempsInLMem = FALSE;
+}
+
+Source::~Source()
+{
+   if (insns)
+      FREE(insns);
+
+   if (info->immd.data)
+      FREE(info->immd.data);
+   if (info->immd.type)
+      FREE(info->immd.type);
+}
+
+bool Source::scanSource()
+{
+   unsigned insnCount = 0;
+   struct tgsi_parse_context parse;
+
+   tgsi_scan_shader(tokens, &scan);
+
+   insns = (struct tgsi_full_instruction *)MALLOC(scan.num_instructions *
+                                                  sizeof(insns[0]));
+   if (!insns)
+      return false;
+
+   clipVertexOutput = -1;
+
+   textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
+   resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+
+   info->immd.bufSize = 0;
+
+   info->numInputs = scan.file_max[TGSI_FILE_INPUT] + 1;
+   info->numOutputs = scan.file_max[TGSI_FILE_OUTPUT] + 1;
+   info->numSysVals = scan.file_max[TGSI_FILE_SYSTEM_VALUE] + 1;
+
+   if (info->type == PIPE_SHADER_FRAGMENT) {
+      info->prop.fp.writesDepth = scan.writes_z;
+      info->prop.fp.usesDiscard = scan.uses_kill;
+   } else
+   if (info->type == PIPE_SHADER_GEOMETRY) {
+      info->prop.gp.instanceCount = 1; // default value
+   }
+
+   info->immd.data = (uint32_t *)MALLOC(scan.immediate_count * 16);
+   info->immd.type = (ubyte *)MALLOC(scan.immediate_count * sizeof(ubyte));
+
+   tgsi_parse_init(&parse, tokens);
+   while (!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         scanImmediate(&parse.FullToken.FullImmediate);
+         break;
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         scanDeclaration(&parse.FullToken.FullDeclaration);
+         break;
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         insns[insnCount++] = parse.FullToken.FullInstruction;
+         scanInstruction(&parse.FullToken.FullInstruction);
+         break;
+      case TGSI_TOKEN_TYPE_PROPERTY:
+         scanProperty(&parse.FullToken.FullProperty);
+         break;
+      default:
+         INFO("unknown TGSI token type: %d\n", parse.FullToken.Token.Type);
+         break;
+      }
+   }
+   tgsi_parse_free(&parse);
+
+   if (mainTempsInLMem)
+      info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
+
+   if (info->io.genUserClip > 0) {
+      info->io.clipDistanceMask = (1 << info->io.genUserClip) - 1;
+
+      const unsigned int nOut = (info->io.genUserClip + 3) / 4;
+
+      for (unsigned int n = 0; n < nOut; ++n) {
+         unsigned int i = info->numOutputs++;
+         info->out[i].id = i;
+         info->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
+         info->out[i].si = n;
+         info->out[i].mask = info->io.clipDistanceMask >> (n * 4);
+      }
+   }
+
+   return info->assignSlots(info) == 0;
+}
+
+void Source::scanProperty(const struct tgsi_full_property *prop)
+{
+   switch (prop->Property.PropertyName) {
+   case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+      info->prop.gp.outputPrim = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_GS_INPUT_PRIM:
+      info->prop.gp.inputPrim = prop->u[0].Data;
+      break;
+   case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
+      info->prop.gp.maxVertices = prop->u[0].Data;
+      break;
+#if 0
+   case TGSI_PROPERTY_GS_INSTANCE_COUNT:
+      info->prop.gp.instanceCount = prop->u[0].Data;
+      break;
+#endif
+   case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
+      info->prop.fp.separateFragData = TRUE;
+      break;
+   case TGSI_PROPERTY_FS_COORD_ORIGIN:
+   case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
+      // we don't care
+      break;
+   case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
+      info->io.genUserClip = -1;
+      break;
+   default:
+      INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
+      break;
+   }
+}
+
+void Source::scanImmediate(const struct tgsi_full_immediate *imm)
+{
+   const unsigned n = info->immd.count++;
+
+   assert(n < scan.immediate_count);
+
+   for (int c = 0; c < 4; ++c)
+      info->immd.data[n * 4 + c] = imm->u[c].Uint;
+
+   info->immd.type[n] = imm->Immediate.DataType;
+}
+
+int Source::inferSysValDirection(unsigned sn) const
+{
+   switch (sn) {
+   case TGSI_SEMANTIC_INSTANCEID:
+   case TGSI_SEMANTIC_VERTEXID:
+      return 1;
+#if 0
+   case TGSI_SEMANTIC_LAYER:
+   case TGSI_SEMANTIC_VIEWPORTINDEX:
+      return 0;
+#endif
+   case TGSI_SEMANTIC_PRIMID:
+      return (info->type == PIPE_SHADER_FRAGMENT) ? 1 : 0;
+   default:
+      return 0;
+   }
+}
+
+bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
+{
+   unsigned i, c;
+   unsigned sn = TGSI_SEMANTIC_GENERIC;
+   unsigned si = 0;
+   const unsigned first = decl->Range.First, last = decl->Range.Last;
+
+   if (decl->Declaration.Semantic) {
+      sn = decl->Semantic.Name;
+      si = decl->Semantic.Index;
+   }
+
+   if (decl->Declaration.Local) {
+      for (i = first; i <= last; ++i) {
+         for (c = 0; c < 4; ++c) {
+            locals.insert(
+               Location(decl->Declaration.File, decl->Dim.Index2D, i, c));
+         }
+      }
+   }
+
+   switch (decl->Declaration.File) {
+   case TGSI_FILE_INPUT:
+      if (info->type == PIPE_SHADER_VERTEX) {
+         // all vertex attributes are equal
+         for (i = first; i <= last; ++i) {
+            info->in[i].sn = TGSI_SEMANTIC_GENERIC;
+            info->in[i].si = i;
+         }
+      } else {
+         for (i = first; i <= last; ++i, ++si) {
+            info->in[i].id = i;
+            info->in[i].sn = sn;
+            info->in[i].si = si;
+            if (info->type == PIPE_SHADER_FRAGMENT) {
+               // translate interpolation mode
+               switch (decl->Interp.Interpolate) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  info->in[i].flat = 1;
+                  break;
+               case TGSI_INTERPOLATE_COLOR:
+                  info->in[i].sc = 1;
+                  break;
+               case TGSI_INTERPOLATE_LINEAR:
+                  info->in[i].linear = 1;
+                  break;
+               default:
+                  break;
+               }
+               if (decl->Interp.Centroid)
+                  info->in[i].centroid = 1;
+            }
+         }
+      }
+      break;
+   case TGSI_FILE_OUTPUT:
+      for (i = first; i <= last; ++i, ++si) {
+         switch (sn) {
+         case TGSI_SEMANTIC_POSITION:
+            if (info->type == PIPE_SHADER_FRAGMENT)
+               info->io.fragDepth = i;
+            else
+            if (clipVertexOutput < 0)
+               clipVertexOutput = i;
+            break;
+         case TGSI_SEMANTIC_COLOR:
+            if (info->type == PIPE_SHADER_FRAGMENT)
+               info->prop.fp.numColourResults++;
+            break;
+         case TGSI_SEMANTIC_EDGEFLAG:
+            info->io.edgeFlagOut = i;
+            break;
+         case TGSI_SEMANTIC_CLIPVERTEX:
+            clipVertexOutput = i;
+            break;
+         case TGSI_SEMANTIC_CLIPDIST:
+            info->io.clipDistanceMask |=
+               decl->Declaration.UsageMask << (si * 4);
+            info->io.genUserClip = -1;
+            break;
+         default:
+            break;
+         }
+         info->out[i].id = i;
+         info->out[i].sn = sn;
+         info->out[i].si = si;
+      }
+      break;
+   case TGSI_FILE_SYSTEM_VALUE:
+      switch (sn) {
+      case TGSI_SEMANTIC_INSTANCEID:
+         info->io.instanceId = first;
+         break;
+      case TGSI_SEMANTIC_VERTEXID:
+         info->io.vertexId = first;
+         break;
+      default:
+         break;
+      }
+      for (i = first; i <= last; ++i, ++si) {
+         info->sv[i].sn = sn;
+         info->sv[i].si = si;
+         info->sv[i].input = inferSysValDirection(sn);
+      }
+      break;
+   case TGSI_FILE_RESOURCE:
+      for (i = first; i <= last; ++i) {
+         resources[i].target = decl->Resource.Resource;
+         resources[i].raw = decl->Resource.Raw;
+         resources[i].slot = i;
+      }
+      break;
+   case TGSI_FILE_SAMPLER_VIEW:
+      for (i = first; i <= last; ++i)
+         textureViews[i].target = decl->SamplerView.Resource;
+      break;
+   case TGSI_FILE_NULL:
+   case TGSI_FILE_TEMPORARY:
+   case TGSI_FILE_ADDRESS:
+   case TGSI_FILE_CONSTANT:
+   case TGSI_FILE_IMMEDIATE:
+   case TGSI_FILE_PREDICATE:
+   case TGSI_FILE_SAMPLER:
+      break;
+   default:
+      ERROR("unhandled TGSI_FILE %d\n", decl->Declaration.File);
+      return false;
+   }
+   return true;
+}
+
+inline bool Source::isEdgeFlagPassthrough(const Instruction& insn) const
+{
+   return insn.getOpcode() == TGSI_OPCODE_MOV &&
+      insn.getDst(0).getIndex(0) == info->io.edgeFlagOut &&
+      insn.getSrc(0).getFile() == TGSI_FILE_INPUT;
+}
+
+bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
+{
+   Instruction insn(inst);
+
+   if (insn.getOpcode() == TGSI_OPCODE_BARRIER)
+      info->numBarriers = 1;
+
+   if (insn.dstCount()) {
+      if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) {
+         Instruction::DstRegister dst = insn.getDst(0);
+
+         if (dst.isIndirect(0))
+            for (unsigned i = 0; i < info->numOutputs; ++i)
+               info->out[i].mask = 0xf;
+         else
+            info->out[dst.getIndex(0)].mask |= dst.getMask();
+
+         if (info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_PSIZE ||
+             info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_PRIMID ||
+             info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_FOG)
+            info->out[dst.getIndex(0)].mask &= 1;
+
+         if (isEdgeFlagPassthrough(insn))
+            info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
+      } else
+      if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
+         if (insn.getDst(0).isIndirect(0))
+            mainTempsInLMem = TRUE;
+      }
+   }
+
+   for (unsigned s = 0; s < insn.srcCount(); ++s) {
+      Instruction::SrcRegister src = insn.getSrc(s);
+      if (src.getFile() == TGSI_FILE_TEMPORARY) {
+         if (src.isIndirect(0))
+            mainTempsInLMem = TRUE;
+      } else
+      if (src.getFile() == TGSI_FILE_RESOURCE) {
+         if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
+            info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
+               0x1 : 0x2;
+      }
+      if (src.getFile() != TGSI_FILE_INPUT)
+         continue;
+      unsigned mask = insn.srcMask(s);
+
+      if (src.isIndirect(0)) {
+         for (unsigned i = 0; i < info->numInputs; ++i)
+            info->in[i].mask = 0xf;
+      } else {
+         const int i = src.getIndex(0);
+         for (unsigned c = 0; c < 4; ++c) {
+            if (!(mask & (1 << c)))
+               continue;
+            int k = src.getSwizzle(c);
+            if (k <= TGSI_SWIZZLE_W)
+               info->in[i].mask |= 1 << k;
+         }
+         switch (info->in[i].sn) {
+         case TGSI_SEMANTIC_PSIZE:
+         case TGSI_SEMANTIC_PRIMID:
+         case TGSI_SEMANTIC_FOG:
+            info->in[i].mask &= 0x1;
+            break;
+         case TGSI_SEMANTIC_PCOORD:
+            info->in[i].mask &= 0x3;
+            break;
+         default:
+            break;
+         }
+      }
+   }
+   return true;
+}
+
+nv50_ir::TexInstruction::Target
+Instruction::getTexture(const tgsi::Source *code, int s) const
+{
+   // XXX: indirect access
+   unsigned int r;
+
+   switch (getSrc(s).getFile()) {
+   case TGSI_FILE_RESOURCE:
+      r = getSrc(s).getIndex(0);
+      return translateTexture(code->resources.at(r).target);
+   case TGSI_FILE_SAMPLER_VIEW:
+      r = getSrc(s).getIndex(0);
+      return translateTexture(code->textureViews.at(r).target);
+   default:
+      return translateTexture(insn->Texture.Texture);
+   }
+}
+
+} // namespace tgsi
+
+namespace {
+
+using namespace nv50_ir;
+
+class Converter : public BuildUtil
+{
+public:
+   Converter(Program *, const tgsi::Source *);
+   ~Converter();
+
+   bool run();
+
+private:
+   struct Subroutine
+   {
+      Subroutine(Function *f) : f(f) { }
+      Function *f;
+      ValueMap values;
+   };
+
+   Value *getVertexBase(int s);
+   DataArray *getArrayForFile(unsigned file, int idx);
+   Value *fetchSrc(int s, int c);
+   Value *acquireDst(int d, int c);
+   void storeDst(int d, int c, Value *);
+
+   Value *fetchSrc(const tgsi::Instruction::SrcRegister src, int c, Value *ptr);
+   void storeDst(const tgsi::Instruction::DstRegister dst, int c,
+                 Value *val, Value *ptr);
+
+   Value *applySrcMod(Value *, int s, int c);
+
+   Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr);
+   Symbol *srcToSym(tgsi::Instruction::SrcRegister, int c);
+   Symbol *dstToSym(tgsi::Instruction::DstRegister, int c);
+
+   bool handleInstruction(const struct tgsi_full_instruction *);
+   void exportOutputs();
+   inline Subroutine *getSubroutine(unsigned ip);
+   inline Subroutine *getSubroutine(Function *);
+   inline bool isEndOfSubroutine(uint ip);
+
+   void loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask);
+
+   // R,S,L,C,Dx,Dy encode TGSI sources for respective values (0xSf for auto)
+   void setTexRS(TexInstruction *, unsigned int& s, int R, int S);
+   void handleTEX(Value *dst0[4], int R, int S, int L, int C, int Dx, int Dy);
+   void handleTXF(Value *dst0[4], int R, int L_M);
+   void handleTXQ(Value *dst0[4], enum TexQuery);
+   void handleLIT(Value *dst0[4]);
+   void handleUserClipPlanes();
+
+   Symbol *getResourceBase(int r);
+   void getResourceCoords(std::vector<Value *>&, int r, int s);
+
+   void handleLOAD(Value *dst0[4]);
+   void handleSTORE();
+   void handleATOM(Value *dst0[4], DataType, uint16_t subOp);
+
+   Value *interpolate(tgsi::Instruction::SrcRegister, int c, Value *ptr);
+
+   void insertConvergenceOps(BasicBlock *conv, BasicBlock *fork);
+
+   Value *buildDot(int dim);
+
+   class BindArgumentsPass : public Pass {
+   public:
+      BindArgumentsPass(Converter &conv) : conv(conv) { }
+
+   private:
+      Converter &conv;
+      Subroutine *sub;
+
+      inline const Location *getValueLocation(Subroutine *, Value *);
+
+      template<typename T> inline void
+      updateCallArgs(Instruction *i, void (Instruction::*setArg)(int, Value *),
+                     T (Function::*proto));
+
+      template<typename T> inline void
+      updatePrototype(BitSet *set, void (Function::*updateSet)(),
+                      T (Function::*proto));
+
+   protected:
+      bool visit(Function *);
+      bool visit(BasicBlock *bb) { return false; }
+   };
+
+private:
+   const struct tgsi::Source *code;
+   const struct nv50_ir_prog_info *info;
+
+   struct {
+      std::map<unsigned, Subroutine> map;
+      Subroutine *cur;
+   } sub;
+
+   uint ip; // instruction pointer
+
+   tgsi::Instruction tgsi;
+
+   DataType dstTy;
+   DataType srcTy;
+
+   DataArray tData; // TGSI_FILE_TEMPORARY
+   DataArray aData; // TGSI_FILE_ADDRESS
+   DataArray pData; // TGSI_FILE_PREDICATE
+   DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers)
+
+   Value *zero;
+   Value *fragCoord[4];
+   Value *clipVtx[4];
+
+   Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP)
+   uint8_t vtxBaseValid;
+
+   Stack condBBs;  // fork BB, then else clause BB
+   Stack joinBBs;  // fork BB, for inserting join ops on ENDIF
+   Stack loopBBs;  // loop headers
+   Stack breakBBs; // end of / after loop
+};
+
+Symbol *
+Converter::srcToSym(tgsi::Instruction::SrcRegister src, int c)
+{
+   const int swz = src.getSwizzle(c);
+
+   return makeSym(src.getFile(),
+                  src.is2D() ? src.getIndex(1) : 0,
+                  src.isIndirect(0) ? -1 : src.getIndex(0), swz,
+                  src.getIndex(0) * 16 + swz * 4);
+}
+
+Symbol *
+Converter::dstToSym(tgsi::Instruction::DstRegister dst, int c)
+{
+   return makeSym(dst.getFile(),
+                  dst.is2D() ? dst.getIndex(1) : 0,
+                  dst.isIndirect(0) ? -1 : dst.getIndex(0), c,
+                  dst.getIndex(0) * 16 + c * 4);
+}
+
+Symbol *
+Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
+{
+   Symbol *sym = new_Symbol(prog, tgsi::translateFile(tgsiFile));
+
+   sym->reg.fileIndex = fileIdx;
+
+   if (idx >= 0) {
+      if (sym->reg.file == FILE_SHADER_INPUT)
+         sym->setOffset(info->in[idx].slot[c] * 4);
+      else
+      if (sym->reg.file == FILE_SHADER_OUTPUT)
+         sym->setOffset(info->out[idx].slot[c] * 4);
+      else
+      if (sym->reg.file == FILE_SYSTEM_VALUE)
+         sym->setSV(tgsi::translateSysVal(info->sv[idx].sn), c);
+      else
+         sym->setOffset(address);
+   } else {
+      sym->setOffset(address);
+   }
+   return sym;
+}
+
+static inline uint8_t
+translateInterpMode(const struct nv50_ir_varying *var, operation& op)
+{
+   uint8_t mode = NV50_IR_INTERP_PERSPECTIVE;
+
+   if (var->flat)
+      mode = NV50_IR_INTERP_FLAT;
+   else
+   if (var->linear)
+      mode = NV50_IR_INTERP_LINEAR;
+   else
+   if (var->sc)
+      mode = NV50_IR_INTERP_SC;
+
+   op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
+      ? OP_PINTERP : OP_LINTERP;
+
+   if (var->centroid)
+      mode |= NV50_IR_INTERP_CENTROID;
+
+   return mode;
+}
+
+Value *
+Converter::interpolate(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
+{
+   operation op;
+
+   // XXX: no way to know interpolation mode if we don't know what's accessed
+   const uint8_t mode = translateInterpMode(&info->in[ptr ? 0 :
+                                                      src.getIndex(0)], op);
+
+   Instruction *insn = new_Instruction(func, op, TYPE_F32);
+
+   insn->setDef(0, getScratch());
+   insn->setSrc(0, srcToSym(src, c));
+   if (op == OP_PINTERP)
+      insn->setSrc(1, fragCoord[3]);
+   if (ptr)
+      insn->setIndirect(0, 0, ptr);
+
+   insn->setInterpolate(mode);
+
+   bb->insertTail(insn);
+   return insn->getDef(0);
+}
+
+Value *
+Converter::applySrcMod(Value *val, int s, int c)
+{
+   Modifier m = tgsi.getSrc(s).getMod(c);
+   DataType ty = tgsi.inferSrcType();
+
+   if (m & Modifier(NV50_IR_MOD_ABS))
+      val = mkOp1v(OP_ABS, ty, getScratch(), val);
+
+   if (m & Modifier(NV50_IR_MOD_NEG))
+      val = mkOp1v(OP_NEG, ty, getScratch(), val);
+
+   return val;
+}
+
+Value *
+Converter::getVertexBase(int s)
+{
+   assert(s < 5);
+   if (!(vtxBaseValid & (1 << s))) {
+      const int index = tgsi.getSrc(s).getIndex(1);
+      Value *rel = NULL;
+      if (tgsi.getSrc(s).isIndirect(1))
+         rel = fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL);
+      vtxBaseValid |= 1 << s;
+      vtxBase[s] = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(), mkImm(index), rel);
+   }
+   return vtxBase[s];
+}
+
+Value *
+Converter::fetchSrc(int s, int c)
+{
+   Value *res;
+   Value *ptr = NULL, *dimRel = NULL;
+
+   tgsi::Instruction::SrcRegister src = tgsi.getSrc(s);
+
+   if (src.isIndirect(0))
+      ptr = fetchSrc(src.getIndirect(0), 0, NULL);
+
+   if (src.is2D()) {
+      switch (src.getFile()) {
+      case TGSI_FILE_INPUT:
+         dimRel = getVertexBase(s);
+         break;
+      case TGSI_FILE_CONSTANT:
+         // on NVC0, this is valid and c{I+J}[k] == cI[(J << 16) + k]
+         if (src.isIndirect(1))
+            dimRel = fetchSrc(src.getIndirect(1), 0, 0);
+         break;
+      default:
+         break;
+      }
+   }
+
+   res = fetchSrc(src, c, ptr);
+
+   if (dimRel)
+      res->getInsn()->setIndirect(0, 1, dimRel);
+
+   return applySrcMod(res, s, c);
+}
+
+Converter::DataArray *
+Converter::getArrayForFile(unsigned file, int idx)
+{
+   switch (file) {
+   case TGSI_FILE_TEMPORARY:
+      return &tData;
+   case TGSI_FILE_PREDICATE:
+      return &pData;
+   case TGSI_FILE_ADDRESS:
+      return &aData;
+   case TGSI_FILE_OUTPUT:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      return &oData;
+   default:
+      assert(!"invalid/unhandled TGSI source file");
+      return NULL;
+   }
+}
+
+Value *
+Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
+{
+   const int idx2d = src.is2D() ? src.getIndex(1) : 0;
+   const int idx = src.getIndex(0);
+   const int swz = src.getSwizzle(c);
+
+   switch (src.getFile()) {
+   case TGSI_FILE_IMMEDIATE:
+      assert(!ptr);
+      return loadImm(NULL, info->immd.data[idx * 4 + swz]);
+   case TGSI_FILE_CONSTANT:
+      return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
+   case TGSI_FILE_INPUT:
+      if (prog->getType() == Program::TYPE_FRAGMENT) {
+         // don't load masked inputs, won't be assigned a slot
+         if (!ptr && !(info->in[idx].mask & (1 << swz)))
+            return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
+	 if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
+            return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
+         return interpolate(src, c, ptr);
+      }
+      return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
+   case TGSI_FILE_OUTPUT:
+      assert(!"load from output file");
+      return NULL;
+   case TGSI_FILE_SYSTEM_VALUE:
+      assert(!ptr);
+      return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+   default:
+      return getArrayForFile(src.getFile(), idx2d)->load(
+         sub.cur->values, idx, swz, ptr);
+   }
+}
+
+Value *
+Converter::acquireDst(int d, int c)
+{
+   const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
+   const unsigned f = dst.getFile();
+   const int idx = dst.getIndex(0);
+   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+
+   if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE)
+      return NULL;
+
+   if (dst.isIndirect(0) ||
+       f == TGSI_FILE_SYSTEM_VALUE ||
+       (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT))
+      return getScratch();
+
+   return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c);
+}
+
+void
+Converter::storeDst(int d, int c, Value *val)
+{
+   const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
+
+   switch (tgsi.getSaturate()) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      mkOp1(OP_SAT, dstTy, val, val);
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      mkOp2(OP_MAX, dstTy, val, val, mkImm(-1.0f));
+      mkOp2(OP_MIN, dstTy, val, val, mkImm(+1.0f));
+      break;
+   default:
+      assert(!"invalid saturation mode");
+      break;
+   }
+
+   Value *ptr = dst.isIndirect(0) ?
+      fetchSrc(dst.getIndirect(0), 0, NULL) : NULL;
+
+   if (info->io.genUserClip > 0 &&
+       dst.getFile() == TGSI_FILE_OUTPUT &&
+       !dst.isIndirect(0) && dst.getIndex(0) == code->clipVertexOutput) {
+      mkMov(clipVtx[c], val);
+      val = clipVtx[c];
+   }
+
+   storeDst(dst, c, val, ptr);
+}
+
+void
+Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
+                    Value *val, Value *ptr)
+{
+   const unsigned f = dst.getFile();
+   const int idx = dst.getIndex(0);
+   const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+
+   if (f == TGSI_FILE_SYSTEM_VALUE) {
+      assert(!ptr);
+      mkOp2(OP_WRSV, TYPE_U32, NULL, dstToSym(dst, c), val);
+   } else
+   if (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT) {
+      if (ptr || (info->out[idx].mask & (1 << c)))
+         mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val);
+   } else
+   if (f == TGSI_FILE_TEMPORARY ||
+       f == TGSI_FILE_PREDICATE ||
+       f == TGSI_FILE_ADDRESS ||
+       f == TGSI_FILE_OUTPUT) {
+      getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val);
+   } else {
+      assert(!"invalid dst file");
+   }
+}
+
+#define FOR_EACH_DST_ENABLED_CHANNEL(d, chan, inst) \
+   for (chan = 0; chan < 4; ++chan)                 \
+      if (!inst.getDst(d).isMasked(chan))
+
+Value *
+Converter::buildDot(int dim)
+{
+   assert(dim > 0);
+
+   Value *src0 = fetchSrc(0, 0), *src1 = fetchSrc(1, 0);
+   Value *dotp = getScratch();
+
+   mkOp2(OP_MUL, TYPE_F32, dotp, src0, src1);
+
+   for (int c = 1; c < dim; ++c) {
+      src0 = fetchSrc(0, c);
+      src1 = fetchSrc(1, c);
+      mkOp3(OP_MAD, TYPE_F32, dotp, src0, src1, dotp);
+   }
+   return dotp;
+}
+
+void
+Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork)
+{
+   FlowInstruction *join = new_FlowInstruction(func, OP_JOIN, NULL);
+   join->fixed = 1;
+   conv->insertHead(join);
+
+   fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv);
+   fork->insertBefore(fork->getExit(), fork->joinAt);
+}
+
+void
+Converter::setTexRS(TexInstruction *tex, unsigned int& s, int R, int S)
+{
+   unsigned rIdx = 0, sIdx = 0;
+
+   if (R >= 0)
+      rIdx = tgsi.getSrc(R).getIndex(0);
+   if (S >= 0)
+      sIdx = tgsi.getSrc(S).getIndex(0);
+
+   tex->setTexture(tgsi.getTexture(code, R), rIdx, sIdx);
+
+   if (tgsi.getSrc(R).isIndirect(0)) {
+      tex->tex.rIndirectSrc = s;
+      tex->setSrc(s++, fetchSrc(tgsi.getSrc(R).getIndirect(0), 0, NULL));
+   }
+   if (S >= 0 && tgsi.getSrc(S).isIndirect(0)) {
+      tex->tex.sIndirectSrc = s;
+      tex->setSrc(s++, fetchSrc(tgsi.getSrc(S).getIndirect(0), 0, NULL));
+   }
+}
+
+void
+Converter::handleTXQ(Value *dst0[4], enum TexQuery query)
+{
+   TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
+   tex->tex.query = query;
+   unsigned int c, d;
+
+   for (d = 0, c = 0; c < 4; ++c) {
+      if (!dst0[c])
+         continue;
+      tex->tex.mask |= 1 << c;
+      tex->setDef(d++, dst0[c]);
+   }
+   tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level
+
+   setTexRS(tex, c, 1, -1);
+
+   bb->insertTail(tex);
+}
+
+void
+Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
+{
+   Value *proj = fetchSrc(0, 3);
+   Instruction *insn = proj->getUniqueInsn();
+   int c;
+
+   if (insn->op == OP_PINTERP) {
+      bb->insertTail(insn = cloneForward(func, insn));
+      insn->op = OP_LINTERP;
+      insn->setInterpolate(NV50_IR_INTERP_LINEAR | insn->getSampleMode());
+      insn->setSrc(1, NULL);
+      proj = insn->getDef(0);
+   }
+   proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), proj);
+
+   for (c = 0; c < 4; ++c) {
+      if (!(mask & (1 << c)))
+         continue;
+      if ((insn = src[c]->getUniqueInsn())->op != OP_PINTERP)
+         continue;
+      mask &= ~(1 << c);
+
+      bb->insertTail(insn = cloneForward(func, insn));
+      insn->setInterpolate(NV50_IR_INTERP_PERSPECTIVE | insn->getSampleMode());
+      insn->setSrc(1, proj);
+      dst[c] = insn->getDef(0);
+   }
+   if (!mask)
+      return;
+
+   proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), fetchSrc(0, 3));
+
+   for (c = 0; c < 4; ++c)
+      if (mask & (1 << c))
+         dst[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), src[c], proj);
+}
+
+// order of nv50 ir sources: x y z layer lod/bias shadow
+// order of TGSI TEX sources: x y z layer shadow lod/bias
+//  lowering will finally set the hw specific order (like array first on nvc0)
+void
+Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
+{
+   Value *val;
+   Value *arg[4], *src[8];
+   Value *lod = NULL, *shd = NULL;
+   unsigned int s, c, d;
+   TexInstruction *texi = new_TexInstruction(func, tgsi.getOP());
+
+   TexInstruction::Target tgt = tgsi.getTexture(code, R);
+
+   for (s = 0; s < tgt.getArgCount(); ++s)
+      arg[s] = src[s] = fetchSrc(0, s);
+
+   if (texi->op == OP_TXL || texi->op == OP_TXB)
+      lod = fetchSrc(L >> 4, L & 3);
+
+   if (C == 0x0f)
+      C = 0x00 | MAX2(tgt.getArgCount(), 2); // guess DC src
+
+   if (tgt.isShadow())
+      shd = fetchSrc(C >> 4, C & 3);
+
+   if (texi->op == OP_TXD) {
+      for (c = 0; c < tgt.getDim(); ++c) {
+         texi->dPdx[c].set(fetchSrc(Dx >> 4, (Dx & 3) + c));
+         texi->dPdy[c].set(fetchSrc(Dy >> 4, (Dy & 3) + c));
+      }
+   }
+
+   // cube textures don't care about projection value, it's divided out
+   if (tgsi.getOpcode() == TGSI_OPCODE_TXP && !tgt.isCube() && !tgt.isArray()) {
+      unsigned int n = tgt.getDim();
+      if (shd) {
+         arg[n] = shd;
+         ++n;
+         assert(tgt.getDim() == tgt.getArgCount());
+      }
+      loadProjTexCoords(src, arg, (1 << n) - 1);
+      if (shd)
+         shd = src[n - 1];
+   }
+
+   if (tgt.isCube()) {
+      for (c = 0; c < 3; ++c)
+         src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
+      val = getScratch();
+      mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+      mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+      mkOp1(OP_RCP, TYPE_F32, val, val);
+      for (c = 0; c < 3; ++c)
+         src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
+   }
+
+   for (c = 0, d = 0; c < 4; ++c) {
+      if (dst[c]) {
+         texi->setDef(d++, dst[c]);
+         texi->tex.mask |= 1 << c;
+      } else {
+         // NOTE: maybe hook up def too, for CSE
+      }
+   }
+   for (s = 0; s < tgt.getArgCount(); ++s)
+      texi->setSrc(s, src[s]);
+   if (lod)
+      texi->setSrc(s++, lod);
+   if (shd)
+      texi->setSrc(s++, shd);
+
+   setTexRS(texi, s, R, S);
+
+   if (tgsi.getOpcode() == TGSI_OPCODE_SAMPLE_C_LZ)
+      texi->tex.levelZero = true;
+
+   bb->insertTail(texi);
+}
+
+// 1st source: xyz = coordinates, w = lod/sample
+// 2nd source: offset
+void
+Converter::handleTXF(Value *dst[4], int R, int L_M)
+{
+   TexInstruction *texi = new_TexInstruction(func, tgsi.getOP());
+   int ms;
+   unsigned int c, d, s;
+
+   texi->tex.target = tgsi.getTexture(code, R);
+
+   ms = texi->tex.target.isMS() ? 1 : 0;
+   texi->tex.levelZero = ms; /* MS textures don't have mip-maps */
+
+   for (c = 0, d = 0; c < 4; ++c) {
+      if (dst[c]) {
+         texi->setDef(d++, dst[c]);
+         texi->tex.mask |= 1 << c;
+      }
+   }
+   for (c = 0; c < (texi->tex.target.getArgCount() - ms); ++c)
+      texi->setSrc(c, fetchSrc(0, c));
+   texi->setSrc(c++, fetchSrc(L_M >> 4, L_M & 3)); // lod or ms
+
+   setTexRS(texi, c, R, -1);
+
+   for (s = 0; s < tgsi.getNumTexOffsets(); ++s) {
+      for (c = 0; c < 3; ++c) {
+         texi->tex.offset[s][c] = tgsi.getTexOffset(s).getValueU32(c, info);
+         if (texi->tex.offset[s][c])
+            texi->tex.useOffsets = s + 1;
+      }
+   }
+
+   bb->insertTail(texi);
+}
+
+void
+Converter::handleLIT(Value *dst0[4])
+{
+   Value *val0 = NULL;
+   unsigned int mask = tgsi.getDst(0).getMask();
+
+   if (mask & (1 << 0))
+      loadImm(dst0[0], 1.0f);
+
+   if (mask & (1 << 3))
+      loadImm(dst0[3], 1.0f);
+
+   if (mask & (3 << 1)) {
+      val0 = getScratch();
+      mkOp2(OP_MAX, TYPE_F32, val0, fetchSrc(0, 0), zero);
+      if (mask & (1 << 1))
+         mkMov(dst0[1], val0);
+   }
+
+   if (mask & (1 << 2)) {
+      Value *src1 = fetchSrc(0, 1), *src3 = fetchSrc(0, 3);
+      Value *val1 = getScratch(), *val3 = getScratch();
+
+      Value *pos128 = loadImm(NULL, +127.999999f);
+      Value *neg128 = loadImm(NULL, -127.999999f);
+
+      mkOp2(OP_MAX, TYPE_F32, val1, src1, zero);
+      mkOp2(OP_MAX, TYPE_F32, val3, src3, neg128);
+      mkOp2(OP_MIN, TYPE_F32, val3, val3, pos128);
+      mkOp2(OP_POW, TYPE_F32, val3, val1, val3);
+
+      mkCmp(OP_SLCT, CC_GT, TYPE_F32, dst0[2], val3, zero, val0);
+   }
+}
+
+static inline bool
+isResourceSpecial(const int r)
+{
+   return (r == TGSI_RESOURCE_GLOBAL ||
+           r == TGSI_RESOURCE_LOCAL ||
+           r == TGSI_RESOURCE_PRIVATE ||
+           r == TGSI_RESOURCE_INPUT);
+}
+
+static inline bool
+isResourceRaw(const struct tgsi::Source *code, const int r)
+{
+   return isResourceSpecial(r) || code->resources[r].raw;
+}
+
+static inline nv50_ir::TexTarget
+getResourceTarget(const struct tgsi::Source *code, int r)
+{
+   if (isResourceSpecial(r))
+      return nv50_ir::TEX_TARGET_BUFFER;
+   return tgsi::translateTexture(code->resources.at(r).target);
+}
+
+Symbol *
+Converter::getResourceBase(const int r)
+{
+   Symbol *sym = NULL;
+
+   switch (r) {
+   case TGSI_RESOURCE_GLOBAL:
+      sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15);
+      break;
+   case TGSI_RESOURCE_LOCAL:
+      assert(prog->getType() == Program::TYPE_COMPUTE);
+      sym = mkSymbol(nv50_ir::FILE_MEMORY_SHARED, 0, TYPE_U32,
+                     info->prop.cp.sharedOffset);
+      break;
+   case TGSI_RESOURCE_PRIVATE:
+      sym = mkSymbol(nv50_ir::FILE_MEMORY_LOCAL, 0, TYPE_U32,
+                     info->bin.tlsSpace);
+      break;
+   case TGSI_RESOURCE_INPUT:
+      assert(prog->getType() == Program::TYPE_COMPUTE);
+      sym = mkSymbol(nv50_ir::FILE_SHADER_INPUT, 0, TYPE_U32,
+                     info->prop.cp.inputOffset);
+      break;
+   default:
+      sym = new_Symbol(prog,
+                       nv50_ir::FILE_MEMORY_GLOBAL, code->resources.at(r).slot);
+      break;
+   }
+   return sym;
+}
+
+void
+Converter::getResourceCoords(std::vector<Value *> &coords, int r, int s)
+{
+   const int arg =
+      TexInstruction::Target(getResourceTarget(code, r)).getArgCount();
+
+   for (int c = 0; c < arg; ++c)
+      coords.push_back(fetchSrc(s, c));
+
+   // NOTE: TGSI_RESOURCE_GLOBAL needs FILE_GPR; this is an nv50 quirk
+   if (r == TGSI_RESOURCE_LOCAL ||
+       r == TGSI_RESOURCE_PRIVATE ||
+       r == TGSI_RESOURCE_INPUT)
+      coords[0] = mkOp1v(OP_MOV, TYPE_U32, getScratch(4, FILE_ADDRESS),
+                         coords[0]);
+}
+
+static inline int
+partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask)
+{
+   int n = 0;
+
+   while (mask) {
+      if (mask & 1) {
+         size[n]++;
+      } else {
+         if (size[n])
+            comp[n = 1] = size[0] + 1;
+         else
+            comp[n]++;
+      }
+      mask >>= 1;
+   }
+   if (size[0] == 3) {
+      n = 1;
+      size[0] = (comp[0] == 1) ? 1 : 2;
+      size[1] = 3 - size[0];
+      comp[1] = comp[0] + size[0];
+   }
+   return n + 1;
+}
+
+// For raw loads, granularity is 4 byte.
+// Usage of the texture read mask on OP_SULDP is not allowed.
+void
+Converter::handleLOAD(Value *dst0[4])
+{
+   const int r = tgsi.getSrc(0).getIndex(0);
+   int c;
+   std::vector<Value *> off, src, ldv, def;
+
+   getResourceCoords(off, r, 1);
+
+   if (isResourceRaw(code, r)) {
+      uint8_t mask = 0;
+      uint8_t comp[2] = { 0, 0 };
+      uint8_t size[2] = { 0, 0 };
+
+      Symbol *base = getResourceBase(r);
+
+      // determine the base and size of the at most 2 load ops
+      for (c = 0; c < 4; ++c)
+         if (!tgsi.getDst(0).isMasked(c))
+            mask |= 1 << (tgsi.getSrc(0).getSwizzle(c) - TGSI_SWIZZLE_X);
+
+      int n = partitionLoadStore(comp, size, mask);
+
+      src = off;
+
+      def.resize(4); // index by component, the ones we need will be non-NULL
+      for (c = 0; c < 4; ++c) {
+         if (dst0[c] && tgsi.getSrc(0).getSwizzle(c) == (TGSI_SWIZZLE_X + c))
+            def[c] = dst0[c];
+         else
+         if (mask & (1 << c))
+            def[c] = getScratch();
+      }
+
+      const bool useLd = isResourceSpecial(r) ||
+         (info->io.nv50styleSurfaces &&
+          code->resources[r].target == TGSI_TEXTURE_BUFFER);
+
+      for (int i = 0; i < n; ++i) {
+         ldv.assign(def.begin() + comp[i], def.begin() + comp[i] + size[i]);
+
+         if (comp[i]) // adjust x component of source address if necessary
+            src[0] = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, off[0]->reg.file),
+                            off[0], mkImm(comp[i] * 4));
+         else
+            src[0] = off[0];
+
+         if (useLd) {
+            Instruction *ld =
+               mkLoad(typeOfSize(size[i] * 4), ldv[0], base, src[0]);
+            for (size_t c = 1; c < ldv.size(); ++c)
+               ld->setDef(c, ldv[c]);
+         } else {
+            mkTex(OP_SULDB, getResourceTarget(code, r), code->resources[r].slot,
+                  0, ldv, src)->dType = typeOfSize(size[i] * 4);
+         }
+      }
+   } else {
+      def.resize(4);
+      for (c = 0; c < 4; ++c) {
+         if (!dst0[c] || tgsi.getSrc(0).getSwizzle(c) != (TGSI_SWIZZLE_X + c))
+            def[c] = getScratch();
+         else
+            def[c] = dst0[c];
+      }
+
+      mkTex(OP_SULDP, getResourceTarget(code, r), code->resources[r].slot, 0,
+            def, off);
+   }
+   FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+      if (dst0[c] != def[c])
+         mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]);
+}
+
+// For formatted stores, the write mask on OP_SUSTP can be used.
+// Raw stores have to be split.
+void
+Converter::handleSTORE()
+{
+   const int r = tgsi.getDst(0).getIndex(0);
+   int c;
+   std::vector<Value *> off, src, dummy;
+
+   getResourceCoords(off, r, 0);
+   src = off;
+   const int s = src.size();
+
+   if (isResourceRaw(code, r)) {
+      uint8_t comp[2] = { 0, 0 };
+      uint8_t size[2] = { 0, 0 };
+
+      int n = partitionLoadStore(comp, size, tgsi.getDst(0).getMask());
+
+      Symbol *base = getResourceBase(r);
+
+      const bool useSt = isResourceSpecial(r) ||
+         (info->io.nv50styleSurfaces &&
+          code->resources[r].target == TGSI_TEXTURE_BUFFER);
+
+      for (int i = 0; i < n; ++i) {
+         if (comp[i]) // adjust x component of source address if necessary
+            src[0] = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, off[0]->reg.file),
+                            off[0], mkImm(comp[i] * 4));
+         else
+            src[0] = off[0];
+
+         const DataType stTy = typeOfSize(size[i] * 4);
+
+         if (useSt) {
+            Instruction *st =
+               mkStore(OP_STORE, stTy, base, NULL, fetchSrc(1, comp[i]));
+            for (c = 1; c < size[i]; ++c)
+               st->setSrc(1 + c, fetchSrc(1, comp[i] + c));
+            st->setIndirect(0, 0, src[0]);
+         } else {
+            // attach values to be stored
+            src.resize(s + size[i]);
+            for (c = 0; c < size[i]; ++c)
+               src[s + c] = fetchSrc(1, comp[i] + c);
+            mkTex(OP_SUSTB, getResourceTarget(code, r), code->resources[r].slot,
+                  0, dummy, src)->setType(stTy);
+         }
+      }
+   } else {
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         src.push_back(fetchSrc(1, c));
+
+      mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0,
+            dummy, src)->tex.mask = tgsi.getDst(0).getMask();
+   }
+}
+
+// XXX: These only work on resources with the single-component u32/s32 formats.
+// Therefore the result is replicated. This might not be intended by TGSI, but
+// operating on more than 1 component would produce undefined results because
+// they do not exist.
+void
+Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
+{
+   const int r = tgsi.getSrc(0).getIndex(0);
+   std::vector<Value *> srcv;
+   std::vector<Value *> defv;
+   LValue *dst = getScratch();
+
+   getResourceCoords(srcv, r, 1);
+
+   if (isResourceSpecial(r)) {
+      assert(r != TGSI_RESOURCE_INPUT);
+      Instruction *insn;
+      insn = mkOp2(OP_ATOM, ty, dst, getResourceBase(r), fetchSrc(2, 0));
+      insn->subOp = subOp;
+      if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+         insn->setSrc(2, fetchSrc(3, 0));
+      insn->setIndirect(0, 0, srcv.at(0));
+   } else {
+      operation op = isResourceRaw(code, r) ? OP_SUREDB : OP_SUREDP;
+      TexTarget targ = getResourceTarget(code, r);
+      int idx = code->resources[r].slot;
+      defv.push_back(dst);
+      srcv.push_back(fetchSrc(2, 0));
+      if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+         srcv.push_back(fetchSrc(3, 0));
+      TexInstruction *tex = mkTex(op, targ, idx, 0, defv, srcv);
+      tex->subOp = subOp;
+      tex->tex.mask = 1;
+      tex->setType(ty);
+   }
+
+   for (int c = 0; c < 4; ++c)
+      if (dst0[c])
+         dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+}
+
+Converter::Subroutine *
+Converter::getSubroutine(unsigned ip)
+{
+   std::map<unsigned, Subroutine>::iterator it = sub.map.find(ip);
+
+   if (it == sub.map.end())
+      it = sub.map.insert(std::make_pair(
+              ip, Subroutine(new Function(prog, "SUB", ip)))).first;
+
+   return &it->second;
+}
+
+Converter::Subroutine *
+Converter::getSubroutine(Function *f)
+{
+   unsigned ip = f->getLabel();
+   std::map<unsigned, Subroutine>::iterator it = sub.map.find(ip);
+
+   if (it == sub.map.end())
+      it = sub.map.insert(std::make_pair(ip, Subroutine(f))).first;
+
+   return &it->second;
+}
+
+bool
+Converter::isEndOfSubroutine(uint ip)
+{
+   assert(ip < code->scan.num_instructions);
+   tgsi::Instruction insn(&code->insns[ip]);
+   return (insn.getOpcode() == TGSI_OPCODE_END ||
+           insn.getOpcode() == TGSI_OPCODE_ENDSUB ||
+           // does END occur at end of main or the very end ?
+           insn.getOpcode() == TGSI_OPCODE_BGNSUB);
+}
+
+bool
+Converter::handleInstruction(const struct tgsi_full_instruction *insn)
+{
+   Instruction *geni;
+
+   Value *dst0[4], *rDst0[4];
+   Value *src0, *src1, *src2;
+   Value *val0, *val1;
+   int c;
+
+   tgsi = tgsi::Instruction(insn);
+
+   bool useScratchDst = tgsi.checkDstSrcAliasing();
+
+   operation op = tgsi.getOP();
+   dstTy = tgsi.inferDstType();
+   srcTy = tgsi.inferSrcType();
+
+   unsigned int mask = tgsi.dstCount() ? tgsi.getDst(0).getMask() : 0;
+
+   if (tgsi.dstCount()) {
+      for (c = 0; c < 4; ++c) {
+         rDst0[c] = acquireDst(0, c);
+         dst0[c] = (useScratchDst && rDst0[c]) ? getScratch() : rDst0[c];
+      }
+   }
+
+   switch (tgsi.getOpcode()) {
+   case TGSI_OPCODE_ADD:
+   case TGSI_OPCODE_UADD:
+   case TGSI_OPCODE_AND:
+   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_IDIV:
+   case TGSI_OPCODE_UDIV:
+   case TGSI_OPCODE_MAX:
+   case TGSI_OPCODE_MIN:
+   case TGSI_OPCODE_IMAX:
+   case TGSI_OPCODE_IMIN:
+   case TGSI_OPCODE_UMAX:
+   case TGSI_OPCODE_UMIN:
+   case TGSI_OPCODE_MOD:
+   case TGSI_OPCODE_UMOD:
+   case TGSI_OPCODE_MUL:
+   case TGSI_OPCODE_UMUL:
+   case TGSI_OPCODE_OR:
+   case TGSI_OPCODE_POW:
+   case TGSI_OPCODE_SHL:
+   case TGSI_OPCODE_ISHR:
+   case TGSI_OPCODE_USHR:
+   case TGSI_OPCODE_SUB:
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         mkOp2(op, dstTy, dst0[c], src0, src1);
+      }
+      break;
+   case TGSI_OPCODE_MAD:
+   case TGSI_OPCODE_UMAD:
+   case TGSI_OPCODE_SAD:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         mkOp3(op, dstTy, dst0[c], src0, src1, src2);
+      }
+      break;
+   case TGSI_OPCODE_MOV:
+   case TGSI_OPCODE_ABS:
+   case TGSI_OPCODE_CEIL:
+   case TGSI_OPCODE_FLR:
+   case TGSI_OPCODE_TRUNC:
+   case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_IABS:
+   case TGSI_OPCODE_INEG:
+   case TGSI_OPCODE_NOT:
+   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp1(op, dstTy, dst0[c], fetchSrc(0, c));
+      break;
+   case TGSI_OPCODE_RSQ:
+      src0 = fetchSrc(0, 0);
+      val0 = getScratch();
+      mkOp1(OP_ABS, TYPE_F32, val0, src0);
+      mkOp1(OP_RSQ, TYPE_F32, val0, val0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         mkCvt(OP_CVT, TYPE_S32, dst0[c], TYPE_F32, src0)->rnd = ROUND_M;
+         mkOp2(OP_SHL, TYPE_U32, dst0[c], dst0[c], mkImm(4));
+      }
+      break;
+   case TGSI_OPCODE_UARL:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp2(OP_SHL, TYPE_U32, dst0[c], fetchSrc(0, c), mkImm(4));
+      break;
+   case TGSI_OPCODE_EX2:
+   case TGSI_OPCODE_LG2:
+      val0 = mkOp1(op, TYPE_F32, getScratch(), fetchSrc(0, 0))->getDef(0);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkOp1(OP_MOV, TYPE_F32, dst0[c], val0);
+      break;
+   case TGSI_OPCODE_COS:
+   case TGSI_OPCODE_SIN:
+      val0 = getScratch();
+      if (mask & 7) {
+         mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 0));
+         mkOp1(op, TYPE_F32, val0, val0);
+         for (c = 0; c < 3; ++c)
+            if (dst0[c])
+               mkMov(dst0[c], val0);
+      }
+      if (dst0[3]) {
+         mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 3));
+         mkOp1(op, TYPE_F32, dst0[3], val0);
+      }
+      break;
+   case TGSI_OPCODE_SCS:
+      if (mask & 3) {
+         val0 = mkOp1v(OP_PRESIN, TYPE_F32, getSSA(), fetchSrc(0, 0));
+         if (dst0[0])
+            mkOp1(OP_COS, TYPE_F32, dst0[0], val0);
+         if (dst0[1])
+            mkOp1(OP_SIN, TYPE_F32, dst0[1], val0);
+      }
+      if (dst0[2])
+         loadImm(dst0[2], 0.0f);
+      if (dst0[3])
+         loadImm(dst0[3], 1.0f);
+      break;
+   case TGSI_OPCODE_EXP:
+      src0 = fetchSrc(0, 0);
+      val0 = mkOp1v(OP_FLOOR, TYPE_F32, getSSA(), src0);
+      if (dst0[1])
+         mkOp2(OP_SUB, TYPE_F32, dst0[1], src0, val0);
+      if (dst0[0])
+         mkOp1(OP_EX2, TYPE_F32, dst0[0], val0);
+      if (dst0[2])
+         mkOp1(OP_EX2, TYPE_F32, dst0[2], src0);
+      if (dst0[3])
+         loadImm(dst0[3], 1.0f);
+      break;
+   case TGSI_OPCODE_LOG:
+      src0 = mkOp1v(OP_ABS, TYPE_F32, getSSA(), fetchSrc(0, 0));
+      val0 = mkOp1v(OP_LG2, TYPE_F32, dst0[2] ? dst0[2] : getSSA(), src0);
+      if (dst0[0] || dst0[1])
+         val1 = mkOp1v(OP_FLOOR, TYPE_F32, dst0[0] ? dst0[0] : getSSA(), val0);
+      if (dst0[1]) {
+         mkOp1(OP_EX2, TYPE_F32, dst0[1], val1);
+         mkOp1(OP_RCP, TYPE_F32, dst0[1], dst0[1]);
+         mkOp2(OP_MUL, TYPE_F32, dst0[1], dst0[1], src0);
+      }
+      if (dst0[3])
+         loadImm(dst0[3], 1.0f);
+      break;
+   case TGSI_OPCODE_DP2:
+      val0 = buildDot(2);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DP3:
+      val0 = buildDot(3);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DP4:
+      val0 = buildDot(4);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DPH:
+      val0 = buildDot(3);
+      src1 = fetchSrc(1, 3);
+      mkOp2(OP_ADD, TYPE_F32, val0, val0, src1);
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkMov(dst0[c], val0);
+      break;
+   case TGSI_OPCODE_DST:
+      if (dst0[0])
+         loadImm(dst0[0], 1.0f);
+      if (dst0[1]) {
+         src0 = fetchSrc(0, 1);
+         src1 = fetchSrc(1, 1);
+         mkOp2(OP_MUL, TYPE_F32, dst0[1], src0, src1);
+      }
+      if (dst0[2])
+         mkMov(dst0[2], fetchSrc(0, 2));
+      if (dst0[3])
+         mkMov(dst0[3], fetchSrc(1, 3));
+      break;
+   case TGSI_OPCODE_LRP:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         mkOp3(OP_MAD, TYPE_F32, dst0[c],
+               mkOp2v(OP_SUB, TYPE_F32, getSSA(), src1, src2), src0, src2);
+      }
+      break;
+   case TGSI_OPCODE_LIT:
+      handleLIT(dst0);
+      break;
+   case TGSI_OPCODE_XPD:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         if (c < 3) {
+            val0 = getSSA();
+            src0 = fetchSrc(1, (c + 1) % 3);
+            src1 = fetchSrc(0, (c + 2) % 3);
+            mkOp2(OP_MUL, TYPE_F32, val0, src0, src1);
+            mkOp1(OP_NEG, TYPE_F32, val0, val0);
+
+            src0 = fetchSrc(0, (c + 1) % 3);
+            src1 = fetchSrc(1, (c + 2) % 3);
+            mkOp3(OP_MAD, TYPE_F32, dst0[c], src0, src1, val0);
+         } else {
+            loadImm(dst0[c], 1.0f);
+         }
+      }
+      break;
+   case TGSI_OPCODE_ISSG:
+   case TGSI_OPCODE_SSG:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         val0 = getScratch();
+         val1 = getScratch();
+         mkCmp(OP_SET, CC_GT, srcTy, val0, src0, zero);
+         mkCmp(OP_SET, CC_LT, srcTy, val1, src0, zero);
+         if (srcTy == TYPE_F32)
+            mkOp2(OP_SUB, TYPE_F32, dst0[c], val0, val1);
+         else
+            mkOp2(OP_SUB, TYPE_S32, dst0[c], val1, val0);
+      }
+      break;
+   case TGSI_OPCODE_UCMP:
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         if (src1 == src2)
+            mkMov(dst0[c], src1);
+         else
+            mkCmp(OP_SLCT, (srcTy == TYPE_F32) ? CC_LT : CC_NE,
+                  srcTy, dst0[c], src1, src2, src0);
+      }
+      break;
+   case TGSI_OPCODE_FRC:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         val0 = getScratch();
+         mkOp1(OP_FLOOR, TYPE_F32, val0, src0);
+         mkOp2(OP_SUB, TYPE_F32, dst0[c], src0, val0);
+      }
+      break;
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F32, fetchSrc(0, c))
+         ->rnd = ROUND_NI;
+      break;
+   case TGSI_OPCODE_CLAMP:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         src2 = fetchSrc(2, c);
+         val0 = getScratch();
+         mkOp2(OP_MIN, TYPE_F32, val0, src0, src1);
+         mkOp2(OP_MAX, TYPE_F32, dst0[c], val0, src2);
+      }
+      break;
+   case TGSI_OPCODE_SLT:
+   case TGSI_OPCODE_SGE:
+   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SFL:
+   case TGSI_OPCODE_SGT:
+   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_STR:
+   case TGSI_OPCODE_FSEQ:
+   case TGSI_OPCODE_FSGE:
+   case TGSI_OPCODE_FSLT:
+   case TGSI_OPCODE_FSNE:
+   case TGSI_OPCODE_ISGE:
+   case TGSI_OPCODE_ISLT:
+   case TGSI_OPCODE_USEQ:
+   case TGSI_OPCODE_USGE:
+   case TGSI_OPCODE_USLT:
+   case TGSI_OPCODE_USNE:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+         src0 = fetchSrc(0, c);
+         src1 = fetchSrc(1, c);
+         mkCmp(op, tgsi.getSetCond(), dstTy, dst0[c], src0, src1);
+      }
+      break;
+   case TGSI_OPCODE_KILL_IF:
+      val0 = new_LValue(func, FILE_PREDICATE);
+      for (c = 0; c < 4; ++c) {
+         mkCmp(OP_SET, CC_LT, TYPE_F32, val0, fetchSrc(0, c), zero);
+         mkOp(OP_DISCARD, TYPE_NONE, NULL)->setPredicate(CC_P, val0);
+      }
+      break;
+   case TGSI_OPCODE_KILL:
+      mkOp(OP_DISCARD, TYPE_NONE, NULL);
+      break;
+   case TGSI_OPCODE_TEX:
+   case TGSI_OPCODE_TXB:
+   case TGSI_OPCODE_TXL:
+   case TGSI_OPCODE_TXP:
+      //              R  S     L     C    Dx    Dy
+      handleTEX(dst0, 1, 1, 0x03, 0x0f, 0x00, 0x00);
+      break;
+   case TGSI_OPCODE_TXD:
+      handleTEX(dst0, 3, 3, 0x03, 0x0f, 0x10, 0x20);
+      break;
+   case TGSI_OPCODE_TEX2:
+      handleTEX(dst0, 2, 2, 0x03, 0x10, 0x00, 0x00);
+      break;
+   case TGSI_OPCODE_TXB2:
+   case TGSI_OPCODE_TXL2:
+      handleTEX(dst0, 2, 2, 0x10, 0x11, 0x00, 0x00);
+      break;
+   case TGSI_OPCODE_SAMPLE:
+   case TGSI_OPCODE_SAMPLE_B:
+   case TGSI_OPCODE_SAMPLE_D:
+   case TGSI_OPCODE_SAMPLE_L:
+   case TGSI_OPCODE_SAMPLE_C:
+   case TGSI_OPCODE_SAMPLE_C_LZ:
+      handleTEX(dst0, 1, 2, 0x30, 0x30, 0x30, 0x40);
+      break;
+   case TGSI_OPCODE_TXF:
+      handleTXF(dst0, 1, 0x03);
+      break;
+   case TGSI_OPCODE_SAMPLE_I:
+      handleTXF(dst0, 1, 0x03);
+      break;
+   case TGSI_OPCODE_SAMPLE_I_MS:
+      handleTXF(dst0, 1, 0x20);
+      break;
+   case TGSI_OPCODE_TXQ:
+   case TGSI_OPCODE_SVIEWINFO:
+      handleTXQ(dst0, TXQ_DIMS);
+      break;
+   case TGSI_OPCODE_F2I:
+   case TGSI_OPCODE_F2U:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c))->rnd = ROUND_Z;
+      break;
+   case TGSI_OPCODE_I2F:
+   case TGSI_OPCODE_U2F:
+      FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+         mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
+      break;
+   case TGSI_OPCODE_EMIT:
+   case TGSI_OPCODE_ENDPRIM:
+      // get vertex stream if specified (must be immediate)
+      src0 = tgsi.srcCount() ?
+         mkImm(tgsi.getSrc(0).getValueU32(0, info)) : zero;
+      mkOp1(op, TYPE_U32, NULL, src0)->fixed = 1;
+      break;
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_UIF:
+   {
+      BasicBlock *ifBB = new BasicBlock(func);
+
+      bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE);
+      condBBs.push(bb);
+      joinBBs.push(bb);
+
+      mkFlow(OP_BRA, NULL, CC_NOT_P, fetchSrc(0, 0))->setType(srcTy);
+
+      setPosition(ifBB, true);
+   }
+      break;
+   case TGSI_OPCODE_ELSE:
+   {
+      BasicBlock *elseBB = new BasicBlock(func);
+      BasicBlock *forkBB = reinterpret_cast<BasicBlock *>(condBBs.pop().u.p);
+
+      forkBB->cfg.attach(&elseBB->cfg, Graph::Edge::TREE);
+      condBBs.push(bb);
+
+      forkBB->getExit()->asFlow()->target.bb = elseBB;
+      if (!bb->isTerminated())
+         mkFlow(OP_BRA, NULL, CC_ALWAYS, NULL);
+
+      setPosition(elseBB, true);
+   }
+      break;
+   case TGSI_OPCODE_ENDIF:
+   {
+      BasicBlock *convBB = new BasicBlock(func);
+      BasicBlock *prevBB = reinterpret_cast<BasicBlock *>(condBBs.pop().u.p);
+      BasicBlock *forkBB = reinterpret_cast<BasicBlock *>(joinBBs.pop().u.p);
+
+      if (!bb->isTerminated()) {
+         // we only want join if none of the clauses ended with CONT/BREAK/RET
+         if (prevBB->getExit()->op == OP_BRA && joinBBs.getSize() < 6)
+            insertConvergenceOps(convBB, forkBB);
+         mkFlow(OP_BRA, convBB, CC_ALWAYS, NULL);
+         bb->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD);
+      }
+
+      if (prevBB->getExit()->op == OP_BRA) {
+         prevBB->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD);
+         prevBB->getExit()->asFlow()->target.bb = convBB;
+      }
+      setPosition(convBB, true);
+   }
+      break;
+   case TGSI_OPCODE_BGNLOOP:
+   {
+      BasicBlock *lbgnBB = new BasicBlock(func);
+      BasicBlock *lbrkBB = new BasicBlock(func);
+
+      loopBBs.push(lbgnBB);
+      breakBBs.push(lbrkBB);
+      if (loopBBs.getSize() > func->loopNestingBound)
+         func->loopNestingBound++;
+
+      mkFlow(OP_PREBREAK, lbrkBB, CC_ALWAYS, NULL);
+
+      bb->cfg.attach(&lbgnBB->cfg, Graph::Edge::TREE);
+      setPosition(lbgnBB, true);
+      mkFlow(OP_PRECONT, lbgnBB, CC_ALWAYS, NULL);
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+   {
+      BasicBlock *loopBB = reinterpret_cast<BasicBlock *>(loopBBs.pop().u.p);
+
+      if (!bb->isTerminated()) {
+         mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
+         bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+      }
+      setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+   }
+      break;
+   case TGSI_OPCODE_BRK:
+   {
+      if (bb->isTerminated())
+         break;
+      BasicBlock *brkBB = reinterpret_cast<BasicBlock *>(breakBBs.peek().u.p);
+      mkFlow(OP_BREAK, brkBB, CC_ALWAYS, NULL);
+      bb->cfg.attach(&brkBB->cfg, Graph::Edge::CROSS);
+   }
+      break;
+   case TGSI_OPCODE_CONT:
+   {
+      if (bb->isTerminated())
+         break;
+      BasicBlock *contBB = reinterpret_cast<BasicBlock *>(loopBBs.peek().u.p);
+      mkFlow(OP_CONT, contBB, CC_ALWAYS, NULL);
+      contBB->explicitCont = true;
+      bb->cfg.attach(&contBB->cfg, Graph::Edge::BACK);
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB:
+   {
+      Subroutine *s = getSubroutine(ip);
+      BasicBlock *entry = new BasicBlock(s->f);
+      BasicBlock *leave = new BasicBlock(s->f);
+
+      // multiple entrypoints possible, keep the graph connected
+      if (prog->getType() == Program::TYPE_COMPUTE)
+         prog->main->call.attach(&s->f->call, Graph::Edge::TREE);
+
+      sub.cur = s;
+      s->f->setEntry(entry);
+      s->f->setExit(leave);
+      setPosition(entry, true);
+      return true;
+   }
+   case TGSI_OPCODE_ENDSUB:
+   {
+      sub.cur = getSubroutine(prog->main);
+      setPosition(BasicBlock::get(sub.cur->f->cfg.getRoot()), true);
+      return true;
+   }
+   case TGSI_OPCODE_CAL:
+   {
+      Subroutine *s = getSubroutine(tgsi.getLabel());
+      mkFlow(OP_CALL, s->f, CC_ALWAYS, NULL);
+      func->call.attach(&s->f->call, Graph::Edge::TREE);
+      return true;
+   }
+   case TGSI_OPCODE_RET:
+   {
+      if (bb->isTerminated())
+         return true;
+      BasicBlock *leave = BasicBlock::get(func->cfgExit);
+
+      if (!isEndOfSubroutine(ip + 1)) {
+         // insert a PRERET at the entry if this is an early return
+         // (only needed for sharing code in the epilogue)
+         BasicBlock *pos = getBB();
+         setPosition(BasicBlock::get(func->cfg.getRoot()), false);
+         mkFlow(OP_PRERET, leave, CC_ALWAYS, NULL)->fixed = 1;
+         setPosition(pos, true);
+      }
+      mkFlow(OP_RET, NULL, CC_ALWAYS, NULL)->fixed = 1;
+      bb->cfg.attach(&leave->cfg, Graph::Edge::CROSS);
+   }
+      break;
+   case TGSI_OPCODE_END:
+   {
+      // attach and generate epilogue code
+      BasicBlock *epilogue = BasicBlock::get(func->cfgExit);
+      bb->cfg.attach(&epilogue->cfg, Graph::Edge::TREE);
+      setPosition(epilogue, true);
+      if (prog->getType() == Program::TYPE_FRAGMENT)
+         exportOutputs();
+      if (info->io.genUserClip > 0)
+         handleUserClipPlanes();
+      mkOp(OP_EXIT, TYPE_NONE, NULL)->terminator = 1;
+   }
+      break;
+   case TGSI_OPCODE_SWITCH:
+   case TGSI_OPCODE_CASE:
+      ERROR("switch/case opcode encountered, should have been lowered\n");
+      abort();
+      break;
+   case TGSI_OPCODE_LOAD:
+      handleLOAD(dst0);
+      break;
+   case TGSI_OPCODE_STORE:
+      handleSTORE();
+      break;
+   case TGSI_OPCODE_BARRIER:
+      geni = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0));
+      geni->fixed = 1;
+      geni->subOp = NV50_IR_SUBOP_BAR_SYNC;
+      break;
+   case TGSI_OPCODE_MFENCE:
+   case TGSI_OPCODE_LFENCE:
+   case TGSI_OPCODE_SFENCE:
+      geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL);
+      geni->fixed = 1;
+      geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
+      break;
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMAX:
+      handleATOM(dst0, dstTy, tgsi::opcodeToSubOp(tgsi.getOpcode()));
+      break;
+   default:
+      ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode());
+      assert(0);
+      break;
+   }
+
+   if (tgsi.dstCount()) {
+      for (c = 0; c < 4; ++c) {
+         if (!dst0[c])
+            continue;
+         if (dst0[c] != rDst0[c])
+            mkMov(rDst0[c], dst0[c]);
+         storeDst(0, c, rDst0[c]);
+      }
+   }
+   vtxBaseValid = 0;
+
+   return true;
+}
+
+void
+Converter::handleUserClipPlanes()
+{
+   Value *res[8];
+   int n, i, c;
+
+   for (c = 0; c < 4; ++c) {
+      for (i = 0; i < info->io.genUserClip; ++i) {
+         Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot,
+                                TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
+         Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
+         if (c == 0)
+            res[i] = mkOp2v(OP_MUL, TYPE_F32, getScratch(), clipVtx[c], ucp);
+         else
+            mkOp3(OP_MAD, TYPE_F32, res[i], clipVtx[c], ucp, res[i]);
+      }
+   }
+
+   const int first = info->numOutputs - (info->io.genUserClip + 3) / 4;
+
+   for (i = 0; i < info->io.genUserClip; ++i) {
+      n = i / 4 + first;
+      c = i % 4;
+      Symbol *sym =
+         mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info->out[n].slot[c] * 4);
+      mkStore(OP_EXPORT, TYPE_F32, sym, NULL, res[i]);
+   }
+}
+
+void
+Converter::exportOutputs()
+{
+   for (unsigned int i = 0; i < info->numOutputs; ++i) {
+      for (unsigned int c = 0; c < 4; ++c) {
+         if (!oData.exists(sub.cur->values, i, c))
+            continue;
+         Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32,
+                                info->out[i].slot[c] * 4);
+         Value *val = oData.load(sub.cur->values, i, c, NULL);
+         if (val)
+            mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val);
+      }
+   }
+}
+
+Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir),
+     code(code),
+     tgsi(NULL),
+     tData(this), aData(this), pData(this), oData(this)
+{
+   info = code->info;
+
+   const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR;
+
+   const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY);
+   const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE);
+   const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS);
+   const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT);
+
+   tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0);
+   pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0);
+   aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_ADDRESS, 0);
+   oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0);
+
+   zero = mkImm((uint32_t)0);
+
+   vtxBaseValid = 0;
+}
+
+Converter::~Converter()
+{
+}
+
+inline const Converter::Location *
+Converter::BindArgumentsPass::getValueLocation(Subroutine *s, Value *v)
+{
+   ValueMap::l_iterator it = s->values.l.find(v);
+   return it == s->values.l.end() ? NULL : &it->second;
+}
+
+template<typename T> inline void
+Converter::BindArgumentsPass::updateCallArgs(
+   Instruction *i, void (Instruction::*setArg)(int, Value *),
+   T (Function::*proto))
+{
+   Function *g = i->asFlow()->target.fn;
+   Subroutine *subg = conv.getSubroutine(g);
+
+   for (unsigned a = 0; a < (g->*proto).size(); ++a) {
+      Value *v = (g->*proto)[a].get();
+      const Converter::Location &l = *getValueLocation(subg, v);
+      Converter::DataArray *array = conv.getArrayForFile(l.array, l.arrayIdx);
+
+      (i->*setArg)(a, array->acquire(sub->values, l.i, l.c));
+   }
+}
+
+template<typename T> inline void
+Converter::BindArgumentsPass::updatePrototype(
+   BitSet *set, void (Function::*updateSet)(), T (Function::*proto))
+{
+   (func->*updateSet)();
+
+   for (unsigned i = 0; i < set->getSize(); ++i) {
+      Value *v = func->getLValue(i);
+      const Converter::Location *l = getValueLocation(sub, v);
+
+      // only include values with a matching TGSI register
+      if (set->test(i) && l && !conv.code->locals.count(*l))
+         (func->*proto).push_back(v);
+   }
+}
+
+bool
+Converter::BindArgumentsPass::visit(Function *f)
+{
+   sub = conv.getSubroutine(f);
+
+   for (ArrayList::Iterator bi = f->allBBlocks.iterator();
+        !bi.end(); bi.next()) {
+      for (Instruction *i = BasicBlock::get(bi)->getFirst();
+           i; i = i->next) {
+         if (i->op == OP_CALL && !i->asFlow()->builtin) {
+            updateCallArgs(i, &Instruction::setSrc, &Function::ins);
+            updateCallArgs(i, &Instruction::setDef, &Function::outs);
+         }
+      }
+   }
+
+   if (func == prog->main && prog->getType() != Program::TYPE_COMPUTE)
+      return true;
+   updatePrototype(&BasicBlock::get(f->cfg.getRoot())->liveSet,
+                   &Function::buildLiveSets, &Function::ins);
+   updatePrototype(&BasicBlock::get(f->cfgExit)->defSet,
+                   &Function::buildDefSets, &Function::outs);
+
+   return true;
+}
+
+bool
+Converter::run()
+{
+   BasicBlock *entry = new BasicBlock(prog->main);
+   BasicBlock *leave = new BasicBlock(prog->main);
+
+   prog->main->setEntry(entry);
+   prog->main->setExit(leave);
+
+   setPosition(entry, true);
+   sub.cur = getSubroutine(prog->main);
+
+   if (info->io.genUserClip > 0) {
+      for (int c = 0; c < 4; ++c)
+         clipVtx[c] = getScratch();
+   }
+
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      Symbol *sv = mkSysVal(SV_POSITION, 3);
+      fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv);
+      mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]);
+   }
+
+   for (ip = 0; ip < code->scan.num_instructions; ++ip) {
+      if (!handleInstruction(&code->insns[ip]))
+         return false;
+   }
+
+   if (!BindArgumentsPass(*this).run(prog))
+      return false;
+
+   return true;
+}
+
+} // unnamed namespace
+
+namespace nv50_ir {
+
+bool
+Program::makeFromTGSI(struct nv50_ir_prog_info *info)
+{
+   tgsi::Source src(info);
+   if (!src.scanSource())
+      return false;
+   tlsSize = info->bin.tlsSpace;
+
+   Converter builder(this, &src);
+   return builder.run();
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
new file mode 100644
index 0000000..3f8d00a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_graph.h"
+#include <limits>
+#include <list>
+#include <stack>
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+Graph::Graph()
+{
+   root = NULL;
+   size = 0;
+   sequence = 0;
+}
+
+Graph::~Graph()
+{
+   for (IteratorRef it = safeIteratorDFS(); !it->end(); it->next())
+      reinterpret_cast<Node *>(it->get())->cut();
+}
+
+void Graph::insert(Node *node)
+{
+   if (!root)
+      root = node;
+
+   node->graph = this;
+   size++;
+}
+
+void Graph::Edge::unlink()
+{
+   if (origin) {
+      prev[0]->next[0] = next[0];
+      next[0]->prev[0] = prev[0];
+      if (origin->out == this)
+         origin->out = (next[0] == this) ? NULL : next[0];
+
+      --origin->outCount;
+   }
+   if (target) {
+      prev[1]->next[1] = next[1];
+      next[1]->prev[1] = prev[1];
+      if (target->in == this)
+         target->in = (next[1] == this) ? NULL : next[1];
+
+      --target->inCount;
+   }
+}
+
+const char *Graph::Edge::typeStr() const
+{
+   switch (type) {
+   case TREE:    return "tree";
+   case FORWARD: return "forward";
+   case BACK:    return "back";
+   case CROSS:   return "cross";
+   case DUMMY:   return "dummy";
+   case UNKNOWN:
+   default:
+      return "unk";
+   }
+}
+
+Graph::Node::Node(void *priv) : data(priv),
+                                in(0), out(0), graph(0),
+                                visited(0),
+                                inCount(0), outCount(0)
+{
+   // nothing to do
+}
+
+void Graph::Node::attach(Node *node, Edge::Type kind)
+{
+   Edge *edge = new Edge(this, node, kind);
+
+   // insert head
+   if (this->out) {
+      edge->next[0] = this->out;
+      edge->prev[0] = this->out->prev[0];
+      edge->prev[0]->next[0] = edge;
+      this->out->prev[0] = edge;
+   }
+   this->out = edge;
+
+   if (node->in) {
+      edge->next[1] = node->in;
+      edge->prev[1] = node->in->prev[1];
+      edge->prev[1]->next[1] = edge;
+      node->in->prev[1] = edge;
+   }
+   node->in = edge;
+
+   ++this->outCount;
+   ++node->inCount;
+
+   assert(graph || node->graph);
+   if (!node->graph)
+      graph->insert(node);
+   if (!graph)
+      node->graph->insert(this);
+
+   if (kind == Edge::UNKNOWN)
+      graph->classifyEdges();
+}
+
+bool Graph::Node::detach(Graph::Node *node)
+{
+   EdgeIterator ei = this->outgoing();
+   for (; !ei.end(); ei.next())
+      if (ei.getNode() == node)
+         break;
+   if (ei.end()) {
+      ERROR("no such node attached\n");
+      return false;
+   }
+   delete ei.getEdge();
+   return true;
+}
+
+// Cut a node from the graph, deleting all attached edges.
+void Graph::Node::cut()
+{
+   while (out)
+      delete out;
+   while (in)
+      delete in;
+
+   if (graph) {
+      if (graph->root == this)
+         graph->root = NULL;
+      graph = NULL;
+   }
+}
+
+Graph::Edge::Edge(Node *org, Node *tgt, Type kind)
+{
+   target = tgt;
+   origin = org;
+   type = kind;
+
+   next[0] = next[1] = this;
+   prev[0] = prev[1] = this;
+}
+
+bool
+Graph::Node::reachableBy(const Node *node, const Node *term) const
+{
+   std::stack<const Node *> stack;
+   const Node *pos = NULL;
+   const int seq = graph->nextSequence();
+
+   stack.push(node);
+
+   while (!stack.empty()) {
+      pos = stack.top();
+      stack.pop();
+
+      if (pos == this)
+         return true;
+      if (pos == term)
+         continue;
+
+      for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) {
+         if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY)
+            continue;
+         if (ei.getNode()->visit(seq))
+            stack.push(ei.getNode());
+      }
+   }
+   return pos == this;
+}
+
+class DFSIterator : public Iterator
+{
+public:
+   DFSIterator(Graph *graph, const bool preorder)
+   {
+      unsigned int seq = graph->nextSequence();
+
+      nodes = new Graph::Node * [graph->getSize() + 1];
+      count = 0;
+      pos = 0;
+      nodes[graph->getSize()] = 0;
+
+      if (graph->getRoot()) {
+         graph->getRoot()->visit(seq);
+         search(graph->getRoot(), preorder, seq);
+      }
+   }
+
+   ~DFSIterator()
+   {
+      if (nodes)
+         delete[] nodes;
+   }
+
+   void search(Graph::Node *node, const bool preorder, const int sequence)
+   {
+      if (preorder)
+         nodes[count++] = node;
+
+      for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+         if (ei.getNode()->visit(sequence))
+            search(ei.getNode(), preorder, sequence);
+
+      if (!preorder)
+         nodes[count++] = node;
+   }
+
+   virtual bool end() const { return pos >= count; }
+   virtual void next() { if (pos < count) ++pos; }
+   virtual void *get() const { return nodes[pos]; }
+   virtual void reset() { pos = 0; }
+
+protected:
+   Graph::Node **nodes;
+   int count;
+   int pos;
+};
+
+IteratorRef Graph::iteratorDFS(bool preorder)
+{
+   return IteratorRef(new DFSIterator(this, preorder));
+}
+
+IteratorRef Graph::safeIteratorDFS(bool preorder)
+{
+   return this->iteratorDFS(preorder);
+}
+
+class CFGIterator : public Iterator
+{
+public:
+   CFGIterator(Graph *graph)
+   {
+      nodes = new Graph::Node * [graph->getSize() + 1];
+      count = 0;
+      pos = 0;
+      nodes[graph->getSize()] = 0;
+
+      // TODO: argh, use graph->sequence instead of tag and just raise it by > 1
+      for (IteratorRef it = graph->iteratorDFS(); !it->end(); it->next())
+         reinterpret_cast<Graph::Node *>(it->get())->tag = 0;
+
+      if (graph->getRoot())
+         search(graph->getRoot(), graph->nextSequence());
+   }
+
+   ~CFGIterator()
+   {
+      if (nodes)
+         delete[] nodes;
+   }
+
+   virtual void *get() const { return nodes[pos]; }
+   virtual bool end() const { return pos >= count; }
+   virtual void next() { if (pos < count) ++pos; }
+   virtual void reset() { pos = 0; }
+
+private:
+   void search(Graph::Node *node, const int sequence)
+   {
+      Stack bb, cross;
+
+      bb.push(node);
+
+      while (bb.getSize()) {
+         node = reinterpret_cast<Graph::Node *>(bb.pop().u.p);
+         assert(node);
+         if (!node->visit(sequence))
+            continue;
+         node->tag = 0;
+
+         for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) {
+            switch (ei.getType()) {
+            case Graph::Edge::TREE:
+            case Graph::Edge::FORWARD:
+            case Graph::Edge::DUMMY:
+               if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd())
+                  bb.push(ei.getNode());
+               break;
+            case Graph::Edge::BACK:
+               continue;
+            case Graph::Edge::CROSS:
+               if (++(ei.getNode()->tag) == 1)
+                  cross.push(ei.getNode());
+               break;
+            default:
+               assert(!"unknown edge kind in CFG");
+               break;
+            }
+         }
+         nodes[count++] = node;
+
+         if (bb.getSize() == 0)
+            cross.moveTo(bb);
+      }
+   }
+
+private:
+   Graph::Node **nodes;
+   int count;
+   int pos;
+};
+
+IteratorRef Graph::iteratorCFG()
+{
+   return IteratorRef(new CFGIterator(this));
+}
+
+IteratorRef Graph::safeIteratorCFG()
+{
+   return this->iteratorCFG();
+}
+
+void Graph::classifyEdges()
+{
+   int seq;
+
+   for (IteratorRef it = iteratorDFS(true); !it->end(); it->next()) {
+      Node *node = reinterpret_cast<Node *>(it->get());
+      node->visit(0);
+      node->tag = 0;
+   }
+
+   classifyDFS(root, (seq = 0));
+
+   sequence = seq;
+}
+
+void Graph::classifyDFS(Node *curr, int& seq)
+{
+   Graph::Edge *edge;
+   Graph::Node *node;
+
+   curr->visit(++seq);
+   curr->tag = 1;
+
+   for (edge = curr->out; edge; edge = edge->next[0]) {
+      node = edge->target;
+      if (edge->type == Edge::DUMMY)
+         continue;
+
+      if (node->getSequence() == 0) {
+         edge->type = Edge::TREE;
+         classifyDFS(node, seq);
+      } else
+      if (node->getSequence() > curr->getSequence()) {
+         edge->type = Edge::FORWARD;
+      } else {
+         edge->type = node->tag ? Edge::BACK : Edge::CROSS;
+      }
+   }
+
+   for (edge = curr->in; edge; edge = edge->next[1]) {
+      node = edge->origin;
+      if (edge->type == Edge::DUMMY)
+         continue;
+
+      if (node->getSequence() == 0) {
+         edge->type = Edge::TREE;
+         classifyDFS(node, seq);
+      } else
+      if (node->getSequence() > curr->getSequence()) {
+         edge->type = Edge::FORWARD;
+      } else {
+         edge->type = node->tag ? Edge::BACK : Edge::CROSS;
+      }
+   }
+
+   curr->tag = 0;
+}
+
+// @dist is indexed by Node::tag, returns -1 if no path found
+int
+Graph::findLightestPathWeight(Node *a, Node *b, const std::vector<int> &weight)
+{
+   std::vector<int> path(weight.size(), std::numeric_limits<int>::max());
+   std::list<Node *> nodeList;
+   const int seq = nextSequence();
+
+   path[a->tag] = 0;
+   for (Node *c = a; c && c != b;) {
+      const int p = path[c->tag] + weight[c->tag];
+      for (EdgeIterator ei = c->outgoing(); !ei.end(); ei.next()) {
+         Node *t = ei.getNode();
+         if (t->getSequence() < seq) {
+            if (path[t->tag] == std::numeric_limits<int>::max())
+               nodeList.push_front(t);
+            if (p < path[t->tag])
+               path[t->tag] = p;
+         }
+      }
+      c->visit(seq);
+      Node *next = NULL;
+      for (std::list<Node *>::iterator n = nodeList.begin();
+           n != nodeList.end(); ++n) {
+         if (!next || path[(*n)->tag] < path[next->tag])
+            next = *n;
+         if ((*n) == c) {
+            // erase visited
+            n = nodeList.erase(n);
+            --n;
+         }
+      }
+      c = next;
+   }
+   if (path[b->tag] == std::numeric_limits<int>::max())
+      return -1;
+   return path[b->tag];
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
new file mode 100644
index 0000000..b0981ff
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_GRAPH_H__
+#define __NV50_IR_GRAPH_H__
+
+#include "codegen/nv50_ir_util.h"
+#include <vector>
+
+namespace nv50_ir {
+
+#define ITER_NODE(x) reinterpret_cast<Graph::Node *>((x).get())
+#define ITER_EDGE(x) reinterpret_cast<Graph::Edge *>((x).get())
+
+// A connected graph.
+class Graph
+{
+public:
+   class Node;
+
+   class Edge
+   {
+   public:
+      enum Type
+      {
+         UNKNOWN,
+         TREE,
+         FORWARD,
+         BACK,
+         CROSS, // e.g. loop break
+         DUMMY
+      };
+
+      Edge(Node *dst, Node *src, Type kind);
+      ~Edge() { unlink(); }
+
+      inline Node *getOrigin() const { return origin; }
+      inline Node *getTarget() const { return target; }
+
+      inline Type getType() const { return type; }
+      const char *typeStr() const;
+
+   private:
+      Node *origin;
+      Node *target;
+
+      Type type;
+      Edge *next[2]; // next edge outgoing/incident from/to origin/target
+      Edge *prev[2];
+
+      void unlink();
+
+      friend class Graph;
+   };
+
+   class EdgeIterator : public Iterator
+   {
+   public:
+      EdgeIterator() : e(0), t(0), d(0), rev(false) { }
+      EdgeIterator(Graph::Edge *first, int dir, bool reverse)
+         : d(dir), rev(reverse)
+      {
+         t = e = ((rev && first) ? first->prev[d] : first);
+      }
+
+      virtual void next()
+      {
+         Graph::Edge *n = (rev ? e->prev[d] : e->next[d]);
+         e = (n == t ? NULL : n);
+      }
+      virtual bool end() const { return !e; }
+      virtual void *get() const { return e; }
+
+      inline Node *getNode() const { assert(e); return d ?
+                                                   e->origin : e->target; }
+      inline Edge *getEdge() const { return e; }
+      inline Edge::Type getType() { return e ? e->getType() : Edge::UNKNOWN; }
+
+   private:
+      Graph::Edge *e;
+      Graph::Edge *t;
+      int d;
+      bool rev;
+   };
+
+   class Node
+   {
+   public:
+      Node(void *);
+      ~Node() { cut(); }
+
+      void attach(Node *, Edge::Type);
+      bool detach(Node *);
+      void cut();
+
+      inline EdgeIterator outgoing(bool reverse = false) const;
+      inline EdgeIterator incident(bool reverse = false) const;
+
+      inline Node *parent() const; // returns NULL if count(incident edges) != 1
+
+      bool reachableBy(const Node *node, const Node *term) const;
+
+      inline bool visit(int);
+      inline int  getSequence() const;
+
+      inline int incidentCountFwd() const; // count of incident non-back edges
+      inline int incidentCount() const { return inCount; }
+      inline int outgoingCount() const { return outCount; }
+
+      Graph *getGraph() const { return graph; }
+
+      void *data;
+
+   private:
+      Edge *in;
+      Edge *out;
+      Graph *graph;
+
+      int visited;
+
+      int16_t inCount;
+      int16_t outCount;
+   public:
+      int tag; // for temporary use
+
+      friend class Graph;
+   };
+
+public:
+   Graph();
+   ~Graph(); // does *not* free the nodes (make it an option ?)
+
+   inline Node *getRoot() const { return root; }
+
+   inline unsigned int getSize() const { return size; }
+
+   inline int nextSequence();
+
+   void insert(Node *node); // attach to or set as root
+
+   IteratorRef iteratorDFS(bool preorder = true);
+   IteratorRef iteratorCFG();
+
+   // safe iterators are unaffected by changes to the *edges* of the graph
+   IteratorRef safeIteratorDFS(bool preorder = true);
+   IteratorRef safeIteratorCFG();
+
+   void classifyEdges();
+
+   // @weights: indexed by Node::tag
+   int findLightestPathWeight(Node *, Node *, const std::vector<int>& weights);
+
+private:
+   void classifyDFS(Node *, int&);
+
+private:
+   Node *root;
+   unsigned int size;
+   int sequence;
+};
+
+int Graph::nextSequence()
+{
+   return ++sequence;
+}
+
+Graph::Node *Graph::Node::parent() const
+{
+   if (inCount != 1)
+      return NULL;
+   assert(in);
+   return in->origin;
+}
+
+bool Graph::Node::visit(int v)
+{
+   if (visited == v)
+      return false;
+   visited = v;
+   return true;
+}
+
+int Graph::Node::getSequence() const
+{
+   return visited;
+}
+
+Graph::EdgeIterator Graph::Node::outgoing(bool reverse) const
+{
+   return EdgeIterator(out, 0, reverse);
+}
+
+Graph::EdgeIterator Graph::Node::incident(bool reverse) const
+{
+   return EdgeIterator(in, 1, reverse);
+}
+
+int Graph::Node::incidentCountFwd() const
+{
+   int n = 0;
+   for (EdgeIterator ei = incident(); !ei.end(); ei.next())
+      if (ei.getType() != Edge::BACK)
+         ++n;
+   return n;
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_GRAPH_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
new file mode 100644
index 0000000..255324f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_INLINES_H__
+#define __NV50_IR_INLINES_H__
+
+static inline CondCode reverseCondCode(CondCode cc)
+{
+   static const uint8_t ccRev[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
+   return static_cast<CondCode>(ccRev[cc & 7] | (cc & ~7));
+}
+
+static inline CondCode inverseCondCode(CondCode cc)
+{
+   return static_cast<CondCode>(cc ^ 7);
+}
+
+static inline bool isMemoryFile(DataFile f)
+{
+   return (f >= FILE_MEMORY_CONST && f <= FILE_MEMORY_LOCAL);
+}
+
+// contrary to asTex(), this will never include SULD/SUST
+static inline bool isTextureOp(operation op)
+{
+   return (op >= OP_TEX && op <= OP_TEXPREP);
+}
+
+static inline bool isSurfaceOp(operation op)
+{
+   return (op >= OP_SULDB && op <= OP_SULEA);
+}
+
+static inline unsigned int typeSizeof(DataType ty)
+{
+   switch (ty) {
+   case TYPE_U8:
+   case TYPE_S8:
+      return 1;
+   case TYPE_F16:
+   case TYPE_U16:
+   case TYPE_S16:
+      return 2;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      return 4;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      return 8;
+   case TYPE_B96:
+      return 12;
+   case TYPE_B128:
+      return 16;
+   default:
+      return 0;
+   }
+}
+
+static inline unsigned int typeSizeofLog2(DataType ty)
+{
+   switch (ty) {
+   case TYPE_F16:
+   case TYPE_U16:
+   case TYPE_S16:
+      return 1;
+   case TYPE_F32:
+   case TYPE_U32:
+   case TYPE_S32:
+      return 2;
+   case TYPE_F64:
+   case TYPE_U64:
+   case TYPE_S64:
+      return 3;
+   case TYPE_B96:
+   case TYPE_B128:
+      return 4;
+   case TYPE_U8:
+   case TYPE_S8:
+   default:
+      return 0;
+   }
+}
+
+static inline DataType typeOfSize(unsigned int size,
+                                  bool flt = false, bool sgn = false)
+{
+   switch (size) {
+   case 1: return sgn ? TYPE_S8 : TYPE_U8;
+   case 2: return flt ? TYPE_F16 : (sgn ? TYPE_S16 : TYPE_U16);
+   case 8: return flt ? TYPE_F64 : (sgn ? TYPE_S64 : TYPE_U64);
+   case 12: return TYPE_B96;
+   case 16: return TYPE_B128;
+   case 4:
+      return flt ? TYPE_F32 : (sgn ? TYPE_S32 : TYPE_U32);
+   default:
+      return TYPE_NONE;
+   }
+}
+
+static inline bool isFloatType(DataType ty)
+{
+   return (ty >= TYPE_F16 && ty <= TYPE_F64);
+}
+
+static inline bool isSignedIntType(DataType ty)
+{
+   return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32);
+}
+
+static inline bool isSignedType(DataType ty)
+{
+   switch (ty) {
+   case TYPE_NONE:
+   case TYPE_U8:
+   case TYPE_U16:
+   case TYPE_U32:
+   case TYPE_B96:
+   case TYPE_B128:
+      return false;
+   default:
+      return true;
+   }
+}
+
+static inline DataType intTypeToSigned(DataType ty)
+{
+   switch (ty) {
+   case TYPE_U32: return TYPE_S32;
+   case TYPE_U16: return TYPE_S16;
+   case TYPE_U8: return TYPE_S8;
+   default:
+      return ty;
+   }
+}
+
+const ValueRef *ValueRef::getIndirect(int dim) const
+{
+   return isIndirect(dim) ? &insn->src(indirect[dim]) : NULL;
+}
+
+DataFile ValueRef::getFile() const
+{
+   return value ? value->reg.file : FILE_NULL;
+}
+
+unsigned int ValueRef::getSize() const
+{
+   return value ? value->reg.size : 0;
+}
+
+Value *ValueRef::rep() const
+{
+   assert(value);
+   return value->join;
+}
+
+Value *ValueDef::rep() const
+{
+   assert(value);
+   return value->join;
+}
+
+DataFile ValueDef::getFile() const
+{
+   return value ? value->reg.file : FILE_NULL;
+}
+
+unsigned int ValueDef::getSize() const
+{
+   return value ? value->reg.size : 0;
+}
+
+void ValueDef::setSSA(LValue *lval)
+{
+   origin = value->asLValue();
+   set(lval);
+}
+
+const LValue *ValueDef::preSSA() const
+{
+   return origin;
+}
+
+Instruction *Value::getInsn() const
+{
+   return defs.empty() ? NULL : defs.front()->getInsn();
+}
+
+Instruction *Value::getUniqueInsn() const
+{
+   if (defs.empty())
+      return NULL;
+
+   // after regalloc, the definitions of coalesced values are linked
+   if (join != this) {
+      for (DefCIterator it = defs.begin(); it != defs.end(); ++it)
+         if ((*it)->get() == this)
+            return (*it)->getInsn();
+      // should be unreachable and trigger assertion at the end
+   }
+#ifdef DEBUG
+   if (reg.data.id < 0) {
+      int n = 0;
+      for (DefCIterator it = defs.begin(); n < 2 && it != defs.end(); ++it)
+         if ((*it)->get() == this) // don't count joined values
+            ++n;
+      if (n > 1)
+         WARN("value %%%i not uniquely defined\n", id); // return NULL ?
+   }
+#endif
+   assert(defs.front()->get() == this);
+   return defs.front()->getInsn();
+}
+
+inline bool Instruction::constrainedDefs() const
+{
+   return defExists(1) || op == OP_UNION;
+}
+
+Value *Instruction::getIndirect(int s, int dim) const
+{
+   return srcs[s].isIndirect(dim) ? getSrc(srcs[s].indirect[dim]) : NULL;
+}
+
+Value *Instruction::getPredicate() const
+{
+   return (predSrc >= 0) ? getSrc(predSrc) : NULL;
+}
+
+void Instruction::setFlagsDef(int d, Value *val)
+{
+   if (val) {
+      if (flagsDef < 0)
+         flagsDef = d;
+      setDef(flagsDef, val);
+   } else {
+      if (flagsDef >= 0) {
+         setDef(flagsDef, NULL);
+         flagsDef = -1;
+      }
+   }
+}
+
+void Instruction::setFlagsSrc(int s, Value *val)
+{
+   flagsSrc = s;
+   setSrc(flagsSrc, val);
+}
+
+Value *TexInstruction::getIndirectR() const
+{
+   return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL;
+}
+
+Value *TexInstruction::getIndirectS() const
+{
+   return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL;
+}
+
+CmpInstruction *Instruction::asCmp()
+{
+   if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP)
+      return static_cast<CmpInstruction *>(this);
+   return NULL;
+}
+
+const CmpInstruction *Instruction::asCmp() const
+{
+   if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP)
+      return static_cast<const CmpInstruction *>(this);
+   return NULL;
+}
+
+FlowInstruction *Instruction::asFlow()
+{
+   if (op >= OP_BRA && op <= OP_JOIN)
+      return static_cast<FlowInstruction *>(this);
+   return NULL;
+}
+
+const FlowInstruction *Instruction::asFlow() const
+{
+   if (op >= OP_BRA && op <= OP_JOINAT)
+      return static_cast<const FlowInstruction *>(this);
+   return NULL;
+}
+
+TexInstruction *Instruction::asTex()
+{
+   if (op >= OP_TEX && op <= OP_SULEA)
+      return static_cast<TexInstruction *>(this);
+   return NULL;
+}
+
+const TexInstruction *Instruction::asTex() const
+{
+   if (op >= OP_TEX && op <= OP_SULEA)
+      return static_cast<const TexInstruction *>(this);
+   return NULL;
+}
+
+static inline Instruction *cloneForward(Function *ctx, Instruction *obj)
+{
+   DeepClonePolicy<Function> pol(ctx);
+
+   for (int i = 0; obj->srcExists(i); ++i)
+      pol.set(obj->getSrc(i), obj->getSrc(i));
+
+   return obj->clone(pol);
+}
+
+// XXX: use a virtual function so we're really really safe ?
+LValue *Value::asLValue()
+{
+   if (reg.file >= FILE_GPR && reg.file <= FILE_ADDRESS)
+      return static_cast<LValue *>(this);
+   return NULL;
+}
+
+Symbol *Value::asSym()
+{
+   if (reg.file >= FILE_MEMORY_CONST)
+      return static_cast<Symbol *>(this);
+   return NULL;
+}
+
+const Symbol *Value::asSym() const
+{
+   if (reg.file >= FILE_MEMORY_CONST)
+      return static_cast<const Symbol *>(this);
+   return NULL;
+}
+
+void Symbol::setOffset(int32_t offset)
+{
+   reg.data.offset = offset;
+}
+
+void Symbol::setAddress(Symbol *base, int32_t offset)
+{
+   baseSym = base;
+   reg.data.offset = offset;
+}
+
+void Symbol::setSV(SVSemantic sv, uint32_t index)
+{
+   reg.data.sv.sv = sv;
+   reg.data.sv.index = index;
+}
+
+ImmediateValue *Value::asImm()
+{
+   if (reg.file == FILE_IMMEDIATE)
+      return static_cast<ImmediateValue *>(this);
+   return NULL;
+}
+
+const ImmediateValue *Value::asImm() const
+{
+   if (reg.file == FILE_IMMEDIATE)
+      return static_cast<const ImmediateValue *>(this);
+   return NULL;
+}
+
+Value *Value::get(Iterator &it)
+{
+   return reinterpret_cast<Value *>(it.get());
+}
+
+bool BasicBlock::reachableBy(const BasicBlock *by, const BasicBlock *term)
+{
+   return cfg.reachableBy(&by->cfg, &term->cfg);
+}
+
+BasicBlock *BasicBlock::get(Iterator &iter)
+{
+   return reinterpret_cast<BasicBlock *>(iter.get());
+}
+
+BasicBlock *BasicBlock::get(Graph::Node *node)
+{
+   assert(node);
+   return reinterpret_cast<BasicBlock *>(node->data);
+}
+
+Function *Function::get(Graph::Node *node)
+{
+   assert(node);
+   return reinterpret_cast<Function *>(node->data);
+}
+
+LValue *Function::getLValue(int id)
+{
+   assert((unsigned int)id < (unsigned int)allLValues.getSize());
+   return reinterpret_cast<LValue *>(allLValues.get(id));
+}
+
+#endif // __NV50_IR_INLINES_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
new file mode 100644
index 0000000..56eaad3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -0,0 +1,1101 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+// nv50 doesn't support 32 bit integer multiplication
+//
+//       ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
+// -------------------
+//    al*bh 00           HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
+// ah*bh 00 00                 (           carry1) << 16 + ( carry2)
+//       al*bl
+//    ah*bl 00
+//
+// fffe0001 + fffe0001
+static bool
+expandIntegerMUL(BuildUtil *bld, Instruction *mul)
+{
+   const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+
+   DataType fTy = mul->sType; // full type
+   DataType hTy;
+   switch (fTy) {
+   case TYPE_S32: hTy = TYPE_S16; break;
+   case TYPE_U32: hTy = TYPE_U16; break;
+   case TYPE_U64: hTy = TYPE_U32; break;
+   case TYPE_S64: hTy = TYPE_S32; break;
+   default:
+      return false;
+   }
+   unsigned int fullSize = typeSizeof(fTy);
+   unsigned int halfSize = typeSizeof(hTy);
+
+   Instruction *i[9];
+
+   bld->setPosition(mul, true);
+
+   Value *a[2], *b[2];
+   Value *c[2];
+   Value *t[4];
+   for (int j = 0; j < 4; ++j)
+      t[j] = bld->getSSA(fullSize);
+
+   // split sources into halves
+   i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
+   i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+
+   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+   i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
+   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+
+   if (highResult) {
+      Value *r[3];
+      Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
+      c[0] = bld->getSSA(1, FILE_FLAGS);
+      c[1] = bld->getSSA(1, FILE_FLAGS);
+      for (int j = 0; j < 3; ++j)
+         r[j] = bld->getSSA(fullSize);
+
+      i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
+      i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
+      bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
+      i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+
+      // set carry defs / sources
+      i[3]->setFlagsDef(1, c[0]);
+      i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+      i[6]->setPredicate(CC_C, c[0]);
+      i[5]->setFlagsSrc(3, c[1]);
+   } else {
+      bld->mkMov(mul->getDef(0), t[3]);
+   }
+   delete_Instruction(bld->getProgram(), mul);
+
+   for (int j = 2; j <= (highResult ? 5 : 4); ++j)
+      if (i[j])
+         i[j]->sType = hTy;
+
+   return true;
+}
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+//             UL UR LL LR
+#define QUADOP(q, r, s, t)            \
+   ((QOP_##q << 6) | (QOP_##r << 4) | \
+    (QOP_##s << 2) | (QOP_##t << 0))
+
+class NV50LegalizePostRA : public Pass
+{
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+
+   void handlePRERET(FlowInstruction *);
+   void replaceZero(Instruction *);
+
+   LValue *r63;
+};
+
+bool
+NV50LegalizePostRA::visit(Function *fn)
+{
+   Program *prog = fn->getProgram();
+
+   r63 = new_LValue(fn, FILE_GPR);
+   r63->reg.data.id = 63;
+
+   // this is actually per-program, but we can do it all on visiting main()
+   std::list<Instruction *> *outWrites =
+      reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+
+   if (outWrites) {
+      for (std::list<Instruction *>::iterator it = outWrites->begin();
+           it != outWrites->end(); ++it)
+         (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
+      // instructions will be deleted on exit
+      outWrites->clear();
+   }
+
+   return true;
+}
+
+void
+NV50LegalizePostRA::replaceZero(Instruction *i)
+{
+   for (int s = 0; i->srcExists(s); ++s) {
+      ImmediateValue *imm = i->getSrc(s)->asImm();
+      if (imm && imm->reg.data.u64 == 0)
+         i->setSrc(s, r63);
+   }
+}
+
+// Emulate PRERET: jump to the target and call to the origin from there
+//
+// WARNING: atm only works if BBs are affected by at most a single PRERET
+//
+// BB:0
+// preret BB:3
+// (...)
+// BB:3
+// (...)
+//             --->
+// BB:0
+// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
+// (...)
+// BB:3
+// bra BB:3 + n1 (skip the call)
+// call BB:0 + n2 (skip bra at beginning of BB:0)
+// (...)
+void
+NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
+{
+   BasicBlock *bbE = pre->bb;
+   BasicBlock *bbT = pre->target.bb;
+
+   pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
+   bbE->remove(pre);
+   bbE->insertHead(pre);
+
+   Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
+   Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
+
+   bbT->insertHead(call);
+   bbT->insertHead(skip);
+
+   // NOTE: maybe split blocks to prevent the instructions from moving ?
+
+   skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
+   call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
+}
+
+bool
+NV50LegalizePostRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+   for (i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->isNop()) {
+         bb->remove(i);
+      } else
+      if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
+         handlePRERET(i->asFlow());
+      } else {
+         // TODO: We will want to do this before register allocation,
+         // since have to use a $c register for the carry flag.
+         if (typeSizeof(i->dType) == 8) {
+            Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
+            if (hi)
+               next = hi;
+         }
+
+         if (i->op != OP_MOV && i->op != OP_PFETCH &&
+             i->op != OP_BAR &&
+             (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
+            replaceZero(i);
+      }
+   }
+   if (!bb->getEntry())
+      return true;
+
+   return true;
+}
+
+class NV50LegalizeSSA : public Pass
+{
+public:
+   NV50LegalizeSSA(Program *);
+
+   virtual bool visit(BasicBlock *bb);
+
+private:
+   void propagateWriteToOutput(Instruction *);
+   void handleDIV(Instruction *);
+   void handleMOD(Instruction *);
+   void handleMUL(Instruction *);
+   void handleAddrDef(Instruction *);
+
+   inline bool isARL(const Instruction *) const;
+
+   BuildUtil bld;
+
+   std::list<Instruction *> *outWrites;
+};
+
+NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
+{
+   bld.setProgram(prog);
+
+   if (prog->optLevel >= 2 &&
+       (prog->getType() == Program::TYPE_GEOMETRY ||
+        prog->getType() == Program::TYPE_VERTEX))
+      outWrites =
+         reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   else
+      outWrites = NULL;
+}
+
+void
+NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
+{
+   if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
+      return;
+
+   // check def instruction can store
+   Instruction *di = st->getSrc(1)->defs.front()->getInsn();
+
+   // TODO: move exports (if beneficial) in common opt pass
+   if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
+      return;
+   for (int s = 0; di->srcExists(s); ++s)
+      if (di->src(s).getFile() == FILE_IMMEDIATE)
+         return;
+
+   // We cannot set defs to non-lvalues before register allocation, so
+   // save & remove (to save registers) the exports and replace later.
+   outWrites->push_back(st);
+   st->bb->remove(st);
+}
+
+bool
+NV50LegalizeSSA::isARL(const Instruction *i) const
+{
+   ImmediateValue imm;
+
+   if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
+      return false;
+   if (!i->src(1).getImmediate(imm))
+      return false;
+   return imm.isInteger(0);
+}
+
+void
+NV50LegalizeSSA::handleAddrDef(Instruction *i)
+{
+   Instruction *arl;
+
+   i->getDef(0)->reg.size = 2; // $aX are only 16 bit
+
+   // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
+   if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
+      if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
+         return;
+      if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
+         return;
+   }
+
+   // turn $a sources into $r sources (can't operate on $a)
+   for (int s = 0; i->srcExists(s); ++s) {
+      Value *a = i->getSrc(s);
+      Value *r;
+      if (a->reg.file == FILE_ADDRESS) {
+         if (a->getInsn() && isARL(a->getInsn())) {
+            i->setSrc(s, a->getInsn()->getSrc(0));
+         } else {
+            bld.setPosition(i, false);
+            r = bld.getSSA();
+            bld.mkMov(r, a);
+            i->setSrc(s, r);
+         }
+      }
+   }
+   if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
+      return;
+
+   // turn result back into $a
+   bld.setPosition(i, true);
+   arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
+   i->setDef(0, arl->getSrc(0));
+}
+
+void
+NV50LegalizeSSA::handleMUL(Instruction *mul)
+{
+   if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
+      return;
+   Value *def = mul->getDef(0);
+   Value *pred = mul->getPredicate();
+   CondCode cc = mul->cc;
+   if (pred)
+      mul->setPredicate(CC_ALWAYS, NULL);
+
+   if (mul->op == OP_MAD) {
+      Instruction *add = mul;
+      bld.setPosition(add, false);
+      Value *res = cloneShallow(func, mul->getDef(0));
+      mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
+      add->op = OP_ADD;
+      add->setSrc(0, mul->getDef(0));
+      add->setSrc(1, add->getSrc(2));
+      for (int s = 2; add->srcExists(s); ++s)
+         add->setSrc(s, NULL);
+      mul->subOp = add->subOp;
+      add->subOp = 0;
+   }
+   expandIntegerMUL(&bld, mul);
+   if (pred)
+      def->getInsn()->setPredicate(cc, pred);
+}
+
+// Use f32 division: first compute an approximate result, use it to reduce
+// the dividend, which should then be representable as f32, divide the reduced
+// dividend, and add the quotients.
+void
+NV50LegalizeSSA::handleDIV(Instruction *div)
+{
+   const DataType ty = div->sType;
+
+   if (ty != TYPE_U32 && ty != TYPE_S32)
+      return;
+
+   Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
+
+   bld.setPosition(div, false);
+
+   Value *a, *af = bld.getSSA();
+   Value *b, *bf = bld.getSSA();
+
+   bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
+   bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
+
+   if (isSignedType(ty)) {
+      af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+      a = bld.getSSA();
+      b = bld.getSSA();
+      bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
+      bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
+   } else {
+      a = div->getSrc(0);
+      b = div->getSrc(1);
+   }
+
+   bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
+   bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
+
+   // get error of 1st result
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
+
+   bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
+
+   bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
+   bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
+      ->rnd = ROUND_Z;
+   bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
+
+   // correction: if modulus >= divisor, add 1
+   expandIntegerMUL(&bld,
+      bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
+   bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
+   bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
+   if (!isSignedType(ty)) {
+      div->op = OP_SUB;
+      div->setSrc(0, q);
+      div->setSrc(1, s);
+   } else {
+      t = q;
+      bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
+      s = bld.getSSA();
+      t = bld.getSSA();
+      // fix the sign
+      bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
+         ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
+      bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
+      bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
+
+      div->op = OP_UNION;
+      div->setSrc(0, s);
+      div->setSrc(1, t);
+   }
+}
+
+void
+NV50LegalizeSSA::handleMOD(Instruction *mod)
+{
+   if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
+      return;
+   bld.setPosition(mod, false);
+
+   Value *q = bld.getSSA();
+   Value *m = bld.getSSA();
+
+   bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
+   handleDIV(q->getInsn());
+
+   bld.setPosition(mod, false);
+   expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
+
+   mod->op = OP_SUB;
+   mod->setSrc(1, m);
+}
+
+bool
+NV50LegalizeSSA::visit(BasicBlock *bb)
+{
+   Instruction *insn, *next;
+   // skipping PHIs (don't pass them to handleAddrDef) !
+   for (insn = bb->getEntry(); insn; insn = next) {
+      next = insn->next;
+
+      switch (insn->op) {
+      case OP_EXPORT:
+         if (outWrites)
+            propagateWriteToOutput(insn);
+         break;
+      case OP_DIV:
+         handleDIV(insn);
+         break;
+      case OP_MOD:
+         handleMOD(insn);
+         break;
+      case OP_MAD:
+      case OP_MUL:
+         handleMUL(insn);
+         break;
+      default:
+         break;
+      }
+
+      if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
+         handleAddrDef(insn);
+   }
+   return true;
+}
+
+class NV50LoweringPreSSA : public Pass
+{
+public:
+   NV50LoweringPreSSA(Program *);
+
+private:
+   virtual bool visit(Instruction *);
+   virtual bool visit(Function *);
+
+   bool handleRDSV(Instruction *);
+   bool handleWRSV(Instruction *);
+
+   bool handleEXPORT(Instruction *);
+
+   bool handleDIV(Instruction *);
+   bool handleSQRT(Instruction *);
+   bool handlePOW(Instruction *);
+
+   bool handleSET(Instruction *);
+   bool handleSLCT(CmpInstruction *);
+   bool handleSELP(Instruction *);
+
+   bool handleTEX(TexInstruction *);
+   bool handleTXB(TexInstruction *); // I really
+   bool handleTXL(TexInstruction *); // hate
+   bool handleTXD(TexInstruction *); // these 3
+
+   bool handleCALL(Instruction *);
+   bool handlePRECONT(Instruction *);
+   bool handleCONT(Instruction *);
+
+   void checkPredicate(Instruction *);
+
+private:
+   const Target *const targ;
+
+   BuildUtil bld;
+
+   Value *tid;
+};
+
+NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
+   targ(prog->getTarget()), tid(NULL)
+{
+   bld.setProgram(prog);
+}
+
+bool
+NV50LoweringPreSSA::visit(Function *f)
+{
+   BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
+
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      Value *arg = new_LValue(func, FILE_GPR);
+      arg->reg.data.id = 0;
+      f->ins.push_back(arg);
+
+      bld.setPosition(root, false);
+      tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
+   }
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTEX(TexInstruction *i)
+{
+   const int arg = i->tex.target.getArgCount();
+   const int dref = arg;
+   const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
+
+   // dref comes before bias/lod
+   if (i->tex.target.isShadow())
+      if (i->op == OP_TXB || i->op == OP_TXL)
+         i->swapSources(dref, lod);
+
+   // array index must be converted to u32
+   if (i->tex.target.isArray()) {
+      Value *layer = i->getSrc(arg - 1);
+      LValue *src = new_LValue(func, FILE_GPR);
+      bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
+      bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
+      i->setSrc(arg - 1, src);
+
+      if (i->tex.target.isCube()) {
+         std::vector<Value *> acube, a2d;
+         int c;
+
+         acube.resize(4);
+         for (c = 0; c < 4; ++c)
+            acube[c] = i->getSrc(c);
+         a2d.resize(4);
+         for (c = 0; c < 3; ++c)
+            a2d[c] = new_LValue(func, FILE_GPR);
+         a2d[3] = NULL;
+
+         bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
+                   a2d, acube)->asTex()->tex.mask = 0x7;
+
+         for (c = 0; c < 3; ++c)
+            i->setSrc(c, a2d[c]);
+         i->setSrc(c, NULL);
+         for (; i->srcExists(c + 1); ++c)
+            i->setSrc(c, i->getSrc(c + 1));
+
+         i->tex.target = i->tex.target.isShadow() ?
+            TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
+      }
+   }
+
+   // texel offsets are 3 immediate fields in the instruction,
+   // nv50 cannot do textureGatherOffsets
+   assert(i->tex.useOffsets <= 1);
+
+   return true;
+}
+
+// Bias must be equal for all threads of a quad or lod calculation will fail.
+//
+// The lanes of a quad are grouped by the bit in the condition register they
+// have set, which is selected by differing bias values.
+// Move the input values for TEX into a new register set for each group and
+// execute TEX only for a specific group.
+// We always need to use 4 new registers for the inputs/outputs because the
+// implicitly calculated derivatives must be correct.
+//
+// TODO: move to SSA phase so we can easily determine whether bias is constant
+bool
+NV50LoweringPreSSA::handleTXB(TexInstruction *i)
+{
+   const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
+   int l, d;
+
+   handleTEX(i);
+   Value *bias = i->getSrc(i->tex.target.getArgCount());
+   if (bias->isUniform())
+      return true;
+
+   Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
+                                 bld.loadImm(NULL, 1));
+   bld.setPosition(cond, false);
+
+   for (l = 1; l < 4; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *bit = bld.getSSA();
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      Value *imm = bld.loadImm(NULL, (1 << l));
+      bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
+      bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
+      cond->setSrc(l, bit);
+   }
+   Value *flags = bld.getScratch(1, FILE_FLAGS);
+   bld.setPosition(cond, true);
+   bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+
+   Instruction *tex[4];
+   for (l = 0; l < 4; ++l) {
+      (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
+      bld.insert(tex[l]);
+   }
+
+   Value *res[4][4];
+   for (d = 0; i->defExists(d); ++d)
+      res[0][d] = tex[0]->getDef(d);
+   for (l = 1; l < 4; ++l) {
+      for (d = 0; tex[l]->defExists(d); ++d) {
+         res[l][d] = cloneShallow(func, res[0][d]);
+         bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
+      }
+   }
+
+   for (d = 0; i->defExists(d); ++d) {
+      Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
+      for (l = 0; l < 4; ++l)
+         dst->setSrc(l, res[l][d]);
+   }
+   delete_Instruction(prog, i);
+   return true;
+}
+
+// LOD must be equal for all threads of a quad.
+// Unlike with TXB, here we can just diverge since there's no LOD calculation
+// that would require all 4 threads' sources to be set up properly.
+bool
+NV50LoweringPreSSA::handleTXL(TexInstruction *i)
+{
+   handleTEX(i);
+   Value *lod = i->getSrc(i->tex.target.getArgCount());
+   if (lod->isUniform())
+      return true;
+
+   BasicBlock *currBB = i->bb;
+   BasicBlock *texiBB = i->bb->splitBefore(i, false);
+   BasicBlock *joinBB = i->bb->splitAfter(i);
+
+   bld.setPosition(currBB, true);
+   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+   for (int l = 0; l <= 3; ++l) {
+      const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+      Value *pred = bld.getScratch(1, FILE_FLAGS);
+      bld.setPosition(currBB, true);
+      bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
+      bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
+      currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
+      if (l <= 2) {
+         BasicBlock *laneBB = new BasicBlock(func);
+         currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
+         currBB = laneBB;
+      }
+   }
+   bld.setPosition(joinBB, false);
+   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Instruction *tex;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   handleTEX(i);
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+
+   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // texture
+      bld.insert(tex = cloneForward(func, i));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSET(Instruction *i)
+{
+   if (i->dType == TYPE_F32) {
+      bld.setPosition(i, true);
+      i->dType = TYPE_U32;
+      bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
+      bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+   Value *pred = bld.getScratch(1, FILE_FLAGS);
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   // XXX: these probably shouldn't be immediates in the first place ...
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.setPosition(i, true);
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+
+   bld.setPosition(i, false);
+   i->op = OP_SET;
+   i->setFlagsDef(0, pred);
+   i->dType = TYPE_U8;
+   i->setSrc(0, i->getSrc(2));
+   i->setSrc(2, NULL);
+   i->setSrc(1, bld.loadImm(NULL, 0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSELP(Instruction *i)
+{
+   Value *src0 = bld.getSSA();
+   Value *src1 = bld.getSSA();
+
+   Value *v0 = i->getSrc(0);
+   Value *v1 = i->getSrc(1);
+   if (v0->asImm())
+      v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+   if (v1->asImm())
+      v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+   bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
+   bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
+   bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleWRSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+
+   // these are all shader outputs, $sreg are not writeable
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
+   if (addr >= 0x400)
+      return false;
+   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+   bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
+
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCALL(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_COMPUTE) {
+      // Add implicit "thread id" argument in $r0 to the function
+      i->setSrc(i->srcCount(), tid);
+   }
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePRECONT(Instruction *i)
+{
+   delete_Instruction(prog, i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCONT(Instruction *i)
+{
+   i->op = OP_BRA;
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleRDSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+   Value *def = i->getDef(0);
+   SVSemantic sv = sym->reg.data.sv.sv;
+   int idx = sym->reg.data.sv.index;
+
+   if (addr >= 0x400) // mov $sreg
+      return true;
+
+   switch (sv) {
+   case SV_POSITION:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      break;
+   case SV_FACE:
+      bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
+      if (i->dType == TYPE_F32) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
+         bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
+      }
+      break;
+   case SV_NCTAID:
+   case SV_CTAID:
+   case SV_NTID:
+      if ((sv == SV_NCTAID && idx >= 2) ||
+          (sv == SV_NTID && idx >= 3)) {
+         bld.mkMov(def, bld.mkImm(1));
+      } else if (sv == SV_CTAID && idx >= 2) {
+         bld.mkMov(def, bld.mkImm(0));
+      } else {
+         Value *x = bld.getSSA(2);
+         bld.mkOp1(OP_LOAD, TYPE_U16, x,
+                   bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+         bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+      }
+      break;
+   case SV_TID:
+      if (idx == 0) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
+      } else if (idx == 1) {
+         bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
+         bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
+      } else if (idx == 2) {
+         bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
+      } else {
+         bld.mkMov(def, bld.mkImm(0));
+      }
+      break;
+   default:
+      bld.mkFetch(i->getDef(0), i->dType,
+                  FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
+      break;
+   }
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleDIV(Instruction *i)
+{
+   if (!isFloatType(i->dType))
+      return true;
+   bld.setPosition(i, false);
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   i->op = OP_MUL;
+   i->setSrc(1, rcp->getDef(0));
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSQRT(Instruction *i)
+{
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePOW(Instruction *i)
+{
+   LValue *val = bld.getScratch();
+
+   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+   i->op = OP_EX2;
+   i->setSrc(0, val);
+   i->setSrc(1, NULL);
+
+   return true;
+}
+
+bool
+NV50LoweringPreSSA::handleEXPORT(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      if (i->getIndirect(0, 0)) {
+         // TODO: redirect to l[] here, load to GPRs at exit
+         return false;
+      } else {
+         int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
+
+         i->op = OP_MOV;
+         i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+         i->src(0).set(i->src(1));
+         i->setSrc(1, NULL);
+         i->setDef(0, new_LValue(func, FILE_GPR));
+         i->getDef(0)->reg.data.id = id;
+
+         prog->maxGPR = MAX2(prog->maxGPR, id);
+      }
+   }
+   return true;
+}
+
+// Set flags according to predicate and make the instruction read $cX.
+void
+NV50LoweringPreSSA::checkPredicate(Instruction *insn)
+{
+   Value *pred = insn->getPredicate();
+   Value *cdst;
+
+   if (!pred || pred->reg.file == FILE_FLAGS)
+      return;
+   cdst = bld.getSSA(1, FILE_FLAGS);
+
+   bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
+
+   insn->setPredicate(insn->cc, cdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NV50LoweringPreSSA::visit(Instruction *i)
+{
+   bld.setPosition(i, false);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXF:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXB:
+      return handleTXB(i->asTex());
+   case OP_TXL:
+      return handleTXL(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_SET:
+      return handleSET(i);
+   case OP_SLCT:
+      return handleSLCT(i->asCmp());
+   case OP_SELP:
+      return handleSELP(i);
+   case OP_POW:
+      return handlePOW(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_CALL:
+      return handleCALL(i);
+   case OP_PRECONT:
+      return handlePRECONT(i);
+   case OP_CONT:
+      return handleCONT(i);
+   default:
+      break;
+   }
+   return true;
+}
+
+bool
+TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
+{
+   bool ret = false;
+
+   if (stage == CG_STAGE_PRE_SSA) {
+      NV50LoweringPreSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      if (!prog->targetPriv)
+         prog->targetPriv = new std::list<Instruction *>();
+      NV50LegalizeSSA pass(prog);
+      ret = pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NV50LegalizePostRA pass;
+      ret = pass.run(prog, false, true);
+      if (prog->targetPriv)
+         delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+   }
+   return ret;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
new file mode 100644
index 0000000..8d94dd1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -0,0 +1,1597 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+#include <limits>
+
+namespace nv50_ir {
+
+#define QOP_ADD  0
+#define QOP_SUBR 1
+#define QOP_SUB  2
+#define QOP_MOV2 3
+
+//             UL UR LL LR
+#define QUADOP(q, r, s, t)                      \
+   ((QOP_##q << 6) | (QOP_##r << 4) |           \
+    (QOP_##s << 2) | (QOP_##t << 0))
+
+class NVC0LegalizeSSA : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Function *);
+
+   // we want to insert calls to the builtin library only after optimization
+   void handleDIV(Instruction *); // integer division, modulus
+   void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
+
+private:
+   BuildUtil bld;
+};
+
+void
+NVC0LegalizeSSA::handleDIV(Instruction *i)
+{
+   FlowInstruction *call;
+   int builtin;
+   Value *def[2];
+
+   bld.setPosition(i, false);
+   def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
+   def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
+   switch (i->dType) {
+   case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
+   case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
+   default:
+      return;
+   }
+   call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
+   bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
+   bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
+   bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
+
+   call->fixed = 1;
+   call->absolute = call->builtin = 1;
+   call->target.builtin = builtin;
+   delete_Instruction(prog, i);
+}
+
+void
+NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
+{
+   // TODO
+}
+
+bool
+NVC0LegalizeSSA::visit(Function *fn)
+{
+   bld.setProgram(fn->getProgram());
+   return true;
+}
+
+bool
+NVC0LegalizeSSA::visit(BasicBlock *bb)
+{
+   Instruction *next;
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      if (i->dType == TYPE_F32)
+         continue;
+      switch (i->op) {
+      case OP_DIV:
+      case OP_MOD:
+         handleDIV(i);
+         break;
+      case OP_RCP:
+      case OP_RSQ:
+         if (i->dType == TYPE_F64)
+            handleRCPRSQ(i);
+         break;
+      default:
+         break;
+      }
+   }
+   return true;
+}
+
+class NVC0LegalizePostRA : public Pass
+{
+public:
+   NVC0LegalizePostRA(const Program *);
+
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+
+   void replaceZero(Instruction *);
+   bool tryReplaceContWithBra(BasicBlock *);
+   void propagateJoin(BasicBlock *);
+
+   struct TexUse
+   {
+      TexUse(Instruction *use, const Instruction *tex)
+         : insn(use), tex(tex), level(-1) { }
+      Instruction *insn;
+      const Instruction *tex; // or split / mov
+      int level;
+   };
+   struct Limits
+   {
+      Limits() { }
+      Limits(int min, int max) : min(min), max(max) { }
+      int min, max;
+   };
+   bool insertTextureBarriers(Function *);
+   inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
+   void findFirstUses(const Instruction *tex, const Instruction *def,
+                      std::list<TexUse>&);
+   void findOverwritingDefs(const Instruction *tex, Instruction *insn,
+                            const BasicBlock *term,
+                            std::list<TexUse>&);
+   void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
+   const Instruction *recurseDef(const Instruction *);
+
+private:
+   LValue *rZero;
+   LValue *carry;
+   const bool needTexBar;
+};
+
+NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
+   : rZero(NULL),
+     carry(NULL),
+     needTexBar(prog->getTarget()->getChipset() >= 0xe0)
+{
+}
+
+bool
+NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
+                                    const Instruction *early) const
+{
+   if (early->bb == later->bb)
+      return early->serial < later->serial;
+   return later->bb->dominatedBy(early->bb);
+}
+
+void
+NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
+                              Instruction *usei, const Instruction *insn)
+{
+   bool add = true;
+   for (std::list<TexUse>::iterator it = uses.begin();
+        it != uses.end();) {
+      if (insnDominatedBy(usei, it->insn)) {
+         add = false;
+         break;
+      }
+      if (insnDominatedBy(it->insn, usei))
+         it = uses.erase(it);
+      else
+         ++it;
+   }
+   if (add)
+      uses.push_back(TexUse(usei, insn));
+}
+
+void
+NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
+                                        Instruction *insn,
+                                        const BasicBlock *term,
+                                        std::list<TexUse> &uses)
+{
+   while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
+      insn = insn->getSrc(0)->getUniqueInsn();
+
+   if (!insn || !insn->bb->reachableBy(texi->bb, term))
+      return;
+
+   switch (insn->op) {
+   /* Values not connected to the tex's definition through any of these should
+    * not be conflicting.
+    */
+   case OP_SPLIT:
+   case OP_MERGE:
+   case OP_PHI:
+   case OP_UNION:
+      /* recurse again */
+      for (int s = 0; insn->srcExists(s); ++s)
+         findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
+                             uses);
+      break;
+   default:
+      // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
+      addTexUse(uses, insn, texi);
+      break;
+   }
+}
+
+void
+NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
+                                  const Instruction *insn,
+                                  std::list<TexUse> &uses)
+{
+   for (int d = 0; insn->defExists(d); ++d) {
+      Value *v = insn->getDef(d);
+      for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
+         Instruction *usei = (*u)->getInsn();
+
+         if (usei->op == OP_PHI || usei->op == OP_UNION) {
+            // need a barrier before WAW cases
+            for (int s = 0; usei->srcExists(s); ++s) {
+               Instruction *defi = usei->getSrc(s)->getUniqueInsn();
+               if (defi && &usei->src(s) != *u)
+                  findOverwritingDefs(texi, defi, usei->bb, uses);
+            }
+         }
+
+         if (usei->op == OP_SPLIT ||
+             usei->op == OP_MERGE ||
+             usei->op == OP_PHI ||
+             usei->op == OP_UNION) {
+            // these uses don't manifest in the machine code
+            findFirstUses(texi, usei, uses);
+         } else
+         if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
+             usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
+            findFirstUses(texi, usei, uses);
+         } else {
+            addTexUse(uses, usei, insn);
+         }
+      }
+   }
+}
+
+// Texture barriers:
+// This pass is a bit long and ugly and can probably be optimized.
+//
+// 1. obtain a list of TEXes and their outputs' first use(s)
+// 2. calculate the barrier level of each first use (minimal number of TEXes,
+//    over all paths, between the TEX and the use in question)
+// 3. for each barrier, if all paths from the source TEX to that barrier
+//    contain a barrier of lesser level, it can be culled
+bool
+NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
+{
+   std::list<TexUse> *uses;
+   std::vector<Instruction *> texes;
+   std::vector<int> bbFirstTex;
+   std::vector<int> bbFirstUse;
+   std::vector<int> texCounts;
+   std::vector<TexUse> useVec;
+   ArrayList insns;
+
+   fn->orderInstructions(insns);
+
+   texCounts.resize(fn->allBBlocks.getSize(), 0);
+   bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
+   bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
+
+   // tag BB CFG nodes by their id for later
+   for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
+      BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
+      if (bb)
+         bb->cfg.tag = bb->getId();
+   }
+
+   // gather the first uses for each TEX
+   for (int i = 0; i < insns.getSize(); ++i) {
+      Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
+      if (isTextureOp(tex->op)) {
+         texes.push_back(tex);
+         if (!texCounts.at(tex->bb->getId()))
+            bbFirstTex[tex->bb->getId()] = texes.size() - 1;
+         texCounts[tex->bb->getId()]++;
+      }
+   }
+   insns.clear();
+   if (texes.empty())
+      return false;
+   uses = new std::list<TexUse>[texes.size()];
+   if (!uses)
+      return false;
+   for (size_t i = 0; i < texes.size(); ++i)
+      findFirstUses(texes[i], texes[i], uses[i]);
+
+   // determine the barrier level at each use
+   for (size_t i = 0; i < texes.size(); ++i) {
+      for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
+           ++u) {
+         BasicBlock *tb = texes[i]->bb;
+         BasicBlock *ub = u->insn->bb;
+         if (tb == ub) {
+            u->level = 0;
+            for (size_t j = i + 1; j < texes.size() &&
+                    texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
+                 ++j)
+               u->level++;
+         } else {
+            u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
+                                                      &ub->cfg, texCounts);
+            if (u->level < 0) {
+               WARN("Failed to find path TEX -> TEXBAR\n");
+               u->level = 0;
+               continue;
+            }
+            // this counted all TEXes in the origin block, correct that
+            u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
+            // and did not count the TEXes in the destination block, add those
+            for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
+                    texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
+                 ++j)
+               u->level++;
+         }
+         assert(u->level >= 0);
+         useVec.push_back(*u);
+      }
+   }
+   delete[] uses;
+   uses = NULL;
+
+   // insert the barriers
+   for (size_t i = 0; i < useVec.size(); ++i) {
+      Instruction *prev = useVec[i].insn->prev;
+      if (useVec[i].level < 0)
+         continue;
+      if (prev && prev->op == OP_TEXBAR) {
+         if (prev->subOp > useVec[i].level)
+            prev->subOp = useVec[i].level;
+         prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
+      } else {
+         Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
+         bar->fixed = 1;
+         bar->subOp = useVec[i].level;
+         // make use explicit to ease latency calculation
+         bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
+         useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
+      }
+   }
+
+   if (fn->getProgram()->optLevel < 3) {
+      if (uses)
+         delete[] uses;
+      return true;
+   }
+
+   std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
+
+   limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+   limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+   limitS.resize(fn->allBBlocks.getSize());
+
+   // cull unneeded barriers (should do that earlier, but for simplicity)
+   IteratorRef bi = fn->cfg.iteratorCFG();
+   // first calculate min/max outstanding TEXes for each BB
+   for (bi->reset(); !bi->end(); bi->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+      BasicBlock *bb = BasicBlock::get(n);
+      int min = 0;
+      int max = std::numeric_limits<int>::max();
+      for (Instruction *i = bb->getFirst(); i; i = i->next) {
+         if (isTextureOp(i->op)) {
+            min++;
+            if (max < std::numeric_limits<int>::max())
+               max++;
+         } else
+         if (i->op == OP_TEXBAR) {
+            min = MIN2(min, i->subOp);
+            max = MIN2(max, i->subOp);
+         }
+      }
+      // limits when looking at an isolated block
+      limitS[bb->getId()].min = min;
+      limitS[bb->getId()].max = max;
+   }
+   // propagate the min/max values
+   for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
+      for (bi->reset(); !bi->end(); bi->next()) {
+         Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+         BasicBlock *bb = BasicBlock::get(n);
+         const int bbId = bb->getId();
+         for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
+            BasicBlock *in = BasicBlock::get(ei.getNode());
+            const int inId = in->getId();
+            limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
+            limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
+         }
+         // I just hope this is correct ...
+         if (limitS[bbId].max == std::numeric_limits<int>::max()) {
+            // no barrier
+            limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
+            limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
+         } else {
+            // block contained a barrier
+            limitB[bbId].min = MIN2(limitS[bbId].max,
+                                    limitT[bbId].min + limitS[bbId].min);
+            limitB[bbId].max = MIN2(limitS[bbId].max,
+                                    limitT[bbId].max + limitS[bbId].min);
+         }
+      }
+   }
+   // finally delete unnecessary barriers
+   for (bi->reset(); !bi->end(); bi->next()) {
+      Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+      BasicBlock *bb = BasicBlock::get(n);
+      Instruction *prev = NULL;
+      Instruction *next;
+      int max = limitT[bb->getId()].max;
+      for (Instruction *i = bb->getFirst(); i; i = next) {
+         next = i->next;
+         if (i->op == OP_TEXBAR) {
+            if (i->subOp >= max) {
+               delete_Instruction(prog, i);
+               i = NULL;
+            } else {
+               max = i->subOp;
+               if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
+                  delete_Instruction(prog, prev);
+                  prev = NULL;
+               }
+            }
+         } else
+         if (isTextureOp(i->op)) {
+            max++;
+         }
+         if (i && !i->isNop())
+            prev = i;
+      }
+   }
+   if (uses)
+      delete[] uses;
+   return true;
+}
+
+bool
+NVC0LegalizePostRA::visit(Function *fn)
+{
+   if (needTexBar)
+      insertTextureBarriers(fn);
+
+   rZero = new_LValue(fn, FILE_GPR);
+   carry = new_LValue(fn, FILE_FLAGS);
+
+   rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
+   carry->reg.data.id = 0;
+
+   return true;
+}
+
+void
+NVC0LegalizePostRA::replaceZero(Instruction *i)
+{
+   for (int s = 0; i->srcExists(s); ++s) {
+      if (s == 2 && i->op == OP_SUCLAMP)
+         continue;
+      ImmediateValue *imm = i->getSrc(s)->asImm();
+      if (imm && imm->reg.data.u64 == 0)
+         i->setSrc(s, rZero);
+   }
+}
+
+// replace CONT with BRA for single unconditional continue
+bool
+NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
+{
+   if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
+      return false;
+   Graph::EdgeIterator ei = bb->cfg.incident();
+   if (ei.getType() != Graph::Edge::BACK)
+      ei.next();
+   if (ei.getType() != Graph::Edge::BACK)
+      return false;
+   BasicBlock *contBB = BasicBlock::get(ei.getNode());
+
+   if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
+       contBB->getExit()->getPredicate())
+      return false;
+   contBB->getExit()->op = OP_BRA;
+   bb->remove(bb->getEntry()); // delete PRECONT
+
+   ei.next();
+   assert(ei.end() || ei.getType() != Graph::Edge::BACK);
+   return true;
+}
+
+// replace branches to join blocks with join ops
+void
+NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
+{
+   if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
+      return;
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+      Instruction *exit = in->getExit();
+      if (!exit) {
+         in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
+         // there should always be a terminator instruction
+         WARN("inserted missing terminator in BB:%i\n", in->getId());
+      } else
+      if (exit->op == OP_BRA) {
+         exit->op = OP_JOIN;
+         exit->asFlow()->limit = 1; // must-not-propagate marker
+      }
+   }
+   bb->remove(bb->getEntry());
+}
+
+bool
+NVC0LegalizePostRA::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+   for (i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->op == OP_EMIT || i->op == OP_RESTART) {
+         if (!i->getDef(0)->refCount())
+            i->setDef(0, NULL);
+         if (i->src(0).getFile() == FILE_IMMEDIATE)
+            i->setSrc(0, rZero); // initial value must be 0
+      } else
+      if (i->isNop()) {
+         bb->remove(i);
+      } else {
+         // TODO: Move this to before register allocation for operations that
+         // need the $c register !
+         if (typeSizeof(i->dType) == 8) {
+            Instruction *hi;
+            hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
+            if (hi)
+               next = hi;
+         }
+
+         if (i->op != OP_MOV && i->op != OP_PFETCH)
+            replaceZero(i);
+      }
+   }
+   if (!bb->getEntry())
+      return true;
+
+   if (!tryReplaceContWithBra(bb))
+      propagateJoin(bb);
+
+   return true;
+}
+
+class NVC0LoweringPass : public Pass
+{
+public:
+   NVC0LoweringPass(Program *);
+
+private:
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Instruction *);
+
+   bool handleRDSV(Instruction *);
+   bool handleWRSV(Instruction *);
+   bool handleEXPORT(Instruction *);
+   bool handleOUT(Instruction *);
+   bool handleDIV(Instruction *);
+   bool handleMOD(Instruction *);
+   bool handleSQRT(Instruction *);
+   bool handlePOW(Instruction *);
+   bool handleTEX(TexInstruction *);
+   bool handleTXD(TexInstruction *);
+   bool handleTXQ(TexInstruction *);
+   bool handleManualTXD(TexInstruction *);
+   bool handleATOM(Instruction *);
+   bool handleCasExch(Instruction *, bool needCctl);
+   void handleSurfaceOpNVE4(TexInstruction *);
+
+   void checkPredicate(Instruction *);
+
+   void readTessCoord(LValue *dst, int c);
+
+   Value *loadResInfo32(Value *ptr, uint32_t off);
+   Value *loadMsInfo32(Value *ptr, uint32_t off);
+   Value *loadTexHandle(Value *ptr, unsigned int slot);
+
+   void adjustCoordinatesMS(TexInstruction *);
+   void processSurfaceCoordsNVE4(TexInstruction *);
+
+private:
+   const Target *const targ;
+
+   BuildUtil bld;
+
+   Symbol *gMemBase;
+   LValue *gpEmitAddress;
+};
+
+NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
+{
+   bld.setProgram(prog);
+   gMemBase = NULL;
+}
+
+bool
+NVC0LoweringPass::visit(Function *fn)
+{
+   if (prog->getType() == Program::TYPE_GEOMETRY) {
+      assert(!strncmp(fn->getName(), "MAIN", 4));
+      // TODO: when we generate actual functions pass this value along somehow
+      bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
+      gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
+      if (fn->cfgExit) {
+         bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
+         bld.mkMovToReg(0, gpEmitAddress);
+      }
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::visit(BasicBlock *bb)
+{
+   return true;
+}
+
+inline Value *
+NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   uint32_t off = prog->driver->io.texBindBase + slot * 4;
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+// move array source to first slot, convert to u16, add indirections
+bool
+NVC0LoweringPass::handleTEX(TexInstruction *i)
+{
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
+   const int arg = i->tex.target.getArgCount();
+   const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
+
+   if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
+      if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+         WARN("indirect TEX not implemented\n");
+      }
+      if (i->tex.r == i->tex.s) {
+         i->tex.r += prog->driver->io.texBindBase / 4;
+         i->tex.s  = 0; // only a single cX[] value possible here
+      } else {
+         Value *hnd = bld.getScratch();
+         Value *rHnd = loadTexHandle(NULL, i->tex.r);
+         Value *sHnd = loadTexHandle(NULL, i->tex.s);
+
+         bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
+
+         i->tex.r = 0; // not used for indirect tex
+         i->tex.s = 0;
+         i->setIndirectR(hnd);
+      }
+      if (i->tex.target.isArray()) {
+         LValue *layer = new_LValue(func, FILE_GPR);
+         Value *src = i->getSrc(lyr);
+         const int sat = (i->op == OP_TXF) ? 1 : 0;
+         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+         bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
+         for (int s = dim; s >= 1; --s)
+            i->setSrc(s, i->getSrc(s - 1));
+         i->setSrc(0, layer);
+      }
+   } else
+   // (nvc0) generate and move the tsc/tic/array source to the front
+   if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+      LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+      Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
+      for (int s = dim; s >= 1; --s)
+         i->setSrc(s, i->getSrc(s - 1));
+      i->setSrc(0, arrayIndex);
+
+      Value *ticRel = i->getIndirectR();
+      Value *tscRel = i->getIndirectS();
+
+      if (arrayIndex) {
+         int sat = (i->op == OP_TXF) ? 1 : 0;
+         DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+         bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
+      } else {
+         bld.loadImm(src, 0);
+      }
+
+      if (ticRel) {
+         i->setSrc(i->tex.rIndirectSrc, NULL);
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
+      }
+      if (tscRel) {
+         i->setSrc(i->tex.sIndirectSrc, NULL);
+         bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
+      }
+
+      i->setSrc(0, src);
+   }
+
+   // offset is last source (lod 1st, dc 2nd)
+   if (i->tex.useOffsets) {
+      uint32_t value = 0;
+      int n, c;
+      int s = i->srcCount(0xff, true);
+      if (i->srcExists(s)) // move potential predicate out of the way
+         i->moveSources(s, 1);
+      for (n = 0; n < i->tex.useOffsets; ++n)
+         for (c = 0; c < 3; ++c)
+            value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
+      i->setSrc(s, bld.loadImm(NULL, value));
+   }
+
+   if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
+      //
+      // If TEX requires more than 4 sources, the 2nd register tuple must be
+      // aligned to 4, even if it consists of just a single 4-byte register.
+      //
+      // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
+      //
+      int s = i->srcCount(0xff, true);
+      if (s > 4 && s < 7) {
+         if (i->srcExists(s)) // move potential predicate out of the way
+            i->moveSources(s, 7 - s);
+         while (s < 7)
+            i->setSrc(s++, bld.loadImm(NULL, 0));
+      }
+   }
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleManualTXD(TexInstruction *i)
+{
+   static const uint8_t qOps[4][2] =
+   {
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(MOV2, MOV2, ADD,  ADD) }, // l0
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD,  ADD) }, // l1
+      { QUADOP(MOV2, ADD,  MOV2, ADD),  QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+      { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+   };
+   Value *def[4][4];
+   Value *crd[3];
+   Instruction *tex;
+   Value *zero = bld.loadImm(bld.getSSA(), 0);
+   int l, c;
+   const int dim = i->tex.target.getDim();
+
+   i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+   for (c = 0; c < dim; ++c)
+      crd[c] = bld.getScratch();
+
+   bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+   for (l = 0; l < 4; ++l) {
+      // mov coordinates from lane l to all lanes
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+      // add dPdx from lane l to lanes dx
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+      // add dPdy from lane l to lanes dy
+      for (c = 0; c < dim; ++c)
+         bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+      // texture
+      bld.insert(tex = cloneForward(func, i));
+      for (c = 0; c < dim; ++c)
+         tex->setSrc(c, crd[c]);
+      // save results
+      for (c = 0; i->defExists(c); ++c) {
+         Instruction *mov;
+         def[c][l] = bld.getSSA();
+         mov = bld.mkMov(def[c][l], tex->getDef(c));
+         mov->fixed = 1;
+         mov->lanes = 1 << l;
+      }
+   }
+   bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+   for (c = 0; i->defExists(c); ++c) {
+      Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+      for (l = 0; l < 4; ++l)
+         u->setSrc(l, def[c][l]);
+   }
+
+   i->bb->remove(i);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleTXD(TexInstruction *txd)
+{
+   int dim = txd->tex.target.getDim();
+   int arg = txd->tex.target.getArgCount();
+
+   handleTEX(txd);
+   while (txd->srcExists(arg))
+      ++arg;
+
+   txd->tex.derivAll = true;
+   if (dim > 2 ||
+       txd->tex.target.isCube() ||
+       arg > 4 ||
+       txd->tex.target.isShadow())
+      return handleManualTXD(txd);
+
+   for (int c = 0; c < dim; ++c) {
+      txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
+      txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
+      txd->dPdx[c].set(NULL);
+      txd->dPdy[c].set(NULL);
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleTXQ(TexInstruction *txq)
+{
+   // TODO: indirect resource/sampler index
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleATOM(Instruction *atom)
+{
+   SVSemantic sv;
+
+   switch (atom->src(0).getFile()) {
+   case FILE_MEMORY_LOCAL:
+      sv = SV_LBASE;
+      break;
+   case FILE_MEMORY_SHARED:
+      sv = SV_SBASE;
+      break;
+   default:
+      assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
+      return true;
+   }
+   Value *base =
+      bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
+   Value *ptr = atom->getIndirect(0, 0);
+
+   atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
+   atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+   if (ptr)
+      base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
+   atom->setIndirect(0, 0, base);
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
+{
+   if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
+       cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
+      return false;
+   bld.setPosition(cas, true);
+
+   if (needCctl) {
+      Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
+      cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
+      cctl->fixed = 1;
+      cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
+      if (cas->isPredicated())
+         cctl->setPredicate(cas->cc, cas->getPredicate());
+   }
+
+   if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
+      // should be set to the high part of the double reg or bad things will
+      // happen elsewhere in the universe.
+      // Also, it sometimes returns the new value instead of the old one
+      // under mysterious circumstances.
+      Value *dreg = bld.getSSA(8);
+      bld.setPosition(cas, false);
+      bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
+      cas->setSrc(1, dreg);
+   }
+
+   return true;
+}
+
+inline Value *
+NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.resInfoCBSlot;
+   off += prog->driver->io.suInfoBase;
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+inline Value *
+NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
+{
+   uint8_t b = prog->driver->io.msInfoCBSlot;
+   off += prog->driver->io.msInfoBase;
+   return bld.
+      mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+/* On nvc0, surface info is obtained via the surface binding points passed
+ * to the SULD/SUST instructions.
+ * On nve4, surface info is stored in c[] and is used by various special
+ * instructions, e.g. for clamping coordiantes or generating an address.
+ * They couldn't just have added an equivalent to TIC now, couldn't they ?
+ */
+#define NVE4_SU_INFO_ADDR   0x00
+#define NVE4_SU_INFO_FMT    0x04
+#define NVE4_SU_INFO_DIM_X  0x08
+#define NVE4_SU_INFO_PITCH  0x0c
+#define NVE4_SU_INFO_DIM_Y  0x10
+#define NVE4_SU_INFO_ARRAY  0x14
+#define NVE4_SU_INFO_DIM_Z  0x18
+#define NVE4_SU_INFO_UNK1C  0x1c
+#define NVE4_SU_INFO_WIDTH  0x20
+#define NVE4_SU_INFO_HEIGHT 0x24
+#define NVE4_SU_INFO_DEPTH  0x28
+#define NVE4_SU_INFO_TARGET 0x2c
+#define NVE4_SU_INFO_CALL   0x30
+#define NVE4_SU_INFO_RAW_X  0x34
+#define NVE4_SU_INFO_MS_X   0x38
+#define NVE4_SU_INFO_MS_Y   0x3c
+
+#define NVE4_SU_INFO__STRIDE 0x40
+
+#define NVE4_SU_INFO_DIM(i)  (0x08 + (i) * 8)
+#define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
+#define NVE4_SU_INFO_MS(i)   (0x38 + (i) * 4)
+
+static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
+{
+   switch (su->tex.target.getEnum()) {
+   case TEX_TARGET_BUFFER:      return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
+   case TEX_TARGET_RECT:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_1D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_1D_ARRAY:    return (c == 1) ?
+                                   NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
+                                   NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_2D:          return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
+   case TEX_TARGET_2D_MS:       return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
+   case TEX_TARGET_2D_ARRAY:    return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_3D:          return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_CUBE:        return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   case TEX_TARGET_CUBE_ARRAY:  return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+void
+NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
+{
+   const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
+   const int arg = tex->tex.target.getArgCount();
+
+   if (tex->tex.target == TEX_TARGET_2D_MS)
+      tex->tex.target = TEX_TARGET_2D;
+   else
+   if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
+      tex->tex.target = TEX_TARGET_2D_ARRAY;
+   else
+      return;
+
+   Value *x = tex->getSrc(0);
+   Value *y = tex->getSrc(1);
+   Value *s = tex->getSrc(arg - 1);
+
+   Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
+
+   Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
+   Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
+
+   bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
+   bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
+
+   s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
+   s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
+
+   Value *dx = loadMsInfo32(ts, 0x0);
+   Value *dy = loadMsInfo32(ts, 0x4);
+
+   bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
+   bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
+
+   tex->setSrc(0, tx);
+   tex->setSrc(1, ty);
+   tex->moveSources(arg, -1);
+}
+
+// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
+// They're computed from the coordinates using the surface info in c[] space.
+void
+NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
+{
+   Instruction *insn;
+   const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
+   const bool raw =
+      su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
+   const int idx = su->tex.r;
+   const int dim = su->tex.target.getDim();
+   const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
+   const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
+   int c;
+   Value *zero = bld.mkImm(0);
+   Value *p1 = NULL;
+   Value *v;
+   Value *src[3];
+   Value *bf, *eau, *off;
+   Value *addr, *pred;
+
+   off = bld.getScratch(4);
+   bf = bld.getScratch(4);
+   addr = bld.getSSA(8);
+   pred = bld.getScratch(1, FILE_PREDICATE);
+
+   bld.setPosition(su, false);
+
+   adjustCoordinatesMS(su);
+
+   // calculate clamped coordinates
+   for (c = 0; c < arg; ++c) {
+      src[c] = bld.getScratch();
+      if (c == 0 && raw)
+         v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
+      else
+         v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
+      bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
+         ->subOp = getSuClampSubOp(su, c);
+   }
+   for (; c < 3; ++c)
+      src[c] = zero;
+
+   // set predicate output
+   if (su->tex.target == TEX_TARGET_BUFFER) {
+      src[0]->getInsn()->setFlagsDef(1, pred);
+   } else
+   if (su->tex.target.isArray()) {
+      p1 = bld.getSSA(1, FILE_PREDICATE);
+      src[dim]->getInsn()->setFlagsDef(1, p1);
+   }
+
+   // calculate pixel offset
+   if (dim == 1) {
+      if (su->tex.target != TEX_TARGET_BUFFER)
+         bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
+   } else
+   if (dim == 3) {
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
+         ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
+         ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
+   } else {
+      assert(dim == 2);
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
+         ->subOp = su->tex.target.isArray() ?
+         NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+   }
+
+   // calculate effective address part 1
+   if (su->tex.target == TEX_TARGET_BUFFER) {
+      if (raw) {
+         bf = src[0];
+      } else {
+         v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+         bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
+            ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
+      }
+   } else {
+      Value *y = src[1];
+      Value *z = src[2];
+      uint16_t subOp = 0;
+
+      switch (dim) {
+      case 1:
+         y = zero;
+         z = zero;
+         break;
+      case 2:
+         z = off;
+         if (!su->tex.target.isArray()) {
+            z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+            subOp = NV50_IR_SUBOP_SUBFM_3D;
+         }
+         break;
+      default:
+         subOp = NV50_IR_SUBOP_SUBFM_3D;
+         assert(dim == 3);
+         break;
+      }
+      insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
+      insn->subOp = subOp;
+      insn->setFlagsDef(1, pred);
+   }
+
+   // part 2
+   v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
+
+   if (su->tex.target == TEX_TARGET_BUFFER) {
+      eau = v;
+   } else {
+      eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
+   }
+   // add array layer offset
+   if (su->tex.target.isArray()) {
+      v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
+      if (dim == 1)
+         bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
+            ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
+      else
+         bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
+            ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
+      // combine predicates
+      assert(p1);
+      bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
+   }
+
+   if (atom) {
+      Value *lo = bf;
+      if (su->tex.target == TEX_TARGET_BUFFER) {
+         lo = zero;
+         bld.mkMov(off, bf);
+      }
+      //  bf == g[] address & 0xff
+      // eau == g[] address >> 8
+      bld.mkOp3(OP_PERMT, TYPE_U32,  bf,   lo, bld.loadImm(NULL, 0x6540), eau);
+      bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
+   } else
+   if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
+      // Convert from u32 to u8 address format, which is what the library code
+      // doing SULDP currently uses.
+      // XXX: can SUEAU do this ?
+      // XXX: does it matter that we don't mask high bytes in bf ?
+      // Grrr.
+      bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
+      bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
+   }
+
+   bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
+
+   if (atom && su->tex.target == TEX_TARGET_BUFFER)
+      bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
+
+   // let's just set it 0 for raw access and hope it works
+   v = raw ?
+      bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+
+   // get rid of old coordinate sources, make space for fmt info and predicate
+   su->moveSources(arg, 3 - arg);
+   // set 64 bit address and 32-bit format sources
+   su->setSrc(0, addr);
+   su->setSrc(1, v);
+   su->setSrc(2, pred);
+}
+
+void
+NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
+{
+   processSurfaceCoordsNVE4(su);
+
+   // Who do we hate more ? The person who decided that nvc0's SULD doesn't
+   // have to support conversion or the person who decided that, in OpenCL,
+   // you don't have to specify the format here like you do in OpenGL ?
+
+   if (su->op == OP_SULDP) {
+      // We don't patch shaders. Ever.
+      // You get an indirect call to our library blob here.
+      // But at least it's uniform.
+      FlowInstruction *call;
+      LValue *p[3];
+      LValue *r[5];
+      uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
+
+      for (int i = 0; i < 4; ++i)
+         (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
+      for (int i = 0; i < 3; ++i)
+         (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
+      (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
+
+      bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
+      bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
+      bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
+      bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
+      bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
+
+      call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
+
+      call->indirect = 1;
+      call->absolute = 1;
+      call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
+                                   prog->driver->io.resInfoCBSlot, TYPE_U32,
+                                   prog->driver->io.suInfoBase + base));
+      call->setSrc(1, r[2]);
+      call->setSrc(2, r[4]);
+      for (int i = 0; i < 3; ++i)
+         call->setSrc(3 + i, p[i]);
+      for (int i = 0; i < 4; ++i) {
+         call->setDef(i, r[i]);
+         bld.mkMov(su->getDef(i), r[i]);
+      }
+      call->setDef(4, p[1]);
+      delete_Instruction(bld.getProgram(), su);
+   }
+
+   if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
+      // FIXME: for out of bounds access, destination value will be undefined !
+      Value *pred = su->getSrc(2);
+      CondCode cc = CC_NOT_P;
+      if (su->getPredicate()) {
+         pred = bld.getScratch(1, FILE_PREDICATE);
+         cc = su->cc;
+         if (cc == CC_NOT_P) {
+            bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
+         } else {
+            bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
+            pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
+         }
+      }
+      Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
+      red->subOp = su->subOp;
+      if (!gMemBase)
+         gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
+      red->setSrc(0, gMemBase);
+      red->setSrc(1, su->getSrc(3));
+      if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
+         red->setSrc(2, su->getSrc(4));
+      red->setIndirect(0, 0, su->getSrc(0));
+      red->setPredicate(cc, pred);
+      delete_Instruction(bld.getProgram(), su);
+      handleCasExch(red, true);
+   } else {
+      su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
+   }
+}
+
+bool
+NVC0LoweringPass::handleWRSV(Instruction *i)
+{
+   Instruction *st;
+   Symbol *sym;
+   uint32_t addr;
+
+   // must replace, $sreg are not writeable
+   addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
+   if (addr >= 0x400)
+      return false;
+   sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+   st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
+                    i->getSrc(1));
+   st->perPatch = i->perPatch;
+
+   bld.getBB()->remove(i);
+   return true;
+}
+
+void
+NVC0LoweringPass::readTessCoord(LValue *dst, int c)
+{
+   Value *laneid = bld.getSSA();
+   Value *x, *y;
+
+   bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
+
+   if (c == 0) {
+      x = dst;
+      y = NULL;
+   } else
+   if (c == 1) {
+      x = NULL;
+      y = dst;
+   } else {
+      assert(c == 2);
+      x = bld.getSSA();
+      y = bld.getSSA();
+   }
+   if (x)
+      bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
+   if (y)
+      bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
+
+   if (c == 2) {
+      bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
+      bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
+   }
+}
+
+bool
+NVC0LoweringPass::handleRDSV(Instruction *i)
+{
+   Symbol *sym = i->getSrc(0)->asSym();
+   const SVSemantic sv = sym->reg.data.sv.sv;
+   Value *vtx = NULL;
+   Instruction *ld;
+   uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+
+   if (addr >= 0x400) {
+      // mov $sreg
+      if (sym->reg.data.sv.index == 3) {
+         // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
+         i->op = OP_MOV;
+         i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
+      }
+      return true;
+   }
+
+   switch (sv) {
+   case SV_POSITION:
+      assert(prog->getType() == Program::TYPE_FRAGMENT);
+      bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+      break;
+   case SV_FACE:
+   {
+      Value *face = i->getDef(0);
+      bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
+      if (i->dType == TYPE_F32) {
+         bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
+         bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
+      }
+   }
+      break;
+   case SV_TESS_COORD:
+      assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
+      readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
+      break;
+   case SV_NTID:
+   case SV_NCTAID:
+   case SV_GRIDID:
+      assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
+      if (sym->reg.data.sv.index == 3) {
+         i->op = OP_MOV;
+         i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
+         return true;
+      }
+      addr += prog->driver->prop.cp.gridInfoBase;
+      bld.mkLoad(TYPE_U32, i->getDef(0),
+                 bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
+      break;
+   default:
+      if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+         vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
+      ld = bld.mkFetch(i->getDef(0), i->dType,
+                       FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
+      ld->perPatch = i->perPatch;
+      break;
+   }
+   bld.getBB()->remove(i);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleDIV(Instruction *i)
+{
+   if (!isFloatType(i->dType))
+      return true;
+   bld.setPosition(i, false);
+   Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+   i->op = OP_MUL;
+   i->setSrc(1, rcp->getDef(0));
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleMOD(Instruction *i)
+{
+   if (i->dType != TYPE_F32)
+      return true;
+   LValue *value = bld.getScratch();
+   bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
+   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
+   bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
+   bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+   i->op = OP_SUB;
+   i->setSrc(1, value);
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleSQRT(Instruction *i)
+{
+   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+                                bld.getSSA(), i->getSrc(0));
+   i->op = OP_MUL;
+   i->setSrc(1, rsq->getDef(0));
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handlePOW(Instruction *i)
+{
+   LValue *val = bld.getScratch();
+
+   bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+   bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+   bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+   i->op = OP_EX2;
+   i->setSrc(0, val);
+   i->setSrc(1, NULL);
+
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleEXPORT(Instruction *i)
+{
+   if (prog->getType() == Program::TYPE_FRAGMENT) {
+      int id = i->getSrc(0)->reg.data.offset / 4;
+
+      if (i->src(0).isIndirect(0)) // TODO, ugly
+         return false;
+      i->op = OP_MOV;
+      i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+      i->src(0).set(i->src(1));
+      i->setSrc(1, NULL);
+      i->setDef(0, new_LValue(func, FILE_GPR));
+      i->getDef(0)->reg.data.id = id;
+
+      prog->maxGPR = MAX2(prog->maxGPR, id);
+   } else
+   if (prog->getType() == Program::TYPE_GEOMETRY) {
+      i->setIndirect(0, 1, gpEmitAddress);
+   }
+   return true;
+}
+
+bool
+NVC0LoweringPass::handleOUT(Instruction *i)
+{
+   if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
+      i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
+      delete_Instruction(prog, i);
+   } else {
+      assert(gpEmitAddress);
+      i->setDef(0, gpEmitAddress);
+      if (i->srcExists(0))
+         i->setSrc(1, i->getSrc(0));
+      i->setSrc(0, gpEmitAddress);
+   }
+   return true;
+}
+
+// Generate a binary predicate if an instruction is predicated by
+// e.g. an f32 value.
+void
+NVC0LoweringPass::checkPredicate(Instruction *insn)
+{
+   Value *pred = insn->getPredicate();
+   Value *pdst;
+
+   if (!pred || pred->reg.file == FILE_PREDICATE)
+      return;
+   pdst = new_LValue(func, FILE_PREDICATE);
+
+   // CAUTION: don't use pdst->getInsn, the definition might not be unique,
+   //  delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
+
+   bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, bld.mkImm(0), pred);
+
+   insn->setPredicate(insn->cc, pdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NVC0LoweringPass::visit(Instruction *i)
+{
+   bld.setPosition(i, false);
+
+   if (i->cc != CC_ALWAYS)
+      checkPredicate(i);
+
+   switch (i->op) {
+   case OP_TEX:
+   case OP_TXB:
+   case OP_TXL:
+   case OP_TXF:
+   case OP_TXG:
+      return handleTEX(i->asTex());
+   case OP_TXD:
+      return handleTXD(i->asTex());
+   case OP_TXQ:
+     return handleTXQ(i->asTex());
+   case OP_EX2:
+      bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+      i->setSrc(0, i->getDef(0));
+      break;
+   case OP_POW:
+      return handlePOW(i);
+   case OP_DIV:
+      return handleDIV(i);
+   case OP_MOD:
+      return handleMOD(i);
+   case OP_SQRT:
+      return handleSQRT(i);
+   case OP_EXPORT:
+      return handleEXPORT(i);
+   case OP_EMIT:
+   case OP_RESTART:
+      return handleOUT(i);
+   case OP_RDSV:
+      return handleRDSV(i);
+   case OP_WRSV:
+      return handleWRSV(i);
+   case OP_LOAD:
+      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+         if (prog->getType() == Program::TYPE_COMPUTE) {
+            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+            i->getSrc(0)->reg.fileIndex = 0;
+         } else {
+            i->op = OP_VFETCH;
+            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+         }
+      }
+      break;
+   case OP_ATOM:
+   {
+      const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
+      handleATOM(i);
+      handleCasExch(i, cctl);
+   }
+      break;
+   case OP_SULDB:
+   case OP_SULDP:
+   case OP_SUSTB:
+   case OP_SUSTP:
+   case OP_SUREDB:
+   case OP_SUREDP:
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+         handleSurfaceOpNVE4(i->asTex());
+      break;
+   default:
+      break;
+   }
+   return true;
+}
+
+bool
+TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
+{
+   if (stage == CG_STAGE_PRE_SSA) {
+      NVC0LoweringPass pass(prog);
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_POST_RA) {
+      NVC0LegalizePostRA pass(prog);
+      return pass.run(prog, false, true);
+   } else
+   if (stage == CG_STAGE_SSA) {
+      NVC0LegalizeSSA pass;
+      return pass.run(prog, false, true);
+   }
+   return false;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
new file mode 100644
index 0000000..99bd2bf
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -0,0 +1,2464 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+#include "codegen/nv50_ir_build_util.h"
+
+extern "C" {
+#include "util/u_math.h"
+}
+
+namespace nv50_ir {
+
+bool
+Instruction::isNop() const
+{
+   if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
+      return true;
+   if (terminator || join) // XXX: should terminator imply flow ?
+      return false;
+   if (op == OP_ATOM)
+      return false;
+   if (!fixed && op == OP_NOP)
+      return true;
+
+   if (defExists(0) && def(0).rep()->reg.data.id < 0) {
+      for (int d = 1; defExists(d); ++d)
+         if (def(d).rep()->reg.data.id >= 0)
+            WARN("part of vector result is unused !\n");
+      return true;
+   }
+
+   if (op == OP_MOV || op == OP_UNION) {
+      if (!getDef(0)->equals(getSrc(0)))
+         return false;
+      if (op == OP_UNION)
+         if (!def(0).rep()->equals(getSrc(1)))
+            return false;
+      return true;
+   }
+
+   return false;
+}
+
+bool Instruction::isDead() const
+{
+   if (op == OP_STORE ||
+       op == OP_EXPORT ||
+       op == OP_ATOM ||
+       op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
+       op == OP_WRSV)
+      return false;
+
+   for (int d = 0; defExists(d); ++d)
+      if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
+         return false;
+
+   if (terminator || asFlow())
+      return false;
+   if (fixed)
+      return false;
+
+   return true;
+};
+
+// =============================================================================
+
+class CopyPropagation : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+// Propagate all MOVs forward to make subsequent optimization easier, except if
+// the sources stem from a phi, in which case we don't want to mess up potential
+// swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
+bool
+CopyPropagation::visit(BasicBlock *bb)
+{
+   Instruction *mov, *si, *next;
+
+   for (mov = bb->getEntry(); mov; mov = next) {
+      next = mov->next;
+      if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
+         continue;
+      if (mov->getPredicate())
+         continue;
+      if (mov->def(0).getFile() != mov->src(0).getFile())
+         continue;
+      si = mov->getSrc(0)->getInsn();
+      if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
+         // propagate
+         mov->def(0).replace(mov->getSrc(0), false);
+         delete_Instruction(prog, mov);
+      }
+   }
+   return true;
+}
+
+// =============================================================================
+
+class LoadPropagation : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   void checkSwapSrc01(Instruction *);
+
+   bool isCSpaceLoad(Instruction *);
+   bool isImmd32Load(Instruction *);
+   bool isAttribOrSharedLoad(Instruction *);
+};
+
+bool
+LoadPropagation::isCSpaceLoad(Instruction *ld)
+{
+   return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
+}
+
+bool
+LoadPropagation::isImmd32Load(Instruction *ld)
+{
+   if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+      return false;
+   return ld->src(0).getFile() == FILE_IMMEDIATE;
+}
+
+bool
+LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
+{
+   return ld &&
+      (ld->op == OP_VFETCH ||
+       (ld->op == OP_LOAD &&
+        (ld->src(0).getFile() == FILE_SHADER_INPUT ||
+         ld->src(0).getFile() == FILE_MEMORY_SHARED)));
+}
+
+void
+LoadPropagation::checkSwapSrc01(Instruction *insn)
+{
+   if (!prog->getTarget()->getOpInfo(insn).commutative)
+      if (insn->op != OP_SET && insn->op != OP_SLCT)
+         return;
+   if (insn->src(1).getFile() != FILE_GPR)
+      return;
+
+   Instruction *i0 = insn->getSrc(0)->getInsn();
+   Instruction *i1 = insn->getSrc(1)->getInsn();
+
+   if (isCSpaceLoad(i0)) {
+      if (!isCSpaceLoad(i1))
+         insn->swapSources(0, 1);
+      else
+         return;
+   } else
+   if (isImmd32Load(i0)) {
+      if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+         insn->swapSources(0, 1);
+      else
+         return;
+   } else
+   if (isAttribOrSharedLoad(i1)) {
+      if (!isAttribOrSharedLoad(i0))
+         insn->swapSources(0, 1);
+      else
+         return;
+   } else {
+      return;
+   }
+
+   if (insn->op == OP_SET)
+      insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
+   else
+   if (insn->op == OP_SLCT)
+      insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
+}
+
+bool
+LoadPropagation::visit(BasicBlock *bb)
+{
+   const Target *targ = prog->getTarget();
+   Instruction *next;
+
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if (i->op == OP_CALL) // calls have args as sources, they must be in regs
+         continue;
+
+      if (i->srcExists(1))
+         checkSwapSrc01(i);
+
+      for (int s = 0; i->srcExists(s); ++s) {
+         Instruction *ld = i->getSrc(s)->getInsn();
+
+         if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
+            continue;
+         if (!targ->insnCanLoad(i, s, ld))
+            continue;
+
+         // propagate !
+         i->setSrc(s, ld->getSrc(0));
+         if (ld->src(0).isIndirect(0))
+            i->setIndirect(s, 0, ld->getIndirect(0, 0));
+
+         if (ld->getDef(0)->refCount() == 0)
+            delete_Instruction(prog, ld);
+      }
+   }
+   return true;
+}
+
+// =============================================================================
+
+// Evaluate constant expressions.
+class ConstantFolding : public Pass
+{
+public:
+   bool foldAll(Program *);
+
+private:
+   virtual bool visit(BasicBlock *);
+
+   void expr(Instruction *, ImmediateValue&, ImmediateValue&);
+   void opnd(Instruction *, ImmediateValue&, int s);
+
+   void unary(Instruction *, const ImmediateValue&);
+
+   void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
+
+   // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
+   CmpInstruction *findOriginForTestWithZero(Value *);
+
+   unsigned int foldCount;
+
+   BuildUtil bld;
+};
+
+// TODO: remember generated immediates and only revisit these
+bool
+ConstantFolding::foldAll(Program *prog)
+{
+   unsigned int iterCount = 0;
+   do {
+      foldCount = 0;
+      if (!run(prog))
+         return false;
+   } while (foldCount && ++iterCount < 2);
+   return true;
+}
+
+bool
+ConstantFolding::visit(BasicBlock *bb)
+{
+   Instruction *i, *next;
+
+   for (i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      if (i->op == OP_MOV || i->op == OP_CALL)
+         continue;
+
+      ImmediateValue src0, src1;
+
+      if (i->srcExists(1) &&
+          i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
+         expr(i, src0, src1);
+      else
+      if (i->srcExists(0) && i->src(0).getImmediate(src0))
+         opnd(i, src0, 0);
+      else
+      if (i->srcExists(1) && i->src(1).getImmediate(src1))
+         opnd(i, src1, 1);
+   }
+   return true;
+}
+
+CmpInstruction *
+ConstantFolding::findOriginForTestWithZero(Value *value)
+{
+   if (!value)
+      return NULL;
+   Instruction *insn = value->getInsn();
+
+   while (insn && insn->op != OP_SET) {
+      Instruction *next = NULL;
+      switch (insn->op) {
+      case OP_NEG:
+      case OP_ABS:
+      case OP_CVT:
+         next = insn->getSrc(0)->getInsn();
+         if (insn->sType != next->dType)
+            return NULL;
+         break;
+      case OP_MOV:
+         next = insn->getSrc(0)->getInsn();
+         break;
+      default:
+         return NULL;
+      }
+      insn = next;
+   }
+   return insn ? insn->asCmp() : NULL;
+}
+
+void
+Modifier::applyTo(ImmediateValue& imm) const
+{
+   if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
+      return;
+   switch (imm.reg.type) {
+   case TYPE_F32:
+      if (bits & NV50_IR_MOD_ABS)
+         imm.reg.data.f32 = fabsf(imm.reg.data.f32);
+      if (bits & NV50_IR_MOD_NEG)
+         imm.reg.data.f32 = -imm.reg.data.f32;
+      if (bits & NV50_IR_MOD_SAT) {
+         if (imm.reg.data.f32 < 0.0f)
+            imm.reg.data.f32 = 0.0f;
+         else
+         if (imm.reg.data.f32 > 1.0f)
+            imm.reg.data.f32 = 1.0f;
+      }
+      assert(!(bits & NV50_IR_MOD_NOT));
+      break;
+
+   case TYPE_S8: // NOTE: will be extended
+   case TYPE_S16:
+   case TYPE_S32:
+   case TYPE_U8: // NOTE: treated as signed
+   case TYPE_U16:
+   case TYPE_U32:
+      if (bits & NV50_IR_MOD_ABS)
+         imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
+            imm.reg.data.s32 : -imm.reg.data.s32;
+      if (bits & NV50_IR_MOD_NEG)
+         imm.reg.data.s32 = -imm.reg.data.s32;
+      if (bits & NV50_IR_MOD_NOT)
+         imm.reg.data.s32 = ~imm.reg.data.s32;
+      break;
+
+   case TYPE_F64:
+      if (bits & NV50_IR_MOD_ABS)
+         imm.reg.data.f64 = fabs(imm.reg.data.f64);
+      if (bits & NV50_IR_MOD_NEG)
+         imm.reg.data.f64 = -imm.reg.data.f64;
+      if (bits & NV50_IR_MOD_SAT) {
+         if (imm.reg.data.f64 < 0.0)
+            imm.reg.data.f64 = 0.0;
+         else
+         if (imm.reg.data.f64 > 1.0)
+            imm.reg.data.f64 = 1.0;
+      }
+      assert(!(bits & NV50_IR_MOD_NOT));
+      break;
+
+   default:
+      assert(!"invalid/unhandled type");
+      imm.reg.data.u64 = 0;
+      break;
+   }
+}
+
+operation
+Modifier::getOp() const
+{
+   switch (bits) {
+   case NV50_IR_MOD_ABS: return OP_ABS;
+   case NV50_IR_MOD_NEG: return OP_NEG;
+   case NV50_IR_MOD_SAT: return OP_SAT;
+   case NV50_IR_MOD_NOT: return OP_NOT;
+   case 0:
+      return OP_MOV;
+   default:
+      return OP_CVT;
+   }
+}
+
+void
+ConstantFolding::expr(Instruction *i,
+                      ImmediateValue &imm0, ImmediateValue &imm1)
+{
+   struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
+   struct Storage res;
+
+   memset(&res.data, 0, sizeof(res.data));
+
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA:
+   case OP_MUL:
+      if (i->dnz && i->dType == TYPE_F32) {
+         if (!isfinite(a->data.f32))
+            a->data.f32 = 0.0f;
+         if (!isfinite(b->data.f32))
+            b->data.f32 = 0.0f;
+      }
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break;
+      case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
+      case TYPE_S32:
+      case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_DIV:
+      if (b->data.u32 == 0)
+         break;
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
+      case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
+      case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
+      case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_ADD:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
+      case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
+      case TYPE_S32:
+      case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_POW:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
+      case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
+      default:
+         return;
+      }
+      break;
+   case OP_MAX:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
+      case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
+      case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
+      case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
+      default:
+         return;
+      }
+      break;
+   case OP_MIN:
+      switch (i->dType) {
+      case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
+      case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
+      case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
+      case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
+      default:
+         return;
+      }
+      break;
+   case OP_AND:
+      res.data.u64 = a->data.u64 & b->data.u64;
+      break;
+   case OP_OR:
+      res.data.u64 = a->data.u64 | b->data.u64;
+      break;
+   case OP_XOR:
+      res.data.u64 = a->data.u64 ^ b->data.u64;
+      break;
+   case OP_SHL:
+      res.data.u32 = a->data.u32 << b->data.u32;
+      break;
+   case OP_SHR:
+      switch (i->dType) {
+      case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
+      case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
+      default:
+         return;
+      }
+      break;
+   case OP_SLCT:
+      if (a->data.u32 != b->data.u32)
+         return;
+      res.data.u32 = a->data.u32;
+      break;
+   default:
+      return;
+   }
+   ++foldCount;
+
+   i->src(0).mod = Modifier(0);
+   i->src(1).mod = Modifier(0);
+
+   i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
+   i->setSrc(1, NULL);
+
+   i->getSrc(0)->reg.data = res.data;
+
+   if (i->op == OP_MAD || i->op == OP_FMA) {
+      i->op = OP_ADD;
+
+      i->setSrc(1, i->getSrc(0));
+      i->src(1).mod = i->src(2).mod;
+      i->setSrc(0, i->getSrc(2));
+      i->setSrc(2, NULL);
+
+      ImmediateValue src0;
+      if (i->src(0).getImmediate(src0))
+         expr(i, src0, *i->getSrc(1)->asImm());
+   } else {
+      i->op = OP_MOV;
+   }
+}
+
+void
+ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
+{
+   Storage res;
+
+   if (i->dType != TYPE_F32)
+      return;
+   switch (i->op) {
+   case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
+   case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
+   case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
+   case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
+   case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
+   case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
+   case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
+   case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
+   case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
+   case OP_PRESIN:
+   case OP_PREEX2:
+      // these should be handled in subsequent OP_SIN/COS/EX2
+      res.data.f32 = imm.reg.data.f32;
+      break;
+   default:
+      return;
+   }
+   i->op = OP_MOV;
+   i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
+   i->src(0).mod = Modifier(0);
+}
+
+void
+ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
+                                        const int s, ImmediateValue& imm2)
+{
+   const int t = s ? 0 : 1;
+   Instruction *insn;
+   Instruction *mul1 = NULL; // mul1 before mul2
+   int e = 0;
+   float f = imm2.reg.data.f32;
+   ImmediateValue imm1;
+
+   assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
+
+   if (mul2->getSrc(t)->refCount() == 1) {
+      insn = mul2->getSrc(t)->getInsn();
+      if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
+         mul1 = insn;
+      if (mul1 && !mul1->saturate) {
+         int s1;
+
+         if (mul1->src(s1 = 0).getImmediate(imm1) ||
+             mul1->src(s1 = 1).getImmediate(imm1)) {
+            bld.setPosition(mul1, false);
+            // a = mul r, imm1
+            // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
+            mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
+            mul1->src(s1).mod = Modifier(0);
+            mul2->def(0).replace(mul1->getDef(0), false);
+         } else
+         if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
+            // c = mul a, b
+            // d = mul c, imm   -> d = mul_x_imm a, b
+            mul1->postFactor = e;
+            mul2->def(0).replace(mul1->getDef(0), false);
+            if (f < 0)
+               mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
+         }
+         mul1->saturate = mul2->saturate;
+         return;
+      }
+   }
+   if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
+      // b = mul a, imm
+      // d = mul b, c   -> d = mul_x_imm a, c
+      int s2, t2;
+      insn = mul2->getDef(0)->uses.front()->getInsn();
+      if (!insn)
+         return;
+      mul1 = mul2;
+      mul2 = NULL;
+      s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
+      t2 = s2 ? 0 : 1;
+      if (insn->op == OP_MUL && insn->dType == TYPE_F32)
+         if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
+            mul2 = insn;
+      if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
+         mul2->postFactor = e;
+         mul2->setSrc(s2, mul1->src(t));
+         if (f < 0)
+            mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
+      }
+   }
+}
+
+void
+ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
+{
+   const int t = !s;
+   const operation op = i->op;
+
+   switch (i->op) {
+   case OP_MUL:
+      if (i->dType == TYPE_F32)
+         tryCollapseChainedMULs(i, s, imm0);
+
+      if (imm0.isInteger(0)) {
+         i->op = OP_MOV;
+         i->setSrc(0, new_ImmediateValue(prog, 0u));
+         i->src(0).mod = Modifier(0);
+         i->setSrc(1, NULL);
+      } else
+      if (imm0.isInteger(1) || imm0.isInteger(-1)) {
+         if (imm0.isNegative())
+            i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
+         i->op = i->src(t).mod.getOp();
+         if (s == 0) {
+            i->setSrc(0, i->getSrc(1));
+            i->src(0).mod = i->src(1).mod;
+            i->src(1).mod = 0;
+         }
+         if (i->op != OP_CVT)
+            i->src(0).mod = 0;
+         i->setSrc(1, NULL);
+      } else
+      if (imm0.isInteger(2) || imm0.isInteger(-2)) {
+         if (imm0.isNegative())
+            i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
+         i->op = OP_ADD;
+         i->setSrc(s, i->getSrc(t));
+         i->src(s).mod = i->src(t).mod;
+      } else
+      if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
+         i->op = OP_SHL;
+         imm0.applyLog2();
+         i->setSrc(0, i->getSrc(t));
+         i->src(0).mod = i->src(t).mod;
+         i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
+         i->src(1).mod = 0;
+      }
+      break;
+   case OP_ADD:
+      if (i->usesFlags())
+         break;
+      if (imm0.isInteger(0)) {
+         if (s == 0) {
+            i->setSrc(0, i->getSrc(1));
+            i->src(0).mod = i->src(1).mod;
+         }
+         i->setSrc(1, NULL);
+         i->op = i->src(0).mod.getOp();
+         if (i->op != OP_CVT)
+            i->src(0).mod = Modifier(0);
+      }
+      break;
+
+   case OP_DIV:
+      if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
+         break;
+      bld.setPosition(i, false);
+      if (imm0.reg.data.u32 == 0) {
+         break;
+      } else
+      if (imm0.reg.data.u32 == 1) {
+         i->op = OP_MOV;
+         i->setSrc(1, NULL);
+      } else
+      if (i->dType == TYPE_U32 && imm0.isPow2()) {
+         i->op = OP_SHR;
+         i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
+      } else
+      if (i->dType == TYPE_U32) {
+         Instruction *mul;
+         Value *tA, *tB;
+         const uint32_t d = imm0.reg.data.u32;
+         uint32_t m;
+         int r, s;
+         uint32_t l = util_logbase2(d);
+         if (((uint32_t)1 << l) < d)
+            ++l;
+         m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
+         r = l ? 1 : 0;
+         s = l ? (l - 1) : 0;
+
+         tA = bld.getSSA();
+         tB = bld.getSSA();
+         mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
+                         bld.loadImm(NULL, m));
+         mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
+         bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
+         tA = bld.getSSA();
+         if (r)
+            bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
+         else
+            tA = tB;
+         tB = s ? bld.getSSA() : i->getDef(0);
+         bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
+         if (s)
+            bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
+
+         delete_Instruction(prog, i);
+      } else
+      if (imm0.reg.data.s32 == -1) {
+         i->op = OP_NEG;
+         i->setSrc(1, NULL);
+      } else {
+         LValue *tA, *tB;
+         LValue *tD;
+         const int32_t d = imm0.reg.data.s32;
+         int32_t m;
+         int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
+         if ((1 << l) < abs(d))
+            ++l;
+         if (!l)
+            l = 1;
+         m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
+
+         tA = bld.getSSA();
+         tB = bld.getSSA();
+         bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
+                   i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
+         if (l > 1)
+            bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
+         else
+            tB = tA;
+         tA = bld.getSSA();
+         bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0));
+         tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
+         bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
+         if (d < 0)
+            bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
+
+         delete_Instruction(prog, i);
+      }
+      break;
+
+   case OP_MOD:
+      if (i->sType == TYPE_U32 && imm0.isPow2()) {
+         bld.setPosition(i, false);
+         i->op = OP_AND;
+         i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
+      }
+      break;
+
+   case OP_SET: // TODO: SET_AND,OR,XOR
+   {
+      CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
+      CondCode cc, ccZ;
+      if (i->src(t).mod != Modifier(0))
+         return;
+      if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
+         return;
+      cc = si->setCond;
+      ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
+      if (s == 0)
+         ccZ = reverseCondCode(ccZ);
+      switch (ccZ) {
+      case CC_LT: cc = CC_FL; break;
+      case CC_GE: cc = CC_TR; break;
+      case CC_EQ: cc = inverseCondCode(cc); break;
+      case CC_LE: cc = inverseCondCode(cc); break;
+      case CC_GT: break;
+      case CC_NE: break;
+      default:
+         return;
+      }
+      i->asCmp()->setCond = cc;
+      i->setSrc(0, si->src(0));
+      i->setSrc(1, si->src(1));
+      i->sType = si->sType;
+   }
+      break;
+
+   case OP_SHL:
+   {
+      if (s != 1 || i->src(0).mod != Modifier(0))
+         break;
+      // try to concatenate shifts
+      Instruction *si = i->getSrc(0)->getInsn();
+      if (!si || si->op != OP_SHL)
+         break;
+      ImmediateValue imm1;
+      if (si->src(1).getImmediate(imm1)) {
+         bld.setPosition(i, false);
+         i->setSrc(0, si->getSrc(0));
+         i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
+      }
+   }
+      break;
+
+   case OP_ABS:
+   case OP_NEG:
+   case OP_LG2:
+   case OP_RCP:
+   case OP_SQRT:
+   case OP_RSQ:
+   case OP_PRESIN:
+   case OP_SIN:
+   case OP_COS:
+   case OP_PREEX2:
+   case OP_EX2:
+      unary(i, imm0);
+      break;
+   default:
+      return;
+   }
+   if (i->op != op)
+      foldCount++;
+}
+
+// =============================================================================
+
+// Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
+class ModifierFolding : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+bool
+ModifierFolding::visit(BasicBlock *bb)
+{
+   const Target *target = prog->getTarget();
+
+   Instruction *i, *next, *mi;
+   Modifier mod;
+
+   for (i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if (0 && i->op == OP_SUB) {
+         // turn "sub" into "add neg" (do we really want this ?)
+         i->op = OP_ADD;
+         i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
+      }
+
+      for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+         mi = i->getSrc(s)->getInsn();
+         if (!mi ||
+             mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
+            continue;
+         if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
+            if ((i->op != OP_ADD &&
+                 i->op != OP_MUL) ||
+                (mi->op != OP_ABS &&
+                 mi->op != OP_NEG))
+               continue;
+         } else
+         if (i->sType != mi->dType) {
+            continue;
+         }
+         if ((mod = Modifier(mi->op)) == Modifier(0))
+            continue;
+         mod *= mi->src(0).mod;
+
+         if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
+            // abs neg [abs] = abs
+            mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
+         } else
+         if ((i->op == OP_NEG) && mod.neg()) {
+            assert(s == 0);
+            // neg as both opcode and modifier on same insn is prohibited
+            // neg neg abs = abs, neg neg = identity
+            mod = mod & Modifier(~NV50_IR_MOD_NEG);
+            i->op = mod.getOp();
+            mod = mod & Modifier(~NV50_IR_MOD_ABS);
+            if (mod == Modifier(0))
+               i->op = OP_MOV;
+         }
+
+         if (target->isModSupported(i, s, mod)) {
+            i->setSrc(s, mi->getSrc(0));
+            i->src(s).mod *= mod;
+         }
+      }
+
+      if (i->op == OP_SAT) {
+         mi = i->getSrc(0)->getInsn();
+         if (mi &&
+             mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
+            mi->saturate = 1;
+            mi->setDef(0, i->getDef(0));
+            delete_Instruction(prog, i);
+         }
+      }
+   }
+
+   return true;
+}
+
+// =============================================================================
+
+// MUL + ADD -> MAD/FMA
+// MIN/MAX(a, a) -> a, etc.
+// SLCT(a, b, const) -> cc(const) ? a : b
+// RCP(RCP(a)) -> a
+// MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
+class AlgebraicOpt : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   void handleABS(Instruction *);
+   bool handleADD(Instruction *);
+   bool tryADDToMADOrSAD(Instruction *, operation toOp);
+   void handleMINMAX(Instruction *);
+   void handleRCP(Instruction *);
+   void handleSLCT(Instruction *);
+   void handleLOGOP(Instruction *);
+   void handleCVT(Instruction *);
+   void handleSUCLAMP(Instruction *);
+
+   BuildUtil bld;
+};
+
+void
+AlgebraicOpt::handleABS(Instruction *abs)
+{
+   Instruction *sub = abs->getSrc(0)->getInsn();
+   DataType ty;
+   if (!sub ||
+       !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
+      return;
+   // expect not to have mods yet, if we do, bail
+   if (sub->src(0).mod || sub->src(1).mod)
+      return;
+   // hidden conversion ?
+   ty = intTypeToSigned(sub->dType);
+   if (abs->dType != abs->sType || ty != abs->sType)
+      return;
+
+   if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
+       sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
+       sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
+         return;
+
+   Value *src0 = sub->getSrc(0);
+   Value *src1 = sub->getSrc(1);
+
+   if (sub->op == OP_ADD) {
+      Instruction *neg = sub->getSrc(1)->getInsn();
+      if (neg && neg->op != OP_NEG) {
+         neg = sub->getSrc(0)->getInsn();
+         src0 = sub->getSrc(1);
+      }
+      if (!neg || neg->op != OP_NEG ||
+          neg->dType != neg->sType || neg->sType != ty)
+         return;
+      src1 = neg->getSrc(0);
+   }
+
+   // found ABS(SUB))
+   abs->moveSources(1, 2); // move sources >=1 up by 2
+   abs->op = OP_SAD;
+   abs->setType(sub->dType);
+   abs->setSrc(0, src0);
+   abs->setSrc(1, src1);
+   bld.setPosition(abs, false);
+   abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
+}
+
+bool
+AlgebraicOpt::handleADD(Instruction *add)
+{
+   Value *src0 = add->getSrc(0);
+   Value *src1 = add->getSrc(1);
+
+   if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
+      return false;
+
+   bool changed = false;
+   if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
+      changed = tryADDToMADOrSAD(add, OP_MAD);
+   if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
+      changed = tryADDToMADOrSAD(add, OP_SAD);
+   return changed;
+}
+
+// ADD(SAD(a,b,0), c) -> SAD(a,b,c)
+// ADD(MUL(a,b), c) -> MAD(a,b,c)
+bool
+AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
+{
+   Value *src0 = add->getSrc(0);
+   Value *src1 = add->getSrc(1);
+   Value *src;
+   int s;
+   const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
+   const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
+   Modifier mod[4];
+
+   if (src0->refCount() == 1 &&
+       src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
+      s = 0;
+   else
+   if (src1->refCount() == 1 &&
+       src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
+      s = 1;
+   else
+      return false;
+
+   if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
+       (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
+      return false;
+
+   src = add->getSrc(s);
+
+   if (src->getInsn()->postFactor)
+      return false;
+   if (toOp == OP_SAD) {
+      ImmediateValue imm;
+      if (!src->getInsn()->src(2).getImmediate(imm))
+         return false;
+      if (!imm.isInteger(0))
+         return false;
+   }
+
+   mod[0] = add->src(0).mod;
+   mod[1] = add->src(1).mod;
+   mod[2] = src->getUniqueInsn()->src(0).mod;
+   mod[3] = src->getUniqueInsn()->src(1).mod;
+
+   if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
+      return false;
+
+   add->op = toOp;
+   add->subOp = src->getInsn()->subOp; // potentially mul-high
+
+   add->setSrc(2, add->src(s ? 0 : 1));
+
+   add->setSrc(0, src->getInsn()->getSrc(0));
+   add->src(0).mod = mod[2] ^ mod[s];
+   add->setSrc(1, src->getInsn()->getSrc(1));
+   add->src(1).mod = mod[3];
+
+   return true;
+}
+
+void
+AlgebraicOpt::handleMINMAX(Instruction *minmax)
+{
+   Value *src0 = minmax->getSrc(0);
+   Value *src1 = minmax->getSrc(1);
+
+   if (src0 != src1 || src0->reg.file != FILE_GPR)
+      return;
+   if (minmax->src(0).mod == minmax->src(1).mod) {
+      if (minmax->def(0).mayReplace(minmax->src(0))) {
+         minmax->def(0).replace(minmax->src(0), false);
+         minmax->bb->remove(minmax);
+      } else {
+         minmax->op = OP_CVT;
+         minmax->setSrc(1, NULL);
+      }
+   } else {
+      // TODO:
+      // min(x, -x) = -abs(x)
+      // min(x, -abs(x)) = -abs(x)
+      // min(x, abs(x)) = x
+      // max(x, -abs(x)) = x
+      // max(x, abs(x)) = abs(x)
+      // max(x, -x) = abs(x)
+   }
+}
+
+void
+AlgebraicOpt::handleRCP(Instruction *rcp)
+{
+   Instruction *si = rcp->getSrc(0)->getUniqueInsn();
+
+   if (si && si->op == OP_RCP) {
+      Modifier mod = rcp->src(0).mod * si->src(0).mod;
+      rcp->op = mod.getOp();
+      rcp->setSrc(0, si->getSrc(0));
+   }
+}
+
+void
+AlgebraicOpt::handleSLCT(Instruction *slct)
+{
+   if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
+      if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
+         slct->setSrc(0, slct->getSrc(1));
+   } else
+   if (slct->getSrc(0) != slct->getSrc(1)) {
+      return;
+   }
+   slct->op = OP_MOV;
+   slct->setSrc(1, NULL);
+   slct->setSrc(2, NULL);
+}
+
+void
+AlgebraicOpt::handleLOGOP(Instruction *logop)
+{
+   Value *src0 = logop->getSrc(0);
+   Value *src1 = logop->getSrc(1);
+
+   if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
+      return;
+
+   if (src0 == src1) {
+      if ((logop->op == OP_AND || logop->op == OP_OR) &&
+          logop->def(0).mayReplace(logop->src(0))) {
+         logop->def(0).replace(logop->src(0), false);
+         delete_Instruction(prog, logop);
+      }
+   } else {
+      // try AND(SET, SET) -> SET_AND(SET)
+      Instruction *set0 = src0->getInsn();
+      Instruction *set1 = src1->getInsn();
+
+      if (!set0 || set0->fixed || !set1 || set1->fixed)
+         return;
+      if (set1->op != OP_SET) {
+         Instruction *xchg = set0;
+         set0 = set1;
+         set1 = xchg;
+         if (set1->op != OP_SET)
+            return;
+      }
+      operation redOp = (logop->op == OP_AND ? OP_SET_AND :
+                         logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
+      if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
+         return;
+      if (set0->op != OP_SET &&
+          set0->op != OP_SET_AND &&
+          set0->op != OP_SET_OR &&
+          set0->op != OP_SET_XOR)
+         return;
+      if (set0->getDef(0)->refCount() > 1 &&
+          set1->getDef(0)->refCount() > 1)
+         return;
+      if (set0->getPredicate() || set1->getPredicate())
+         return;
+      // check that they don't source each other
+      for (int s = 0; s < 2; ++s)
+         if (set0->getSrc(s) == set1->getDef(0) ||
+             set1->getSrc(s) == set0->getDef(0))
+            return;
+
+      set0 = cloneForward(func, set0);
+      set1 = cloneShallow(func, set1);
+      logop->bb->insertAfter(logop, set1);
+      logop->bb->insertAfter(logop, set0);
+
+      set0->dType = TYPE_U8;
+      set0->getDef(0)->reg.file = FILE_PREDICATE;
+      set0->getDef(0)->reg.size = 1;
+      set1->setSrc(2, set0->getDef(0));
+      set1->op = redOp;
+      set1->setDef(0, logop->getDef(0));
+      delete_Instruction(prog, logop);
+   }
+}
+
+// F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
+// nv50:
+//  F2I(NEG(I2F(ABS(SET))))
+void
+AlgebraicOpt::handleCVT(Instruction *cvt)
+{
+   if (cvt->sType != TYPE_F32 ||
+       cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
+      return;
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
+      return;
+   if (insn->src(0).mod != Modifier(0))
+      return;
+   insn = insn->getSrc(0)->getInsn();
+
+   // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
+   if (insn && insn->op == OP_CVT &&
+       insn->dType == TYPE_F32 &&
+       insn->sType == TYPE_S32) {
+      insn = insn->getSrc(0)->getInsn();
+      if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
+          insn->src(0).mod)
+         return;
+      insn = insn->getSrc(0)->getInsn();
+      if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
+         return;
+   } else
+   if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
+      return;
+   }
+
+   Instruction *bset = cloneShallow(func, insn);
+   bset->dType = TYPE_U32;
+   bset->setDef(0, cvt->getDef(0));
+   cvt->bb->insertAfter(cvt, bset);
+   delete_Instruction(prog, cvt);
+}
+
+// SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
+void
+AlgebraicOpt::handleSUCLAMP(Instruction *insn)
+{
+   ImmediateValue imm;
+   int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
+   int s;
+   Instruction *add;
+
+   assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
+
+   // look for ADD (TODO: only count references by non-SUCLAMP)
+   if (insn->getSrc(0)->refCount() > 1)
+      return;
+   add = insn->getSrc(0)->getInsn();
+   if (!add || add->op != OP_ADD ||
+       (add->dType != TYPE_U32 &&
+        add->dType != TYPE_S32))
+      return;
+
+   // look for immediate
+   for (s = 0; s < 2; ++s)
+      if (add->src(s).getImmediate(imm))
+         break;
+   if (s >= 2)
+      return;
+   s = s ? 0 : 1;
+   // determine if immediate fits
+   val += imm.reg.data.s32;
+   if (val > 31 || val < -32)
+      return;
+   // determine if other addend fits
+   if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
+      return;
+
+   bld.setPosition(insn, false); // make sure bld is init'ed
+   // replace sources
+   insn->setSrc(2, bld.mkImm(val));
+   insn->setSrc(0, add->getSrc(s));
+}
+
+bool
+AlgebraicOpt::visit(BasicBlock *bb)
+{
+   Instruction *next;
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+      switch (i->op) {
+      case OP_ABS:
+         handleABS(i);
+         break;
+      case OP_ADD:
+         handleADD(i);
+         break;
+      case OP_RCP:
+         handleRCP(i);
+         break;
+      case OP_MIN:
+      case OP_MAX:
+         handleMINMAX(i);
+         break;
+      case OP_SLCT:
+         handleSLCT(i);
+         break;
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         handleLOGOP(i);
+         break;
+      case OP_CVT:
+         handleCVT(i);
+         break;
+      case OP_SUCLAMP:
+         handleSUCLAMP(i);
+         break;
+      default:
+         break;
+      }
+   }
+
+   return true;
+}
+
+// =============================================================================
+
+static inline void
+updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
+{
+   if (offset != ldst->getSrc(0)->reg.data.offset) {
+      if (ldst->getSrc(0)->refCount() > 1)
+         ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
+      ldst->getSrc(0)->reg.data.offset = offset;
+   }
+}
+
+// Combine loads and stores, forward stores to loads where possible.
+class MemoryOpt : public Pass
+{
+private:
+   class Record
+   {
+   public:
+      Record *next;
+      Instruction *insn;
+      const Value *rel[2];
+      const Value *base;
+      int32_t offset;
+      int8_t fileIndex;
+      uint8_t size;
+      bool locked;
+      Record *prev;
+
+      bool overlaps(const Instruction *ldst) const;
+
+      inline void link(Record **);
+      inline void unlink(Record **);
+      inline void set(const Instruction *ldst);
+   };
+
+public:
+   MemoryOpt();
+
+   Record *loads[DATA_FILE_COUNT];
+   Record *stores[DATA_FILE_COUNT];
+
+   MemoryPool recordPool;
+
+private:
+   virtual bool visit(BasicBlock *);
+   bool runOpt(BasicBlock *);
+
+   Record **getList(const Instruction *);
+
+   Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
+
+   // merge @insn into load/store instruction from @rec
+   bool combineLd(Record *rec, Instruction *ld);
+   bool combineSt(Record *rec, Instruction *st);
+
+   bool replaceLdFromLd(Instruction *ld, Record *ldRec);
+   bool replaceLdFromSt(Instruction *ld, Record *stRec);
+   bool replaceStFromSt(Instruction *restrict st, Record *stRec);
+
+   void addRecord(Instruction *ldst);
+   void purgeRecords(Instruction *const st, DataFile);
+   void lockStores(Instruction *const ld);
+   void reset();
+
+private:
+   Record *prevRecord;
+};
+
+MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
+{
+   for (int i = 0; i < DATA_FILE_COUNT; ++i) {
+      loads[i] = NULL;
+      stores[i] = NULL;
+   }
+   prevRecord = NULL;
+}
+
+void
+MemoryOpt::reset()
+{
+   for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
+      Record *it, *next;
+      for (it = loads[i]; it; it = next) {
+         next = it->next;
+         recordPool.release(it);
+      }
+      loads[i] = NULL;
+      for (it = stores[i]; it; it = next) {
+         next = it->next;
+         recordPool.release(it);
+      }
+      stores[i] = NULL;
+   }
+}
+
+bool
+MemoryOpt::combineLd(Record *rec, Instruction *ld)
+{
+   int32_t offRc = rec->offset;
+   int32_t offLd = ld->getSrc(0)->reg.data.offset;
+   int sizeRc = rec->size;
+   int sizeLd = typeSizeof(ld->dType);
+   int size = sizeRc + sizeLd;
+   int d, j;
+
+   if (!prog->getTarget()->
+       isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
+      return false;
+   // no unaligned loads
+   if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
+       ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
+      return false;
+
+   assert(sizeRc + sizeLd <= 16 && offRc != offLd);
+
+   for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
+
+   if (offLd < offRc) {
+      int sz;
+      for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
+      // d: nr of definitions in ld
+      // j: nr of definitions in rec->insn, move:
+      for (d = d + j - 1; j > 0; --j, --d)
+         rec->insn->setDef(d, rec->insn->getDef(j - 1));
+
+      if (rec->insn->getSrc(0)->refCount() > 1)
+         rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
+      rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
+
+      d = 0;
+   } else {
+      d = j;
+   }
+   // move definitions of @ld to @rec->insn
+   for (j = 0; sizeLd; ++j, ++d) {
+      sizeLd -= ld->getDef(j)->reg.size;
+      rec->insn->setDef(d, ld->getDef(j));
+   }
+
+   rec->size = size;
+   rec->insn->getSrc(0)->reg.size = size;
+   rec->insn->setType(typeOfSize(size));
+
+   delete_Instruction(prog, ld);
+
+   return true;
+}
+
+bool
+MemoryOpt::combineSt(Record *rec, Instruction *st)
+{
+   int32_t offRc = rec->offset;
+   int32_t offSt = st->getSrc(0)->reg.data.offset;
+   int sizeRc = rec->size;
+   int sizeSt = typeSizeof(st->dType);
+   int s = sizeSt / 4;
+   int size = sizeRc + sizeSt;
+   int j, k;
+   Value *src[4]; // no modifiers in ValueRef allowed for st
+   Value *extra[3];
+
+   if (!prog->getTarget()->
+       isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
+      return false;
+   if (size == 8 && MIN2(offRc, offSt) & 0x7)
+      return false;
+
+   st->takeExtraSources(0, extra); // save predicate and indirect address
+
+   if (offRc < offSt) {
+      // save values from @st
+      for (s = 0; sizeSt; ++s) {
+         sizeSt -= st->getSrc(s + 1)->reg.size;
+         src[s] = st->getSrc(s + 1);
+      }
+      // set record's values as low sources of @st
+      for (j = 1; sizeRc; ++j) {
+         sizeRc -= rec->insn->getSrc(j)->reg.size;
+         st->setSrc(j, rec->insn->getSrc(j));
+      }
+      // set saved values as high sources of @st
+      for (k = j, j = 0; j < s; ++j)
+         st->setSrc(k++, src[j]);
+
+      updateLdStOffset(st, offRc, func);
+   } else {
+      for (j = 1; sizeSt; ++j)
+         sizeSt -= st->getSrc(j)->reg.size;
+      for (s = 1; sizeRc; ++j, ++s) {
+         sizeRc -= rec->insn->getSrc(s)->reg.size;
+         st->setSrc(j, rec->insn->getSrc(s));
+      }
+      rec->offset = offSt;
+   }
+   st->putExtraSources(0, extra); // restore pointer and predicate
+
+   delete_Instruction(prog, rec->insn);
+   rec->insn = st;
+   rec->size = size;
+   rec->insn->getSrc(0)->reg.size = size;
+   rec->insn->setType(typeOfSize(size));
+   return true;
+}
+
+void
+MemoryOpt::Record::set(const Instruction *ldst)
+{
+   const Symbol *mem = ldst->getSrc(0)->asSym();
+   fileIndex = mem->reg.fileIndex;
+   rel[0] = ldst->getIndirect(0, 0);
+   rel[1] = ldst->getIndirect(0, 1);
+   offset = mem->reg.data.offset;
+   base = mem->getBase();
+   size = typeSizeof(ldst->sType);
+}
+
+void
+MemoryOpt::Record::link(Record **list)
+{
+   next = *list;
+   if (next)
+      next->prev = this;
+   prev = NULL;
+   *list = this;
+}
+
+void
+MemoryOpt::Record::unlink(Record **list)
+{
+   if (next)
+      next->prev = prev;
+   if (prev)
+      prev->next = next;
+   else
+      *list = next;
+}
+
+MemoryOpt::Record **
+MemoryOpt::getList(const Instruction *insn)
+{
+   if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
+      return &loads[insn->src(0).getFile()];
+   return &stores[insn->src(0).getFile()];
+}
+
+void
+MemoryOpt::addRecord(Instruction *i)
+{
+   Record **list = getList(i);
+   Record *it = reinterpret_cast<Record *>(recordPool.allocate());
+
+   it->link(list);
+   it->set(i);
+   it->insn = i;
+   it->locked = false;
+}
+
+MemoryOpt::Record *
+MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
+{
+   const Symbol *sym = insn->getSrc(0)->asSym();
+   const int size = typeSizeof(insn->sType);
+   Record *rec = NULL;
+   Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
+
+   for (; it; it = it->next) {
+      if (it->locked && insn->op != OP_LOAD)
+         continue;
+      if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
+          it->rel[0] != insn->getIndirect(0, 0) ||
+          it->fileIndex != sym->reg.fileIndex ||
+          it->rel[1] != insn->getIndirect(0, 1))
+         continue;
+
+      if (it->offset < sym->reg.data.offset) {
+         if (it->offset + it->size >= sym->reg.data.offset) {
+            isAdj = (it->offset + it->size == sym->reg.data.offset);
+            if (!isAdj)
+               return it;
+            if (!(it->offset & 0x7))
+               rec = it;
+         }
+      } else {
+         isAdj = it->offset != sym->reg.data.offset;
+         if (size <= it->size && !isAdj)
+            return it;
+         else
+         if (!(sym->reg.data.offset & 0x7))
+            if (it->offset - size <= sym->reg.data.offset)
+               rec = it;
+      }
+   }
+   return rec;
+}
+
+bool
+MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
+{
+   Instruction *st = rec->insn;
+   int32_t offSt = rec->offset;
+   int32_t offLd = ld->getSrc(0)->reg.data.offset;
+   int d, s;
+
+   for (s = 1; offSt != offLd && st->srcExists(s); ++s)
+      offSt += st->getSrc(s)->reg.size;
+   if (offSt != offLd)
+      return false;
+
+   for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
+      if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
+         return false;
+      if (st->getSrc(s)->reg.file != FILE_GPR)
+         return false;
+      ld->def(d).replace(st->src(s), false);
+   }
+   ld->bb->remove(ld);
+   return true;
+}
+
+bool
+MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
+{
+   Instruction *ldR = rec->insn;
+   int32_t offR = rec->offset;
+   int32_t offE = ldE->getSrc(0)->reg.data.offset;
+   int dR, dE;
+
+   assert(offR <= offE);
+   for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
+      offR += ldR->getDef(dR)->reg.size;
+   if (offR != offE)
+      return false;
+
+   for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
+      if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
+         return false;
+      ldE->def(dE).replace(ldR->getDef(dR), false);
+   }
+
+   delete_Instruction(prog, ldE);
+   return true;
+}
+
+bool
+MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
+{
+   const Instruction *const ri = rec->insn;
+   Value *extra[3];
+
+   int32_t offS = st->getSrc(0)->reg.data.offset;
+   int32_t offR = rec->offset;
+   int32_t endS = offS + typeSizeof(st->dType);
+   int32_t endR = offR + typeSizeof(ri->dType);
+
+   rec->size = MAX2(endS, endR) - MIN2(offS, offR);
+
+   st->takeExtraSources(0, extra);
+
+   if (offR < offS) {
+      Value *vals[10];
+      int s, n;
+      int k = 0;
+      // get non-replaced sources of ri
+      for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
+         vals[k++] = ri->getSrc(s);
+      n = s;
+      // get replaced sources of st
+      for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
+         vals[k++] = st->getSrc(s);
+      // skip replaced sources of ri
+      for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
+      // get non-replaced sources after values covered by st
+      for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
+         vals[k++] = ri->getSrc(s);
+      assert((unsigned int)k <= Elements(vals));
+      for (s = 0; s < k; ++s)
+         st->setSrc(s + 1, vals[s]);
+      st->setSrc(0, ri->getSrc(0));
+   } else
+   if (endR > endS) {
+      int j, s;
+      for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
+      for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
+      for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
+         st->setSrc(s++, ri->getSrc(j));
+   }
+   st->putExtraSources(0, extra);
+
+   delete_Instruction(prog, rec->insn);
+
+   rec->insn = st;
+   rec->offset = st->getSrc(0)->reg.data.offset;
+
+   st->setType(typeOfSize(rec->size));
+
+   return true;
+}
+
+bool
+MemoryOpt::Record::overlaps(const Instruction *ldst) const
+{
+   Record that;
+   that.set(ldst);
+
+   if (this->fileIndex != that.fileIndex)
+      return false;
+
+   if (this->rel[0] || that.rel[0])
+      return this->base == that.base;
+   return
+      (this->offset < that.offset + that.size) &&
+      (this->offset + this->size > that.offset);
+}
+
+// We must not eliminate stores that affect the result of @ld if
+// we find later stores to the same location, and we may no longer
+// merge them with later stores.
+// The stored value can, however, still be used to determine the value
+// returned by future loads.
+void
+MemoryOpt::lockStores(Instruction *const ld)
+{
+   for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
+      if (!r->locked && r->overlaps(ld))
+         r->locked = true;
+}
+
+// Prior loads from the location of @st are no longer valid.
+// Stores to the location of @st may no longer be used to derive
+// the value at it nor be coalesced into later stores.
+void
+MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
+{
+   if (st)
+      f = st->src(0).getFile();
+
+   for (Record *r = loads[f]; r; r = r->next)
+      if (!st || r->overlaps(st))
+         r->unlink(&loads[f]);
+
+   for (Record *r = stores[f]; r; r = r->next)
+      if (!st || r->overlaps(st))
+         r->unlink(&stores[f]);
+}
+
+bool
+MemoryOpt::visit(BasicBlock *bb)
+{
+   bool ret = runOpt(bb);
+   // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
+   // where 96 bit memory operations are forbidden.
+   if (ret)
+      ret = runOpt(bb);
+   return ret;
+}
+
+bool
+MemoryOpt::runOpt(BasicBlock *bb)
+{
+   Instruction *ldst, *next;
+   Record *rec;
+   bool isAdjacent = true;
+
+   for (ldst = bb->getEntry(); ldst; ldst = next) {
+      bool keep = true;
+      bool isLoad = true;
+      next = ldst->next;
+
+      if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
+         if (ldst->isDead()) {
+            // might have been produced by earlier optimization
+            delete_Instruction(prog, ldst);
+            continue;
+         }
+      } else
+      if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+         isLoad = false;
+      } else {
+         // TODO: maybe have all fixed ops act as barrier ?
+         if (ldst->op == OP_CALL ||
+             ldst->op == OP_BAR ||
+             ldst->op == OP_MEMBAR) {
+            purgeRecords(NULL, FILE_MEMORY_LOCAL);
+            purgeRecords(NULL, FILE_MEMORY_GLOBAL);
+            purgeRecords(NULL, FILE_MEMORY_SHARED);
+            purgeRecords(NULL, FILE_SHADER_OUTPUT);
+         } else
+         if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
+            if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+               purgeRecords(NULL, FILE_MEMORY_LOCAL);
+               purgeRecords(NULL, FILE_MEMORY_GLOBAL);
+               purgeRecords(NULL, FILE_MEMORY_SHARED);
+            } else {
+               purgeRecords(NULL, ldst->src(0).getFile());
+            }
+         } else
+         if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
+            purgeRecords(NULL, FILE_SHADER_OUTPUT);
+         }
+         continue;
+      }
+      if (ldst->getPredicate()) // TODO: handle predicated ld/st
+         continue;
+
+      if (isLoad) {
+         DataFile file = ldst->src(0).getFile();
+
+         // if ld l[]/g[] look for previous store to eliminate the reload
+         if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
+            // TODO: shared memory ?
+            rec = findRecord(ldst, false, isAdjacent);
+            if (rec && !isAdjacent)
+               keep = !replaceLdFromSt(ldst, rec);
+         }
+
+         // or look for ld from the same location and replace this one
+         rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
+         if (rec) {
+            if (!isAdjacent)
+               keep = !replaceLdFromLd(ldst, rec);
+            else
+               // or combine a previous load with this one
+               keep = !combineLd(rec, ldst);
+         }
+         if (keep)
+            lockStores(ldst);
+      } else {
+         rec = findRecord(ldst, false, isAdjacent);
+         if (rec) {
+            if (!isAdjacent)
+               keep = !replaceStFromSt(ldst, rec);
+            else
+               keep = !combineSt(rec, ldst);
+         }
+         if (keep)
+            purgeRecords(ldst, DATA_FILE_COUNT);
+      }
+      if (keep)
+         addRecord(ldst);
+   }
+   reset();
+
+   return true;
+}
+
+// =============================================================================
+
+// Turn control flow into predicated instructions (after register allocation !).
+// TODO:
+// Could move this to before register allocation on NVC0 and also handle nested
+// constructs.
+class FlatteningPass : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   bool tryPredicateConditional(BasicBlock *);
+   void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
+   void tryPropagateBranch(BasicBlock *);
+   inline bool isConstantCondition(Value *pred);
+   inline bool mayPredicate(const Instruction *, const Value *pred) const;
+   inline void removeFlow(Instruction *);
+};
+
+bool
+FlatteningPass::isConstantCondition(Value *pred)
+{
+   Instruction *insn = pred->getUniqueInsn();
+   assert(insn);
+   if (insn->op != OP_SET || insn->srcExists(2))
+      return false;
+
+   for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
+      Instruction *ld = insn->getSrc(s)->getUniqueInsn();
+      DataFile file;
+      if (ld) {
+         if (ld->op != OP_MOV && ld->op != OP_LOAD)
+            return false;
+         if (ld->src(0).isIndirect(0))
+            return false;
+         file = ld->src(0).getFile();
+      } else {
+         file = insn->src(s).getFile();
+         // catch $r63 on NVC0
+         if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
+            file = FILE_IMMEDIATE;
+      }
+      if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
+         return false;
+   }
+   return true;
+}
+
+void
+FlatteningPass::removeFlow(Instruction *insn)
+{
+   FlowInstruction *term = insn ? insn->asFlow() : NULL;
+   if (!term)
+      return;
+   Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
+
+   if (term->op == OP_BRA) {
+      // TODO: this might get more difficult when we get arbitrary BRAs
+      if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
+         return;
+   } else
+   if (term->op != OP_JOIN)
+      return;
+
+   Value *pred = term->getPredicate();
+
+   delete_Instruction(prog, term);
+
+   if (pred && pred->refCount() == 0) {
+      Instruction *pSet = pred->getUniqueInsn();
+      pred->join->reg.data.id = -1; // deallocate
+      if (pSet->isDead())
+         delete_Instruction(prog, pSet);
+   }
+}
+
+void
+FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
+{
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      if (i->isNop())
+         continue;
+      assert(!i->getPredicate());
+      i->setPredicate(cc, pred);
+   }
+   removeFlow(bb->getExit());
+}
+
+bool
+FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->isPseudo())
+      return true;
+   // TODO: calls where we don't know which registers are modified
+
+   if (!prog->getTarget()->mayPredicate(insn, pred))
+      return false;
+   for (int d = 0; insn->defExists(d); ++d)
+      if (insn->getDef(d)->equals(pred))
+         return false;
+   return true;
+}
+
+// If we jump to BRA/RET/EXIT, replace the jump with it.
+// NOTE: We do not update the CFG anymore here !
+//
+// TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
+//  BB:0
+//   @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
+//  BB1:
+//   bra BB:3
+//  BB2:
+//   ...
+//  BB3:
+//   ...
+void
+FlatteningPass::tryPropagateBranch(BasicBlock *bb)
+{
+   for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
+      BasicBlock *bf = i->asFlow()->target.bb;
+
+      if (bf->getInsnCount() != 1)
+         continue;
+
+      FlowInstruction *bra = i->asFlow();
+      FlowInstruction *rep = bf->getExit()->asFlow();
+
+      if (!rep || rep->getPredicate())
+         continue;
+      if (rep->op != OP_BRA &&
+          rep->op != OP_JOIN &&
+          rep->op != OP_EXIT)
+         continue;
+
+      // TODO: If there are multiple branches to @rep, only the first would
+      // be replaced, so only remove them after this pass is done ?
+      // Also, need to check all incident blocks for fall-through exits and
+      // add the branch there.
+      bra->op = rep->op;
+      bra->target.bb = rep->target.bb;
+      if (bf->cfg.incidentCount() == 1)
+         bf->remove(rep);
+   }
+}
+
+bool
+FlatteningPass::visit(BasicBlock *bb)
+{
+   if (tryPredicateConditional(bb))
+      return true;
+
+   // try to attach join to previous instruction
+   Instruction *insn = bb->getExit();
+   if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
+      insn = insn->prev;
+      if (insn && !insn->getPredicate() &&
+          !insn->asFlow() &&
+          insn->op != OP_TEXBAR &&
+          !isTextureOp(insn->op) && // probably just nve4
+          !isSurfaceOp(insn->op) && // not confirmed
+          insn->op != OP_LINTERP && // probably just nve4
+          insn->op != OP_PINTERP && // probably just nve4
+          ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
+           typeSizeof(insn->dType) <= 4) &&
+          !insn->isNop()) {
+         insn->join = 1;
+         bb->remove(bb->getExit());
+         return true;
+      }
+   }
+
+   tryPropagateBranch(bb);
+
+   return true;
+}
+
+bool
+FlatteningPass::tryPredicateConditional(BasicBlock *bb)
+{
+   BasicBlock *bL = NULL, *bR = NULL;
+   unsigned int nL = 0, nR = 0, limit = 12;
+   Instruction *insn;
+   unsigned int mask;
+
+   mask = bb->initiatesSimpleConditional();
+   if (!mask)
+      return false;
+
+   assert(bb->getExit());
+   Value *pred = bb->getExit()->getPredicate();
+   assert(pred);
+
+   if (isConstantCondition(pred))
+      limit = 4;
+
+   Graph::EdgeIterator ei = bb->cfg.outgoing();
+
+   if (mask & 1) {
+      bL = BasicBlock::get(ei.getNode());
+      for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
+         if (!mayPredicate(insn, pred))
+            return false;
+      if (nL > limit)
+         return false; // too long, do a real branch
+   }
+   ei.next();
+
+   if (mask & 2) {
+      bR = BasicBlock::get(ei.getNode());
+      for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
+         if (!mayPredicate(insn, pred))
+            return false;
+      if (nR > limit)
+         return false; // too long, do a real branch
+   }
+
+   if (bL)
+      predicateInstructions(bL, pred, bb->getExit()->cc);
+   if (bR)
+      predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
+
+   if (bb->joinAt) {
+      bb->remove(bb->joinAt);
+      bb->joinAt = NULL;
+   }
+   removeFlow(bb->getExit()); // delete the branch/join at the fork point
+
+   // remove potential join operations at the end of the conditional
+   if (prog->getTarget()->joinAnterior) {
+      bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
+      if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
+         removeFlow(bb->getEntry());
+   }
+
+   return true;
+}
+
+// =============================================================================
+
+// Common subexpression elimination. Stupid O^2 implementation.
+class LocalCSE : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+
+   inline bool tryReplace(Instruction **, Instruction *);
+
+   DLList ops[OP_LAST + 1];
+};
+
+class GlobalCSE : public Pass
+{
+private:
+   virtual bool visit(BasicBlock *);
+};
+
+bool
+Instruction::isActionEqual(const Instruction *that) const
+{
+   if (this->op != that->op ||
+       this->dType != that->dType ||
+       this->sType != that->sType)
+      return false;
+   if (this->cc != that->cc)
+      return false;
+
+   if (this->asTex()) {
+      if (memcmp(&this->asTex()->tex,
+                 &that->asTex()->tex,
+                 sizeof(this->asTex()->tex)))
+         return false;
+   } else
+   if (this->asCmp()) {
+      if (this->asCmp()->setCond != that->asCmp()->setCond)
+         return false;
+   } else
+   if (this->asFlow()) {
+      return false;
+   } else {
+      if (this->ipa != that->ipa ||
+          this->lanes != that->lanes ||
+          this->perPatch != that->perPatch)
+         return false;
+      if (this->postFactor != that->postFactor)
+         return false;
+   }
+
+   if (this->subOp != that->subOp ||
+       this->saturate != that->saturate ||
+       this->rnd != that->rnd ||
+       this->ftz != that->ftz ||
+       this->dnz != that->dnz ||
+       this->cache != that->cache ||
+       this->mask != that->mask)
+      return false;
+
+   return true;
+}
+
+bool
+Instruction::isResultEqual(const Instruction *that) const
+{
+   unsigned int d, s;
+
+   // NOTE: location of discard only affects tex with liveOnly and quadops
+   if (!this->defExists(0) && this->op != OP_DISCARD)
+      return false;
+
+   if (!isActionEqual(that))
+      return false;
+
+   if (this->predSrc != that->predSrc)
+      return false;
+
+   for (d = 0; this->defExists(d); ++d) {
+      if (!that->defExists(d) ||
+          !this->getDef(d)->equals(that->getDef(d), false))
+         return false;
+   }
+   if (that->defExists(d))
+      return false;
+
+   for (s = 0; this->srcExists(s); ++s) {
+      if (!that->srcExists(s))
+         return false;
+      if (this->src(s).mod != that->src(s).mod)
+         return false;
+      if (!this->getSrc(s)->equals(that->getSrc(s), true))
+         return false;
+   }
+   if (that->srcExists(s))
+      return false;
+
+   if (op == OP_LOAD || op == OP_VFETCH) {
+      switch (src(0).getFile()) {
+      case FILE_MEMORY_CONST:
+      case FILE_SHADER_INPUT:
+         return true;
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+// pull through common expressions from different in-blocks
+bool
+GlobalCSE::visit(BasicBlock *bb)
+{
+   Instruction *phi, *next, *ik;
+   int s;
+
+   // TODO: maybe do this with OP_UNION, too
+
+   for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
+      next = phi->next;
+      if (phi->getSrc(0)->refCount() > 1)
+         continue;
+      ik = phi->getSrc(0)->getInsn();
+      if (!ik)
+         continue; // probably a function input
+      for (s = 1; phi->srcExists(s); ++s) {
+         if (phi->getSrc(s)->refCount() > 1)
+            break;
+         if (!phi->getSrc(s)->getInsn() ||
+             !phi->getSrc(s)->getInsn()->isResultEqual(ik))
+            break;
+      }
+      if (!phi->srcExists(s)) {
+         Instruction *entry = bb->getEntry();
+         ik->bb->remove(ik);
+         if (!entry || entry->op != OP_JOIN)
+            bb->insertHead(ik);
+         else
+            bb->insertAfter(entry, ik);
+         ik->setDef(0, phi->getDef(0));
+         delete_Instruction(prog, phi);
+      }
+   }
+
+   return true;
+}
+
+bool
+LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
+{
+   Instruction *old = *ptr;
+
+   // TODO: maybe relax this later (causes trouble with OP_UNION)
+   if (i->isPredicated())
+      return false;
+
+   if (!old->isResultEqual(i))
+      return false;
+
+   for (int d = 0; old->defExists(d); ++d)
+      old->def(d).replace(i->getDef(d), false);
+   delete_Instruction(prog, old);
+   *ptr = NULL;
+   return true;
+}
+
+bool
+LocalCSE::visit(BasicBlock *bb)
+{
+   unsigned int replaced;
+
+   do {
+      Instruction *ir, *next;
+
+      replaced = 0;
+
+      // will need to know the order of instructions
+      int serial = 0;
+      for (ir = bb->getFirst(); ir; ir = ir->next)
+         ir->serial = serial++;
+
+      for (ir = bb->getEntry(); ir; ir = next) {
+         int s;
+         Value *src = NULL;
+
+         next = ir->next;
+
+         if (ir->fixed) {
+            ops[ir->op].insert(ir);
+            continue;
+         }
+
+         for (s = 0; ir->srcExists(s); ++s)
+            if (ir->getSrc(s)->asLValue())
+               if (!src || ir->getSrc(s)->refCount() < src->refCount())
+                  src = ir->getSrc(s);
+
+         if (src) {
+            for (Value::UseIterator it = src->uses.begin();
+                 it != src->uses.end(); ++it) {
+               Instruction *ik = (*it)->getInsn();
+               if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
+                  if (tryReplace(&ir, ik))
+                     break;
+            }
+         } else {
+            DLLIST_FOR_EACH(&ops[ir->op], iter)
+            {
+               Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
+               if (tryReplace(&ir, ik))
+                  break;
+            }
+         }
+
+         if (ir)
+            ops[ir->op].insert(ir);
+         else
+            ++replaced;
+      }
+      for (unsigned int i = 0; i <= OP_LAST; ++i)
+         ops[i].clear();
+
+   } while (replaced);
+
+   return true;
+}
+
+// =============================================================================
+
+// Remove computations of unused values.
+class DeadCodeElim : public Pass
+{
+public:
+   bool buryAll(Program *);
+
+private:
+   virtual bool visit(BasicBlock *);
+
+   void checkSplitLoad(Instruction *ld); // for partially dead loads
+
+   unsigned int deadCount;
+};
+
+bool
+DeadCodeElim::buryAll(Program *prog)
+{
+   do {
+      deadCount = 0;
+      if (!this->run(prog, false, false))
+         return false;
+   } while (deadCount);
+
+   return true;
+}
+
+bool
+DeadCodeElim::visit(BasicBlock *bb)
+{
+   Instruction *next;
+
+   for (Instruction *i = bb->getFirst(); i; i = next) {
+      next = i->next;
+      if (i->isDead()) {
+         ++deadCount;
+         delete_Instruction(prog, i);
+      } else
+      if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
+         checkSplitLoad(i);
+      } else
+      if (i->defExists(0) && !i->getDef(0)->refCount()) {
+         if (i->op == OP_ATOM ||
+             i->op == OP_SUREDP ||
+             i->op == OP_SUREDB)
+            i->setDef(0, NULL);
+      }
+   }
+   return true;
+}
+
+void
+DeadCodeElim::checkSplitLoad(Instruction *ld1)
+{
+   Instruction *ld2 = NULL; // can get at most 2 loads
+   Value *def1[4];
+   Value *def2[4];
+   int32_t addr1, addr2;
+   int32_t size1, size2;
+   int d, n1, n2;
+   uint32_t mask = 0xffffffff;
+
+   for (d = 0; ld1->defExists(d); ++d)
+      if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
+         mask &= ~(1 << d);
+   if (mask == 0xffffffff)
+      return;
+
+   addr1 = ld1->getSrc(0)->reg.data.offset;
+   n1 = n2 = 0;
+   size1 = size2 = 0;
+   for (d = 0; ld1->defExists(d); ++d) {
+      if (mask & (1 << d)) {
+         if (size1 && (addr1 & 0x7))
+            break;
+         def1[n1] = ld1->getDef(d);
+         size1 += def1[n1++]->reg.size;
+      } else
+      if (!n1) {
+         addr1 += ld1->getDef(d)->reg.size;
+      } else {
+         break;
+      }
+   }
+   for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
+      if (mask & (1 << d)) {
+         def2[n2] = ld1->getDef(d);
+         size2 += def2[n2++]->reg.size;
+      } else {
+         assert(!n2);
+         addr2 += ld1->getDef(d)->reg.size;
+      }
+   }
+
+   updateLdStOffset(ld1, addr1, func);
+   ld1->setType(typeOfSize(size1));
+   for (d = 0; d < 4; ++d)
+      ld1->setDef(d, (d < n1) ? def1[d] : NULL);
+
+   if (!n2)
+      return;
+
+   ld2 = cloneShallow(func, ld1);
+   updateLdStOffset(ld2, addr2, func);
+   ld2->setType(typeOfSize(size2));
+   for (d = 0; d < 4; ++d)
+      ld2->setDef(d, (d < n2) ? def2[d] : NULL);
+
+   ld1->bb->insertAfter(ld1, ld2);
+}
+
+// =============================================================================
+
+#define RUN_PASS(l, n, f)                       \
+   if (level >= (l)) {                          \
+      if (dbgFlags & NV50_IR_DEBUG_VERBOSE)     \
+         INFO("PEEPHOLE: %s\n", #n);            \
+      n pass;                                   \
+      if (!pass.f(this))                        \
+         return false;                          \
+   }
+
+bool
+Program::optimizeSSA(int level)
+{
+   RUN_PASS(1, DeadCodeElim, buryAll);
+   RUN_PASS(1, CopyPropagation, run);
+   RUN_PASS(2, GlobalCSE, run);
+   RUN_PASS(1, LocalCSE, run);
+   RUN_PASS(2, AlgebraicOpt, run);
+   RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
+   RUN_PASS(1, ConstantFolding, foldAll);
+   RUN_PASS(1, LoadPropagation, run);
+   RUN_PASS(2, MemoryOpt, run);
+   RUN_PASS(2, LocalCSE, run);
+   RUN_PASS(0, DeadCodeElim, buryAll);
+
+   return true;
+}
+
+bool
+Program::optimizePostRA(int level)
+{
+   RUN_PASS(2, FlatteningPass, run);
+   return true;
+}
+
+}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
new file mode 100644
index 0000000..ee39b3c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -0,0 +1,698 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+namespace nv50_ir {
+
+enum TextStyle
+{
+   TXT_DEFAULT,
+   TXT_GPR,
+   TXT_REGISTER,
+   TXT_FLAGS,
+   TXT_MEM,
+   TXT_IMMD,
+   TXT_BRA,
+   TXT_INSN
+};
+
+static const char *_colour[8] =
+{
+   "\x1b[00m",
+   "\x1b[34m",
+   "\x1b[35m",
+   "\x1b[35m",
+   "\x1b[36m",
+   "\x1b[33m",
+   "\x1b[37m",
+   "\x1b[32m"
+};
+
+static const char *_nocolour[8] =
+{
+      "", "", "", "", "", "", "", ""
+};
+
+static const char **colour;
+
+static void init_colours()
+{
+   if (getenv("NV50_PROG_DEBUG_NO_COLORS") != NULL)
+      colour = _nocolour;
+   else
+      colour = _colour;
+}
+
+const char *operationStr[OP_LAST + 1] =
+{
+   "nop",
+   "phi",
+   "union",
+   "split",
+   "merge",
+   "consec",
+   "mov",
+   "ld",
+   "st",
+   "add",
+   "sub",
+   "mul",
+   "div",
+   "mod",
+   "mad",
+   "fma",
+   "sad",
+   "abs",
+   "neg",
+   "not",
+   "and",
+   "or",
+   "xor",
+   "shl",
+   "shr",
+   "max",
+   "min",
+   "sat",
+   "ceil",
+   "floor",
+   "trunc",
+   "cvt",
+   "set and",
+   "set or",
+   "set xor",
+   "set",
+   "selp",
+   "slct",
+   "rcp",
+   "rsq",
+   "lg2",
+   "sin",
+   "cos",
+   "ex2",
+   "exp",
+   "log",
+   "presin",
+   "preex2",
+   "sqrt",
+   "pow",
+   "bra",
+   "call",
+   "ret",
+   "cont",
+   "break",
+   "preret",
+   "precont",
+   "prebreak",
+   "brkpt",
+   "joinat",
+   "join",
+   "discard",
+   "exit",
+   "membar",
+   "vfetch",
+   "pfetch",
+   "export",
+   "linterp",
+   "pinterp",
+   "emit",
+   "restart",
+   "tex",
+   "texbias",
+   "texlod",
+   "texfetch",
+   "texquery",
+   "texgrad",
+   "texgather",
+   "texcsaa",
+   "texprep",
+   "suldb",
+   "suldp",
+   "sustb",
+   "sustp",
+   "suredb",
+   "suredp",
+   "sulea",
+   "subfm",
+   "suclamp",
+   "sueau",
+   "madsp",
+   "texbar",
+   "dfdx",
+   "dfdy",
+   "rdsv",
+   "wrsv",
+   "quadop",
+   "quadon",
+   "quadpop",
+   "popcnt",
+   "insbf",
+   "extbf",
+   "permt",
+   "atom",
+   "bar",
+   "vadd",
+   "vavg",
+   "vmin",
+   "vmax",
+   "vsad",
+   "vset",
+   "vshr",
+   "vshl",
+   "vsel",
+   "cctl",
+   "(invalid)"
+};
+
+static const char *atomSubOpStr[] =
+{
+   "add", "min", "max", "inc", "dec", "and", "or", "xor", "cas", "exch"
+};
+
+static const char *DataTypeStr[] =
+{
+   "-",
+   "u8", "s8",
+   "u16", "s16",
+   "u32", "s32",
+   "u64", "s64",
+   "f16", "f32", "f64",
+   "b96", "b128"
+};
+
+static const char *RoundModeStr[] =
+{
+   "", "rm", "rz", "rp", "rni", "rmi", "rzi", "rpi"
+};
+
+static const char *CondCodeStr[] =
+{
+   "never",
+   "lt",
+   "eq",
+   "le",
+   "gt",
+   "ne",
+   "ge",
+   "",
+   "(invalid)",
+   "ltu",
+   "equ",
+   "leu",
+   "gtu",
+   "neu",
+   "geu",
+   "",
+   "no",
+   "nc",
+   "ns",
+   "na",
+   "a",
+   "s",
+   "c",
+   "o"
+};
+
+static const char *SemanticStr[SV_LAST + 1] =
+{
+   "POSITION",
+   "VERTEX_ID",
+   "INSTANCE_ID",
+   "INVOCATION_ID",
+   "PRIMITIVE_ID",
+   "VERTEX_COUNT",
+   "LAYER",
+   "VIEWPORT_INDEX",
+   "Y_DIR",
+   "FACE",
+   "POINT_SIZE",
+   "POINT_COORD",
+   "CLIP_DISTANCE",
+   "SAMPLE_INDEX",
+   "TESS_FACTOR",
+   "TESS_COORD",
+   "TID",
+   "CTAID",
+   "NTID",
+   "GRIDID",
+   "NCTAID",
+   "LANEID",
+   "PHYSID",
+   "NPHYSID",
+   "CLOCK",
+   "LBASE",
+   "SBASE",
+   "?",
+   "(INVALID)"
+};
+
+static const char *interpStr[16] =
+{
+   "pass",
+   "mul",
+   "flat",
+   "sc",
+   "cent pass",
+   "cent mul",
+   "cent flat",
+   "cent sc",
+   "off pass",
+   "off mul",
+   "off flat",
+   "off sc",
+   "samp pass",
+   "samp mul",
+   "samp flat",
+   "samp sc"
+};
+
+#define PRINT(args...)                                \
+   do {                                               \
+      pos += snprintf(&buf[pos], size - pos, args);   \
+   } while(0)
+
+#define SPACE_PRINT(cond, args...)                      \
+   do {                                                 \
+      if (cond)                                         \
+         buf[pos++] = ' ';                              \
+      pos += snprintf(&buf[pos], size - pos, args);     \
+   } while(0)
+
+#define SPACE()                                    \
+   do {                                            \
+      if (pos < size)                              \
+         buf[pos++] = ' ';                         \
+   } while(0)
+
+int Modifier::print(char *buf, size_t size) const
+{
+   size_t pos = 0;
+
+   if (bits)
+      PRINT("%s", colour[TXT_INSN]);
+
+   size_t base = pos;
+
+   if (bits & NV50_IR_MOD_NOT)
+      PRINT("not");
+   if (bits & NV50_IR_MOD_SAT)
+      SPACE_PRINT(pos > base && pos < size, "sat");
+   if (bits & NV50_IR_MOD_NEG)
+      SPACE_PRINT(pos > base && pos < size, "neg");
+   if (bits & NV50_IR_MOD_ABS)
+      SPACE_PRINT(pos > base && pos < size, "abs");
+
+   return pos;
+}
+
+int LValue::print(char *buf, size_t size, DataType ty) const
+{
+   const char *postFix = "";
+   size_t pos = 0;
+   int idx = join->reg.data.id >= 0 ? join->reg.data.id : id;
+   char p = join->reg.data.id >= 0 ? '$' : '%';
+   char r;
+   int col = TXT_DEFAULT;
+
+   switch (reg.file) {
+   case FILE_GPR:
+      r = 'r'; col = TXT_GPR;
+      if (reg.size == 2) {
+         if (p == '$') {
+            postFix = (idx & 1) ? "h" : "l";
+            idx /= 2;
+         } else {
+            postFix = "s";
+         }
+      } else
+      if (reg.size == 8) {
+         postFix = "d";
+      } else
+      if (reg.size == 16) {
+         postFix = "q";
+      } else
+      if (reg.size == 12) {
+         postFix = "t";
+      }
+      break;
+   case FILE_PREDICATE:
+      r = 'p'; col = TXT_REGISTER;
+      if (reg.size == 2)
+         postFix = "d";
+      else
+      if (reg.size == 4)
+         postFix = "q";
+      break;
+   case FILE_FLAGS:
+      r = 'c'; col = TXT_FLAGS;
+      break;
+   case FILE_ADDRESS:
+      r = 'a'; col = TXT_REGISTER;
+      break;
+   default:
+      assert(!"invalid file for lvalue");
+      r = '?';
+      break;
+   }
+
+   PRINT("%s%c%c%i%s", colour[col], p, r, idx, postFix);
+
+   return pos;
+}
+
+int ImmediateValue::print(char *buf, size_t size, DataType ty) const
+{
+   size_t pos = 0;
+
+   PRINT("%s", colour[TXT_IMMD]);
+
+   switch (ty) {
+   case TYPE_F32: PRINT("%f", reg.data.f32); break;
+   case TYPE_F64: PRINT("%f", reg.data.f64); break;
+   case TYPE_U8:  PRINT("0x%02x", reg.data.u8); break;
+   case TYPE_S8:  PRINT("%i", reg.data.s8); break;
+   case TYPE_U16: PRINT("0x%04x", reg.data.u16); break;
+   case TYPE_S16: PRINT("%i", reg.data.s16); break;
+   case TYPE_U32: PRINT("0x%08x", reg.data.u32); break;
+   case TYPE_S32: PRINT("%i", reg.data.s32); break;
+   case TYPE_U64:
+   case TYPE_S64:
+   default:
+      PRINT("0x%016"PRIx64, reg.data.u64);
+      break;
+   }
+   return pos;
+}
+
+int Symbol::print(char *buf, size_t size, DataType ty) const
+{
+   return print(buf, size, NULL, NULL, ty);
+}
+
+int Symbol::print(char *buf, size_t size,
+                  Value *rel, Value *dimRel, DataType ty) const
+{
+   size_t pos = 0;
+   char c;
+
+   if (ty == TYPE_NONE)
+      ty = typeOfSize(reg.size);
+
+   if (reg.file == FILE_SYSTEM_VALUE) {
+      PRINT("%ssv[%s%s:%i%s", colour[TXT_MEM],
+            colour[TXT_REGISTER],
+            SemanticStr[reg.data.sv.sv], reg.data.sv.index, colour[TXT_MEM]);
+      if (rel) {
+         PRINT("%s+", colour[TXT_DEFAULT]);
+         pos += rel->print(&buf[pos], size - pos);
+      }
+      PRINT("%s]", colour[TXT_MEM]);
+      return pos;
+   }
+
+   switch (reg.file) {
+   case FILE_MEMORY_CONST:  c = 'c'; break;
+   case FILE_SHADER_INPUT:  c = 'a'; break;
+   case FILE_SHADER_OUTPUT: c = 'o'; break;
+   case FILE_MEMORY_GLOBAL: c = 'g'; break;
+   case FILE_MEMORY_SHARED: c = 's'; break;
+   case FILE_MEMORY_LOCAL:  c = 'l'; break;
+   default:
+      assert(!"invalid file");
+      c = '?';
+      break;
+   }
+
+   if (c == 'c')
+      PRINT("%s%c%i[", colour[TXT_MEM], c, reg.fileIndex);
+   else
+      PRINT("%s%c[", colour[TXT_MEM], c);
+
+   if (dimRel) {
+      pos += dimRel->print(&buf[pos], size - pos, TYPE_S32);
+      PRINT("%s][", colour[TXT_MEM]);
+   }
+
+   if (rel) {
+      pos += rel->print(&buf[pos], size - pos);
+      PRINT("%s%c", colour[TXT_DEFAULT], (reg.data.offset < 0) ? '-' : '+');
+   } else {
+      assert(reg.data.offset >= 0);
+   }
+   PRINT("%s0x%x%s]", colour[TXT_IMMD], abs(reg.data.offset), colour[TXT_MEM]);
+
+   return pos;
+}
+
+void Instruction::print() const
+{
+   #define BUFSZ 512
+
+   const size_t size = BUFSZ;
+
+   char buf[BUFSZ];
+   int s, d;
+   size_t pos = 0;
+
+   PRINT("%s", colour[TXT_INSN]);
+
+   if (join)
+      PRINT("join ");
+
+   if (predSrc >= 0) {
+      const size_t pre = pos;
+      if (getSrc(predSrc)->reg.file == FILE_PREDICATE) {
+         if (cc == CC_NOT_P)
+            PRINT("not");
+      } else {
+         PRINT("%s", CondCodeStr[cc]);
+      }
+      if (pos > pre)
+         SPACE();
+      pos += getSrc(predSrc)->print(&buf[pos], BUFSZ - pos);
+      PRINT(" %s", colour[TXT_INSN]);
+   }
+
+   if (saturate)
+      PRINT("sat ");
+
+   if (asFlow()) {
+      PRINT("%s", operationStr[op]);
+      if (asFlow()->indirect)
+         PRINT(" ind");
+      if (asFlow()->absolute)
+         PRINT(" abs");
+      if (op == OP_CALL && asFlow()->builtin) {
+         PRINT(" %sBUILTIN:%i", colour[TXT_BRA], asFlow()->target.builtin);
+      } else
+      if (op == OP_CALL && asFlow()->target.fn) {
+         PRINT(" %s%s:%i", colour[TXT_BRA],
+               asFlow()->target.fn->getName(),
+               asFlow()->target.fn->getLabel());
+      } else
+      if (asFlow()->target.bb)
+         PRINT(" %sBB:%i", colour[TXT_BRA], asFlow()->target.bb->getId());
+   } else {
+      PRINT("%s ", operationStr[op]);
+      if (op == OP_LINTERP || op == OP_PINTERP)
+         PRINT("%s ", interpStr[ipa]);
+      switch (op) {
+      case OP_SUREDP:
+      case OP_ATOM:
+         if (subOp < Elements(atomSubOpStr))
+            PRINT("%s ", atomSubOpStr[subOp]);
+         break;
+      default:
+         if (subOp)
+            PRINT("(SUBOP:%u) ", subOp);
+         break;
+      }
+      if (perPatch)
+         PRINT("patch ");
+      if (asTex())
+         PRINT("%s %s$r%u $s%u %s", asTex()->tex.target.getName(),
+               colour[TXT_MEM], asTex()->tex.r, asTex()->tex.s,
+               colour[TXT_INSN]);
+      if (postFactor)
+         PRINT("x2^%i ", postFactor);
+      PRINT("%s%s", dnz ? "dnz " : (ftz ? "ftz " : ""),  DataTypeStr[dType]);
+   }
+
+   if (rnd != ROUND_N)
+      PRINT(" %s", RoundModeStr[rnd]);
+
+   if (defExists(1))
+      PRINT(" {");
+   for (d = 0; defExists(d); ++d) {
+      SPACE();
+      pos += getDef(d)->print(&buf[pos], size - pos);
+   }
+   if (d > 1)
+      PRINT(" %s}", colour[TXT_INSN]);
+   else
+   if (!d && !asFlow())
+      PRINT(" %s#", colour[TXT_INSN]);
+
+   if (asCmp())
+      PRINT(" %s%s", colour[TXT_INSN], CondCodeStr[asCmp()->setCond]);
+
+   if (sType != dType)
+      PRINT(" %s%s", colour[TXT_INSN], DataTypeStr[sType]);
+
+   for (s = 0; srcExists(s); ++s) {
+      if (s == predSrc || src(s).usedAsPtr)
+         continue;
+      const size_t pre = pos;
+      SPACE();
+      pos += src(s).mod.print(&buf[pos], BUFSZ - pos);
+      if (pos > pre + 1)
+         SPACE();
+      if (src(s).isIndirect(0) || src(s).isIndirect(1))
+         pos += getSrc(s)->asSym()->print(&buf[pos], BUFSZ - pos,
+                                          getIndirect(s, 0),
+                                          getIndirect(s, 1));
+      else
+         pos += getSrc(s)->print(&buf[pos], BUFSZ - pos, sType);
+   }
+   if (exit)
+      PRINT("%s exit", colour[TXT_INSN]);
+
+   PRINT("%s", colour[TXT_DEFAULT]);
+
+   buf[MIN2(pos, BUFSZ - 1)] = 0;
+
+   INFO("%s (%u)\n", buf, encSize);
+}
+
+class PrintPass : public Pass
+{
+public:
+   PrintPass() : serial(0) { }
+
+   virtual bool visit(Function *);
+   virtual bool visit(BasicBlock *);
+   virtual bool visit(Instruction *);
+
+private:
+   int serial;
+};
+
+bool
+PrintPass::visit(Function *fn)
+{
+   char str[16];
+
+   INFO("\n%s:%i (", fn->getName(), fn->getLabel());
+
+   if (!fn->outs.empty())
+      INFO("out");
+   for (std::deque<ValueRef>::iterator it = fn->outs.begin();
+        it != fn->outs.end();
+        ++it) {
+      it->get()->print(str, sizeof(str), typeOfSize(it->get()->reg.size));
+      INFO(" %s", str);
+   }
+
+   if (!fn->ins.empty())
+      INFO("%s%sin", colour[TXT_DEFAULT], fn->outs.empty() ? "" : ", ");
+   for (std::deque<ValueDef>::iterator it = fn->ins.begin();
+        it != fn->ins.end();
+        ++it) {
+      it->get()->print(str, sizeof(str), typeOfSize(it->get()->reg.size));
+      INFO(" %s", str);
+   }
+   INFO("%s)\n", colour[TXT_DEFAULT]);
+
+   return true;
+}
+
+bool
+PrintPass::visit(BasicBlock *bb)
+{
+#if 0
+   INFO("---\n");
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next())
+      INFO(" <- BB:%i (%s)\n",
+           BasicBlock::get(ei.getNode())->getId(),
+           ei.getEdge()->typeStr());
+#endif
+   INFO("BB:%i (%u instructions) - ", bb->getId(), bb->getInsnCount());
+
+   if (bb->idom())
+      INFO("idom = BB:%i, ", bb->idom()->getId());
+
+   INFO("df = { ");
+   for (DLList::Iterator df = bb->getDF().iterator(); !df.end(); df.next())
+      INFO("BB:%i ", BasicBlock::get(df)->getId());
+
+   INFO("}\n");
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next())
+      INFO(" -> BB:%i (%s)\n",
+           BasicBlock::get(ei.getNode())->getId(),
+           ei.getEdge()->typeStr());
+
+   return true;
+}
+
+bool
+PrintPass::visit(Instruction *insn)
+{
+   INFO("%3i: ", serial++);
+   insn->print();
+   return true;
+}
+
+void
+Function::print()
+{
+   PrintPass pass;
+   pass.run(this, true, false);
+}
+
+void
+Program::print()
+{
+   PrintPass pass;
+   init_colours();
+   pass.run(this, true, false);
+}
+
+void
+Function::printLiveIntervals() const
+{
+   INFO("printing live intervals ...\n");
+
+   for (ArrayList::Iterator it = allLValues.iterator(); !it.end(); it.next()) {
+      const Value *lval = Value::get(it)->asLValue();
+      if (lval && !lval->livei.isEmpty()) {
+         INFO("livei(%%%i): ", lval->id);
+         lval->livei.print();
+      }
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
new file mode 100644
index 0000000..d65003c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -0,0 +1,2050 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+#include <stack>
+#include <limits>
+
+namespace nv50_ir {
+
+#define MAX_REGISTER_FILE_SIZE 256
+
+class RegisterSet
+{
+public:
+   RegisterSet(const Target *);
+
+   void init(const Target *);
+   void reset(DataFile, bool resetMax = false);
+
+   void periodicMask(DataFile f, uint32_t lock, uint32_t unlock);
+   void intersect(DataFile f, const RegisterSet *);
+
+   bool assign(int32_t& reg, DataFile f, unsigned int size);
+   void release(DataFile f, int32_t reg, unsigned int size);
+   void occupy(DataFile f, int32_t reg, unsigned int size);
+   void occupy(const Value *);
+   void occupyMask(DataFile f, int32_t reg, uint8_t mask);
+   bool isOccupied(DataFile f, int32_t reg, unsigned int size) const;
+   bool testOccupy(const Value *);
+   bool testOccupy(DataFile f, int32_t reg, unsigned int size);
+
+   inline int getMaxAssigned(DataFile f) const { return fill[f]; }
+
+   inline unsigned int getFileSize(DataFile f, uint8_t regSize) const
+   {
+      if (restrictedGPR16Range && f == FILE_GPR && regSize == 2)
+         return (last[f] + 1) / 2;
+      return last[f] + 1;
+   }
+
+   inline unsigned int units(DataFile f, unsigned int size) const
+   {
+      return size >> unit[f];
+   }
+   // for regs of size >= 4, id is counted in 4-byte words (like nv50/c0 binary)
+   inline unsigned int idToBytes(const Value *v) const
+   {
+      return v->reg.data.id * MIN2(v->reg.size, 4);
+   }
+   inline unsigned int idToUnits(const Value *v) const
+   {
+      return units(v->reg.file, idToBytes(v));
+   }
+   inline int bytesToId(Value *v, unsigned int bytes) const
+   {
+      if (v->reg.size < 4)
+         return units(v->reg.file, bytes);
+      return bytes / 4;
+   }
+   inline int unitsToId(DataFile f, int u, uint8_t size) const
+   {
+      if (u < 0)
+         return -1;
+      return (size < 4) ? u : ((u << unit[f]) / 4);
+   }
+
+   void print() const;
+
+private:
+   BitSet bits[LAST_REGISTER_FILE + 1];
+
+   int unit[LAST_REGISTER_FILE + 1]; // log2 of allocation granularity
+
+   int last[LAST_REGISTER_FILE + 1];
+   int fill[LAST_REGISTER_FILE + 1];
+
+   const bool restrictedGPR16Range;
+};
+
+void
+RegisterSet::reset(DataFile f, bool resetMax)
+{
+   bits[f].fill(0);
+   if (resetMax)
+      fill[f] = -1;
+}
+
+void
+RegisterSet::init(const Target *targ)
+{
+   for (unsigned int rf = 0; rf <= FILE_ADDRESS; ++rf) {
+      DataFile f = static_cast<DataFile>(rf);
+      last[rf] = targ->getFileSize(f) - 1;
+      unit[rf] = targ->getFileUnit(f);
+      fill[rf] = -1;
+      assert(last[rf] < MAX_REGISTER_FILE_SIZE);
+      bits[rf].allocate(last[rf] + 1, true);
+   }
+}
+
+RegisterSet::RegisterSet(const Target *targ)
+  : restrictedGPR16Range(targ->getChipset() < 0xc0)
+{
+   init(targ);
+   for (unsigned int i = 0; i <= LAST_REGISTER_FILE; ++i)
+      reset(static_cast<DataFile>(i));
+}
+
+void
+RegisterSet::periodicMask(DataFile f, uint32_t lock, uint32_t unlock)
+{
+   bits[f].periodicMask32(lock, unlock);
+}
+
+void
+RegisterSet::intersect(DataFile f, const RegisterSet *set)
+{
+   bits[f] |= set->bits[f];
+}
+
+void
+RegisterSet::print() const
+{
+   INFO("GPR:");
+   bits[FILE_GPR].print();
+   INFO("\n");
+}
+
+bool
+RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size)
+{
+   reg = bits[f].findFreeRange(size);
+   if (reg < 0)
+      return false;
+   fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
+   return true;
+}
+
+bool
+RegisterSet::isOccupied(DataFile f, int32_t reg, unsigned int size) const
+{
+   return bits[f].testRange(reg, size);
+}
+
+void
+RegisterSet::occupy(const Value *v)
+{
+   occupy(v->reg.file, idToUnits(v), v->reg.size >> unit[v->reg.file]);
+}
+
+void
+RegisterSet::occupyMask(DataFile f, int32_t reg, uint8_t mask)
+{
+   bits[f].setMask(reg & ~31, static_cast<uint32_t>(mask) << (reg % 32));
+}
+
+void
+RegisterSet::occupy(DataFile f, int32_t reg, unsigned int size)
+{
+   bits[f].setRange(reg, size);
+
+   INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %u\n", f, reg, size);
+
+   fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
+}
+
+bool
+RegisterSet::testOccupy(const Value *v)
+{
+   return testOccupy(v->reg.file,
+                     idToUnits(v), v->reg.size >> unit[v->reg.file]);
+}
+
+bool
+RegisterSet::testOccupy(DataFile f, int32_t reg, unsigned int size)
+{
+   if (isOccupied(f, reg, size))
+      return false;
+   occupy(f, reg, size);
+   return true;
+}
+
+void
+RegisterSet::release(DataFile f, int32_t reg, unsigned int size)
+{
+   bits[f].clrRange(reg, size);
+
+   INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %u\n", f, reg, size);
+}
+
+class RegAlloc
+{
+public:
+   RegAlloc(Program *program) : prog(program), sequence(0) { }
+
+   bool exec();
+   bool execFunc();
+
+private:
+   class PhiMovesPass : public Pass {
+   private:
+      virtual bool visit(BasicBlock *);
+      inline bool needNewElseBlock(BasicBlock *b, BasicBlock *p);
+   };
+
+   class ArgumentMovesPass : public Pass {
+   private:
+      virtual bool visit(BasicBlock *);
+   };
+
+   class BuildIntervalsPass : public Pass {
+   private:
+      virtual bool visit(BasicBlock *);
+      void collectLiveValues(BasicBlock *);
+      void addLiveRange(Value *, const BasicBlock *, int end);
+   };
+
+   class InsertConstraintsPass : public Pass {
+   public:
+      bool exec(Function *func);
+   private:
+      virtual bool visit(BasicBlock *);
+
+      bool insertConstraintMoves();
+
+      void condenseDefs(Instruction *);
+      void condenseSrcs(Instruction *, const int first, const int last);
+
+      void addHazard(Instruction *i, const ValueRef *src);
+      void textureMask(TexInstruction *);
+      void addConstraint(Instruction *, int s, int n);
+      bool detectConflict(Instruction *, int s);
+
+      // target specific functions, TODO: put in subclass or Target
+      void texConstraintNV50(TexInstruction *);
+      void texConstraintNVC0(TexInstruction *);
+      void texConstraintNVE0(TexInstruction *);
+
+      std::list<Instruction *> constrList;
+
+      const Target *targ;
+   };
+
+   bool buildLiveSets(BasicBlock *);
+
+private:
+   Program *prog;
+   Function *func;
+
+   // instructions in control flow / chronological order
+   ArrayList insns;
+
+   int sequence; // for manual passes through CFG
+};
+
+typedef std::pair<Value *, Value *> ValuePair;
+
+class SpillCodeInserter
+{
+public:
+   SpillCodeInserter(Function *fn) : func(fn), stackSize(0), stackBase(0) { }
+
+   bool run(const std::list<ValuePair>&);
+
+   Symbol *assignSlot(const Interval&, const unsigned int size);
+   inline int32_t getStackSize() const { return stackSize; }
+
+private:
+   Function *func;
+
+   struct SpillSlot
+   {
+      Interval occup;
+      std::list<Value *> residents; // needed to recalculate occup
+      Symbol *sym;
+      int32_t offset;
+      inline uint8_t size() const { return sym->reg.size; }
+   };
+   std::list<SpillSlot> slots;
+   int32_t stackSize;
+   int32_t stackBase;
+
+   LValue *unspill(Instruction *usei, LValue *, Value *slot);
+   void spill(Instruction *defi, Value *slot, LValue *);
+};
+
+void
+RegAlloc::BuildIntervalsPass::addLiveRange(Value *val,
+                                           const BasicBlock *bb,
+                                           int end)
+{
+   Instruction *insn = val->getUniqueInsn();
+
+   if (!insn)
+      insn = bb->getFirst();
+
+   assert(bb->getFirst()->serial <= bb->getExit()->serial);
+   assert(bb->getExit()->serial + 1 >= end);
+
+   int begin = insn->serial;
+   if (begin < bb->getEntry()->serial || begin > bb->getExit()->serial)
+      begin = bb->getEntry()->serial;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "%%%i <- live range [%i(%i), %i)\n",
+            val->id, begin, insn->serial, end);
+
+   if (begin != end) // empty ranges are only added as hazards for fixed regs
+      val->livei.extend(begin, end);
+}
+
+bool
+RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p)
+{
+   if (b->cfg.incidentCount() <= 1)
+      return false;
+
+   int n = 0;
+   for (Graph::EdgeIterator ei = p->cfg.outgoing(); !ei.end(); ei.next())
+      if (ei.getType() == Graph::Edge::TREE ||
+          ei.getType() == Graph::Edge::FORWARD)
+         ++n;
+   return (n == 2);
+}
+
+// For each operand of each PHI in b, generate a new value by inserting a MOV
+// at the end of the block it is coming from and replace the operand with its
+// result. This eliminates liveness conflicts and enables us to let values be
+// copied to the right register if such a conflict exists nonetheless.
+//
+// These MOVs are also crucial in making sure the live intervals of phi srces
+// are extended until the end of the loop, since they are not included in the
+// live-in sets.
+bool
+RegAlloc::PhiMovesPass::visit(BasicBlock *bb)
+{
+   Instruction *phi, *mov;
+   BasicBlock *pb, *pn;
+
+   std::stack<BasicBlock *> stack;
+
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      pb = BasicBlock::get(ei.getNode());
+      assert(pb);
+      if (needNewElseBlock(bb, pb))
+         stack.push(pb);
+   }
+   while (!stack.empty()) {
+      pb = stack.top();
+      pn = new BasicBlock(func);
+      stack.pop();
+
+      pb->cfg.detach(&bb->cfg);
+      pb->cfg.attach(&pn->cfg, Graph::Edge::TREE);
+      pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD);
+
+      assert(pb->getExit()->op != OP_CALL);
+      if (pb->getExit()->asFlow()->target.bb == bb)
+         pb->getExit()->asFlow()->target.bb = pn;
+   }
+
+   // insert MOVs (phi->src(j) should stem from j-th in-BB)
+   int j = 0;
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      pb = BasicBlock::get(ei.getNode());
+      if (!pb->isTerminated())
+         pb->insertTail(new_FlowInstruction(func, OP_BRA, bb));
+
+      for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+         mov = new_Instruction(func, OP_MOV, TYPE_U32);
+
+         mov->setSrc(0, phi->getSrc(j));
+         mov->setDef(0, new_LValue(func, phi->getDef(0)->asLValue()));
+         phi->setSrc(j, mov->getDef(0));
+
+         pb->insertBefore(pb->getExit(), mov);
+      }
+      ++j;
+   }
+
+   return true;
+}
+
+bool
+RegAlloc::ArgumentMovesPass::visit(BasicBlock *bb)
+{
+   // Bind function call inputs/outputs to the same physical register
+   // the callee uses, inserting moves as appropriate for the case a
+   // conflict arises.
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      FlowInstruction *cal = i->asFlow();
+      // TODO: Handle indirect calls.
+      // Right now they should only be generated for builtins.
+      if (!cal || cal->op != OP_CALL || cal->builtin || cal->indirect)
+         continue;
+      RegisterSet clobberSet(prog->getTarget());
+
+      // Bind input values.
+      for (int s = cal->indirect ? 1 : 0; cal->srcExists(s); ++s) {
+         const int t = cal->indirect ? (s - 1) : s;
+         LValue *tmp = new_LValue(func, cal->getSrc(s)->asLValue());
+         tmp->reg.data.id = cal->target.fn->ins[t].rep()->reg.data.id;
+
+         Instruction *mov =
+            new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
+         mov->setDef(0, tmp);
+         mov->setSrc(0, cal->getSrc(s));
+         cal->setSrc(s, tmp);
+
+         bb->insertBefore(cal, mov);
+      }
+
+      // Bind output values.
+      for (int d = 0; cal->defExists(d); ++d) {
+         LValue *tmp = new_LValue(func, cal->getDef(d)->asLValue());
+         tmp->reg.data.id = cal->target.fn->outs[d].rep()->reg.data.id;
+
+         Instruction *mov =
+            new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
+         mov->setSrc(0, tmp);
+         mov->setDef(0, cal->getDef(d));
+         cal->setDef(d, tmp);
+
+         bb->insertAfter(cal, mov);
+         clobberSet.occupy(tmp);
+      }
+
+      // Bind clobbered values.
+      for (std::deque<Value *>::iterator it = cal->target.fn->clobbers.begin();
+           it != cal->target.fn->clobbers.end();
+           ++it) {
+         if (clobberSet.testOccupy(*it)) {
+            Value *tmp = new_LValue(func, (*it)->asLValue());
+            tmp->reg.data.id = (*it)->reg.data.id;
+            cal->setDef(cal->defCount(), tmp);
+         }
+      }
+   }
+
+   // Update the clobber set of the function.
+   if (BasicBlock::get(func->cfgExit) == bb) {
+      func->buildDefSets();
+      for (unsigned int i = 0; i < bb->defSet.getSize(); ++i)
+         if (bb->defSet.test(i))
+            func->clobbers.push_back(func->getLValue(i));
+   }
+
+   return true;
+}
+
+// Build the set of live-in variables of bb.
+bool
+RegAlloc::buildLiveSets(BasicBlock *bb)
+{
+   Function *f = bb->getFunction();
+   BasicBlock *bn;
+   Instruction *i;
+   unsigned int s, d;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "buildLiveSets(BB:%i)\n", bb->getId());
+
+   bb->liveSet.allocate(func->allLValues.getSize(), false);
+
+   int n = 0;
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      bn = BasicBlock::get(ei.getNode());
+      if (bn == bb)
+         continue;
+      if (bn->cfg.visit(sequence))
+         if (!buildLiveSets(bn))
+            return false;
+      if (n++ || bb->liveSet.marker)
+         bb->liveSet |= bn->liveSet;
+      else
+         bb->liveSet = bn->liveSet;
+   }
+   if (!n && !bb->liveSet.marker)
+      bb->liveSet.fill(0);
+   bb->liveSet.marker = true;
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+      INFO("BB:%i live set of out blocks:\n", bb->getId());
+      bb->liveSet.print();
+   }
+
+   // if (!bb->getEntry())
+   //   return true;
+
+   if (bb == BasicBlock::get(f->cfgExit)) {
+      for (std::deque<ValueRef>::iterator it = f->outs.begin();
+           it != f->outs.end(); ++it) {
+         assert(it->get()->asLValue());
+         bb->liveSet.set(it->get()->id);
+      }
+   }
+
+   for (i = bb->getExit(); i && i != bb->getEntry()->prev; i = i->prev) {
+      for (d = 0; i->defExists(d); ++d)
+         bb->liveSet.clr(i->getDef(d)->id);
+      for (s = 0; i->srcExists(s); ++s)
+         if (i->getSrc(s)->asLValue())
+            bb->liveSet.set(i->getSrc(s)->id);
+   }
+   for (i = bb->getPhi(); i && i->op == OP_PHI; i = i->next)
+      bb->liveSet.clr(i->getDef(0)->id);
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+      INFO("BB:%i live set after propagation:\n", bb->getId());
+      bb->liveSet.print();
+   }
+
+   return true;
+}
+
+void
+RegAlloc::BuildIntervalsPass::collectLiveValues(BasicBlock *bb)
+{
+   BasicBlock *bbA = NULL, *bbB = NULL;
+
+   if (bb->cfg.outgoingCount()) {
+      // trickery to save a loop of OR'ing liveSets
+      // aliasing works fine with BitSet::setOr
+      for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+         if (ei.getType() == Graph::Edge::DUMMY)
+            continue;
+         if (bbA) {
+            bb->liveSet.setOr(&bbA->liveSet, &bbB->liveSet);
+            bbA = bb;
+         } else {
+            bbA = bbB;
+         }
+         bbB = BasicBlock::get(ei.getNode());
+      }
+      bb->liveSet.setOr(&bbB->liveSet, bbA ? &bbA->liveSet : NULL);
+   } else
+   if (bb->cfg.incidentCount()) {
+      bb->liveSet.fill(0);
+   }
+}
+
+bool
+RegAlloc::BuildIntervalsPass::visit(BasicBlock *bb)
+{
+   collectLiveValues(bb);
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "BuildIntervals(BB:%i)\n", bb->getId());
+
+   // go through out blocks and delete phi sources that do not originate from
+   // the current block from the live set
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      BasicBlock *out = BasicBlock::get(ei.getNode());
+
+      for (Instruction *i = out->getPhi(); i && i->op == OP_PHI; i = i->next) {
+         bb->liveSet.clr(i->getDef(0)->id);
+
+         for (int s = 0; i->srcExists(s); ++s) {
+            assert(i->src(s).getInsn());
+            if (i->getSrc(s)->getUniqueInsn()->bb == bb) // XXX: reachableBy ?
+               bb->liveSet.set(i->getSrc(s)->id);
+            else
+               bb->liveSet.clr(i->getSrc(s)->id);
+         }
+      }
+   }
+
+   // remaining live-outs are live until end
+   if (bb->getExit()) {
+      for (unsigned int j = 0; j < bb->liveSet.getSize(); ++j)
+         if (bb->liveSet.test(j))
+            addLiveRange(func->getLValue(j), bb, bb->getExit()->serial + 1);
+   }
+
+   for (Instruction *i = bb->getExit(); i && i->op != OP_PHI; i = i->prev) {
+      for (int d = 0; i->defExists(d); ++d) {
+         bb->liveSet.clr(i->getDef(d)->id);
+         if (i->getDef(d)->reg.data.id >= 0) // add hazard for fixed regs
+            i->getDef(d)->livei.extend(i->serial, i->serial);
+      }
+
+      for (int s = 0; i->srcExists(s); ++s) {
+         if (!i->getSrc(s)->asLValue())
+            continue;
+         if (!bb->liveSet.test(i->getSrc(s)->id)) {
+            bb->liveSet.set(i->getSrc(s)->id);
+            addLiveRange(i->getSrc(s), bb, i->serial);
+         }
+      }
+   }
+
+   if (bb == BasicBlock::get(func->cfg.getRoot())) {
+      for (std::deque<ValueDef>::iterator it = func->ins.begin();
+           it != func->ins.end(); ++it) {
+         if (it->get()->reg.data.id >= 0) // add hazard for fixed regs
+            it->get()->livei.extend(0, 1);
+      }
+   }
+
+   return true;
+}
+
+
+#define JOIN_MASK_PHI        (1 << 0)
+#define JOIN_MASK_UNION      (1 << 1)
+#define JOIN_MASK_MOV        (1 << 2)
+#define JOIN_MASK_TEX        (1 << 3)
+
+class GCRA
+{
+public:
+   GCRA(Function *, SpillCodeInserter&);
+   ~GCRA();
+
+   bool allocateRegisters(ArrayList& insns);
+
+   void printNodeInfo() const;
+
+private:
+   class RIG_Node : public Graph::Node
+   {
+   public:
+      RIG_Node();
+
+      void init(const RegisterSet&, LValue *);
+
+      void addInterference(RIG_Node *);
+      void addRegPreference(RIG_Node *);
+
+      inline LValue *getValue() const
+      {
+         return reinterpret_cast<LValue *>(data);
+      }
+      inline void setValue(LValue *lval) { data = lval; }
+
+      inline uint8_t getCompMask() const
+      {
+         return ((1 << colors) - 1) << (reg & 7);
+      }
+
+      static inline RIG_Node *get(const Graph::EdgeIterator& ei)
+      {
+         return static_cast<RIG_Node *>(ei.getNode());
+      }
+
+   public:
+      uint32_t degree;
+      uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable
+      uint16_t colors;
+
+      DataFile f;
+      int32_t reg;
+
+      float weight;
+
+      // list pointers for simplify() phase
+      RIG_Node *next;
+      RIG_Node *prev;
+
+      // union of the live intervals of all coalesced values (we want to retain
+      //  the separate intervals for testing interference of compound values)
+      Interval livei;
+
+      std::list<RIG_Node *> prefRegs;
+   };
+
+private:
+   inline RIG_Node *getNode(const LValue *v) const { return &nodes[v->id]; }
+
+   void buildRIG(ArrayList&);
+   bool coalesce(ArrayList&);
+   bool doCoalesce(ArrayList&, unsigned int mask);
+   void calculateSpillWeights();
+   void simplify();
+   bool selectRegisters();
+   void cleanup(const bool success);
+
+   void simplifyEdge(RIG_Node *, RIG_Node *);
+   void simplifyNode(RIG_Node *);
+
+   bool coalesceValues(Value *, Value *, bool force);
+   void resolveSplitsAndMerges();
+   void makeCompound(Instruction *, bool isSplit);
+
+   inline void checkInterference(const RIG_Node *, Graph::EdgeIterator&);
+
+   inline void insertOrderedTail(std::list<RIG_Node *>&, RIG_Node *);
+   void checkList(std::list<RIG_Node *>&);
+
+private:
+   std::stack<uint32_t> stack;
+
+   // list headers for simplify() phase
+   RIG_Node lo[2];
+   RIG_Node hi;
+
+   Graph RIG;
+   RIG_Node *nodes;
+   unsigned int nodeCount;
+
+   Function *func;
+   Program *prog;
+
+   static uint8_t relDegree[17][17];
+
+   RegisterSet regs;
+
+   // need to fixup register id for participants of OP_MERGE/SPLIT
+   std::list<Instruction *> merges;
+   std::list<Instruction *> splits;
+
+   SpillCodeInserter& spill;
+   std::list<ValuePair> mustSpill;
+};
+
+uint8_t GCRA::relDegree[17][17];
+
+GCRA::RIG_Node::RIG_Node() : Node(NULL), next(this), prev(this)
+{
+   colors = 0;
+}
+
+void
+GCRA::printNodeInfo() const
+{
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      if (!nodes[i].colors)
+         continue;
+      INFO("RIG_Node[%%%i]($[%u]%i): %u colors, weight %f, deg %u/%u\n X",
+           i,
+           nodes[i].f,nodes[i].reg,nodes[i].colors,
+           nodes[i].weight,
+           nodes[i].degree, nodes[i].degreeLimit);
+
+      for (Graph::EdgeIterator ei = nodes[i].outgoing(); !ei.end(); ei.next())
+         INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
+      for (Graph::EdgeIterator ei = nodes[i].incident(); !ei.end(); ei.next())
+         INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
+      INFO("\n");
+   }
+}
+
+void
+GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval)
+{
+   setValue(lval);
+   if (lval->reg.data.id >= 0)
+      lval->noSpill = lval->fixedReg = 1;
+
+   colors = regs.units(lval->reg.file, lval->reg.size);
+   f = lval->reg.file;
+   reg = -1;
+   if (lval->reg.data.id >= 0)
+      reg = regs.idToUnits(lval);
+
+   weight = std::numeric_limits<float>::infinity();
+   degree = 0;
+   degreeLimit = regs.getFileSize(f, lval->reg.size);
+
+   livei.insert(lval->livei);
+}
+
+bool
+GCRA::coalesceValues(Value *dst, Value *src, bool force)
+{
+   LValue *rep = dst->join->asLValue();
+   LValue *val = src->join->asLValue();
+
+   if (!force && val->reg.data.id >= 0) {
+      rep = src->join->asLValue();
+      val = dst->join->asLValue();
+   }
+   RIG_Node *nRep = &nodes[rep->id];
+   RIG_Node *nVal = &nodes[val->id];
+
+   if (src->reg.file != dst->reg.file) {
+      if (!force)
+         return false;
+      WARN("forced coalescing of values in different files !\n");
+   }
+   if (!force && dst->reg.size != src->reg.size)
+      return false;
+
+   if ((rep->reg.data.id >= 0) && (rep->reg.data.id != val->reg.data.id)) {
+      if (force) {
+         if (val->reg.data.id >= 0)
+            WARN("forced coalescing of values in different fixed regs !\n");
+      } else {
+         if (val->reg.data.id >= 0)
+            return false;
+         // make sure that there is no overlap with the fixed register of rep
+         for (ArrayList::Iterator it = func->allLValues.iterator();
+              !it.end(); it.next()) {
+            Value *reg = reinterpret_cast<Value *>(it.get())->asLValue();
+            assert(reg);
+            if (reg->interfers(rep) && reg->livei.overlaps(nVal->livei))
+               return false;
+         }
+      }
+   }
+
+   if (!force && nRep->livei.overlaps(nVal->livei))
+      return false;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "joining %%%i($%i) <- %%%i\n",
+            rep->id, rep->reg.data.id, val->id);
+
+   // set join pointer of all values joined with val
+   for (Value::DefIterator def = val->defs.begin(); def != val->defs.end();
+        ++def)
+      (*def)->get()->join = rep;
+   assert(rep->join == rep && val->join == rep);
+
+   // add val's definitions to rep and extend the live interval of its RIG node
+   rep->defs.insert(rep->defs.end(), val->defs.begin(), val->defs.end());
+   nRep->livei.unify(nVal->livei);
+   return true;
+}
+
+bool
+GCRA::coalesce(ArrayList& insns)
+{
+   bool ret = doCoalesce(insns, JOIN_MASK_PHI);
+   if (!ret)
+      return false;
+   switch (func->getProgram()->getTarget()->getChipset() & ~0xf) {
+   case 0x50:
+   case 0x80:
+   case 0x90:
+   case 0xa0:
+      ret = doCoalesce(insns, JOIN_MASK_UNION | JOIN_MASK_TEX);
+      break;
+   case 0xc0:
+   case 0xd0:
+   case 0xe0:
+      ret = doCoalesce(insns, JOIN_MASK_UNION);
+      break;
+   default:
+      break;
+   }
+   if (!ret)
+      return false;
+   return doCoalesce(insns, JOIN_MASK_MOV);
+}
+
+static inline uint8_t makeCompMask(int compSize, int base, int size)
+{
+   uint8_t m = ((1 << size) - 1) << base;
+
+   switch (compSize) {
+   case 1:
+      return 0xff;
+   case 2:
+      m |= (m << 2);
+      return (m << 4) | m;
+   case 3:
+   case 4:
+      return (m << 4) | m;
+   default:
+      assert(compSize <= 8);
+      return m;
+   }
+}
+
+// Used when coalescing moves. The non-compound value will become one, e.g.:
+// mov b32 $r0 $r2            / merge b64 $r0d { $r0 $r1 }
+// split b64 { $r0 $r1 } $r0d / mov b64 $r0d f64 $r2d
+static inline void copyCompound(Value *dst, Value *src)
+{
+   LValue *ldst = dst->asLValue();
+   LValue *lsrc = src->asLValue();
+
+   if (ldst->compound && !lsrc->compound) {
+      LValue *swap = lsrc;
+      lsrc = ldst;
+      ldst = swap;
+   }
+
+   ldst->compound = lsrc->compound;
+   ldst->compMask = lsrc->compMask;
+}
+
+void
+GCRA::makeCompound(Instruction *insn, bool split)
+{
+   LValue *rep = (split ? insn->getSrc(0) : insn->getDef(0))->asLValue();
+
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+      INFO("makeCompound(split = %i): ", split);
+      insn->print();
+   }
+
+   const unsigned int size = getNode(rep)->colors;
+   unsigned int base = 0;
+
+   if (!rep->compound)
+      rep->compMask = 0xff;
+   rep->compound = 1;
+
+   for (int c = 0; split ? insn->defExists(c) : insn->srcExists(c); ++c) {
+      LValue *val = (split ? insn->getDef(c) : insn->getSrc(c))->asLValue();
+
+      val->compound = 1;
+      if (!val->compMask)
+         val->compMask = 0xff;
+      val->compMask &= makeCompMask(size, base, getNode(val)->colors);
+      assert(val->compMask);
+
+      INFO_DBG(prog->dbgFlags, REG_ALLOC, "compound: %%%i:%02x <- %%%i:%02x\n",
+           rep->id, rep->compMask, val->id, val->compMask);
+
+      base += getNode(val)->colors;
+   }
+   assert(base == size);
+}
+
+bool
+GCRA::doCoalesce(ArrayList& insns, unsigned int mask)
+{
+   int c, n;
+
+   for (n = 0; n < insns.getSize(); ++n) {
+      Instruction *i;
+      Instruction *insn = reinterpret_cast<Instruction *>(insns.get(n));
+
+      switch (insn->op) {
+      case OP_PHI:
+         if (!(mask & JOIN_MASK_PHI))
+            break;
+         for (c = 0; insn->srcExists(c); ++c)
+            if (!coalesceValues(insn->getDef(0), insn->getSrc(c), false)) {
+               // this is bad
+               ERROR("failed to coalesce phi operands\n");
+               return false;
+            }
+         break;
+      case OP_UNION:
+      case OP_MERGE:
+         if (!(mask & JOIN_MASK_UNION))
+            break;
+         for (c = 0; insn->srcExists(c); ++c)
+            coalesceValues(insn->getDef(0), insn->getSrc(c), true);
+         if (insn->op == OP_MERGE) {
+            merges.push_back(insn);
+            if (insn->srcExists(1))
+               makeCompound(insn, false);
+         }
+         break;
+      case OP_SPLIT:
+         if (!(mask & JOIN_MASK_UNION))
+            break;
+         splits.push_back(insn);
+         for (c = 0; insn->defExists(c); ++c)
+            coalesceValues(insn->getSrc(0), insn->getDef(c), true);
+         makeCompound(insn, true);
+         break;
+      case OP_MOV:
+         if (!(mask & JOIN_MASK_MOV))
+            break;
+         i = NULL;
+         if (!insn->getDef(0)->uses.empty())
+            i = insn->getDef(0)->uses.front()->getInsn();
+         // if this is a contraint-move there will only be a single use
+         if (i && i->op == OP_MERGE) // do we really still need this ?
+            break;
+         i = insn->getSrc(0)->getUniqueInsn();
+         if (i && !i->constrainedDefs()) {
+            if (coalesceValues(insn->getDef(0), insn->getSrc(0), false))
+               copyCompound(insn->getSrc(0), insn->getDef(0));
+         }
+         break;
+      case OP_TEX:
+      case OP_TXB:
+      case OP_TXL:
+      case OP_TXF:
+      case OP_TXQ:
+      case OP_TXD:
+      case OP_TXG:
+      case OP_TEXCSAA:
+         if (!(mask & JOIN_MASK_TEX))
+            break;
+         for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c)
+            coalesceValues(insn->getDef(c), insn->getSrc(c), true);
+         break;
+      default:
+         break;
+      }
+   }
+   return true;
+}
+
+void
+GCRA::RIG_Node::addInterference(RIG_Node *node)
+{
+   this->degree += relDegree[node->colors][colors];
+   node->degree += relDegree[colors][node->colors];
+
+   this->attach(node, Graph::Edge::CROSS);
+}
+
+void
+GCRA::RIG_Node::addRegPreference(RIG_Node *node)
+{
+   prefRegs.push_back(node);
+}
+
+GCRA::GCRA(Function *fn, SpillCodeInserter& spill) :
+   func(fn),
+   regs(fn->getProgram()->getTarget()),
+   spill(spill)
+{
+   prog = func->getProgram();
+
+   // initialize relative degrees array - i takes away from j
+   for (int i = 1; i <= 16; ++i)
+      for (int j = 1; j <= 16; ++j)
+         relDegree[i][j] = j * ((i + j - 1) / j);
+}
+
+GCRA::~GCRA()
+{
+   if (nodes)
+      delete[] nodes;
+}
+
+void
+GCRA::checkList(std::list<RIG_Node *>& lst)
+{
+   GCRA::RIG_Node *prev = NULL;
+
+   for (std::list<RIG_Node *>::iterator it = lst.begin();
+        it != lst.end();
+        ++it) {
+      assert((*it)->getValue()->join == (*it)->getValue());
+      if (prev)
+         assert(prev->livei.begin() <= (*it)->livei.begin());
+      prev = *it;
+   }
+}
+
+void
+GCRA::insertOrderedTail(std::list<RIG_Node *>& list, RIG_Node *node)
+{
+   if (node->livei.isEmpty())
+      return;
+   // only the intervals of joined values don't necessarily arrive in order
+   std::list<RIG_Node *>::iterator prev, it;
+   for (it = list.end(); it != list.begin(); it = prev) {
+      prev = it;
+      --prev;
+      if ((*prev)->livei.begin() <= node->livei.begin())
+         break;
+   }
+   list.insert(it, node);
+}
+
+void
+GCRA::buildRIG(ArrayList& insns)
+{
+   std::list<RIG_Node *> values, active;
+
+   for (std::deque<ValueDef>::iterator it = func->ins.begin();
+        it != func->ins.end(); ++it)
+      insertOrderedTail(values, getNode(it->get()->asLValue()));
+
+   for (int i = 0; i < insns.getSize(); ++i) {
+      Instruction *insn = reinterpret_cast<Instruction *>(insns.get(i));
+      for (int d = 0; insn->defExists(d); ++d)
+         if (insn->getDef(d)->rep() == insn->getDef(d))
+            insertOrderedTail(values, getNode(insn->getDef(d)->asLValue()));
+   }
+   checkList(values);
+
+   while (!values.empty()) {
+      RIG_Node *cur = values.front();
+
+      for (std::list<RIG_Node *>::iterator it = active.begin();
+           it != active.end();) {
+         RIG_Node *node = *it;
+
+         if (node->livei.end() <= cur->livei.begin()) {
+            it = active.erase(it);
+         } else {
+            if (node->f == cur->f && node->livei.overlaps(cur->livei))
+               cur->addInterference(node);
+            ++it;
+         }
+      }
+      values.pop_front();
+      active.push_back(cur);
+   }
+}
+
+void
+GCRA::calculateSpillWeights()
+{
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      RIG_Node *const n = &nodes[i];
+      if (!nodes[i].colors || nodes[i].livei.isEmpty())
+         continue;
+      if (nodes[i].reg >= 0) {
+         // update max reg
+         regs.occupy(n->f, n->reg, n->colors);
+         continue;
+      }
+      LValue *val = nodes[i].getValue();
+
+      if (!val->noSpill) {
+         int rc = 0;
+         for (Value::DefIterator it = val->defs.begin();
+              it != val->defs.end();
+              ++it)
+            rc += (*it)->get()->refCount();
+
+         nodes[i].weight =
+            (float)rc * (float)rc / (float)nodes[i].livei.extent();
+      }
+
+      if (nodes[i].degree < nodes[i].degreeLimit) {
+         int l = 0;
+         if (val->reg.size > 4)
+            l = 1;
+         DLLIST_ADDHEAD(&lo[l], &nodes[i]);
+      } else {
+         DLLIST_ADDHEAD(&hi, &nodes[i]);
+      }
+   }
+   if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+      printNodeInfo();
+}
+
+void
+GCRA::simplifyEdge(RIG_Node *a, RIG_Node *b)
+{
+   bool move = b->degree >= b->degreeLimit;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC,
+            "edge: (%%%i, deg %u/%u) >-< (%%%i, deg %u/%u)\n",
+            a->getValue()->id, a->degree, a->degreeLimit,
+            b->getValue()->id, b->degree, b->degreeLimit);
+
+   b->degree -= relDegree[a->colors][b->colors];
+
+   move = move && b->degree < b->degreeLimit;
+   if (move && !DLLIST_EMPTY(b)) {
+      int l = (b->getValue()->reg.size > 4) ? 1 : 0;
+      DLLIST_DEL(b);
+      DLLIST_ADDTAIL(&lo[l], b);
+   }
+}
+
+void
+GCRA::simplifyNode(RIG_Node *node)
+{
+   for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+      simplifyEdge(node, RIG_Node::get(ei));
+
+   for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
+      simplifyEdge(node, RIG_Node::get(ei));
+
+   DLLIST_DEL(node);
+   stack.push(node->getValue()->id);
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "SIMPLIFY: pushed %%%i%s\n",
+            node->getValue()->id,
+            (node->degree < node->degreeLimit) ? "" : "(spill)");
+}
+
+void
+GCRA::simplify()
+{
+   for (;;) {
+      if (!DLLIST_EMPTY(&lo[0])) {
+         do {
+            simplifyNode(lo[0].next);
+         } while (!DLLIST_EMPTY(&lo[0]));
+      } else
+      if (!DLLIST_EMPTY(&lo[1])) {
+         simplifyNode(lo[1].next);
+      } else
+      if (!DLLIST_EMPTY(&hi)) {
+         RIG_Node *best = hi.next;
+         float bestScore = best->weight / (float)best->degree;
+         // spill candidate
+         for (RIG_Node *it = best->next; it != &hi; it = it->next) {
+            float score = it->weight / (float)it->degree;
+            if (score < bestScore) {
+               best = it;
+               bestScore = score;
+            }
+         }
+         if (isinf(bestScore)) {
+            ERROR("no viable spill candidates left\n");
+            break;
+         }
+         simplifyNode(best);
+      } else {
+         break;
+      }
+   }
+}
+
+void
+GCRA::checkInterference(const RIG_Node *node, Graph::EdgeIterator& ei)
+{
+   const RIG_Node *intf = RIG_Node::get(ei);
+
+   if (intf->reg < 0)
+      return;
+   const LValue *vA = node->getValue();
+   const LValue *vB = intf->getValue();
+
+   const uint8_t intfMask = ((1 << intf->colors) - 1) << (intf->reg & 7);
+
+   if (vA->compound | vB->compound) {
+      // NOTE: this only works for >aligned< register tuples !
+      for (Value::DefCIterator D = vA->defs.begin(); D != vA->defs.end(); ++D) {
+      for (Value::DefCIterator d = vB->defs.begin(); d != vB->defs.end(); ++d) {
+         const LValue *vD = (*D)->get()->asLValue();
+         const LValue *vd = (*d)->get()->asLValue();
+
+         if (!vD->livei.overlaps(vd->livei)) {
+            INFO_DBG(prog->dbgFlags, REG_ALLOC, "(%%%i) X (%%%i): no overlap\n",
+                     vD->id, vd->id);
+            continue;
+         }
+
+         uint8_t mask = vD->compound ? vD->compMask : ~0;
+         if (vd->compound) {
+            assert(vB->compound);
+            mask &= vd->compMask & vB->compMask;
+         } else {
+            mask &= intfMask;
+         }
+
+         INFO_DBG(prog->dbgFlags, REG_ALLOC,
+                  "(%%%i)%02x X (%%%i)%02x & %02x: $r%i.%02x\n",
+                  vD->id,
+                  vD->compound ? vD->compMask : 0xff,
+                  vd->id,
+                  vd->compound ? vd->compMask : intfMask,
+                  vB->compMask, intf->reg & ~7, mask);
+         if (mask)
+            regs.occupyMask(node->f, intf->reg & ~7, mask);
+      }
+      }
+   } else {
+      INFO_DBG(prog->dbgFlags, REG_ALLOC,
+               "(%%%i) X (%%%i): $r%i + %u\n",
+               vA->id, vB->id, intf->reg, intf->colors);
+      regs.occupy(node->f, intf->reg, intf->colors);
+   }
+}
+
+bool
+GCRA::selectRegisters()
+{
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nSELECT phase\n");
+
+   while (!stack.empty()) {
+      RIG_Node *node = &nodes[stack.top()];
+      stack.pop();
+
+      regs.reset(node->f);
+
+      INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nNODE[%%%i, %u colors]\n",
+               node->getValue()->id, node->colors);
+
+      for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+         checkInterference(node, ei);
+      for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
+         checkInterference(node, ei);
+
+      if (!node->prefRegs.empty()) {
+         for (std::list<RIG_Node *>::const_iterator it = node->prefRegs.begin();
+              it != node->prefRegs.end();
+              ++it) {
+            if ((*it)->reg >= 0 &&
+                regs.testOccupy(node->f, (*it)->reg, node->colors)) {
+               node->reg = (*it)->reg;
+               break;
+            }
+         }
+      }
+      if (node->reg >= 0)
+         continue;
+      LValue *lval = node->getValue();
+      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+         regs.print();
+      bool ret = regs.assign(node->reg, node->f, node->colors);
+      if (ret) {
+         INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg);
+         lval->compMask = node->getCompMask();
+      } else {
+         INFO_DBG(prog->dbgFlags, REG_ALLOC, "must spill: %%%i (size %u)\n",
+                  lval->id, lval->reg.size);
+         Symbol *slot = NULL;
+         if (lval->reg.file == FILE_GPR)
+            slot = spill.assignSlot(node->livei, lval->reg.size);
+         mustSpill.push_back(ValuePair(lval, slot));
+      }
+   }
+   if (!mustSpill.empty())
+      return false;
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      LValue *lval = nodes[i].getValue();
+      if (nodes[i].reg >= 0 && nodes[i].colors > 0)
+         lval->reg.data.id =
+            regs.unitsToId(nodes[i].f, nodes[i].reg, lval->reg.size);
+   }
+   return true;
+}
+
+bool
+GCRA::allocateRegisters(ArrayList& insns)
+{
+   bool ret;
+
+   INFO_DBG(prog->dbgFlags, REG_ALLOC,
+            "allocateRegisters to %u instructions\n", insns.getSize());
+
+   nodeCount = func->allLValues.getSize();
+   nodes = new RIG_Node[nodeCount];
+   if (!nodes)
+      return false;
+   for (unsigned int i = 0; i < nodeCount; ++i) {
+      LValue *lval = reinterpret_cast<LValue *>(func->allLValues.get(i));
+      if (lval) {
+         nodes[i].init(regs, lval);
+         RIG.insert(&nodes[i]);
+      }
+   }
+
+   // coalesce first, we use only 1 RIG node for a group of joined values
+   ret = coalesce(insns);
+   if (!ret)
+      goto out;
+
+   if (func->getProgram()->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+      func->printLiveIntervals();
+
+   buildRIG(insns);
+   calculateSpillWeights();
+   simplify();
+
+   ret = selectRegisters();
+   if (!ret) {
+      INFO_DBG(prog->dbgFlags, REG_ALLOC,
+               "selectRegisters failed, inserting spill code ...\n");
+      regs.reset(FILE_GPR, true);
+      spill.run(mustSpill);
+      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+         func->print();
+   } else {
+      prog->maxGPR = std::max(prog->maxGPR, regs.getMaxAssigned(FILE_GPR));
+   }
+
+out:
+   cleanup(ret);
+   return ret;
+}
+
+void
+GCRA::cleanup(const bool success)
+{
+   mustSpill.clear();
+
+   for (ArrayList::Iterator it = func->allLValues.iterator();
+        !it.end(); it.next()) {
+      LValue *lval =  reinterpret_cast<LValue *>(it.get());
+
+      lval->livei.clear();
+
+      lval->compound = 0;
+      lval->compMask = 0;
+
+      if (lval->join == lval)
+         continue;
+
+      if (success) {
+         lval->reg.data.id = lval->join->reg.data.id;
+      } else {
+         for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
+              ++d)
+            lval->join->defs.remove(*d);
+         lval->join = lval;
+      }
+   }
+
+   if (success)
+      resolveSplitsAndMerges();
+   splits.clear(); // avoid duplicate entries on next coalesce pass
+   merges.clear();
+
+   delete[] nodes;
+   nodes = NULL;
+}
+
+Symbol *
+SpillCodeInserter::assignSlot(const Interval &livei, const unsigned int size)
+{
+   SpillSlot slot;
+   int32_t offsetBase = stackSize;
+   int32_t offset;
+   std::list<SpillSlot>::iterator pos = slots.end(), it = slots.begin();
+
+   if (offsetBase % size)
+      offsetBase += size - (offsetBase % size);
+
+   slot.sym = NULL;
+
+   for (offset = offsetBase; offset < stackSize; offset += size) {
+      const int32_t entryEnd = offset + size;
+      while (it != slots.end() && it->offset < offset)
+         ++it;
+      if (it == slots.end()) // no slots left
+         break;
+      std::list<SpillSlot>::iterator bgn = it;
+
+      while (it != slots.end() && it->offset < entryEnd) {
+         it->occup.print();
+         if (it->occup.overlaps(livei))
+            break;
+         ++it;
+      }
+      if (it == slots.end() || it->offset >= entryEnd) {
+         // fits
+         for (; bgn != slots.end() && bgn->offset < entryEnd; ++bgn) {
+            bgn->occup.insert(livei);
+            if (bgn->size() == size)
+               slot.sym = bgn->sym;
+         }
+         break;
+      }
+   }
+   if (!slot.sym) {
+      stackSize = offset + size;
+      slot.offset = offset;
+      slot.sym = new_Symbol(func->getProgram(), FILE_MEMORY_LOCAL);
+      if (!func->stackPtr)
+         offset += func->tlsBase;
+      slot.sym->setAddress(NULL, offset);
+      slot.sym->reg.size = size;
+      slots.insert(pos, slot)->occup.insert(livei);
+   }
+   return slot.sym;
+}
+
+void
+SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
+{
+   const DataType ty = typeOfSize(slot->reg.size);
+
+   Instruction *st;
+   if (slot->reg.file == FILE_MEMORY_LOCAL) {
+      st = new_Instruction(func, OP_STORE, ty);
+      st->setSrc(0, slot);
+      st->setSrc(1, lval);
+      lval->noSpill = 1;
+   } else {
+      st = new_Instruction(func, OP_CVT, ty);
+      st->setDef(0, slot);
+      st->setSrc(0, lval);
+   }
+   defi->bb->insertAfter(defi, st);
+}
+
+LValue *
+SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
+{
+   const DataType ty = typeOfSize(slot->reg.size);
+
+   lval = cloneShallow(func, lval);
+
+   Instruction *ld;
+   if (slot->reg.file == FILE_MEMORY_LOCAL) {
+      lval->noSpill = 1;
+      ld = new_Instruction(func, OP_LOAD, ty);
+   } else {
+      ld = new_Instruction(func, OP_CVT, ty);
+   }
+   ld->setDef(0, lval);
+   ld->setSrc(0, slot);
+
+   usei->bb->insertBefore(usei, ld);
+   return lval;
+}
+
+bool
+SpillCodeInserter::run(const std::list<ValuePair>& lst)
+{
+   for (std::list<ValuePair>::const_iterator it = lst.begin(); it != lst.end();
+        ++it) {
+      LValue *lval = it->first->asLValue();
+      Symbol *mem = it->second ? it->second->asSym() : NULL;
+
+      for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
+           ++d) {
+         Value *slot = mem ?
+            static_cast<Value *>(mem) : new_LValue(func, FILE_GPR);
+         Value *tmp = NULL;
+         Instruction *last = NULL;
+
+         LValue *dval = (*d)->get()->asLValue();
+         Instruction *defi = (*d)->getInsn();
+
+         // handle uses first or they'll contain the spill stores
+         while (!dval->uses.empty()) {
+            ValueRef *u = dval->uses.front();
+            Instruction *usei = u->getInsn();
+            assert(usei);
+            if (usei->op == OP_PHI) {
+               tmp = (slot->reg.file == FILE_MEMORY_LOCAL) ? NULL : slot;
+               last = NULL;
+            } else
+            if (!last || usei != last->next) { // TODO: sort uses
+               tmp = unspill(usei, dval, slot);
+               last = usei;
+            }
+            u->set(tmp);
+         }
+
+         assert(defi);
+         if (defi->op == OP_PHI) {
+            d = lval->defs.erase(d);
+            --d;
+            if (slot->reg.file == FILE_MEMORY_LOCAL)
+               delete_Instruction(func->getProgram(), defi);
+            else
+               defi->setDef(0, slot);
+         } else {
+            spill(defi, slot, dval);
+         }
+      }
+
+   }
+
+   // TODO: We're not trying to reuse old slots in a potential next iteration.
+   //  We have to update the slots' livei intervals to be able to do that.
+   stackBase = stackSize;
+   slots.clear();
+   return true;
+}
+
+bool
+RegAlloc::exec()
+{
+   for (IteratorRef it = prog->calls.iteratorDFS(false);
+        !it->end(); it->next()) {
+      func = Function::get(reinterpret_cast<Graph::Node *>(it->get()));
+
+      func->tlsBase = prog->tlsSize;
+      if (!execFunc())
+         return false;
+      prog->tlsSize += func->tlsSize;
+   }
+   return true;
+}
+
+bool
+RegAlloc::execFunc()
+{
+   InsertConstraintsPass insertConstr;
+   PhiMovesPass insertPhiMoves;
+   ArgumentMovesPass insertArgMoves;
+   BuildIntervalsPass buildIntervals;
+   SpillCodeInserter insertSpills(func);
+
+   GCRA gcra(func, insertSpills);
+
+   unsigned int i, retries;
+   bool ret;
+
+   if (!func->ins.empty()) {
+      // Insert a nop at the entry so inputs only used by the first instruction
+      // don't count as having an empty live range.
+      Instruction *nop = new_Instruction(func, OP_NOP, TYPE_NONE);
+      BasicBlock::get(func->cfg.getRoot())->insertHead(nop);
+   }
+
+   ret = insertConstr.exec(func);
+   if (!ret)
+      goto out;
+
+   ret = insertPhiMoves.run(func);
+   if (!ret)
+      goto out;
+
+   ret = insertArgMoves.run(func);
+   if (!ret)
+      goto out;
+
+   // TODO: need to fix up spill slot usage ranges to support > 1 retry
+   for (retries = 0; retries < 3; ++retries) {
+      if (retries && (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC))
+         INFO("Retry: %i\n", retries);
+      if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+         func->print();
+
+      // spilling to registers may add live ranges, need to rebuild everything
+      ret = true;
+      for (sequence = func->cfg.nextSequence(), i = 0;
+           ret && i <= func->loopNestingBound;
+           sequence = func->cfg.nextSequence(), ++i)
+         ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot()));
+      if (!ret)
+         break;
+      func->orderInstructions(this->insns);
+
+      ret = buildIntervals.run(func);
+      if (!ret)
+         break;
+      ret = gcra.allocateRegisters(insns);
+      if (ret)
+         break; // success
+   }
+   INFO_DBG(prog->dbgFlags, REG_ALLOC, "RegAlloc done: %i\n", ret);
+
+   func->tlsSize = insertSpills.getStackSize();
+out:
+   return ret;
+}
+
+// TODO: check if modifying Instruction::join here breaks anything
+void
+GCRA::resolveSplitsAndMerges()
+{
+   for (std::list<Instruction *>::iterator it = splits.begin();
+        it != splits.end();
+        ++it) {
+      Instruction *split = *it;
+      unsigned int reg = regs.idToBytes(split->getSrc(0));
+      for (int d = 0; split->defExists(d); ++d) {
+         Value *v = split->getDef(d);
+         v->reg.data.id = regs.bytesToId(v, reg);
+         v->join = v;
+         reg += v->reg.size;
+      }
+   }
+   splits.clear();
+
+   for (std::list<Instruction *>::iterator it = merges.begin();
+        it != merges.end();
+        ++it) {
+      Instruction *merge = *it;
+      unsigned int reg = regs.idToBytes(merge->getDef(0));
+      for (int s = 0; merge->srcExists(s); ++s) {
+         Value *v = merge->getSrc(s);
+         v->reg.data.id = regs.bytesToId(v, reg);
+         v->join = v;
+         reg += v->reg.size;
+      }
+   }
+   merges.clear();
+}
+
+bool Program::registerAllocation()
+{
+   RegAlloc ra(this);
+   return ra.exec();
+}
+
+bool
+RegAlloc::InsertConstraintsPass::exec(Function *ir)
+{
+   constrList.clear();
+
+   bool ret = run(ir, true, true);
+   if (ret)
+      ret = insertConstraintMoves();
+   return ret;
+}
+
+// TODO: make part of texture insn
+void
+RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex)
+{
+   Value *def[4];
+   int c, k, d;
+   uint8_t mask = 0;
+
+   for (d = 0, k = 0, c = 0; c < 4; ++c) {
+      if (!(tex->tex.mask & (1 << c)))
+         continue;
+      if (tex->getDef(k)->refCount()) {
+         mask |= 1 << c;
+         def[d++] = tex->getDef(k);
+      }
+      ++k;
+   }
+   tex->tex.mask = mask;
+
+   for (c = 0; c < d; ++c)
+      tex->setDef(c, def[c]);
+   for (; c < 4; ++c)
+      tex->setDef(c, NULL);
+}
+
+bool
+RegAlloc::InsertConstraintsPass::detectConflict(Instruction *cst, int s)
+{
+   Value *v = cst->getSrc(s);
+
+   // current register allocation can't handle it if a value participates in
+   // multiple constraints
+   for (Value::UseIterator it = v->uses.begin(); it != v->uses.end(); ++it) {
+      if (cst != (*it)->getInsn())
+         return true;
+   }
+
+   // can start at s + 1 because detectConflict is called on all sources
+   for (int c = s + 1; cst->srcExists(c); ++c)
+      if (v == cst->getSrc(c))
+         return true;
+
+   Instruction *defi = v->getInsn();
+
+   return (!defi || defi->constrainedDefs());
+}
+
+void
+RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n)
+{
+   Instruction *cst;
+   int d;
+
+   // first, look for an existing identical constraint op
+   for (std::list<Instruction *>::iterator it = constrList.begin();
+        it != constrList.end();
+        ++it) {
+      cst = (*it);
+      if (!i->bb->dominatedBy(cst->bb))
+         break;
+      for (d = 0; d < n; ++d)
+         if (cst->getSrc(d) != i->getSrc(d + s))
+            break;
+      if (d >= n) {
+         for (d = 0; d < n; ++d, ++s)
+            i->setSrc(s, cst->getDef(d));
+         return;
+      }
+   }
+   cst = new_Instruction(func, OP_CONSTRAINT, i->dType);
+
+   for (d = 0; d < n; ++s, ++d) {
+      cst->setDef(d, new_LValue(func, FILE_GPR));
+      cst->setSrc(d, i->getSrc(s));
+      i->setSrc(s, cst->getDef(d));
+   }
+   i->bb->insertBefore(i, cst);
+
+   constrList.push_back(cst);
+}
+
+// Add a dummy use of the pointer source of >= 8 byte loads after the load
+// to prevent it from being assigned a register which overlapping the load's
+// destination, which would produce random corruptions.
+void
+RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src)
+{
+   Instruction *hzd = new_Instruction(func, OP_NOP, TYPE_NONE);
+   hzd->setSrc(0, src->get());
+   i->bb->insertAfter(i, hzd);
+
+}
+
+// b32 { %r0 %r1 %r2 %r3 } -> b128 %r0q
+void
+RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn)
+{
+   uint8_t size = 0;
+   int n;
+   for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n)
+      size += insn->getDef(n)->reg.size;
+   if (n < 2)
+      return;
+   LValue *lval = new_LValue(func, FILE_GPR);
+   lval->reg.size = size;
+
+   Instruction *split = new_Instruction(func, OP_SPLIT, typeOfSize(size));
+   split->setSrc(0, lval);
+   for (int d = 0; d < n; ++d) {
+      split->setDef(d, insn->getDef(d));
+      insn->setDef(d, NULL);
+   }
+   insn->setDef(0, lval);
+
+   for (int k = 1, d = n; insn->defExists(d); ++d, ++k) {
+      insn->setDef(k, insn->getDef(d));
+      insn->setDef(d, NULL);
+   }
+   // carry over predicate if any (mainly for OP_UNION uses)
+   split->setPredicate(insn->cc, insn->getPredicate());
+
+   insn->bb->insertAfter(insn, split);
+   constrList.push_back(split);
+}
+void
+RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
+                                              const int a, const int b)
+{
+   uint8_t size = 0;
+   if (a >= b)
+      return;
+   for (int s = a; s <= b; ++s)
+      size += insn->getSrc(s)->reg.size;
+   if (!size)
+      return;
+   LValue *lval = new_LValue(func, FILE_GPR);
+   lval->reg.size = size;
+
+   Value *save[3];
+   insn->takeExtraSources(0, save);
+
+   Instruction *merge = new_Instruction(func, OP_MERGE, typeOfSize(size));
+   merge->setDef(0, lval);
+   for (int s = a, i = 0; s <= b; ++s, ++i) {
+      merge->setSrc(i, insn->getSrc(s));
+      insn->setSrc(s, NULL);
+   }
+   insn->setSrc(a, lval);
+
+   for (int k = a + 1, s = b + 1; insn->srcExists(s); ++s, ++k) {
+      insn->setSrc(k, insn->getSrc(s));
+      insn->setSrc(s, NULL);
+   }
+   insn->bb->insertBefore(insn, merge);
+
+   insn->putExtraSources(0, save);
+
+   constrList.push_back(merge);
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
+{
+   if (isTextureOp(tex->op))
+      textureMask(tex);
+   condenseDefs(tex);
+
+   if (tex->op == OP_SUSTB || tex->op == OP_SUSTP) {
+      condenseSrcs(tex, 3, (3 + typeSizeof(tex->dType) / 4) - 1);
+   } else
+   if (isTextureOp(tex->op)) {
+      int n = tex->srcCount(0xff, true);
+      if (n > 4) {
+         condenseSrcs(tex, 0, 3);
+         if (n > 5) // NOTE: first call modified positions already
+            condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
+      } else
+      if (n > 1) {
+         condenseSrcs(tex, 0, n - 1);
+      }
+   }
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex)
+{
+   int n, s;
+
+   textureMask(tex);
+
+   if (tex->op == OP_TXQ) {
+      s = tex->srcCount(0xff);
+      n = 0;
+   } else {
+      s = tex->tex.target.getArgCount();
+      if (!tex->tex.target.isArray() &&
+          (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0))
+         ++s;
+      if (tex->op == OP_TXD && tex->tex.useOffsets)
+         ++s;
+      n = tex->srcCount(0xff) - s;
+      assert(n <= 4);
+   }
+
+   if (s > 1)
+      condenseSrcs(tex, 0, s - 1);
+   if (n > 1) // NOTE: first call modified positions already
+      condenseSrcs(tex, 1, n);
+
+   condenseDefs(tex);
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNV50(TexInstruction *tex)
+{
+   Value *pred = tex->getPredicate();
+   if (pred)
+      tex->setPredicate(tex->cc, NULL);
+
+   textureMask(tex);
+
+   assert(tex->defExists(0) && tex->srcExists(0));
+   // make src and def count match
+   int c;
+   for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) {
+      if (!tex->srcExists(c))
+         tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue()));
+      if (!tex->defExists(c))
+         tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue()));
+   }
+   if (pred)
+      tex->setPredicate(tex->cc, pred);
+   condenseDefs(tex);
+   condenseSrcs(tex, 0, c - 1);
+}
+
+// Insert constraint markers for instructions whose multiple sources must be
+// located in consecutive registers.
+bool
+RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
+{
+   TexInstruction *tex;
+   Instruction *next;
+   int s, size;
+
+   targ = bb->getProgram()->getTarget();
+
+   for (Instruction *i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if ((tex = i->asTex())) {
+         switch (targ->getChipset() & ~0xf) {
+         case 0x50:
+         case 0x80:
+         case 0x90:
+         case 0xa0:
+            texConstraintNV50(tex);
+            break;
+         case 0xc0:
+         case 0xd0:
+            texConstraintNVC0(tex);
+            break;
+         case 0xe0:
+         case NVISA_GK110_CHIPSET:
+            texConstraintNVE0(tex);
+            break;
+         default:
+            break;
+         }
+      } else
+      if (i->op == OP_EXPORT || i->op == OP_STORE) {
+         for (size = typeSizeof(i->dType), s = 1; size > 0; ++s) {
+            assert(i->srcExists(s));
+            size -= i->getSrc(s)->reg.size;
+         }
+         condenseSrcs(i, 1, s - 1);
+      } else
+      if (i->op == OP_LOAD || i->op == OP_VFETCH) {
+         condenseDefs(i);
+         if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
+            addHazard(i, i->src(0).getIndirect(0));
+      } else
+      if (i->op == OP_UNION ||
+          i->op == OP_MERGE ||
+          i->op == OP_SPLIT) {
+         constrList.push_back(i);
+      }
+   }
+   return true;
+}
+
+// Insert extra moves so that, if multiple register constraints on a value are
+// in conflict, these conflicts can be resolved.
+bool
+RegAlloc::InsertConstraintsPass::insertConstraintMoves()
+{
+   for (std::list<Instruction *>::iterator it = constrList.begin();
+        it != constrList.end();
+        ++it) {
+      Instruction *cst = *it;
+      Instruction *mov;
+
+      if (cst->op == OP_SPLIT && 0) {
+         // spilling splits is annoying, just make sure they're separate
+         for (int d = 0; cst->defExists(d); ++d) {
+            if (!cst->getDef(d)->refCount())
+               continue;
+            LValue *lval = new_LValue(func, cst->def(d).getFile());
+            const uint8_t size = cst->def(d).getSize();
+            lval->reg.size = size;
+
+            mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+            mov->setSrc(0, lval);
+            mov->setDef(0, cst->getDef(d));
+            cst->setDef(d, mov->getSrc(0));
+            cst->bb->insertAfter(cst, mov);
+
+            cst->getSrc(0)->asLValue()->noSpill = 1;
+            mov->getSrc(0)->asLValue()->noSpill = 1;
+         }
+      } else
+      if (cst->op == OP_MERGE || cst->op == OP_UNION) {
+         for (int s = 0; cst->srcExists(s); ++s) {
+            const uint8_t size = cst->src(s).getSize();
+
+            if (!cst->getSrc(s)->defs.size()) {
+               mov = new_Instruction(func, OP_NOP, typeOfSize(size));
+               mov->setDef(0, cst->getSrc(s));
+               cst->bb->insertBefore(cst, mov);
+               continue;
+            }
+            assert(cst->getSrc(s)->defs.size() == 1); // still SSA
+
+            Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+            // catch some cases where don't really need MOVs
+            if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs())
+               continue;
+
+            LValue *lval = new_LValue(func, cst->src(s).getFile());
+            lval->reg.size = size;
+
+            mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+            mov->setDef(0, lval);
+            mov->setSrc(0, cst->getSrc(s));
+            cst->setSrc(s, mov->getDef(0));
+            cst->bb->insertBefore(cst, mov);
+
+            cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
+
+            if (cst->op == OP_UNION)
+               mov->setPredicate(defi->cc, defi->getPredicate());
+         }
+      }
+   }
+
+   return true;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
new file mode 100644
index 0000000..2e43234
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+// Converts nv50 IR generated from TGSI to SSA form.
+
+// DominatorTree implements an algorithm for finding immediate dominators,
+// as described by T. Lengauer & R. Tarjan.
+class DominatorTree : public Graph
+{
+public:
+   DominatorTree(Graph *cfg);
+   ~DominatorTree() { }
+
+   bool dominates(BasicBlock *, BasicBlock *);
+
+   void findDominanceFrontiers();
+
+private:
+   void build();
+   void buildDFS(Node *);
+
+   void squash(int);
+   inline void link(int, int);
+   inline int eval(int);
+
+   void debugPrint();
+
+   Graph *cfg;
+
+   Node **vert;
+   int *data;
+   const int count;
+
+   #define SEMI(i)     (data[(i) + 0 * count])
+   #define ANCESTOR(i) (data[(i) + 1 * count])
+   #define PARENT(i)   (data[(i) + 2 * count])
+   #define LABEL(i)    (data[(i) + 3 * count])
+   #define DOM(i)      (data[(i) + 4 * count])
+};
+
+void DominatorTree::debugPrint()
+{
+   for (int i = 0; i < count; ++i) {
+      INFO("SEMI(%i) = %i\n", i, SEMI(i));
+      INFO("ANCESTOR(%i) = %i\n", i, ANCESTOR(i));
+      INFO("PARENT(%i) = %i\n", i, PARENT(i));
+      INFO("LABEL(%i) = %i\n", i, LABEL(i));
+      INFO("DOM(%i) = %i\n", i, DOM(i));
+   }
+}
+
+DominatorTree::DominatorTree(Graph *cfgraph) : cfg(cfgraph),
+                                               count(cfg->getSize())
+{
+   int i = 0;
+
+   vert = new Node * [count];
+   data = new int[5 * count];
+
+   for (IteratorRef it = cfg->iteratorDFS(true); !it->end(); it->next(), ++i) {
+      vert[i] = reinterpret_cast<Node *>(it->get());
+      vert[i]->tag = i;
+      LABEL(i) = i;
+      SEMI(i) = ANCESTOR(i) = -1;
+   }
+
+   build();
+
+   delete[] vert;
+   delete[] data;
+}
+
+void DominatorTree::buildDFS(Graph::Node *node)
+{
+   SEMI(node->tag) = node->tag;
+
+   for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) {
+      if (SEMI(ei.getNode()->tag) < 0) {
+         buildDFS(ei.getNode());
+         PARENT(ei.getNode()->tag) = node->tag;
+      }
+   }
+}
+
+void DominatorTree::squash(int v)
+{
+   if (ANCESTOR(ANCESTOR(v)) >= 0) {
+      squash(ANCESTOR(v));
+
+      if (SEMI(LABEL(ANCESTOR(v))) < SEMI(LABEL(v)))
+         LABEL(v) = LABEL(ANCESTOR(v));
+      ANCESTOR(v) = ANCESTOR(ANCESTOR(v));
+   }
+}
+
+int DominatorTree::eval(int v)
+{
+   if (ANCESTOR(v) < 0)
+      return v;
+   squash(v);
+   return LABEL(v);
+}
+
+void DominatorTree::link(int v, int w)
+{
+   ANCESTOR(w) = v;
+}
+
+void DominatorTree::build()
+{
+   DLList *bucket = new DLList[count];
+   Node *nv, *nw;
+   int p, u, v, w;
+
+   buildDFS(cfg->getRoot());
+
+   for (w = count - 1; w >= 1; --w) {
+      nw = vert[w];
+      assert(nw->tag == w);
+      for (Graph::EdgeIterator ei = nw->incident(); !ei.end(); ei.next()) {
+         nv = ei.getNode();
+         v = nv->tag;
+         u = eval(v);
+         if (SEMI(u) < SEMI(w))
+            SEMI(w) = SEMI(u);
+      }
+      p = PARENT(w);
+      bucket[SEMI(w)].insert(nw);
+      link(p, w);
+
+      for (DLList::Iterator it = bucket[p].iterator(); !it.end(); it.erase()) {
+         v = reinterpret_cast<Node *>(it.get())->tag;
+         u = eval(v);
+         DOM(v) = (SEMI(u) < SEMI(v)) ? u : p;
+      }
+   }
+   for (w = 1; w < count; ++w) {
+      if (DOM(w) != SEMI(w))
+         DOM(w) = DOM(DOM(w));
+   }
+   DOM(0) = 0;
+
+   insert(&BasicBlock::get(cfg->getRoot())->dom);
+   do {
+      p = 0;
+      for (v = 1; v < count; ++v) {
+         nw = &BasicBlock::get(vert[DOM(v)])->dom;;
+         nv = &BasicBlock::get(vert[v])->dom;
+         if (nw->getGraph() && !nv->getGraph()) {
+            ++p;
+            nw->attach(nv, Graph::Edge::TREE);
+         }
+      }
+   } while (p);
+
+   delete[] bucket;
+}
+
+#undef SEMI
+#undef ANCESTOR
+#undef PARENT
+#undef LABEL
+#undef DOM
+
+void DominatorTree::findDominanceFrontiers()
+{
+   BasicBlock *bb;
+
+   for (IteratorRef dtIt = iteratorDFS(false); !dtIt->end(); dtIt->next()) {
+      EdgeIterator succIt, chldIt;
+
+      bb = BasicBlock::get(reinterpret_cast<Node *>(dtIt->get()));
+      bb->getDF().clear();
+
+      for (succIt = bb->cfg.outgoing(); !succIt.end(); succIt.next()) {
+         BasicBlock *dfLocal = BasicBlock::get(succIt.getNode());
+         if (dfLocal->idom() != bb)
+            bb->getDF().insert(dfLocal);
+      }
+
+      for (chldIt = bb->dom.outgoing(); !chldIt.end(); chldIt.next()) {
+         BasicBlock *cb = BasicBlock::get(chldIt.getNode());
+
+         DLList::Iterator dfIt = cb->getDF().iterator();
+         for (; !dfIt.end(); dfIt.next()) {
+            BasicBlock *dfUp = BasicBlock::get(dfIt);
+            if (dfUp->idom() != bb)
+               bb->getDF().insert(dfUp);
+         }
+      }
+   }
+}
+
+// liveIn(bb) = usedBeforeAssigned(bb) U (liveOut(bb) - assigned(bb))
+void
+Function::buildLiveSetsPreSSA(BasicBlock *bb, const int seq)
+{
+   Function *f = bb->getFunction();
+   BitSet usedBeforeAssigned(allLValues.getSize(), true);
+   BitSet assigned(allLValues.getSize(), true);
+
+   bb->liveSet.allocate(allLValues.getSize(), false);
+
+   int n = 0;
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      BasicBlock *out = BasicBlock::get(ei.getNode());
+      if (out == bb)
+         continue;
+      if (out->cfg.visit(seq))
+         buildLiveSetsPreSSA(out, seq);
+      if (!n++)
+         bb->liveSet = out->liveSet;
+      else
+         bb->liveSet |= out->liveSet;
+   }
+   if (!n && !bb->liveSet.marker)
+      bb->liveSet.fill(0);
+   bb->liveSet.marker = true;
+
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      for (int s = 0; i->srcExists(s); ++s)
+         if (i->getSrc(s)->asLValue() && !assigned.test(i->getSrc(s)->id))
+            usedBeforeAssigned.set(i->getSrc(s)->id);
+      for (int d = 0; i->defExists(d); ++d)
+         assigned.set(i->getDef(d)->id);
+   }
+
+   if (bb == BasicBlock::get(f->cfgExit)) {
+      for (std::deque<ValueRef>::iterator it = f->outs.begin();
+           it != f->outs.end(); ++it) {
+         if (!assigned.test(it->get()->id))
+            usedBeforeAssigned.set(it->get()->id);
+      }
+   }
+
+   bb->liveSet.andNot(assigned);
+   bb->liveSet |= usedBeforeAssigned;
+}
+
+void
+Function::buildDefSetsPreSSA(BasicBlock *bb, const int seq)
+{
+   bb->defSet.allocate(allLValues.getSize(), !bb->liveSet.marker);
+   bb->liveSet.marker = true;
+
+   for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+      BasicBlock *in = BasicBlock::get(ei.getNode());
+
+      if (in->cfg.visit(seq))
+         buildDefSetsPreSSA(in, seq);
+
+      bb->defSet |= in->defSet;
+   }
+
+   for (Instruction *i = bb->getEntry(); i; i = i->next) {
+      for (int d = 0; i->defExists(d); ++d)
+         bb->defSet.set(i->getDef(d)->id);
+   }
+}
+
+class RenamePass
+{
+public:
+   RenamePass(Function *);
+   ~RenamePass();
+
+   bool run();
+   void search(BasicBlock *);
+
+   inline LValue *getStackTop(Value *);
+
+   LValue *mkUndefined(Value *);
+
+private:
+   Stack *stack;
+   Function *func;
+   Program *prog;
+};
+
+bool
+Program::convertToSSA()
+{
+   for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
+      Function *fn = reinterpret_cast<Function *>(fi.get());
+      if (!fn->convertToSSA())
+         return false;
+   }
+   return true;
+}
+
+// XXX: add edge from entry to exit ?
+
+// Efficiently Computing Static Single Assignment Form and
+//  the Control Dependence Graph,
+// R. Cytron, J. Ferrante, B. K. Rosen, M. N. Wegman, F. K. Zadeck
+bool
+Function::convertToSSA()
+{
+   // 0. calculate live in variables (for pruned SSA)
+   buildLiveSets();
+
+   // 1. create the dominator tree
+   domTree = new DominatorTree(&cfg);
+   reinterpret_cast<DominatorTree *>(domTree)->findDominanceFrontiers();
+
+   // 2. insert PHI functions
+   DLList workList;
+   LValue *lval;
+   BasicBlock *bb;
+   int var;
+   int iterCount = 0;
+   int *hasAlready = new int[allBBlocks.getSize() * 2];
+   int *work = &hasAlready[allBBlocks.getSize()];
+
+   memset(hasAlready, 0, allBBlocks.getSize() * 2 * sizeof(int));
+
+   // for each variable
+   for (var = 0; var < allLValues.getSize(); ++var) {
+      if (!allLValues.get(var))
+         continue;
+      lval = reinterpret_cast<Value *>(allLValues.get(var))->asLValue();
+      if (!lval || lval->defs.empty())
+         continue;
+      ++iterCount;
+
+      // TODO: don't add phi functions for values that aren't used outside
+      //  the BB they're defined in
+
+      // gather blocks with assignments to lval in workList
+      for (Value::DefIterator d = lval->defs.begin();
+           d != lval->defs.end(); ++d) {
+         bb = ((*d)->getInsn() ? (*d)->getInsn()->bb : NULL);
+         if (!bb)
+            continue; // instruction likely been removed but not XXX deleted
+
+         if (work[bb->getId()] == iterCount)
+            continue;
+         work[bb->getId()] = iterCount;
+         workList.insert(bb);
+      }
+
+      // for each block in workList, insert a phi for lval in the block's
+      //  dominance frontier (if we haven't already done so)
+      for (DLList::Iterator wI = workList.iterator(); !wI.end(); wI.erase()) {
+         bb = BasicBlock::get(wI);
+
+         DLList::Iterator dfIter = bb->getDF().iterator();
+         for (; !dfIter.end(); dfIter.next()) {
+            Instruction *phi;
+            BasicBlock *dfBB = BasicBlock::get(dfIter);
+
+            if (hasAlready[dfBB->getId()] >= iterCount)
+               continue;
+            hasAlready[dfBB->getId()] = iterCount;
+
+            // pruned SSA: don't need a phi if the value is not live-in
+            if (!dfBB->liveSet.test(lval->id))
+               continue;
+
+            phi = new_Instruction(this, OP_PHI, typeOfSize(lval->reg.size));
+            dfBB->insertTail(phi);
+
+            phi->setDef(0, lval);
+            for (int s = 0; s < dfBB->cfg.incidentCount(); ++s)
+               phi->setSrc(s, lval);
+
+            if (work[dfBB->getId()] < iterCount) {
+               work[dfBB->getId()] = iterCount;
+               wI.insert(dfBB);
+            }
+         }
+      }
+   }
+   delete[] hasAlready;
+
+   RenamePass rename(this);
+   return rename.run();
+}
+
+RenamePass::RenamePass(Function *fn) : func(fn), prog(fn->getProgram())
+{
+   stack = new Stack[func->allLValues.getSize()];
+}
+
+RenamePass::~RenamePass()
+{
+   if (stack)
+      delete[] stack;
+}
+
+LValue *
+RenamePass::getStackTop(Value *val)
+{
+   if (!stack[val->id].getSize())
+      return 0;
+   return reinterpret_cast<LValue *>(stack[val->id].peek().u.p);
+}
+
+LValue *
+RenamePass::mkUndefined(Value *val)
+{
+   LValue *lval = val->asLValue();
+   assert(lval);
+   LValue *ud = new_LValue(func, lval);
+   Instruction *nop = new_Instruction(func, OP_NOP, typeOfSize(lval->reg.size));
+   nop->setDef(0, ud);
+   BasicBlock::get(func->cfg.getRoot())->insertHead(nop);
+   return ud;
+}
+
+bool RenamePass::run()
+{
+   if (!stack)
+      return false;
+   search(BasicBlock::get(func->domTree->getRoot()));
+
+   return true;
+}
+
+// Go through BBs in dominance order, create new values for each definition,
+// and replace all sources with their current new values.
+//
+// NOTE: The values generated for function inputs/outputs have no connection
+// to their corresponding outputs/inputs in other functions. Only allocation
+// of physical registers will establish this connection.
+//
+void RenamePass::search(BasicBlock *bb)
+{
+   LValue *lval, *ssa;
+   int d, s;
+   const Target *targ = prog->getTarget();
+
+   // Put current definitions for function inputs values on the stack.
+   // They can be used before any redefinitions are pushed.
+   if (bb == BasicBlock::get(func->cfg.getRoot())) {
+      for (std::deque<ValueDef>::iterator it = func->ins.begin();
+           it != func->ins.end(); ++it) {
+         lval = it->get()->asLValue();
+         assert(lval);
+
+         ssa = new_LValue(func, targ->nativeFile(lval->reg.file));
+         ssa->reg.size = lval->reg.size;
+         ssa->reg.data.id = lval->reg.data.id;
+
+         it->setSSA(ssa);
+         stack[lval->id].push(ssa);
+      }
+   }
+
+   for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) {
+      // PHI sources get definitions from the passes through the incident BBs,
+      // so skip them here.
+      if (stmt->op != OP_PHI) {
+         for (s = 0; stmt->srcExists(s); ++s) {
+            lval = stmt->getSrc(s)->asLValue();
+            if (!lval)
+               continue;
+            // Values on the stack created in previously visited blocks, and
+            // function inputs, will be valid because they dominate this one.
+            lval = getStackTop(lval);
+            if (!lval)
+               lval = mkUndefined(stmt->getSrc(s));
+            stmt->setSrc(s, lval);
+         }
+      }
+      for (d = 0; stmt->defExists(d); ++d) {
+         lval = stmt->def(d).get()->asLValue();
+         assert(lval);
+         stmt->def(d).setSSA(
+            new_LValue(func, targ->nativeFile(lval->reg.file)));
+         stmt->def(d).get()->reg.size = lval->reg.size;
+         stmt->def(d).get()->reg.data.id = lval->reg.data.id;
+         stack[lval->id].push(stmt->def(d).get());
+      }
+   }
+
+   // Update sources of PHI ops corresponding to this BB in outgoing BBs.
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      Instruction *phi;
+      int p = 0;
+      BasicBlock *sb = BasicBlock::get(ei.getNode());
+
+      // which predecessor of sb is bb ?
+      for (Graph::EdgeIterator ei = sb->cfg.incident(); !ei.end(); ei.next()) {
+         if (ei.getNode() == &bb->cfg)
+            break;
+         ++p;
+      }
+      assert(p < sb->cfg.incidentCount());
+
+      for (phi = sb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+         lval = getStackTop(phi->getSrc(p));
+         if (!lval)
+            lval = mkUndefined(phi->getSrc(p));
+         phi->setSrc(p, lval);
+      }
+   }
+
+   // Visit the BBs we dominate.
+   for (Graph::EdgeIterator ei = bb->dom.outgoing(); !ei.end(); ei.next())
+      search(BasicBlock::get(ei.getNode()));
+
+   // Update function outputs to the last definitions of their pre-SSA values.
+   // I hope they're unique, i.e. that we get PHIs for all of them ...
+   if (bb == BasicBlock::get(func->cfgExit)) {
+      for (std::deque<ValueRef>::iterator it = func->outs.begin();
+           it != func->outs.end(); ++it) {
+         lval = it->get()->asLValue();
+         if (!lval)
+            continue;
+         lval = getStackTop(lval);
+         if (!lval)
+            lval = mkUndefined(it->get());
+         it->set(lval);
+      }
+   }
+
+   // Pop the values we created in this block from the stack because we will
+   // return to blocks that we do not dominate.
+   for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) {
+      if (stmt->op == OP_NOP)
+         continue;
+      for (d = 0; stmt->defExists(d); ++d)
+         stack[stmt->def(d).preSSA()->id].pop();
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
new file mode 100644
index 0000000..443acfc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -0,0 +1,469 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+const uint8_t Target::operationSrcNr[OP_LAST + 1] =
+{
+   0, 0,                   // NOP, PHI
+   0, 0, 0, 0,             // UNION, SPLIT, MERGE, CONSTRAINT
+   1, 1, 2,                // MOV, LOAD, STORE
+   2, 2, 2, 2, 2, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD
+   1, 1, 1,                // ABS, NEG, NOT
+   2, 2, 2, 2, 2,          // AND, OR, XOR, SHL, SHR
+   2, 2, 1,                // MAX, MIN, SAT
+   1, 1, 1, 1,             // CEIL, FLOOR, TRUNC, CVT
+   3, 3, 3, 2, 3, 3,       // SET_AND,OR,XOR, SET, SELP, SLCT
+   1, 1, 1, 1, 1, 1,       // RCP, RSQ, LG2, SIN, COS, EX2
+   1, 1, 1, 1, 1, 2,       // EXP, LOG, PRESIN, PREEX2, SQRT, POW
+   0, 0, 0, 0, 0,          // BRA, CALL, RET, CONT, BREAK,
+   0, 0, 0,                // PRERET,CONT,BREAK
+   0, 0, 0, 0, 0, 0,       // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR
+   1, 1, 2, 1, 2,          // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP
+   1, 1,                   // EMIT, RESTART
+   1, 1, 1,                // TEX, TXB, TXL,
+   1, 1, 1, 1, 1, 2,       // TXF, TXQ, TXD, TXG, TEXCSAA, TEXPREP
+   1, 1, 2, 2, 2, 2, 2,    // SULDB, SULDP, SUSTB, SUSTP, SUREDB, SUREDP, SULEA
+   3, 3, 3, 3,             // SUBFM, SUCLAMP, SUEAU, MADSP
+   0,                      // TEXBAR
+   1, 1,                   // DFDX, DFDY
+   1, 2, 2, 0, 0,          // RDSV, WRSV, QUADOP, QUADON, QUADPOP
+   2, 3, 2, 3,             // POPCNT, INSBF, EXTBF, PERMT
+   2, 2,                   // ATOM, BAR
+   2, 2, 2, 2, 3, 2,       // VADD, VAVG, VMIN, VMAX, VSAD, VSET,
+   2, 2, 2, 1,             // VSHR, VSHL, VSEL, CCTL
+   0
+};
+
+const OpClass Target::operationClass[OP_LAST + 1] =
+{
+   // NOP; PHI; UNION, SPLIT, MERGE, CONSTRAINT
+   OPCLASS_OTHER,
+   OPCLASS_PSEUDO,
+   OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO,
+   // MOV; LOAD; STORE
+   OPCLASS_MOVE,
+   OPCLASS_LOAD,
+   OPCLASS_STORE,
+   // ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD
+   OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
+   OPCLASS_ARITH, OPCLASS_ARITH,
+   OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
+   // ABS, NEG; NOT, AND, OR, XOR; SHL, SHR
+   OPCLASS_CONVERT, OPCLASS_CONVERT,
+   OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC,
+   OPCLASS_SHIFT, OPCLASS_SHIFT,
+   // MAX, MIN
+   OPCLASS_COMPARE, OPCLASS_COMPARE,
+   // SAT, CEIL, FLOOR, TRUNC; CVT
+   OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT,
+   OPCLASS_CONVERT,
+   // SET(AND,OR,XOR); SELP, SLCT
+   OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE,
+   OPCLASS_COMPARE, OPCLASS_COMPARE,
+   // RCP, RSQ, LG2, SIN, COS; EX2, EXP, LOG, PRESIN, PREEX2; SQRT, POW
+   OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
+   OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
+   OPCLASS_SFU, OPCLASS_SFU,
+   // BRA, CALL, RET; CONT, BREAK, PRE(RET,CONT,BREAK); BRKPT, JOINAT, JOIN
+   OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+   OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+   OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+   // DISCARD, EXIT
+   OPCLASS_FLOW, OPCLASS_FLOW,
+   // MEMBAR
+   OPCLASS_CONTROL,
+   // VFETCH, PFETCH, EXPORT
+   OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
+   // LINTERP, PINTERP
+   OPCLASS_SFU, OPCLASS_SFU,
+   // EMIT, RESTART
+   OPCLASS_CONTROL, OPCLASS_CONTROL,
+   // TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TEXCSAA; TEXPREP
+   OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
+   OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
+   OPCLASS_TEXTURE,
+   // SULDB, SULDP, SUSTB, SUSTP; SUREDB, SUREDP, SULEA
+   OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_ATOMIC, OPCLASS_SURFACE,
+   OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_SURFACE,
+   // SUBFM, SUCLAMP, SUEAU, MADSP
+   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_ARITH,
+   // TEXBAR
+   OPCLASS_OTHER,
+   // DFDX, DFDY, RDSV, WRSV; QUADOP, QUADON, QUADPOP
+   OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
+   OPCLASS_OTHER, OPCLASS_CONTROL, OPCLASS_CONTROL,
+   // POPCNT, INSBF, EXTBF, PERMT
+   OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD,
+   // ATOM, BAR
+   OPCLASS_ATOMIC, OPCLASS_CONTROL,
+   // VADD, VAVG, VMIN, VMAX
+   OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR,
+   // VSAD, VSET, VSHR, VSHL
+   OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR,
+   // VSEL, CCTL
+   OPCLASS_VECTOR, OPCLASS_CONTROL,
+   OPCLASS_PSEUDO // LAST
+};
+
+
+extern Target *getTargetNVC0(unsigned int chipset);
+extern Target *getTargetNV50(unsigned int chipset);
+
+Target *Target::create(unsigned int chipset)
+{
+   switch (chipset & 0xf0) {
+   case 0xc0:
+   case 0xd0:
+   case 0xe0:
+   case NVISA_GK110_CHIPSET:
+      return getTargetNVC0(chipset);
+   case 0x50:
+   case 0x80:
+   case 0x90:
+   case 0xa0:
+      return getTargetNV50(chipset);
+   default:
+      ERROR("unsupported target: NV%x\n", chipset);
+      return 0;
+   }
+}
+
+void Target::destroy(Target *targ)
+{
+   delete targ;
+}
+
+CodeEmitter::CodeEmitter(const Target *target) : targ(target)
+{
+}
+
+void
+CodeEmitter::setCodeLocation(void *ptr, uint32_t size)
+{
+   code = reinterpret_cast<uint32_t *>(ptr);
+   codeSize = 0;
+   codeSizeLimit = size;
+}
+
+void
+CodeEmitter::printBinary() const
+{
+   uint32_t *bin = code - codeSize / 4;
+   INFO("program binary (%u bytes)", codeSize);
+   for (unsigned int pos = 0; pos < codeSize / 4; ++pos) {
+      if ((pos % 8) == 0)
+         INFO("\n");
+      INFO("%08x ", bin[pos]);
+   }
+   INFO("\n");
+}
+
+static inline uint32_t sizeToBundlesNVE4(uint32_t size)
+{
+   return (size + 55) / 56;
+}
+
+void
+CodeEmitter::prepareEmission(Program *prog)
+{
+   for (ArrayList::Iterator fi = prog->allFuncs.iterator();
+        !fi.end(); fi.next()) {
+      Function *func = reinterpret_cast<Function *>(fi.get());
+      func->binPos = prog->binSize;
+      prepareEmission(func);
+
+      // adjust sizes & positions for schedulding info:
+      if (prog->getTarget()->hasSWSched) {
+         uint32_t adjPos = func->binPos;
+         BasicBlock *bb = NULL;
+         for (int i = 0; i < func->bbCount; ++i) {
+            bb = func->bbArray[i];
+            int32_t adjSize = bb->binSize;
+            if (adjPos % 64) {
+               adjSize -= 64 - adjPos % 64;
+               if (adjSize < 0)
+                  adjSize = 0;
+            }
+            adjSize = bb->binSize + sizeToBundlesNVE4(adjSize) * 8;
+            bb->binPos = adjPos;
+            bb->binSize = adjSize;
+            adjPos += adjSize;
+         }
+         if (bb)
+            func->binSize = adjPos - func->binPos;
+      }
+
+      prog->binSize += func->binSize;
+   }
+}
+
+void
+CodeEmitter::prepareEmission(Function *func)
+{
+   func->bbCount = 0;
+   func->bbArray = new BasicBlock * [func->cfg.getSize()];
+
+   BasicBlock::get(func->cfg.getRoot())->binPos = func->binPos;
+
+   for (IteratorRef it = func->cfg.iteratorCFG(); !it->end(); it->next())
+      prepareEmission(BasicBlock::get(*it));
+}
+
+void
+CodeEmitter::prepareEmission(BasicBlock *bb)
+{
+   Instruction *i, *next;
+   Function *func = bb->getFunction();
+   int j;
+   unsigned int nShort;
+
+   for (j = func->bbCount - 1; j >= 0 && !func->bbArray[j]->binSize; --j);
+
+   for (; j >= 0; --j) {
+      BasicBlock *in = func->bbArray[j];
+      Instruction *exit = in->getExit();
+
+      if (exit && exit->op == OP_BRA && exit->asFlow()->target.bb == bb) {
+         in->binSize -= 8;
+         func->binSize -= 8;
+
+         for (++j; j < func->bbCount; ++j)
+            func->bbArray[j]->binPos -= 8;
+
+         in->remove(exit);
+      }
+      bb->binPos = in->binPos + in->binSize;
+      if (in->binSize) // no more no-op branches to bb
+         break;
+   }
+   func->bbArray[func->bbCount++] = bb;
+
+   if (!bb->getExit())
+      return;
+
+   // determine encoding size, try to group short instructions
+   nShort = 0;
+   for (i = bb->getEntry(); i; i = next) {
+      next = i->next;
+
+      if (i->op == OP_MEMBAR && !targ->isOpSupported(OP_MEMBAR, TYPE_NONE)) {
+         bb->remove(i);
+         continue;
+      }
+
+      i->encSize = getMinEncodingSize(i);
+      if (next && i->encSize < 8)
+         ++nShort;
+      else
+      if ((nShort & 1) && next && getMinEncodingSize(next) == 4) {
+         if (i->isCommutationLegal(i->next)) {
+            bb->permuteAdjacent(i, next);
+            next->encSize = 4;
+            next = i;
+            i = i->prev;
+            ++nShort;
+         } else
+         if (i->isCommutationLegal(i->prev) && next->next) {
+            bb->permuteAdjacent(i->prev, i);
+            next->encSize = 4;
+            next = next->next;
+            bb->binSize += 4;
+            ++nShort;
+         } else {
+            i->encSize = 8;
+            i->prev->encSize = 8;
+            bb->binSize += 4;
+            nShort = 0;
+         }
+      } else {
+         i->encSize = 8;
+         if (nShort & 1) {
+            i->prev->encSize = 8;
+            bb->binSize += 4;
+         }
+         nShort = 0;
+      }
+      bb->binSize += i->encSize;
+   }
+
+   if (bb->getExit()->encSize == 4) {
+      assert(nShort);
+      bb->getExit()->encSize = 8;
+      bb->binSize += 4;
+
+      if ((bb->getExit()->prev->encSize == 4) && !(nShort & 1)) {
+         bb->binSize += 8;
+         bb->getExit()->prev->encSize = 8;
+      }
+   }
+   assert(!bb->getEntry() || (bb->getExit() && bb->getExit()->encSize == 8));
+
+   func->binSize += bb->binSize;
+}
+
+void
+Program::emitSymbolTable(struct nv50_ir_prog_info *info)
+{
+   unsigned int n = 0, nMax = allFuncs.getSize();
+
+   info->bin.syms =
+      (struct nv50_ir_prog_symbol *)MALLOC(nMax * sizeof(*info->bin.syms));
+
+   for (ArrayList::Iterator fi = allFuncs.iterator();
+        !fi.end();
+        fi.next(), ++n) {
+      Function *f = (Function *)fi.get();
+      assert(n < nMax);
+
+      info->bin.syms[n].label = f->getLabel();
+      info->bin.syms[n].offset = f->binPos;
+   }
+
+   info->bin.numSyms = n;
+}
+
+bool
+Program::emitBinary(struct nv50_ir_prog_info *info)
+{
+   CodeEmitter *emit = target->getCodeEmitter(progType);
+
+   emit->prepareEmission(this);
+
+   if (dbgFlags & NV50_IR_DEBUG_BASIC)
+      this->print();
+
+   if (!binSize) {
+      code = NULL;
+      return false;
+   }
+   code = reinterpret_cast<uint32_t *>(MALLOC(binSize));
+   if (!code)
+      return false;
+   emit->setCodeLocation(code, binSize);
+
+   for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
+      Function *fn = reinterpret_cast<Function *>(fi.get());
+
+      assert(emit->getCodeSize() == fn->binPos);
+
+      for (int b = 0; b < fn->bbCount; ++b)
+         for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next)
+            emit->emitInstruction(i);
+   }
+   info->bin.relocData = emit->getRelocInfo();
+
+   emitSymbolTable(info);
+
+   // the nvc0 driver will print the binary iself together with the header
+   if ((dbgFlags & NV50_IR_DEBUG_BASIC) && getTarget()->getChipset() < 0xc0)
+      emit->printBinary();
+
+   delete emit;
+   return true;
+}
+
+#define RELOC_ALLOC_INCREMENT 8
+
+bool
+CodeEmitter::addReloc(RelocEntry::Type ty, int w, uint32_t data, uint32_t m,
+                      int s)
+{
+   unsigned int n = relocInfo ? relocInfo->count : 0;
+
+   if (!(n % RELOC_ALLOC_INCREMENT)) {
+      size_t size = sizeof(RelocInfo) + n * sizeof(RelocEntry);
+      relocInfo = reinterpret_cast<RelocInfo *>(
+         REALLOC(relocInfo, n ? size : 0,
+                 size + RELOC_ALLOC_INCREMENT * sizeof(RelocEntry)));
+      if (!relocInfo)
+         return false;
+      if (n == 0)
+         memset(relocInfo, 0, sizeof(RelocInfo));
+   }
+   ++relocInfo->count;
+
+   relocInfo->entry[n].data = data;
+   relocInfo->entry[n].mask = m;
+   relocInfo->entry[n].offset = codeSize + w * 4;
+   relocInfo->entry[n].bitPos = s;
+   relocInfo->entry[n].type = ty;
+
+   return true;
+}
+
+void
+RelocEntry::apply(uint32_t *binary, const RelocInfo *info) const
+{
+   uint32_t value = 0;
+
+   switch (type) {
+   case TYPE_CODE: value = info->codePos; break;
+   case TYPE_BUILTIN: value = info->libPos; break;
+   case TYPE_DATA: value = info->dataPos; break;
+   default:
+      assert(0);
+      break;
+   }
+   value += data;
+   value = (bitPos < 0) ? (value >> -bitPos) : (value << bitPos);
+
+   binary[offset / 4] &= ~mask;
+   binary[offset / 4] |= value & mask;
+}
+
+} // namespace nv50_ir
+
+
+#include "codegen/nv50_ir_driver.h"
+
+extern "C" {
+
+void
+nv50_ir_relocate_code(void *relocData, uint32_t *code,
+                      uint32_t codePos,
+                      uint32_t libPos,
+                      uint32_t dataPos)
+{
+   nv50_ir::RelocInfo *info = reinterpret_cast<nv50_ir::RelocInfo *>(relocData);
+
+   info->codePos = codePos;
+   info->libPos = libPos;
+   info->dataPos = dataPos;
+
+   for (unsigned int i = 0; i < info->count; ++i)
+      info->entry[i].apply(code, info);
+}
+
+void
+nv50_ir_get_target_library(uint32_t chipset,
+                           const uint32_t **code, uint32_t *size)
+{
+   nv50_ir::Target *targ = nv50_ir::Target::create(chipset);
+   targ->getBuiltinCode(code, size);
+   nv50_ir::Target::destroy(targ);
+}
+
+}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
new file mode 100644
index 0000000..9913ca1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_TARGET_H__
+#define __NV50_IR_TARGET_H__
+
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+struct RelocInfo;
+
+struct RelocEntry
+{
+   enum Type
+   {
+      TYPE_CODE,
+      TYPE_BUILTIN,
+      TYPE_DATA
+   };
+
+   uint32_t data;
+   uint32_t mask;
+   uint32_t offset;
+   int8_t bitPos;
+   Type type;
+
+   inline void apply(uint32_t *binary, const RelocInfo *info) const;
+};
+
+struct RelocInfo
+{
+   uint32_t codePos;
+   uint32_t libPos;
+   uint32_t dataPos;
+
+   uint32_t count;
+
+   RelocEntry entry[0];
+};
+
+class CodeEmitter
+{
+public:
+   CodeEmitter(const Target *);
+   virtual ~CodeEmitter() { }
+
+   // returns whether the instruction was encodable and written
+   virtual bool emitInstruction(Instruction *) = 0;
+
+   virtual uint32_t getMinEncodingSize(const Instruction *) const = 0;
+
+   void setCodeLocation(void *, uint32_t size);
+   inline void *getCodeLocation() const { return code; }
+   inline uint32_t getCodeSize() const { return codeSize; }
+
+   bool addReloc(RelocEntry::Type, int w, uint32_t data, uint32_t m,
+                 int s);
+
+   inline void *getRelocInfo() const { return relocInfo; }
+
+   void prepareEmission(Program *);
+   virtual void prepareEmission(Function *);
+   virtual void prepareEmission(BasicBlock *);
+
+   void printBinary() const;
+
+protected:
+   const Target *targ;
+
+   uint32_t *code;
+   uint32_t codeSize;
+   uint32_t codeSizeLimit;
+
+   RelocInfo *relocInfo;
+};
+
+
+enum OpClass
+{
+   OPCLASS_MOVE          = 0,
+   OPCLASS_LOAD          = 1,
+   OPCLASS_STORE         = 2,
+   OPCLASS_ARITH         = 3,
+   OPCLASS_SHIFT         = 4,
+   OPCLASS_SFU           = 5,
+   OPCLASS_LOGIC         = 6,
+   OPCLASS_COMPARE       = 7,
+   OPCLASS_CONVERT       = 8,
+   OPCLASS_ATOMIC        = 9,
+   OPCLASS_TEXTURE       = 10,
+   OPCLASS_SURFACE       = 11,
+   OPCLASS_FLOW          = 12,
+   OPCLASS_PSEUDO        = 14,
+   OPCLASS_VECTOR        = 15,
+   OPCLASS_BITFIELD      = 16,
+   OPCLASS_CONTROL       = 17,
+   OPCLASS_OTHER         = 18
+};
+
+class Target
+{
+public:
+   Target(bool j, bool s) : joinAnterior(j), hasSWSched(s) { }
+   virtual ~Target() { }
+
+   static Target *create(uint32_t chipset);
+   static void destroy(Target *);
+
+   // 0x50 and 0x84 to 0xaf for nv50
+   // 0xc0 to 0xdf for nvc0
+   inline uint32_t getChipset() const { return chipset; }
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type) = 0;
+
+   // Drivers should upload this so we can use it from all programs.
+   // The address chosen is supplied to the relocation routine.
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 0;
+
+   virtual void parseDriverInfo(const struct nv50_ir_prog_info *info) { }
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const = 0;
+
+public:
+   struct OpInfo
+   {
+      OpInfo *variants;
+      operation op;
+      uint16_t srcTypes;
+      uint16_t dstTypes;
+      uint32_t immdBits;
+      uint8_t srcNr;
+      uint8_t srcMods[3];
+      uint8_t dstMods;
+      uint8_t srcFiles[3];
+      uint8_t dstFiles;
+      unsigned int minEncSize  : 4;
+      unsigned int vector      : 1;
+      unsigned int predicate   : 1;
+      unsigned int commutative : 1;
+      unsigned int pseudo      : 1;
+      unsigned int flow        : 1;
+      unsigned int hasDest     : 1;
+      unsigned int terminator  : 1;
+   };
+
+   inline const OpInfo& getOpInfo(const Instruction *) const;
+   inline const OpInfo& getOpInfo(const operation) const;
+
+   inline DataFile nativeFile(DataFile f) const;
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const = 0;
+   virtual bool isOpSupported(operation, DataType) const = 0;
+   virtual bool isAccessSupported(DataFile, DataType) const = 0;
+   virtual bool isModSupported(const Instruction *,
+                               int s, Modifier) const = 0;
+   virtual bool isSatSupported(const Instruction *) const = 0;
+   virtual bool isPostMultiplySupported(operation op, float f,
+                                        int& e) const { return false; }
+   virtual bool mayPredicate(const Instruction *,
+                             const Value *) const = 0;
+
+   // whether @insn can be issued together with @next (order matters)
+   virtual bool canDualIssue(const Instruction *insn,
+                             const Instruction *next) const { return false; }
+   virtual int getLatency(const Instruction *) const { return 1; }
+   virtual int getThroughput(const Instruction *) const { return 1; }
+
+   virtual unsigned int getFileSize(DataFile) const = 0;
+   virtual unsigned int getFileUnit(DataFile) const = 0;
+
+   virtual uint32_t getSVAddress(DataFile, const Symbol *) const = 0;
+
+public:
+   const bool joinAnterior; // true if join is executed before the op
+   const bool hasSWSched;   // true if code should provide scheduling data
+
+   static const uint8_t operationSrcNr[OP_LAST + 1];
+   static const OpClass operationClass[OP_LAST + 1];
+
+   static inline uint8_t getOpSrcNr(operation op)
+   {
+      return operationSrcNr[op];
+   }
+   static inline OpClass getOpClass(operation op)
+   {
+      return operationClass[op];
+   }
+
+protected:
+   uint32_t chipset;
+
+   DataFile nativeFileMap[DATA_FILE_COUNT];
+
+   OpInfo opInfo[OP_LAST + 1];
+};
+
+const Target::OpInfo& Target::getOpInfo(const Instruction *insn) const
+{
+   return opInfo[MIN2(insn->op, OP_LAST)];
+}
+
+const Target::OpInfo& Target::getOpInfo(const operation op) const
+{
+   return opInfo[op];
+}
+
+inline DataFile Target::nativeFile(DataFile f) const
+{
+   return nativeFileMap[f];
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_TARGET_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
new file mode 100644
index 0000000..ade9be0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+Target *getTargetNV50(unsigned int chipset)
+{
+   return new TargetNV50(chipset);
+}
+
+TargetNV50::TargetNV50(unsigned int card) : Target(true, false)
+{
+   chipset = card;
+
+   wposMask = 0;
+   for (unsigned int i = 0; i <= SV_LAST; ++i)
+      sysvalLocation[i] = ~0;
+
+   initOpInfo();
+}
+
+#if 0
+// BULTINS / LIBRARY FUNCTIONS:
+
+// TODO
+static const uint32_t nvc0_builtin_code[] =
+{
+};
+
+static const uint16_t nvc0_builtin_offsets[NV50_BUILTIN_COUNT] =
+{
+};
+#endif
+
+void
+TargetNV50::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+   *code = NULL;
+   *size = 0;
+}
+
+uint32_t
+TargetNV50::getBuiltinOffset(int builtin) const
+{
+   return 0;
+}
+
+struct opProperties
+{
+   operation op;
+   unsigned int mNeg    : 4;
+   unsigned int mAbs    : 4;
+   unsigned int mNot    : 4;
+   unsigned int mSat    : 4;
+   unsigned int fConst  : 3;
+   unsigned int fShared : 3;
+   unsigned int fAttrib : 3;
+   unsigned int fImm    : 3;
+};
+
+static const struct opProperties _initProps[] =
+{
+   //           neg  abs  not  sat  c[]  s[], a[], imm
+   { OP_ADD,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
+   { OP_SUB,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_MAD,    0x7, 0x0, 0x0, 0x0, 0x6, 0x1, 0x1, 0x0 }, // special constraint
+   { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+   { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+   { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x0, 0x1, 0x1, 0x0 },
+   { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_LG2,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_RCP,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_RSQ,    0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+};
+
+void TargetNV50::initOpInfo()
+{
+   unsigned int i, j;
+
+   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   {
+      // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN
+      0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
+   };
+   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   {
+      // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF
+      0x00010e40, 0x00000040, 0x00000498, 0x00000000
+   };
+   static const operation noDestList[] =
+   {
+      OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+      OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+      OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+      OP_QUADON, OP_QUADPOP, OP_TEXBAR, OP_SUSTB, OP_SUSTP, OP_SUREDP,
+      OP_SUREDB, OP_BAR
+   };
+   static const operation noPredList[] =
+   {
+      OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT
+   };
+
+   for (i = 0; i < DATA_FILE_COUNT; ++i)
+      nativeFileMap[i] = (DataFile)i;
+   nativeFileMap[FILE_PREDICATE] = FILE_FLAGS;
+
+   for (i = 0; i < OP_LAST; ++i) {
+      opInfo[i].variants = NULL;
+      opInfo[i].op = (operation)i;
+      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+      opInfo[i].immdBits = 0xffffffff;
+      opInfo[i].srcNr = operationSrcNr[i];
+
+      for (j = 0; j < opInfo[i].srcNr; ++j) {
+         opInfo[i].srcMods[j] = 0;
+         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+      }
+      opInfo[i].dstMods = 0;
+      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+      opInfo[i].hasDest = 1;
+      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].pseudo = (i < OP_MOV);
+      opInfo[i].predicate = !opInfo[i].pseudo;
+      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+   }
+   for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i)
+      opInfo[noDestList[i]].hasDest = 0;
+   for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i)
+      opInfo[noPredList[i]].predicate = 0;
+
+   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+      const struct opProperties *prop = &_initProps[i];
+
+      for (int s = 0; s < 3; ++s) {
+         if (prop->mNeg & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+         if (prop->mAbs & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+         if (prop->mNot & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+         if (prop->fConst & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+         if (prop->fShared & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_SHARED;
+         if (prop->fAttrib & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_SHADER_INPUT;
+         if (prop->fImm & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+      }
+      if (prop->mSat & 8)
+         opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+   }
+}
+
+unsigned int
+TargetNV50::getFileSize(DataFile file) const
+{
+   switch (file) {
+   case FILE_NULL:          return 0;
+   case FILE_GPR:           return 256; // in 16-bit units **
+   case FILE_PREDICATE:     return 0;
+   case FILE_FLAGS:         return 4;
+   case FILE_ADDRESS:       return 4;
+   case FILE_IMMEDIATE:     return 0;
+   case FILE_MEMORY_CONST:  return 65536;
+   case FILE_SHADER_INPUT:  return 0x200;
+   case FILE_SHADER_OUTPUT: return 0x200;
+   case FILE_MEMORY_GLOBAL: return 0xffffffff;
+   case FILE_MEMORY_SHARED: return 16 << 10;
+   case FILE_MEMORY_LOCAL:  return 48 << 10;
+   case FILE_SYSTEM_VALUE:  return 16;
+   default:
+      assert(!"invalid file");
+      return 0;
+   }
+   // ** only first 128 units encodable for 16-bit regs
+}
+
+unsigned int
+TargetNV50::getFileUnit(DataFile file) const
+{
+   if (file == FILE_GPR || file == FILE_ADDRESS)
+      return 1;
+   if (file == FILE_SYSTEM_VALUE)
+      return 2;
+   return 0;
+}
+
+uint32_t
+TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+   switch (sym->reg.data.sv.sv) {
+   case SV_FACE:
+      return 0x3fc;
+   case SV_POSITION:
+   {
+      uint32_t addr = sysvalLocation[sym->reg.data.sv.sv];
+      for (int c = 0; c < sym->reg.data.sv.index; ++c)
+         if (wposMask & (1 << c))
+            addr += 4;
+      return addr;
+   }
+   case SV_NCTAID:
+      return 0x8 + 2 * sym->reg.data.sv.index;
+   case SV_CTAID:
+      return 0xc + 2 * sym->reg.data.sv.index;
+   case SV_NTID:
+      return 0x2 + 2 * sym->reg.data.sv.index;
+   case SV_TID:
+      return 0;
+   default:
+      return sysvalLocation[sym->reg.data.sv.sv];
+   }
+}
+
+// long:  rrr, arr, rcr, acr, rrc, arc, gcr, grr
+// short: rr, ar, rc, gr
+// immd:  ri, gi
+bool
+TargetNV50::insnCanLoad(const Instruction *i, int s,
+                        const Instruction *ld) const
+{
+   DataFile sf = ld->src(0).getFile();
+
+   if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0))
+      return false;
+   if (s >= opInfo[i->op].srcNr)
+      return false;
+   if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+      return false;
+   if (s == 2 && i->src(1).getFile() != FILE_GPR)
+      return false;
+
+   // NOTE: don't rely on flagsDef
+   for (int d = 0; i->defExists(d); ++d)
+      if (i->def(d).getFile() == FILE_FLAGS)
+         return false;
+
+   unsigned mode = 0;
+
+   for (int z = 0; z < Target::operationSrcNr[i->op]; ++z) {
+      DataFile zf = (z == s) ? sf : i->src(z).getFile();
+      switch (zf) {
+      case FILE_GPR:
+         break;
+      case FILE_MEMORY_SHARED:
+      case FILE_SHADER_INPUT:
+         mode |= 1 << (z * 2);
+         break;
+      case FILE_MEMORY_CONST:
+         mode |= 2 << (z * 2);
+         break;
+      case FILE_IMMEDIATE:
+         mode |= 3 << (z * 2);
+      default:
+         break;
+      }
+   }
+
+   switch (mode) {
+   case 0x00:
+   case 0x01:
+   case 0x03:
+   case 0x08:
+   case 0x09:
+   case 0x0c:
+   case 0x20:
+   case 0x21:
+      break;
+   case 0x0d:
+      if (ld->bb->getProgram()->getType() != Program::TYPE_GEOMETRY)
+         return false;
+   default:
+      return false;
+   }
+
+   uint8_t ldSize;
+
+   if ((i->op == OP_MUL || i->op == OP_MAD) && !isFloatType(i->dType)) {
+      // 32-bit MUL will be split into 16-bit MULs
+      if (ld->src(0).isIndirect(0))
+         return false;
+      if (sf == FILE_IMMEDIATE)
+         return false;
+      ldSize = 2;
+   } else {
+      ldSize = typeSizeof(ld->dType);
+   }
+
+   if (sf == FILE_IMMEDIATE)
+      return true;
+
+
+   // Check if memory access is encodable:
+
+   if (ldSize < 4 && sf == FILE_SHADER_INPUT) // no < 4-byte aligned a[] access
+      return false;
+   if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * ldSize))
+      return false;
+
+   if (ld->src(0).isIndirect(0)) {
+      for (int z = 0; i->srcExists(z); ++z)
+         if (i->src(z).isIndirect(0))
+            return false;
+
+      // s[] access only possible in CP, $aX always applies
+      if (sf == FILE_MEMORY_SHARED)
+         return true;
+      if (!ld->bb) // can't check type ...
+         return false;
+      Program::Type pt = ld->bb->getProgram()->getType();
+
+      // $aX applies to c[] only in VP, FP, GP if p[] is not accessed
+      if (pt == Program::TYPE_COMPUTE)
+         return false;
+      if (pt == Program::TYPE_GEOMETRY) {
+         if (sf == FILE_MEMORY_CONST)
+            return i->src(s).getFile() != FILE_SHADER_INPUT;
+         return sf == FILE_SHADER_INPUT;
+      }
+      return sf == FILE_MEMORY_CONST;
+   }
+   return true;
+}
+
+bool
+TargetNV50::isAccessSupported(DataFile file, DataType ty) const
+{
+   if (ty == TYPE_B96 || ty == TYPE_NONE)
+      return false;
+   if (typeSizeof(ty) > 4)
+      return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL);
+   return true;
+}
+
+bool
+TargetNV50::isOpSupported(operation op, DataType ty) const
+{
+   if (ty == TYPE_F64 && chipset < 0xa0)
+      return false;
+
+   switch (op) {
+   case OP_PRERET:
+      return chipset >= 0xa0;
+   case OP_TXG:
+      return chipset >= 0xa3;
+   case OP_POW:
+   case OP_SQRT:
+   case OP_DIV:
+   case OP_MOD:
+   case OP_SET_AND:
+   case OP_SET_OR:
+   case OP_SET_XOR:
+   case OP_SLCT:
+   case OP_SELP:
+   case OP_POPCNT:
+   case OP_INSBF:
+   case OP_EXTBF:
+   case OP_EXIT: // want exit modifier instead (on NOP if required)
+   case OP_MEMBAR:
+      return false;
+   case OP_SAD:
+      return ty == TYPE_S32;
+   default:
+      return true;
+   }
+}
+
+bool
+TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+   if (!isFloatType(insn->dType)) {
+      switch (insn->op) {
+      case OP_ABS:
+      case OP_NEG:
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         break;
+      case OP_ADD:
+         if (insn->src(s ? 0 : 1).mod.neg())
+            return false;
+         break;
+      case OP_SUB:
+         if (s == 0)
+            return insn->src(1).mod.neg() ? false : true;
+         break;
+      case OP_SET:
+         if (insn->sType != TYPE_F32)
+            return false;
+         break;
+      default:
+         return false;
+      }
+   }
+   if (s > 3)
+      return false;
+   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNV50::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->getPredicate() || insn->flagsSrc >= 0)
+      return false;
+   for (int s = 0; insn->srcExists(s); ++s)
+      if (insn->src(s).getFile() == FILE_IMMEDIATE)
+         return false;
+   return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNV50::isSatSupported(const Instruction *insn) const
+{
+   if (insn->op == OP_CVT)
+      return true;
+   if (insn->dType != TYPE_F32)
+      return false;
+   return opInfo[insn->op].dstMods & NV50_IR_MOD_SAT;
+}
+
+int TargetNV50::getLatency(const Instruction *i) const
+{
+   // TODO: tune these values
+   if (i->op == OP_LOAD) {
+      switch (i->src(0).getFile()) {
+      case FILE_MEMORY_LOCAL:
+      case FILE_MEMORY_GLOBAL:
+         return 100; // really 400 to 800
+      default:
+         return 22;
+      }
+   }
+   return 22;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNV50::getThroughput(const Instruction *i) const
+{
+   // TODO: tune these values
+   if (i->dType == TYPE_F32) {
+      switch (i->op) {
+      case OP_RCP:
+      case OP_RSQ:
+      case OP_LG2:
+      case OP_SIN:
+      case OP_COS:
+      case OP_PRESIN:
+      case OP_PREEX2:
+         return 16;
+      default:
+         return 4;
+      }
+   } else
+   if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+      return 4;
+   } else
+   if (i->dType == TYPE_F64) {
+      return 32;
+   } else {
+      return 1;
+   }
+}
+
+static void
+recordLocation(uint16_t *locs, uint8_t *masks,
+               const struct nv50_ir_varying *var)
+{
+   uint16_t addr = var->slot[0] * 4;
+
+   switch (var->sn) {
+   case TGSI_SEMANTIC_POSITION: locs[SV_POSITION] = addr; break;
+   case TGSI_SEMANTIC_INSTANCEID: locs[SV_INSTANCE_ID] = addr; break;
+   case TGSI_SEMANTIC_VERTEXID: locs[SV_VERTEX_ID] = addr; break;
+   case TGSI_SEMANTIC_PRIMID: locs[SV_PRIMITIVE_ID] = addr; break;
+   case NV50_SEMANTIC_LAYER: locs[SV_LAYER] = addr; break;
+   case NV50_SEMANTIC_VIEWPORTINDEX: locs[SV_VIEWPORT_INDEX] = addr; break;
+   default:
+      break;
+   }
+   if (var->sn == TGSI_SEMANTIC_POSITION && masks)
+      masks[0] = var->mask;
+}
+
+void
+TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info)
+{
+   unsigned int i;
+   for (i = 0; i < info->numOutputs; ++i)
+      recordLocation(sysvalLocation, NULL, &info->out[i]);
+   for (i = 0; i < info->numInputs; ++i)
+      recordLocation(sysvalLocation, &wposMask, &info->in[i]);
+   for (i = 0; i < info->numSysVals; ++i)
+      recordLocation(sysvalLocation, NULL, &info->sv[i]);
+
+   if (sysvalLocation[SV_POSITION] >= 0x200) {
+      // not assigned by driver, but we need it internally
+      wposMask = 0x8;
+      sysvalLocation[SV_POSITION] = 0;
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
new file mode 100644
index 0000000..0cbf180
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNV50 : public Target
+{
+public:
+   TargetNV50(unsigned int chipset);
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+   virtual void parseDriverInfo(const struct nv50_ir_prog_info *);
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const;
+   virtual bool isOpSupported(operation, DataType) const;
+   virtual bool isAccessSupported(DataFile, DataType) const;
+   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+   virtual bool isSatSupported(const Instruction *) const;
+   virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+   virtual int getLatency(const Instruction *) const;
+   virtual int getThroughput(const Instruction *) const;
+
+   virtual unsigned int getFileSize(DataFile) const;
+   virtual unsigned int getFileUnit(DataFile) const;
+
+   virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+   uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+   void initOpInfo();
+
+   uint16_t sysvalLocation[SV_LAST + 1];
+   uint8_t wposMask;
+};
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
new file mode 100644
index 0000000..47e9c55
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+Target *getTargetNVC0(unsigned int chipset)
+{
+   return new TargetNVC0(chipset);
+}
+
+TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
+{
+   chipset = card;
+   initOpInfo();
+}
+
+// BULTINS / LIBRARY FUNCTIONS:
+
+// lazyness -> will just hardcode everything for the time being
+
+#include "target_lib_nvc0.asm.h"
+#include "target_lib_nve4.asm.h"
+#include "target_lib_nvf0.asm.h"
+
+void
+TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+   switch (chipset & 0xf0) {
+   case 0xe0:
+      *code = (const uint32_t *)&nve4_builtin_code[0];
+      *size = sizeof(nve4_builtin_code);
+      break;
+   case 0xf0:
+      *code = (const uint32_t *)&nvf0_builtin_code[0];
+      *size = sizeof(nvf0_builtin_code);
+      break;
+   default:
+      *code = (const uint32_t *)&nvc0_builtin_code[0];
+      *size = sizeof(nvc0_builtin_code);
+      break;
+   }
+}
+
+uint32_t
+TargetNVC0::getBuiltinOffset(int builtin) const
+{
+   assert(builtin < NVC0_BUILTIN_COUNT);
+
+   switch (chipset & 0xf0) {
+   case 0xe0: return nve4_builtin_offsets[builtin];
+   case 0xf0: return nvf0_builtin_offsets[builtin];
+   default:
+      return nvc0_builtin_offsets[builtin];
+   }
+}
+
+struct opProperties
+{
+   operation op;
+   unsigned int mNeg   : 4;
+   unsigned int mAbs   : 4;
+   unsigned int mNot   : 4;
+   unsigned int mSat   : 4;
+   unsigned int fConst : 3;
+   unsigned int fImmd  : 4; // last bit indicates if full immediate is suppoted
+};
+
+static const struct opProperties _initProps[] =
+{
+   //           neg  abs  not  sat  c[]  imm
+   { OP_ADD,    0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+   { OP_SUB,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+   { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_MAD,    0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
+   { OP_MADSP,  0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+   { OP_ABS,    0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+   { OP_NEG,    0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
+   { OP_CVT,    0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_CEIL,   0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_FLOOR,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_TRUNC,  0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+   { OP_AND,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_OR,     0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_XOR,    0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+   { OP_SHL,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SHR,    0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET,    0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SLCT,   0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
+   { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+   { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+   { OP_COS,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_SIN,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_EX2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_LG2,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_RCP,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_RSQ,    0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+   { OP_DFDX,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_DFDY,   0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+   { OP_CALL,   0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+   { OP_INSBF,  0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
+   { OP_PERMT,  0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+   { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+   // saturate only:
+   { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+   { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+   // nve4 ops:
+   { OP_SULDB,   0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+   { OP_SUSTB,   0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+   { OP_SUSTP,   0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+   { OP_SUCLAMP, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+   { OP_SUBFM,   0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+   { OP_SUEAU,   0x0, 0x0, 0x0, 0x0, 0x6, 0x2 }
+};
+
+void TargetNVC0::initOpInfo()
+{
+   unsigned int i, j;
+
+   static const uint32_t commutative[(OP_LAST + 31) / 32] =
+   {
+      // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
+      0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
+   };
+
+   static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+   {
+      // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
+      0x0670ca00, 0x00000000, 0x00000000, 0x00000000
+   };
+
+   static const operation noDest[] =
+   {
+      OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+      OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+      OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+      OP_QUADON, OP_QUADPOP, OP_TEXBAR, OP_SUSTB, OP_SUSTP, OP_SUREDP,
+      OP_SUREDB, OP_BAR
+   };
+
+   static const operation noPred[] =
+   {
+      OP_CALL, OP_PRERET, OP_QUADON, OP_QUADPOP,
+      OP_JOINAT, OP_PREBREAK, OP_PRECONT, OP_BRKPT
+   };
+
+   for (i = 0; i < DATA_FILE_COUNT; ++i)
+      nativeFileMap[i] = (DataFile)i;
+   nativeFileMap[FILE_ADDRESS] = FILE_GPR;
+
+   for (i = 0; i < OP_LAST; ++i) {
+      opInfo[i].variants = NULL;
+      opInfo[i].op = (operation)i;
+      opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+      opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+      opInfo[i].immdBits = 0;
+      opInfo[i].srcNr = operationSrcNr[i];
+
+      for (j = 0; j < opInfo[i].srcNr; ++j) {
+         opInfo[i].srcMods[j] = 0;
+         opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+      }
+      opInfo[i].dstMods = 0;
+      opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+      opInfo[i].hasDest = 1;
+      opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+      opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+      opInfo[i].pseudo = (i < OP_MOV);
+      opInfo[i].predicate = !opInfo[i].pseudo;
+      opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+      opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+   }
+   for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
+      opInfo[noDest[i]].hasDest = 0;
+   for (i = 0; i < sizeof(noPred) / sizeof(noPred[0]); ++i)
+      opInfo[noPred[i]].predicate = 0;
+
+   for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+      const struct opProperties *prop = &_initProps[i];
+
+      for (int s = 0; s < 3; ++s) {
+         if (prop->mNeg & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+         if (prop->mAbs & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+         if (prop->mNot & (1 << s))
+            opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+         if (prop->fConst & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+         if (prop->fImmd & (1 << s))
+            opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+         if (prop->fImmd & 8)
+            opInfo[prop->op].immdBits = 0xffffffff;
+      }
+      if (prop->mSat & 8)
+         opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+   }
+}
+
+unsigned int
+TargetNVC0::getFileSize(DataFile file) const
+{
+   switch (file) {
+   case FILE_NULL:          return 0;
+   case FILE_GPR:           return (chipset >= NVISA_GK110_CHIPSET) ? 255 : 63;
+   case FILE_PREDICATE:     return 7;
+   case FILE_FLAGS:         return 1;
+   case FILE_ADDRESS:       return 0;
+   case FILE_IMMEDIATE:     return 0;
+   case FILE_MEMORY_CONST:  return 65536;
+   case FILE_SHADER_INPUT:  return 0x400;
+   case FILE_SHADER_OUTPUT: return 0x400;
+   case FILE_MEMORY_GLOBAL: return 0xffffffff;
+   case FILE_MEMORY_SHARED: return 16 << 10;
+   case FILE_MEMORY_LOCAL:  return 48 << 10;
+   case FILE_SYSTEM_VALUE:  return 32;
+   default:
+      assert(!"invalid file");
+      return 0;
+   }
+}
+
+unsigned int
+TargetNVC0::getFileUnit(DataFile file) const
+{
+   if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE)
+      return 2;
+   return 0;
+}
+
+uint32_t
+TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+   const int idx = sym->reg.data.sv.index;
+   const SVSemantic sv = sym->reg.data.sv.sv;
+
+   const bool isInput = shaderFile == FILE_SHADER_INPUT;
+   const bool kepler = getChipset() >= NVISA_GK104_CHIPSET;
+
+   switch (sv) {
+   case SV_POSITION:       return 0x070 + idx * 4;
+   case SV_INSTANCE_ID:    return 0x2f8;
+   case SV_VERTEX_ID:      return 0x2fc;
+   case SV_PRIMITIVE_ID:   return isInput ? 0x060 : 0x040;
+   case SV_LAYER:          return 0x064;
+   case SV_VIEWPORT_INDEX: return 0x068;
+   case SV_POINT_SIZE:     return 0x06c;
+   case SV_CLIP_DISTANCE:  return 0x2c0 + idx * 4;
+   case SV_POINT_COORD:    return 0x2e0 + idx * 4;
+   case SV_FACE:           return 0x3fc;
+   case SV_TESS_FACTOR:    return 0x000 + idx * 4;
+   case SV_TESS_COORD:     return 0x2f0 + idx * 4;
+   case SV_NTID:           return kepler ? (0x00 + idx * 4) : ~0;
+   case SV_NCTAID:         return kepler ? (0x0c + idx * 4) : ~0;
+   case SV_GRIDID:         return kepler ? 0x18 : ~0;
+   default:
+      return 0xffffffff;
+   }
+}
+
+bool
+TargetNVC0::insnCanLoad(const Instruction *i, int s,
+                        const Instruction *ld) const
+{
+   DataFile sf = ld->src(0).getFile();
+
+   // immediate 0 can be represented by GPR $r63/$r255
+   if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
+      return (!i->isPseudo() &&
+              !i->asTex() &&
+              i->op != OP_EXPORT && i->op != OP_STORE);
+
+   if (s >= opInfo[i->op].srcNr)
+      return false;
+   if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+      return false;
+
+   // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
+   if (ld->src(0).isIndirect(0))
+      return false;
+
+   for (int k = 0; i->srcExists(k); ++k) {
+      if (i->src(k).getFile() == FILE_IMMEDIATE) {
+         if (k == 2 && i->op == OP_SUCLAMP) // special case
+            continue;
+         if (i->getSrc(k)->reg.data.u64 != 0)
+            return false;
+      } else
+      if (i->src(k).getFile() != FILE_GPR &&
+          i->src(k).getFile() != FILE_PREDICATE) {
+         return false;
+      }
+   }
+
+   // not all instructions support full 32 bit immediates
+   if (sf == FILE_IMMEDIATE) {
+      Storage &reg = ld->getSrc(0)->asImm()->reg;
+
+      if (opInfo[i->op].immdBits != 0xffffffff) {
+         if (i->sType == TYPE_F32) {
+            if (reg.data.u32 & 0xfff)
+               return false;
+         } else
+         if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+            // with u32, 0xfffff counts as 0xffffffff as well
+            if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
+               return false;
+         }
+      } else
+      if (i->op == OP_MAD || i->op == OP_FMA) {
+         // requires src == dst, cannot decide before RA
+         // (except if we implement more constraints)
+         if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff)
+            return false;
+      } else
+      if (i->op == OP_ADD && i->sType == TYPE_F32) {
+         // add f32 LIMM cannot saturate
+         if (i->saturate && (reg.data.u32 & 0xfff))
+            return false;
+      }
+   }
+
+   return true;
+}
+
+bool
+TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
+{
+   if (ty == TYPE_NONE)
+      return false;
+   if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
+      return typeSizeof(ty) <= 8;
+   if (ty == TYPE_B96)
+      return false;
+   if (getChipset() >= 0xf0) {
+      // XXX: find wide vfetch/export
+      if (ty == TYPE_B128)
+         return false;
+      if (ty == TYPE_U64)
+         return false;
+   }
+   return true;
+}
+
+bool
+TargetNVC0::isOpSupported(operation op, DataType ty) const
+{
+   if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32))
+      return false;
+   if (op == OP_SAD && ty != TYPE_S32 && ty != TYPE_U32)
+      return false;
+   if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD)
+      return false;
+   return true;
+}
+
+bool
+TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+   if (!isFloatType(insn->dType)) {
+      switch (insn->op) {
+      case OP_ABS:
+      case OP_NEG:
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+         break;
+      case OP_SET:
+         if (insn->sType != TYPE_F32)
+            return false;
+         break;
+      case OP_ADD:
+         if (mod.abs())
+            return false;
+         if (insn->src(s ? 0 : 1).mod.neg())
+            return false;
+         break;
+      case OP_SUB:
+         if (s == 0)
+            return insn->src(1).mod.neg() ? false : true;
+         break;
+      default:
+         return false;
+      }
+   }
+   if (s > 3)
+      return false;
+   return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+   if (insn->getPredicate())
+      return false;
+   return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNVC0::isSatSupported(const Instruction *insn) const
+{
+   if (insn->op == OP_CVT)
+      return true;
+   if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT))
+      return false;
+
+   if (insn->dType == TYPE_U32)
+      return (insn->op == OP_ADD) || (insn->op == OP_MAD);
+
+   // add f32 LIMM cannot saturate
+   if (insn->op == OP_ADD && insn->sType == TYPE_F32) {
+      if (insn->getSrc(1)->asImm() &&
+          insn->getSrc(1)->reg.data.u32 & 0xfff)
+         return false;
+   }
+
+   return insn->dType == TYPE_F32;
+}
+
+bool
+TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
+{
+   if (op != OP_MUL)
+      return false;
+   f = fabsf(f);
+   e = static_cast<int>(log2f(f));
+   if (e < -3 || e > 3)
+      return false;
+   return f == exp2f(static_cast<float>(e));
+}
+
+// TODO: better values
+// this could be more precise, e.g. depending on the issue-to-read/write delay
+// of the depending instruction, but it's good enough
+int TargetNVC0::getLatency(const Instruction *i) const
+{
+   if (chipset >= 0xe4) {
+      if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
+         return 20;
+      switch (i->op) {
+      case OP_LINTERP:
+      case OP_PINTERP:
+         return 15;
+      case OP_LOAD:
+         if (i->src(0).getFile() == FILE_MEMORY_CONST)
+            return 9;
+         // fall through
+      case OP_VFETCH:
+         return 24;
+      default:
+         if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
+            return 17;
+         if (i->op == OP_MUL && i->dType != TYPE_F32)
+            return 15;
+         return 9;
+      }
+   } else {
+      if (i->op == OP_LOAD) {
+         if (i->cache == CACHE_CV)
+            return 700;
+         return 48;
+      }
+      return 24;
+   }
+   return 32;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNVC0::getThroughput(const Instruction *i) const
+{
+   // TODO: better values
+   if (i->dType == TYPE_F32) {
+      switch (i->op) {
+      case OP_ADD:
+      case OP_MUL:
+      case OP_MAD:
+      case OP_FMA:
+         return 1;
+      case OP_CVT:
+      case OP_CEIL:
+      case OP_FLOOR:
+      case OP_TRUNC:
+      case OP_SET:
+      case OP_SLCT:
+      case OP_MIN:
+      case OP_MAX:
+         return 2;
+      case OP_RCP:
+      case OP_RSQ:
+      case OP_LG2:
+      case OP_SIN:
+      case OP_COS:
+      case OP_PRESIN:
+      case OP_PREEX2:
+      default:
+         return 8;
+      }
+   } else
+   if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+      switch (i->op) {
+      case OP_ADD:
+      case OP_AND:
+      case OP_OR:
+      case OP_XOR:
+      case OP_NOT:
+         return 1;
+      case OP_MUL:
+      case OP_MAD:
+      case OP_CVT:
+      case OP_SET:
+      case OP_SLCT:
+      case OP_SHL:
+      case OP_SHR:
+      case OP_NEG:
+      case OP_ABS:
+      case OP_MIN:
+      case OP_MAX:
+      default:
+         return 2;
+      }
+   } else
+   if (i->dType == TYPE_F64) {
+      return 2;
+   } else {
+      return 1;
+   }
+}
+
+bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
+{
+   const OpClass clA = operationClass[a->op];
+   const OpClass clB = operationClass[b->op];
+
+   if (getChipset() >= 0xe4) {
+      // not texturing
+      // not if the 2nd instruction isn't necessarily executed
+      if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
+         return false;
+      // anything with MOV
+      if (a->op == OP_MOV || b->op == OP_MOV)
+         return true;
+      if (clA == clB) {
+         // only F32 arith or integer additions
+         if (clA != OPCLASS_ARITH)
+            return false;
+         return (a->dType == TYPE_F32 || a->op == OP_ADD ||
+                 b->dType == TYPE_F32 || b->op == OP_ADD);
+      }
+      // nothing with TEXBAR
+      if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
+         return false;
+      // no loads and stores accessing the the same space
+      if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
+          (clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
+         if (a->src(0).getFile() == b->src(0).getFile())
+            return false;
+      // no > 32-bit ops
+      if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
+          typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
+         return false;
+      return true;
+   } else {
+      return false; // info not needed (yet)
+   }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
new file mode 100644
index 0000000..7831af5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNVC0 : public Target
+{
+public:
+   TargetNVC0(unsigned int chipset);
+
+   virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+   CodeEmitter *createCodeEmitterNVC0(Program::Type);
+   CodeEmitter *createCodeEmitterGK110(Program::Type);
+
+   virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+   virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+   virtual bool insnCanLoad(const Instruction *insn, int s,
+                            const Instruction *ld) const;
+   virtual bool isOpSupported(operation, DataType) const;
+   virtual bool isAccessSupported(DataFile, DataType) const;
+   virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+   virtual bool isSatSupported(const Instruction *) const;
+   virtual bool isPostMultiplySupported(operation, float, int& e) const;
+   virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+   virtual bool canDualIssue(const Instruction *, const Instruction *) const;
+   virtual int getLatency(const Instruction *) const;
+   virtual int getThroughput(const Instruction *) const;
+
+   virtual unsigned int getFileSize(DataFile) const;
+   virtual unsigned int getFileUnit(DataFile) const;
+
+   virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+   uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+   void initOpInfo();
+};
+
+bool calculateSchedDataNVC0(const Target *, Function *);
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
new file mode 100644
index 0000000..8959777
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
@@ -0,0 +1,390 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_util.h"
+
+namespace nv50_ir {
+
+void DLList::clear()
+{
+   for (Item *next, *item = head.next; item != &head; item = next) {
+      next = item->next;
+      delete item;
+   }
+   head.next = head.prev = &head;
+}
+
+void
+DLList::Iterator::erase()
+{
+   Item *rem = pos;
+
+   if (rem == term)
+      return;
+   pos = pos->next;
+
+   DLLIST_DEL(rem);
+   delete rem;
+}
+
+void DLList::Iterator::moveToList(DLList& dest)
+{
+   Item *item = pos;
+
+   assert(term != &dest.head);
+   assert(pos != term);
+
+   pos = pos->next;
+
+   DLLIST_DEL(item);
+   DLLIST_ADDHEAD(&dest.head, item);
+}
+
+bool
+DLList::Iterator::insert(void *data)
+{
+   Item *ins = new Item(data);
+
+   ins->next = pos->next;
+   ins->prev = pos;
+   pos->next->prev = ins;
+   pos->next = ins;
+
+   if (pos == term)
+      term = ins;
+
+   return true;
+}
+
+void
+Stack::moveTo(Stack& that)
+{
+   unsigned int newSize = this->size + that.size;
+
+   while (newSize > that.limit)
+      that.resize();
+   memcpy(&that.array[that.size], &array[0], this->size * sizeof(Item));
+
+   that.size = newSize;
+   this->size = 0;
+}
+
+Interval::Interval(const Interval& that) : head(NULL), tail(NULL)
+{
+   this->insert(that);
+}
+
+Interval::~Interval()
+{
+   clear();
+}
+
+void
+Interval::clear()
+{
+   for (Range *next, *r = head; r; r = next) {
+      next = r->next;
+      delete r;
+   }
+   head = tail = NULL;
+}
+
+bool
+Interval::extend(int a, int b)
+{
+   Range *r, **nextp = &head;
+
+   // NOTE: we need empty intervals for fixed registers
+   // if (a == b)
+   //   return false;
+   assert(a <= b);
+
+   for (r = head; r; r = r->next) {
+      if (b < r->bgn)
+         break; // insert before
+      if (a > r->end) {
+         // insert after
+         nextp = &r->next;
+         continue;
+      }
+
+      // overlap
+      if (a < r->bgn) {
+         r->bgn = a;
+         if (b > r->end)
+            r->end = b;
+         r->coalesce(&tail);
+         return true;
+      }
+      if (b > r->end) {
+         r->end = b;
+         r->coalesce(&tail);
+         return true;
+      }
+      assert(a >= r->bgn);
+      assert(b <= r->end);
+      return true;
+   }
+
+   (*nextp) = new Range(a, b);
+   (*nextp)->next = r;
+
+   for (r = (*nextp); r->next; r = r->next);
+   tail = r;
+   return true;
+}
+
+bool Interval::contains(int pos) const
+{
+   for (Range *r = head; r && r->bgn <= pos; r = r->next)
+      if (r->end > pos)
+         return true;
+   return false;
+}
+
+bool Interval::overlaps(const Interval &that) const
+{
+#if 1
+   Range *a = this->head;
+   Range *b = that.head;
+
+   while (a && b) {
+      if (b->bgn < a->end &&
+          b->end > a->bgn)
+         return true;
+      if (a->end <= b->bgn)
+         a = a->next;
+      else
+         b = b->next;
+   }
+#else
+   for (Range *rA = this->head; rA; rA = rA->next)
+      for (Range *rB = iv.head; rB; rB = rB->next)
+         if (rB->bgn < rA->end &&
+             rB->end > rA->bgn)
+            return true;
+#endif
+   return false;
+}
+
+void Interval::insert(const Interval &that)
+{
+   for (Range *r = that.head; r; r = r->next)
+      this->extend(r->bgn, r->end);
+}
+
+void Interval::unify(Interval &that)
+{
+   assert(this != &that);
+   for (Range *next, *r = that.head; r; r = next) {
+      next = r->next;
+      this->extend(r->bgn, r->end);
+      delete r;
+   }
+   that.head = NULL;
+}
+
+int Interval::length() const
+{
+   int len = 0;
+   for (Range *r = head; r; r = r->next)
+      len += r->bgn - r->end;
+   return len;
+}
+
+void Interval::print() const
+{
+   if (!head)
+      return;
+   INFO("[%i %i)", head->bgn, head->end);
+   for (const Range *r = head->next; r; r = r->next)
+      INFO(" [%i %i)", r->bgn, r->end);
+   INFO("\n");
+}
+
+void
+BitSet::andNot(const BitSet &set)
+{
+   assert(data && set.data);
+   assert(size >= set.size);
+   for (unsigned int i = 0; i < (set.size + 31) / 32; ++i)
+      data[i] &= ~set.data[i];
+}
+
+BitSet& BitSet::operator|=(const BitSet &set)
+{
+   assert(data && set.data);
+   assert(size >= set.size);
+   for (unsigned int i = 0; i < (set.size + 31) / 32; ++i)
+      data[i] |= set.data[i];
+   return *this;
+}
+
+bool BitSet::resize(unsigned int nBits)
+{
+   if (!data || !nBits)
+      return allocate(nBits, true);
+   const unsigned int p = (size + 31) / 32;
+   const unsigned int n = (nBits + 31) / 32;
+   if (n == p)
+      return true;
+
+   data = (uint32_t *)REALLOC(data, 4 * p, 4 * n);
+   if (!data) {
+      size = 0;
+      return false;
+   }
+   if (n > p)
+      memset(&data[4 * p + 4], 0, (n - p) * 4);
+
+   size = nBits;
+   return true;
+}
+
+bool BitSet::allocate(unsigned int nBits, bool zero)
+{
+   if (data && size < nBits) {
+      FREE(data);
+      data = NULL;
+   }
+   size = nBits;
+
+   if (!data)
+      data = reinterpret_cast<uint32_t *>(CALLOC((size + 31) / 32, 4));
+
+   if (zero)
+      memset(data, 0, (size + 7) / 8);
+   else
+   if (nBits)
+      data[(size + 31) / 32 - 1] = 0; // clear unused bits (e.g. for popCount)
+
+   return data;
+}
+
+unsigned int BitSet::popCount() const
+{
+   unsigned int count = 0;
+
+   for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+      if (data[i])
+         count += util_bitcount(data[i]);
+   return count;
+}
+
+void BitSet::fill(uint32_t val)
+{
+   unsigned int i;
+   for (i = 0; i < (size + 31) / 32; ++i)
+      data[i] = val;
+   if (val)
+      data[i] &= ~(0xffffffff << (size % 32)); // BE ?
+}
+
+void BitSet::setOr(BitSet *pA, BitSet *pB)
+{
+   if (!pB) {
+      *this = *pA;
+   } else {
+      for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+         data[i] = pA->data[i] | pB->data[i];
+   }
+}
+
+int BitSet::findFreeRange(unsigned int count) const
+{
+   const uint32_t m = (1 << count) - 1;
+   int pos = size;
+   unsigned int i;
+   const unsigned int end = (size + 31) / 32;
+
+   if (count == 1) {
+      for (i = 0; i < end; ++i) {
+         pos = ffs(~data[i]) - 1;
+         if (pos >= 0)
+            break;
+      }
+   } else
+   if (count == 2) {
+      for (i = 0; i < end; ++i) {
+         if (data[i] != 0xffffffff) {
+            uint32_t b = data[i] | (data[i] >> 1) | 0xaaaaaaaa;
+            pos = ffs(~b) - 1;
+            if (pos >= 0)
+               break;
+         }
+      }
+   } else
+   if (count == 4 || count == 3) {
+      for (i = 0; i < end; ++i) {
+         if (data[i] != 0xffffffff) {
+            uint32_t b =
+               (data[i] >> 0) | (data[i] >> 1) |
+               (data[i] >> 2) | (data[i] >> 3) | 0xeeeeeeee;
+            pos = ffs(~b) - 1;
+            if (pos >= 0)
+               break;
+         }
+      }
+   } else {
+      if (count <= 8)
+         count = 8;
+      else
+      if (count <= 16)
+         count = 16;
+      else
+         count = 32;
+
+      for (i = 0; i < end; ++i) {
+         if (data[i] != 0xffffffff) {
+            for (pos = 0; pos < 32; pos += count)
+               if (!(data[i] & (m << pos)))
+                  break;
+            if (pos < 32)
+               break;
+         }
+      }
+   }
+   pos += i * 32;
+
+   return ((pos + count) <= size) ? pos : -1;
+}
+
+void BitSet::print() const
+{
+   unsigned int n = 0;
+   INFO("BitSet of size %u:\n", size);
+   for (unsigned int i = 0; i < (size + 31) / 32; ++i) {
+      uint32_t bits = data[i];
+      while (bits) {
+         int pos = ffs(bits) - 1;
+         bits &= ~(1 << pos);
+         INFO(" %i", i * 32 + pos);
+         ++n;
+         if ((n % 16) == 0)
+            INFO("\n");
+      }
+   }
+   if (n % 16)
+      INFO("\n");
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
new file mode 100644
index 0000000..a4ea9d9
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
@@ -0,0 +1,788 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_UTIL_H__
+#define __NV50_IR_UTIL_H__
+
+#include <new>
+#include <assert.h>
+#include <stdio.h>
+#include <memory>
+#include <map>
+
+#ifndef NDEBUG
+# include <typeinfo>
+#endif
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#define ERROR(args...) debug_printf("ERROR: " args)
+#define WARN(args...) debug_printf("WARNING: " args)
+#define INFO(args...) debug_printf(args)
+
+#define INFO_DBG(m, f, args...)          \
+   do {                                  \
+      if (m & NV50_IR_DEBUG_##f)         \
+         debug_printf(args);             \
+   } while(0)
+
+#define FATAL(args...)          \
+   do {                         \
+      fprintf(stderr, args);    \
+      abort();                  \
+   } while(0)
+
+
+#define NV50_IR_FUNC_ALLOC_OBJ_DEF(obj, f, args...)               \
+   new ((f)->getProgram()->mem_##obj.allocate()) obj(f, args)
+
+#define new_Instruction(f, args...)                      \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(Instruction, f, args)
+#define new_CmpInstruction(f, args...)                   \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(CmpInstruction, f, args)
+#define new_TexInstruction(f, args...)                   \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(TexInstruction, f, args)
+#define new_FlowInstruction(f, args...)                  \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(FlowInstruction, f, args)
+
+#define new_LValue(f, args...)                  \
+   NV50_IR_FUNC_ALLOC_OBJ_DEF(LValue, f, args)
+
+
+#define NV50_IR_PROG_ALLOC_OBJ_DEF(obj, p, args...)   \
+   new ((p)->mem_##obj.allocate()) obj(p, args)
+
+#define new_Symbol(p, args...)                           \
+   NV50_IR_PROG_ALLOC_OBJ_DEF(Symbol, p, args)
+#define new_ImmediateValue(p, args...)                   \
+   NV50_IR_PROG_ALLOC_OBJ_DEF(ImmediateValue, p, args)
+
+
+#define delete_Instruction(p, insn) (p)->releaseInstruction(insn)
+#define delete_Value(p, val) (p)->releaseValue(val)
+
+
+namespace nv50_ir {
+
+class Iterator
+{
+public:
+   virtual ~Iterator() { };
+   virtual void next() = 0;
+   virtual void *get() const = 0;
+   virtual bool end() const = 0; // if true, get will return 0
+   virtual void reset() { assert(0); } // only for graph iterators
+};
+
+typedef std::auto_ptr<Iterator> IteratorRef;
+
+class ManipIterator : public Iterator
+{
+public:
+   virtual bool insert(void *) = 0; // insert after current position
+   virtual void erase() = 0;
+};
+
+// WARNING: do not use a->prev/next for __item or __list
+
+#define DLLIST_DEL(__item)                      \
+   do {                                         \
+      (__item)->prev->next = (__item)->next;    \
+      (__item)->next->prev = (__item)->prev;    \
+      (__item)->next = (__item);                \
+      (__item)->prev = (__item);                \
+   } while(0)
+
+#define DLLIST_ADDTAIL(__list, __item)          \
+   do {                                         \
+      (__item)->next = (__list);                \
+      (__item)->prev = (__list)->prev;          \
+      (__list)->prev->next = (__item);          \
+      (__list)->prev = (__item);                \
+   } while(0)
+
+#define DLLIST_ADDHEAD(__list, __item)          \
+   do {                                         \
+      (__item)->prev = (__list);                \
+      (__item)->next = (__list)->next;          \
+      (__list)->next->prev = (__item);          \
+      (__list)->next = (__item);                \
+   } while(0)
+
+#define DLLIST_MERGE(__listA, __listB, ty)      \
+   do {                                         \
+      ty prevB = (__listB)->prev;               \
+      (__listA)->prev->next = (__listB);        \
+      (__listB)->prev->next = (__listA);        \
+      (__listB)->prev = (__listA)->prev;        \
+      (__listA)->prev = prevB;                  \
+   } while(0)
+
+#define DLLIST_EMPTY(__list) ((__list)->next == (__list))
+
+#define DLLIST_FOR_EACH(list, it) \
+   for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next())
+
+class DLList
+{
+public:
+   class Item
+   {
+   public:
+      Item(void *priv) : next(this), prev(this), data(priv) { }
+
+   public:
+      Item *next;
+      Item *prev;
+      void *data;
+   };
+
+   DLList() : head(0) { }
+   ~DLList() { clear(); }
+
+   inline void insertHead(void *data)
+   {
+      Item *item = new Item(data);
+
+      assert(data);
+
+      item->prev = &head;
+      item->next = head.next;
+      head.next->prev = item;
+      head.next = item;
+   }
+
+   inline void insertTail(void *data)
+   {
+      Item *item = new Item(data);
+
+      assert(data);
+
+      DLLIST_ADDTAIL(&head, item);
+   }
+
+   inline void insert(void *data) { insertTail(data); }
+
+   void clear();
+
+   class Iterator : public ManipIterator
+   {
+   public:
+      Iterator(Item *head, bool r) : rev(r), pos(r ? head->prev : head->next),
+                                     term(head) { }
+
+      virtual void next() { if (!end()) pos = rev ? pos->prev : pos->next; }
+      virtual void *get() const { return pos->data; }
+      virtual bool end() const { return pos == term; }
+
+      // caution: if you're at end-2 and erase it, then do next, you're at end
+      virtual void erase();
+      virtual bool insert(void *data);
+
+      // move item to a another list, no consistency with its iterators though
+      void moveToList(DLList&);
+
+   private:
+      const bool rev;
+      Item *pos;
+      Item *term;
+
+      friend class DLList;
+   };
+
+   inline void erase(Iterator& pos)
+   {
+      pos.erase();
+   }
+
+   Iterator iterator()
+   {
+      return Iterator(&head, false);
+   }
+
+   Iterator revIterator()
+   {
+      return Iterator(&head, true);
+   }
+
+private:
+   Item head;
+};
+
+class Stack
+{
+public:
+   class Item {
+   public:
+      union {
+         void *p;
+         int i;
+         unsigned int u;
+         float f;
+         double d;
+      } u;
+
+      Item() { memset(&u, 0, sizeof(u)); }
+   };
+
+   Stack() : size(0), limit(0), array(0) { }
+   ~Stack() { if (array) FREE(array); }
+
+   inline void push(int i)          { Item data; data.u.i = i; push(data); }
+   inline void push(unsigned int u) { Item data; data.u.u = u; push(data); }
+   inline void push(void *p)        { Item data; data.u.p = p; push(data); }
+   inline void push(float f)        { Item data; data.u.f = f; push(data); }
+
+   inline void push(Item data)
+   {
+      if (size == limit)
+         resize();
+      array[size++] = data;
+   }
+
+   inline Item pop()
+   {
+      if (!size) {
+         Item data;
+         assert(0);
+         return data;
+      }
+      return array[--size];
+   }
+
+   inline unsigned int getSize() { return size; }
+
+   inline Item& peek() { assert(size); return array[size - 1]; }
+
+   void clear(bool releaseStorage = false)
+   {
+      if (releaseStorage && array)
+         FREE(array);
+      size = limit = 0;
+   }
+
+   void moveTo(Stack&); // move all items to target (not like push(pop()))
+
+private:
+   void resize()
+   {
+         unsigned int sizeOld, sizeNew;
+
+         sizeOld = limit * sizeof(Item);
+         limit = MAX2(4, limit + limit);
+         sizeNew = limit * sizeof(Item);
+
+         array = (Item *)REALLOC(array, sizeOld, sizeNew);
+   }
+
+   unsigned int size;
+   unsigned int limit;
+   Item *array;
+};
+
+class DynArray
+{
+public:
+   class Item
+   {
+   public:
+      union {
+         uint32_t u32;
+         void *p;
+      };
+   };
+
+   DynArray() : data(NULL), size(0) { }
+
+   ~DynArray() { if (data) FREE(data); }
+
+   inline Item& operator[](unsigned int i)
+   {
+      if (i >= size)
+         resize(i);
+      return data[i];
+   }
+
+   inline const Item operator[](unsigned int i) const
+   {
+      return data[i];
+   }
+
+   void resize(unsigned int index)
+   {
+      const unsigned int oldSize = size * sizeof(Item);
+
+      if (!size)
+         size = 8;
+      while (size <= index)
+         size <<= 1;
+
+      data = (Item *)REALLOC(data, oldSize, size * sizeof(Item));
+   }
+
+   void clear()
+   {
+      FREE(data);
+      data = NULL;
+      size = 0;
+   }
+
+private:
+   Item *data;
+   unsigned int size;
+};
+
+class ArrayList
+{
+public:
+   ArrayList() : size(0) { }
+
+   void insert(void *item, int& id)
+   {
+      id = ids.getSize() ? ids.pop().u.i : size++;
+      data[id].p = item;
+   }
+
+   void remove(int& id)
+   {
+      const unsigned int uid = id;
+      assert(uid < size && data[id].p);
+      ids.push(uid);
+      data[uid].p = NULL;
+      id = -1;
+   }
+
+   inline int getSize() const { return size; }
+
+   inline void *get(unsigned int id) { assert(id < size); return data[id].p; }
+
+   class Iterator : public nv50_ir::Iterator
+   {
+   public:
+      Iterator(const ArrayList *array) : pos(0), data(array->data)
+      {
+         size = array->getSize();
+         if (size)
+            nextValid();
+      }
+
+      void nextValid() { while ((pos < size) && !data[pos].p) ++pos; }
+
+      void next() { if (pos < size) { ++pos; nextValid(); } }
+      void *get() const { assert(pos < size); return data[pos].p; }
+      bool end() const { return pos >= size; }
+
+   private:
+      unsigned int pos;
+      unsigned int size;
+      const DynArray& data;
+
+      friend class ArrayList;
+   };
+
+   Iterator iterator() const { return Iterator(this); }
+
+   void clear()
+   {
+      data.clear();
+      ids.clear(true);
+      size = 0;
+   }
+
+private:
+   DynArray data;
+   Stack ids;
+   unsigned int size;
+};
+
+class Interval
+{
+public:
+   Interval() : head(0), tail(0) { }
+   Interval(const Interval&);
+   ~Interval();
+
+   bool extend(int, int);
+   void insert(const Interval&);
+   void unify(Interval&); // clears source interval
+   void clear();
+
+   inline int begin() const { return head ? head->bgn : -1; }
+   inline int end() const { checkTail(); return tail ? tail->end : -1; }
+   inline bool isEmpty() const { return !head; }
+   bool overlaps(const Interval&) const;
+   bool contains(int pos) const;
+
+   inline int extent() const { return end() - begin(); }
+   int length() const;
+
+   void print() const;
+
+   inline void checkTail() const;
+
+private:
+   class Range
+   {
+   public:
+      Range(int a, int b) : next(0), bgn(a), end(b) { }
+
+      Range *next;
+      int bgn;
+      int end;
+
+      void coalesce(Range **ptail)
+      {
+         Range *rnn;
+
+         while (next && end >= next->bgn) {
+            assert(bgn <= next->bgn);
+            rnn = next->next;
+            end = MAX2(end, next->end);
+            delete next;
+            next = rnn;
+         }
+         if (!next)
+            *ptail = this;
+      }
+   };
+
+   Range *head;
+   Range *tail;
+};
+
+class BitSet
+{
+public:
+   BitSet() : marker(false), data(0), size(0) { }
+   BitSet(unsigned int nBits, bool zero) : marker(false), data(0), size(0)
+   {
+      allocate(nBits, zero);
+   }
+   ~BitSet()
+   {
+      if (data)
+         FREE(data);
+   }
+
+   bool allocate(unsigned int nBits, bool zero);
+   bool resize(unsigned int nBits); // keep old data, zero additional bits
+
+   inline unsigned int getSize() const { return size; }
+
+   void fill(uint32_t val);
+
+   void setOr(BitSet *, BitSet *); // second BitSet may be NULL
+
+   inline void set(unsigned int i)
+   {
+      assert(i < size);
+      data[i / 32] |= 1 << (i % 32);
+   }
+   // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+   inline void setRange(unsigned int i, unsigned int n)
+   {
+      assert((i + n) <= size && (((i % 32) + n) <= 32));
+      data[i / 32] |= ((1 << n) - 1) << (i % 32);
+   }
+   inline void setMask(unsigned int i, uint32_t m)
+   {
+      assert(i < size);
+      data[i / 32] |= m;
+   }
+
+   inline void clr(unsigned int i)
+   {
+      assert(i < size);
+      data[i / 32] &= ~(1 << (i % 32));
+   }
+   // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+   inline void clrRange(unsigned int i, unsigned int n)
+   {
+      assert((i + n) <= size && (((i % 32) + n) <= 32));
+      data[i / 32] &= ~(((1 << n) - 1) << (i % 32));
+   }
+
+   inline bool test(unsigned int i) const
+   {
+      assert(i < size);
+      return data[i / 32] & (1 << (i % 32));
+   }
+   // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+   inline bool testRange(unsigned int i, unsigned int n) const
+   {
+      assert((i + n) <= size && (((i % 32) + n) <= 32));
+      return data[i / 32] & (((1 << n) - 1) << (i % 32));
+   }
+
+   // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size).
+   int findFreeRange(unsigned int size) const;
+
+   BitSet& operator|=(const BitSet&);
+
+   BitSet& operator=(const BitSet& set)
+   {
+      assert(data && set.data);
+      assert(size == set.size);
+      memcpy(data, set.data, (set.size + 7) / 8);
+      return *this;
+   }
+
+   void andNot(const BitSet&);
+
+   // bits = (bits | setMask) & ~clrMask
+   inline void periodicMask32(uint32_t setMask, uint32_t clrMask)
+   {
+      for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+         data[i] = (data[i] | setMask) & ~clrMask;
+   }
+
+   unsigned int popCount() const;
+
+   void print() const;
+
+public:
+   bool marker; // for user
+
+private:
+   uint32_t *data;
+   unsigned int size;
+};
+
+void Interval::checkTail() const
+{
+#if NV50_DEBUG & NV50_DEBUG_PROG_RA
+   Range *r = head;
+   while (r->next)
+      r = r->next;
+   assert(tail == r);
+#endif
+}
+
+class MemoryPool
+{
+private:
+   inline bool enlargeAllocationsArray(const unsigned int id, unsigned int nr)
+   {
+      const unsigned int size = sizeof(uint8_t *) * id;
+      const unsigned int incr = sizeof(uint8_t *) * nr;
+
+      uint8_t **alloc = (uint8_t **)REALLOC(allocArray, size, size + incr);
+      if (!alloc)
+         return false;
+      allocArray = alloc;
+      return true;
+   }
+
+   inline bool enlargeCapacity()
+   {
+      const unsigned int id = count >> objStepLog2;
+
+      uint8_t *const mem = (uint8_t *)MALLOC(objSize << objStepLog2);
+      if (!mem)
+         return false;
+
+      if (!(id % 32)) {
+         if (!enlargeAllocationsArray(id, 32)) {
+            FREE(mem);
+            return false;
+         }
+      }
+      allocArray[id] = mem;
+      return true;
+   }
+
+public:
+   MemoryPool(unsigned int size, unsigned int incr) : objSize(size),
+                                                      objStepLog2(incr)
+   {
+      allocArray = NULL;
+      released = NULL;
+      count = 0;
+   }
+
+   ~MemoryPool()
+   {
+      unsigned int allocCount = (count + (1 << objStepLog2) - 1) >> objStepLog2;
+      for (unsigned int i = 0; i < allocCount && allocArray[i]; ++i)
+         FREE(allocArray[i]);
+      if (allocArray)
+         FREE(allocArray);
+   }
+
+   void *allocate()
+   {
+      void *ret;
+      const unsigned int mask = (1 << objStepLog2) - 1;
+
+      if (released) {
+         ret = released;
+         released = *(void **)released;
+         return ret;
+      }
+
+      if (!(count & mask))
+         if (!enlargeCapacity())
+            return NULL;
+
+      ret = allocArray[count >> objStepLog2] + (count & mask) * objSize;
+      ++count;
+      return ret;
+   }
+
+   void release(void *ptr)
+   {
+      *(void **)ptr = released;
+      released = ptr;
+   }
+
+private:
+   uint8_t **allocArray; // array (list) of MALLOC allocations
+
+   void *released; // list of released objects
+
+   unsigned int count; // highest allocated object
+
+   const unsigned int objSize;
+   const unsigned int objStepLog2;
+};
+
+/**
+ *  Composite object cloning policy.
+ *
+ *  Encapsulates how sub-objects are to be handled (if at all) when a
+ *  composite object is being cloned.
+ */
+template<typename C>
+class ClonePolicy
+{
+protected:
+   C *c;
+
+public:
+   ClonePolicy(C *c) : c(c) {}
+
+   C *context() { return c; }
+
+   template<typename T> T *get(T *obj)
+   {
+      void *clone = lookup(obj);
+      if (!clone)
+         clone = obj->clone(*this);
+      return reinterpret_cast<T *>(clone);
+   }
+
+   template<typename T> void set(const T *obj, T *clone)
+   {
+      insert(obj, clone);
+   }
+
+protected:
+   virtual void *lookup(void *obj) = 0;
+   virtual void insert(const void *obj, void *clone) = 0;
+};
+
+/**
+ *  Shallow non-recursive cloning policy.
+ *
+ *  Objects cloned with the "shallow" policy don't clone their
+ *  children recursively, instead, the new copy shares its children
+ *  with the original object.
+ */
+template<typename C>
+class ShallowClonePolicy : public ClonePolicy<C>
+{
+public:
+   ShallowClonePolicy(C *c) : ClonePolicy<C>(c) {}
+
+protected:
+   virtual void *lookup(void *obj)
+   {
+      return obj;
+   }
+
+   virtual void insert(const void *obj, void *clone)
+   {
+   }
+};
+
+template<typename C, typename T>
+inline T *cloneShallow(C *c, T *obj)
+{
+   ShallowClonePolicy<C> pol(c);
+   return obj->clone(pol);
+}
+
+/**
+ *  Recursive cloning policy.
+ *
+ *  Objects cloned with the "deep" policy clone their children
+ *  recursively, keeping track of what has already been cloned to
+ *  avoid making several new copies of the same object.
+ */
+template<typename C>
+class DeepClonePolicy : public ClonePolicy<C>
+{
+public:
+   DeepClonePolicy(C *c) : ClonePolicy<C>(c) {}
+
+private:
+   std::map<const void *, void *> map;
+
+protected:
+   virtual void *lookup(void *obj)
+   {
+      return map[obj];
+   }
+
+   virtual void insert(const void *obj, void *clone)
+   {
+      map[obj] = clone;
+   }
+};
+
+template<typename S, typename T>
+struct bimap
+{
+   std::map<S, T> forth;
+   std::map<T, S> back;
+
+public:
+   bimap() : l(back), r(forth) { }
+   bimap(const bimap<S, T> &m)
+      : forth(m.forth), back(m.back), l(back), r(forth) { }
+
+   void insert(const S &s, const T &t)
+   {
+      forth.insert(std::make_pair(s, t));
+      back.insert(std::make_pair(t, s));
+   }
+
+   typedef typename std::map<T, S>::const_iterator l_iterator;
+   const std::map<T, S> &l;
+   typedef typename std::map<S, T>::const_iterator r_iterator;
+   const std::map<S, T> &r;
+};
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_UTIL_H__
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm
new file mode 100644
index 0000000..f40becc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm
@@ -0,0 +1,96 @@
+//
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE:    22 / 14 * 8 bytes
+//
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+ret
+//
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+set $p2 0x1 lt s32 $r0 0x0
+set $p3 0x1 lt s32 $r1 0x0 xor $p2
+cvt s32 $r0 abs s32 $r0
+cvt s32 $r1 abs s32 $r1
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p3 cvt s32 $r0 neg s32 $r0
+$p2 cvt s32 $r1 neg s32 $r1
+ret
+//
+// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rcp(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    9 * 8 bytes
+//
+nop
+ret
+// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rsqrt(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    14 * 8 bytes
+//
+nop
+ret
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h
new file mode 100644
index 0000000..3790504
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h
@@ -0,0 +1,112 @@
+
+static const uint32_t nvc0_builtin_code[] =
+{
+   0x04009c03,
+   0x78000000,
+   0x7c209cdd,
+   0x0010dd18,
+   0x08309c03,
+   0x60000000,
+   0x05605c18,
+   0x0810dc2a,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0000dde4,
+   0x28000000,
+   0x08001c43,
+   0x50000000,
+   0x05609c18,
+   0x0010430d,
+   0x0811dc03,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x04000002,
+   0x08000000,
+   0x0811c003,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x040000ac,
+   0x90001dff,
+   0xfc05dc23,
+   0x188e0000,
+   0xfc17dc23,
+   0x18c40000,
+   0x03301e18,
+   0x07305e18,
+   0x04009c03,
+   0x78000000,
+   0x7c209cdd,
+   0x0010dd18,
+   0x08309c03,
+   0x60000000,
+   0x05605c18,
+   0x0810dc2a,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0810dc03,
+   0x50000000,
+   0x0c209c43,
+   0x20040000,
+   0x0000dde4,
+   0x28000000,
+   0x08001c43,
+   0x50000000,
+   0x05609c18,
+   0x0010430d,
+   0x0811dc03,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x04000002,
+   0x08000000,
+   0x0811c003,
+   0x1b0e0000,
+   0x08104103,
+   0x48000000,
+   0x040000ac,
+   0x01700e18,
+   0x05704a18,
+   0x90001dff,
+   0x00001c08,
+   0x90001dff,
+   0x00001c08,
+   0x90001dff,
+};
+
+static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+   0x0000,
+   0x00b0,
+   0x0180,
+   0x0188
+};
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm
new file mode 100644
index 0000000..5adc9ff
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm
@@ -0,0 +1,698 @@
+//
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE:    22 / 14 * 8 bytes
+//
+sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
+bfind u32 $r2 $r1
+long xor b32 $r2 $r2 0x1f
+long mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+long cvt u32 $r1 neg u32 $r1
+long mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+long cvt u32 $r2 neg u32 $r1
+long add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+long ret
+//
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT:   $r0: dividend, $r1: divisor
+// OUTPUT:  $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+set $p2 0x1 lt s32 $r0 0x0
+set $p3 0x1 lt s32 $r1 0x0 xor $p2
+sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
+long cvt s32 $r0 abs s32 $r0
+long cvt s32 $r1 abs s32 $r1
+bfind u32 $r2 $r1
+long xor b32 $r2 $r2 0x1f
+long mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+long cvt u32 $r2 neg u32 $r1
+long add $r1 (mul u32 $r1 u32 $r0) $r3
+sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+long $p0 add b32 $r0 $r0 0x1
+long $p3 cvt s32 $r0 neg s32 $r0
+sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
+$p2 cvt s32 $r1 neg s32 $r1
+long ret
+//
+// SULDP [for each format]
+// $r4d: address
+// $r2: surface info (format)
+// $p0: access predicate
+// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
+//
+// RGBA32
+$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
+long ret
+// RGBA16_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 u16 1 $r1
+cvt rn f32 $r2 u16 0 $r1
+mul f32 $r3 $r3 0x37800074
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt rn f32 $r1 u16 1 $r0
+mul f32 $r2 $r2 0x37800074
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x37800074
+mul f32 $r0 $r0 0x37800074
+long ret
+// RGBA16_SNORM
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 s16 1 $r1
+cvt rn f32 $r2 s16 0 $r1
+mul f32 $r3 $r3 0x38000187
+cvt rn f32 $r1 s16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x38000187
+cvt rn f32 $r0 s16 0 $r0
+mul f32 $r1 $r1 0x38000187
+mul f32 $r0 $r0 0x38000187
+long ret
+// RGBA16_SINT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt s32 $r3 s16 1 $r1
+cvt s32 $r2 s16 0 $r1
+cvt s32 $r1 s16 1 $r0
+cvt s32 $r0 s16 0 $r0
+long ret
+// RGBA16_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt u32 $r3 u16 1 $r1
+cvt u32 $r2 u16 0 $r1
+cvt u32 $r1 u16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u16 0 $r0
+long ret
+// RGBA16_FLOAT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt f32 $r3 f16 $r1 1
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt f32 $r2 f16 $r1 0
+cvt f32 $r1 f16 $r0 1
+cvt f32 $r0 f16 $r0 0
+long ret
+// RG32_FLOAT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x3f800000
+long ret
+// RG32_xINT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x00000001
+long ret
+// RGB10A2_UNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0a0a
+long mov b32 $r3 0x3f800000
+ext u32 $r2 $r0 0x0a14
+long and b32 $r0 $r0 0x3ff
+cvt rn f32 $r2 u16 0 $r2
+cvt rn f32 $r1 u16 0 $r1
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3a802007
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x3a802007
+mul f32 $r0 $r0 0x3a802007
+long ret
+// RGB10A2_UINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0a0a
+long mov b32 $r3 0x00000001
+ext u32 $r2 $r0 0x0a14
+long and b32 $r0 $r0 0x3ff
+long ret
+// RGBA8_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 u8 3 $r0
+cvt rn f32 $r2 u8 2 $r0
+mul f32 $r3 $r3 0x3b808081
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt rn f32 $r1 u8 1 $r0
+mul f32 $r2 $r2 0x3b808081
+cvt rn f32 $r0 u8 0 $r0
+mul f32 $r1 $r1 0x3b808081
+mul f32 $r0 $r0 0x3b808081
+long ret
+// RGBA8_SNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 s8 3 $r0
+cvt rn f32 $r2 s8 2 $r0
+mul f32 $r3 $r3 0x3c010204
+cvt rn f32 $r1 s8 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3c010204
+cvt rn f32 $r0 s8 0 $r0
+mul f32 $r1 $r1 0x3c010204
+mul f32 $r0 $r0 0x3c010204
+long ret
+// RGBA8_SINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt s32 $r3 s8 3 $r0
+cvt s32 $r2 s8 2 $r0
+cvt s32 $r1 s8 1 $r0
+cvt s32 $r0 s8 0 $r0
+long ret
+// RGBA8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt u32 $r3 u8 3 $r0
+cvt u32 $r2 u8 2 $r0
+cvt u32 $r1 u8 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u8 0 $r0
+long ret
+// R5G6B5_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0605
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x3f800000
+ext u32 $r2 $r0 0x050b
+long and b32 $r0 $r0 0x1f
+cvt rn f32 $r2 u8 0 $r2
+cvt rn f32 $r1 u8 0 $r1
+mul f32 $r2 $r2 0x3d042108
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r1 $r1 0x3c820821
+mul f32 $r0 $r0 0x3d042108
+long ret
+// R5G5B5X1_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+ext u32 $r1 $r0 0x0505
+ext u32 $r2 $r0 0x050a
+long and b32 $r0 $r0 0x1f
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r2 u8 0 $r2
+cvt rn f32 $r1 u8 0 $r1
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3d042108
+mul f32 $r1 $r1 0x3d042108
+mul f32 $r0 $r0 0x3d042108
+long ret
+// RG16_UNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r1 u16 1 $r0
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x37800074
+mul f32 $r0 $r0 0x37800074
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x3f800000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long ret
+// RG16_SNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r1 s16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mov b32 $r2 0x00000000
+cvt rn f32 $r0 s16 0 $r0
+mul f32 $r1 $r1 0x38000187
+mul f32 $r0 $r0 0x38000187
+long ret
+// RG16_SINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x00000001
+cvt s32 $r1 s16 1 $r0
+mov b32 $r2 0x00000000
+cvt s32 $r0 s16 0 $r0
+long ret
+// RG16_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x00000001
+cvt u32 $r1 u16 1 $r0
+mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u16 0 $r0
+long ret
+// RG16_FLOAT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt f32 $r1 f16 $r0 1
+mov b32 $r2 0x00000000
+cvt f32 $r0 f16 $r0 0
+long ret
+// R32_FLOAT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R32_xINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// RG8_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r1 u8 1 $r0
+mov b32 $r2 0x00000000
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r1 $r1 0x3b808081
+mul f32 $r0 $r0 0x3b808081
+long ret
+// RG8_SNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r1 s8 1 $r0
+long mov b32 $r2 0x00000000
+cvt rn f32 $r0 s8 0 $r0
+mul f32 $r1 $r1 0x3c010204
+mul f32 $r0 $r0 0x3c010204
+long ret
+// RG8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+cvt u32 $r1 u8 1 $r0
+long mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u8 0 $r0
+long ret
+// RG8_SINT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt s32 $r1 s8 1 $r0
+long mov b32 $r2 0x00000000
+cvt s32 $r0 s8 0 $r0
+long ret
+// R16_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r0 u16 0 $r0
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+mul f32 $r0 $r0 0x37800074
+long ret
+// R16_SNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 s16 0 $r0
+long mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r1 0x00000000
+mul f32 $r0 $r0 0x38000187
+long ret
+// R16_SINT
+$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R16_UINT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R16_FLOAT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long mov b32 $r2 0x00000000
+cvt f32 $r0 f16 $r0 0
+mov b32 $r1 0x00000000
+long ret
+// R8_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 u8 0 $r0
+mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r0 $r0 0x3b808081
+mov b32 $r1 0x00000000
+long ret
+// R8_SNORM
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 s8 0 $r0
+mov b32 $r2 0x00000000
+mul f32 $r0 $r0 0x3c010204
+mov b32 $r1 0x00000000
+long ret
+// R8_SINT
+$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long ret
+// R11G11B10_FLOAT TODO
+$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long nop
+long ret
+//
+// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rcp(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    9 * 8 bytes
+//
+long nop
+long ret
+// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
+//
+// INPUT:   $r0d (x)
+// OUTPUT:  $r0d (rsqrt(x))
+// CLOBBER: $r2 - $r7
+// SIZE:    14 * 8 bytes
+//
+long nop
+long ret
+//
+// Trap handler.
+// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
+// Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
+//
+// Trap info:
+// 0x000: mutex
+// 0x004: PC
+// 0x008: trapstat
+// 0x00c: warperr
+// 0x010: tidx
+// 0x014: tidy
+// 0x018: tidz
+// 0x01c: ctaidx
+// 0x020: ctaidy
+// 0x024: ctaidz
+// 0x030: $r0q
+// 0x130: $flags
+// 0x140: s[]
+//
+st b128 wb l[0x00] $r0q
+// check state of the warp and continue if it didn't cause the trap
+long mov b32 $r1 $trapstat
+long mov b32 $r3 $warperr
+mov $r2 $flags mask 0xffff
+and b32 0 $c $r1 $r3
+e $c bra #end_cont
+// spill control flow stack to l[]
+long mov b32 $r3 16
+spill_cfstack:
+preret #end_exit
+sub b32 $r3 $c $r3 0x1
+lg $c bra #spill_cfstack
+// retrieve pointer to trap info
+mov b32 $r0 c0[0x1900]
+mov b32 $r1 c0[0x1904]
+// we only let a single faulting thread store its state
+mov b32 $r3 0x1
+exch b32 $r3 g[$r0d] $r3
+joinat #end_exit
+set $p0 0x1 eq u32 $r3 0x1
+join $p0 nop
+// store $c and $p registers
+st b32 wb g[$r0d+0x130] $r2
+// store $trapstat and $warperr
+long mov b32 $r2 $trapstat
+long mov b32 $r3 $warperr
+st b64 wb g[$r0d+0x8] $r2d
+// store registers
+st b128 wb g[$r0d+0x40] $r4q
+st b128 wb g[$r0d+0x50] $r8q
+st b128 wb g[$r0d+0x60] $r12q
+st b128 wb g[$r0d+0x70] $r16q
+st b128 wb g[$r0d+0x80] $r20q
+st b128 wb g[$r0d+0x90] $r24q
+st b128 wb g[$r0d+0xa0] $r28q
+st b128 wb g[$r0d+0xb0] $r32q
+st b128 wb g[$r0d+0xc0] $r36q
+st b128 wb g[$r0d+0xd0] $r40q
+st b128 wb g[$r0d+0xe0] $r44q
+st b128 wb g[$r0d+0xf0] $r48q
+st b128 wb g[$r0d+0x100] $r52q
+st b128 wb g[$r0d+0x110] $r56q
+st b128 wb g[$r0d+0x120] $r60q
+ld b64 $r2d cs l[0x0]
+st b64 wb g[$r0d+0x30] $r2d
+ld b64 $r2d cs l[0x8]
+st b64 wb g[$r0d+0x38] $r2d
+// store thread id
+long mov b32 $r2 $tidx
+long mov b32 $r3 $tidy
+st b64 wb g[$r0d+0x10] $r2d
+long mov b32 $r2 $tidz
+long mov b32 $r3 $ctaidx
+st b64 wb g[$r0d+0x18] $r2d
+long mov b32 $r2 $ctaidy
+long mov b32 $r3 $ctaidz
+st b64 wb g[$r0d+0x20] $r2d
+// store shared memory (in reverse order so $r0d is base again at the end)
+long mov b32 $r3 $smemsz
+sub b32 $r3 $c $r3 0x4
+s $c bra #shared_done
+add b32 $r0 $c $r0 $r3
+add b32 $r1 $r1 0x0 $c
+shared_loop:
+long ld b32 $r2 s[$r3]
+long st b32 wb g[$r0d+0x140] $r2
+sub b32 $r0 $c $r0 0x4
+sub b32 $r1 $r1 0x0 $c
+sub b32 $r3 $c $r3 0x4
+lg $c bra #shared_loop
+shared_done:
+// search the stack for trap entry to retrieve PC
+mov b32 $r0 c0[0x1908]
+mov b32 $r1 c0[0x190c]
+membar sys
+// invalidate caches so we can read stack entries via g[]
+cctl ivall 0 l[0]
+cctl ivall 0 g[$r0d]
+// get offsets
+mov b32 $r2 $physid
+ext u32 $r3 $r2 0x0814 // MP id
+ext u32 $r2 $r2 0x0608 // warp id
+mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
+mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
+add b32 $r2 $r2 $r3 // MP + warp offset
+add b32 $r0 $c $r0 $r2
+add b32 $r1 $r1 0x0 $c
+search_cstack:
+mov b32 $r3 c0[0x1918] // cstack size
+ld u8 $r2 cv g[$r0d+0x8]
+set $p0 0x1 eq u32 $r2 0xa
+$p0 bra #entry_found
+add b32 $r0 $c $r0 0x10
+add b32 $r1 $r1 0x0 $c
+sub b32 $r3 $c $r3 0x10
+lg $c bra #search_cstack
+bra #end_exit
+entry_found:
+// load PC (may be unaligned and spread out)
+ld b32 $r2 cv g[$r0d]
+mov b32 $r0 c0[0x1900]
+mov b32 $r1 c0[0x1904]
+st b32 wb g[$r0d+0x4] $r2
+join nop
+// invalidate caches and exit
+end_exit:
+cctl ivall 0 g[0]
+bpt pause 0x0
+rtt terminate
+end_cont:
+bpt pause 0x0
+mov $flags $r2 mask 0xffff
+ld b128 $r0q cs l[0x00]
+rtt
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h
new file mode 100644
index 0000000..53fa12c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h
@@ -0,0 +1,592 @@
+
+// Assembled from target_lib_nve4.asm by envyas -m nvc0 -V nve4 -W.
+
+static const uint64_t nve4_builtin_code[] =
+{
+   0x2282828042804287ULL,
+   0x7800000004009c03ULL,
+   0x380000007c209c82ULL,
+   0x180000000400dde2ULL,
+   0x6000000008309c03ULL,
+   0x1c00000005205d04ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x2282828282828287ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x2042c28280428047ULL,
+   0x200400000c209c43ULL,
+   0x280000000000dde4ULL,
+   0x5000000008001c43ULL,
+   0x1c00000005209d04ULL,
+   0x2006000000105c03ULL,
+   0x1b0e00000811dc03ULL,
+   0x4800000008104103ULL,
+   0x220282e20042c287ULL,
+   0x0800000004000002ULL,
+   0x1b0e00000811c003ULL,
+   0x4800000008104103ULL,
+   0x0800000004000002ULL,
+   0x9000000000001de7ULL,
+   0x188e0000fc05dc23ULL,
+   0x18c40000fc17dc23ULL,
+   0x2280428042828207ULL,
+   0x1c00000001201ec4ULL,
+   0x1c00000005205ec4ULL,
+   0x7800000004009c03ULL,
+   0x380000007c209c82ULL,
+   0x180000000400dde2ULL,
+   0x6000000008309c03ULL,
+   0x1c00000005205d04ULL,
+   0x2282828282828287ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x2282804280428287ULL,
+   0x200400000c209c43ULL,
+   0x500000000810dc03ULL,
+   0x200400000c209c43ULL,
+   0x280000000000dde4ULL,
+   0x5000000008001c43ULL,
+   0x1c00000005209d04ULL,
+   0x2006000000105c03ULL,
+   0x22028042c28042c7ULL,
+   0x1b0e00000811dc03ULL,
+   0x4800000008104103ULL,
+   0x0800000004000002ULL,
+   0x1b0e00000811c003ULL,
+   0x4800000008104103ULL,
+   0x0800000004000002ULL,
+   0x1c00000001200f84ULL,
+   0x22c200428042e047ULL,
+   0x1c00000005204b84ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004c5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009c5ULL,
+   0xd4004000084007c5ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd4004000084004c5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009c5ULL,
+   0xd4004000084007c5ULL,
+   0x1900000004a0dc04ULL,
+   0x1800000004a09c04ULL,
+   0x30de0001d030dc02ULL,
+   0x2000000000000007ULL,
+   0x1900000000a05c04ULL,
+   0x30de0001d0209c02ULL,
+   0x1800000000a01c04ULL,
+   0x30de0001d0105c02ULL,
+   0x30de0001d0001c02ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1900000004a0de04ULL,
+   0x1800000004a09e04ULL,
+   0x30e000061c30dc02ULL,
+   0x1900000000a05e04ULL,
+   0x2000000000000007ULL,
+   0x30e000061c209c02ULL,
+   0x1800000000a01e04ULL,
+   0x30e000061c105c02ULL,
+   0x30e000061c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1d00000004a0de84ULL,
+   0x1c00000004a09e84ULL,
+   0x1d00000000a05e84ULL,
+   0x1c00000000a01e84ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1d00000004a0dc04ULL,
+   0x1c00000004a09c04ULL,
+   0x1d00000000a05c04ULL,
+   0x2000000000000007ULL,
+   0x1c00000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0xd4004000084007a5ULL,
+   0x1100000004a0dc04ULL,
+   0x2000000000000007ULL,
+   0x1000000004a09c04ULL,
+   0x1100000000a05c04ULL,
+   0x1000000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0x2000000000000007ULL,
+   0xd4004000084007a5ULL,
+   0x1800000000009de2ULL,
+   0x18fe00000000dde2ULL,
+   0x9000000000001de7ULL,
+   0xd4004000084004a5ULL,
+   0x0c5400000013dc04ULL,
+   0xd4004000084009a5ULL,
+   0x2000000000000007ULL,
+   0xd4004000084007a5ULL,
+   0x1800000000009de2ULL,
+   0x180000000400dde2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0x2000000000000007ULL,
+   0xd400400008400785ULL,
+   0x7000c02828005c03ULL,
+   0x18fe00000000dde2ULL,
+   0x7000c02850009c03ULL,
+   0x3800000ffc001c02ULL,
+   0x1800000008a09c04ULL,
+   0x1800000004a05c04ULL,
+   0x2000000000000007ULL,
+   0x30ea00801c209c02ULL,
+   0x1800000000a01c04ULL,
+   0x30ea00801c105c02ULL,
+   0x30ea00801c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x7000c02828005c03ULL,
+   0x180000000400dde2ULL,
+   0x7000c02850009c03ULL,
+   0x3800000ffc001c02ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x198000000020dc04ULL,
+   0x1900000000209c04ULL,
+   0x30ee02020430dc02ULL,
+   0x2000000000000007ULL,
+   0x1880000000205c04ULL,
+   0x30ee020204209c02ULL,
+   0x1800000000201c04ULL,
+   0x30ee020204105c02ULL,
+   0x30ee020204001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x198000000020de04ULL,
+   0x1900000000209e04ULL,
+   0x30f004081030dc02ULL,
+   0x1880000000205e04ULL,
+   0x2000000000000007ULL,
+   0x30f0040810209c02ULL,
+   0x1800000000201e04ULL,
+   0x30f0040810105c02ULL,
+   0x30f0040810001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x1d8000000020de84ULL,
+   0x1d00000000209e84ULL,
+   0x1c80000000205e84ULL,
+   0x1c00000000201e84ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x1d8000000020dc04ULL,
+   0x1d00000000209c04ULL,
+   0x1c80000000205c04ULL,
+   0x2000000000000007ULL,
+   0x1c00000000201c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x7000c01814005c03ULL,
+   0x2000000000000007ULL,
+   0x18fe00000000dde2ULL,
+   0x7000c0142c009c03ULL,
+   0x380000007c001c02ULL,
+   0x1800000008209c04ULL,
+   0x1800000004205c04ULL,
+   0x30f4108420209c02ULL,
+   0x1800000000201c04ULL,
+   0x2000000000000007ULL,
+   0x30f2082084105c02ULL,
+   0x30f4108420001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x2000000000000007ULL,
+   0x7000c01414005c03ULL,
+   0x7000c01428009c03ULL,
+   0x380000007c001c02ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000008209c04ULL,
+   0x1800000004205c04ULL,
+   0x1800000000201c04ULL,
+   0x2000000000000007ULL,
+   0x30f4108420209c02ULL,
+   0x30f4108420105c02ULL,
+   0x30f4108420001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0x2000000000000007ULL,
+   0xd400400008400785ULL,
+   0x1900000000a05c04ULL,
+   0x1800000000a01c04ULL,
+   0x30de0001d0105c02ULL,
+   0x30de0001d0001c02ULL,
+   0x1800000000009de2ULL,
+   0x18fe00000000dde2ULL,
+   0x2000000000000007ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x18fe00000000dde2ULL,
+   0x1900000000a05e04ULL,
+   0x2000000000000007ULL,
+   0x1800000000009de2ULL,
+   0x1800000000a01e04ULL,
+   0x30e000061c105c02ULL,
+   0x30e000061c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x180000000400dde2ULL,
+   0x1d00000000a05e84ULL,
+   0x1800000000009de2ULL,
+   0x1c00000000a01e84ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x180000000400dde2ULL,
+   0x1d00000000a05c04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x1c00000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x18fe00000000dde2ULL,
+   0x2000000000000007ULL,
+   0x1100000000a05c04ULL,
+   0x1800000000009de2ULL,
+   0x1000000000a01c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400985ULL,
+   0x2000000000000007ULL,
+   0xd400400008400785ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400485ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400985ULL,
+   0xd400400008400785ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1880000000205c04ULL,
+   0x1800000000009de2ULL,
+   0x1800000000201c04ULL,
+   0x2000000000000007ULL,
+   0x30ee020204105c02ULL,
+   0x30ee020204001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x2000000000000007ULL,
+   0x18fe00000000dde2ULL,
+   0x1880000000205e04ULL,
+   0x1800000000009de2ULL,
+   0x1800000000201e04ULL,
+   0x30f0040810105c02ULL,
+   0x30f0040810001c02ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x180000000400dde2ULL,
+   0x1c80000000205c04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x1c00000000201c04ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x180000000400dde2ULL,
+   0x2000000000000007ULL,
+   0x1c80000000205e84ULL,
+   0x1800000000009de2ULL,
+   0x1c00000000201e84ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0x2000000000000007ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000a01c04ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x30de0001d0001c02ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000a01e04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x1800000000005de2ULL,
+   0x30e000061c001c02ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400465ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400965ULL,
+   0xd400400008400765ULL,
+   0x2000000000000007ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400945ULL,
+   0x2000000000000007ULL,
+   0xd400400008400745ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400445ULL,
+   0x0c5400000013dc04ULL,
+   0x2000000000000007ULL,
+   0xd400400008400945ULL,
+   0xd400400008400745ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000009de2ULL,
+   0x1000000000a01c04ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400405ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400905ULL,
+   0xd400400008400705ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000201c04ULL,
+   0x1800000000009de2ULL,
+   0x2000000000000007ULL,
+   0x30ee020204001c02ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400405ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400905ULL,
+   0xd400400008400705ULL,
+   0x2000000000000007ULL,
+   0x18fe00000000dde2ULL,
+   0x1800000000201e04ULL,
+   0x1800000000009de2ULL,
+   0x30f0040810001c02ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0xd400400008400425ULL,
+   0x2000000000000007ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400925ULL,
+   0xd400400008400725ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x9000000000001de7ULL,
+   0x2000000000000007ULL,
+   0xd400400008400405ULL,
+   0x0c5400000013dc04ULL,
+   0xd400400008400905ULL,
+   0xd400400008400705ULL,
+   0x180000000400dde2ULL,
+   0x1800000000009de2ULL,
+   0x1800000000005de2ULL,
+   0x2000000000000007ULL,
+   0x9000000000001de7ULL,
+   0xd40040000840c485ULL,
+   0x0c5400000013dc04ULL,
+   0xd40040000840c985ULL,
+   0xd40040000840c785ULL,
+   0x18fe00000000dde2ULL,
+   0x4000000000001de4ULL,
+   0x9000000000001de7ULL,
+   0x4000000000001de4ULL,
+   0x9000000000001de7ULL,
+   0x4000000000001de4ULL,
+   0x9000000000001de7ULL,
+   0xc800000003f01cc5ULL,
+   0x2c00000100005c04ULL,
+   0x2c0000010800dc04ULL,
+   0x3000c3fffff09c04ULL,
+   0x680100000c1fdc03ULL,
+   0x4000000a60001c47ULL,
+   0x180000004000dde2ULL,
+   0x78000009c0000007ULL,
+   0x0c0000000430dd02ULL,
+   0x4003ffffa0001ca7ULL,
+   0x2800406400001de4ULL,
+   0x2800406410005de4ULL,
+   0x180000000400dde2ULL,
+   0x547e18000000dd05ULL,
+   0x60000008e0000007ULL,
+   0x190ec0000431dc03ULL,
+   0x40000000000001f4ULL,
+   0x94000004c0009c85ULL,
+   0x2c00000100009c04ULL,
+   0x2c0000010800dc04ULL,
+   0x9400000020009ca5ULL,
+   0x9400000100011cc5ULL,
+   0x9400000140021cc5ULL,
+   0x9400000180031cc5ULL,
+   0x94000001c0041cc5ULL,
+   0x9400000200051cc5ULL,
+   0x9400000240061cc5ULL,
+   0x9400000280071cc5ULL,
+   0x94000002c0081cc5ULL,
+   0x9400000300091cc5ULL,
+   0x94000003400a1cc5ULL,
+   0x94000003800b1cc5ULL,
+   0x94000003c00c1cc5ULL,
+   0x94000004000d1cc5ULL,
+   0x94000004400e1cc5ULL,
+   0x94000004800f1cc5ULL,
+   0xc000000003f09ea5ULL,
+   0x94000000c0009ca5ULL,
+   0xc000000023f09ea5ULL,
+   0x94000000e0009ca5ULL,
+   0x2c00000084009c04ULL,
+   0x2c0000008800dc04ULL,
+   0x9400000040009ca5ULL,
+   0x2c0000008c009c04ULL,
+   0x2c0000009400dc04ULL,
+   0x9400000060009ca5ULL,
+   0x2c00000098009c04ULL,
+   0x2c0000009c00dc04ULL,
+   0x9400000080009ca5ULL,
+   0x2c000000c800dc04ULL,
+   0x0c0000001030dd02ULL,
+   0x4000000100001ea7ULL,
+   0x480100000c001c03ULL,
+   0x0800000000105c42ULL,
+   0xc100000000309c85ULL,
+   0x9400000500009c85ULL,
+   0x0c00000010001d02ULL,
+   0x0800000000105d42ULL,
+   0x0c0000001030dd02ULL,
+   0x4003ffff40001ca7ULL,
+   0x2800406420001de4ULL,
+   0x2800406430005de4ULL,
+   0xe000000000001c45ULL,
+   0xd000000003ffdcc5ULL,
+   0x9c000000000fdcc5ULL,
+   0x2c0000000c009c04ULL,
+   0x7000c0205020dc03ULL,
+   0x7000c01820209c03ULL,
+   0x5000406450209c03ULL,
+   0x500040644030dc03ULL,
+   0x480000000c209c03ULL,
+   0x4801000008001c03ULL,
+   0x0800000000105c42ULL,
+   0x280040646000dde4ULL,
+   0x8400000020009f05ULL,
+   0x190ec0002821dc03ULL,
+   0x40000000800001e7ULL,
+   0x0c00000040001c02ULL,
+   0x0800000000105c42ULL,
+   0x0c0000004030dd02ULL,
+   0x00029dff0ffc5cbfULL,
+   0x8400000000009f85ULL,
+   0x2800406400001de4ULL,
+   0x2800406410005de4ULL,
+   0x9400000010009c85ULL,
+   0x4000000000001df4ULL,
+   0x9800000003ffdcc5ULL,
+   0xd000000000008007ULL,
+   0xa000000000004007ULL,
+   0xd000000000008007ULL,
+   0x3400c3fffc201c04ULL,
+   0xc000000003f01ec5ULL,
+   0xa000000000000007ULL
+};
+
+static const uint16_t nve4_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+   0x0000,
+   0x00f0,
+   0x0f08,
+   0x0f18,
+};
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
new file mode 100644
index 0000000..d10b6b0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
@@ -0,0 +1,13 @@
+
+static const uint64_t nvf0_builtin_code[] =
+{
+   0x19000000001c003cULL,
+};
+
+static const uint16_t nvf0_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+   0,
+   0,
+   0,
+   0
+};