31 files changed, 388 insertions, 206 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 216e119..c126c08 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -1124,12 +1124,15 @@ CodeEmitterNV50::emitIMUL(const Instruction *i)
 {
    code[0] = 0x40000000;
 
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      if (i->sType == TYPE_S16)
+         code[0] |= 0x8100;
+      code[1] = 0;
+      emitForm_IMM(i);
+   } else
    if (i->encSize == 8) {
       code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
-      if (i->src(1).getFile() == FILE_IMMEDIATE)
-         emitForm_IMM(i);
-      else
-         emitForm_MAD(i);
+      emitForm_MAD(i);
    } else {
       if (i->sType == TYPE_S16)
          code[0] |= 0x8100;
@@ -1190,29 +1193,45 @@ CodeEmitterNV50::emitDMUL(const Instruction *i)
 void
 CodeEmitterNV50::emitIMAD(const Instruction *i)
 {
+   int mode;
    code[0] = 0x60000000;
-   if (isSignedType(i->sType))
-      code[1] = i->saturate ? 0x40000000 : 0x20000000;
-   else
-      code[1] = 0x00000000;
 
-   int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
-   int neg2 = i->src(2).mod.neg();
-
-   assert(!(neg1 & neg2));
-   code[1] |= neg1 << 27;
-   code[1] |= neg2 << 26;
+   assert(!i->src(0).mod && !i->src(1).mod && !i->src(2).mod);
+   if (!isSignedType(i->sType))
+      mode = 0;
+   else if (i->saturate)
+      mode = 2;
+   else
+      mode = 1;
 
-   if (i->src(1).getFile() == FILE_IMMEDIATE)
+   if (i->src(1).getFile() == FILE_IMMEDIATE) {
+      code[1] = 0;
       emitForm_IMM(i);
-   else
+      code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
+      if (i->flagsSrc >= 0) {
+         assert(!(code[0] & 0x10400000));
+         assert(SDATA(i->src(i->flagsSrc)).id == 0);
+         code[0] |= 0x10400000;
+      }
+   } else
+   if (i->encSize == 4) {
+      emitForm_MUL(i);
+      code[0] |= (mode & 1) << 8 | (mode & 2) << 14;
+      if (i->flagsSrc >= 0) {
+         assert(!(code[0] & 0x10400000));
+         assert(SDATA(i->src(i->flagsSrc)).id == 0);
+         code[0] |= 0x10400000;
+      }
+   } else {
+      code[1] = mode << 29;
       emitForm_MAD(i);
 
-   if (i->flagsSrc >= 0) {
-      // add with carry from $cX
-      assert(!(code[1] & 0x0c000000) && !i->getPredicate());
-      code[1] |= 0xc << 24;
-      srcId(i->src(i->flagsSrc), 32 + 12);
+      if (i->flagsSrc >= 0) {
+         // add with carry from $cX
+         assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+         code[1] |= 0xc << 24;
+         srcId(i->src(i->flagsSrc), 32 + 12);
+      }
    }
 }
 
@@ -2054,8 +2073,9 @@ CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
 
    // check constraints on short MAD
    if (info.srcNr >= 2 && i->srcExists(2)) {
-      if (!i->defExists(0) || !isFloatType(i->dType) ||
-          i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
+      if (!i->defExists(0) ||
+          (i->flagsSrc >= 0 && SDATA(i->src(i->flagsSrc)).id > 0) ||
+          DDATA(i->def(0)).id != SDATA(i->src(2)).id)
          return 8;
    }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 1d2caab..b233860 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1897,7 +1897,7 @@ Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
       shd = fetchSrc(C >> 4, C & 3);
 
    if (texi->op == OP_TXD) {
-      for (c = 0; c < tgt.getDim(); ++c) {
+      for (c = 0; c < tgt.getDim() + tgt.isCube(); ++c) {
          texi->dPdx[c].set(fetchSrc(Dx >> 4, (Dx & 3) + c));
          texi->dPdy[c].set(fetchSrc(Dy >> 4, (Dy & 3) + c));
       }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 420cc4e..0b90378 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -57,7 +57,7 @@ GM107LoweringPass::handleManualTXD(TexInstruction *i)
    Instruction *tex, *add;
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
-   const int dim = i->tex.target.getDim();
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    const int array = i->tex.target.isArray();
 
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 64f5fc0..8752b0c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -44,6 +44,8 @@ static bool
 expandIntegerMUL(BuildUtil *bld, Instruction *mul)
 {
    const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+   ImmediateValue src1;
+   bool src1imm = mul->src(1).getImmediate(src1);
 
    DataType fTy; // full type
    switch (mul->sType) {
@@ -72,24 +74,41 @@ expandIntegerMUL(BuildUtil *bld, Instruction *mul)
    for (int j = 0; j < 4; ++j)
       t[j] = bld->getSSA(fullSize);
 
-   s[0] = mul->getSrc(0);
-   s[1] = mul->getSrc(1);
-
    if (isSignedType(mul->sType) && highResult) {
       s[0] = bld->getSSA(fullSize);
       s[1] = bld->getSSA(fullSize);
       bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
       bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
+      src1.reg.data.s32 = abs(src1.reg.data.s32);
+   } else {
+      s[0] = mul->getSrc(0);
+      s[1] = mul->getSrc(1);
    }
 
    // split sources into halves
    i[0] = bld->mkSplit(a, halfSize, s[0]);
    i[1] = bld->mkSplit(b, halfSize, s[1]);
 
-   i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
-   i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+   if (src1imm && (src1.reg.data.u32 & 0xffff0000) == 0) {
+      i[2] = i[3] = bld->mkOp2(OP_MUL, fTy, t[1], a[1],
+                               bld->mkImm(src1.reg.data.u32 & 0xffff));
+   } else {
+      i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0],
+                        src1imm ? bld->mkImm(src1.reg.data.u32 >> 16) : b[1]);
+      if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+         i[3] = i[2];
+         t[1] = t[0];
+      } else {
+         i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+      }
+   }
    i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
-   i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+   if (src1imm && (src1.reg.data.u32 & 0x0000ffff) == 0) {
+      i[4] = i[3];
+      t[3] = t[2];
+   } else {
+      i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+   }
 
    if (highResult) {
       Value *c[2];
@@ -911,7 +930,7 @@ NV50LoweringPreSSA::handleTXD(TexInstruction *i)
    Instruction *tex;
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
-   const int dim = i->tex.target.getDim();
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
 
    handleTEX(i);
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
@@ -1225,7 +1244,7 @@ NV50LoweringPreSSA::handleEXPORT(Instruction *i)
          i->setDef(0, new_LValue(func, FILE_GPR));
          i->getDef(0)->reg.data.id = id;
 
-         prog->maxGPR = MAX2(prog->maxGPR, id);
+         prog->maxGPR = MAX2(prog->maxGPR, id * 2);
       }
    }
    return true;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 0f575f2..e67bf3e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -186,92 +186,68 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
       uses.push_back(TexUse(usei, texi));
 }
 
+// While it might be tempting to use the an algorithm that just looks at tex
+// uses, not all texture results are guaranteed to be used on all paths. In
+// the case where along some control flow path a texture result is never used,
+// we might reuse that register for something else, creating a
+// write-after-write hazard. So we have to manually look through all
+// instructions looking for ones that reference the registers in question.
 void
-NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
-                                        Instruction *insn,
-                                        const BasicBlock *term,
-                                        std::list<TexUse> &uses)
+NVC0LegalizePostRA::findFirstUses(
+   Instruction *texi, std::list<TexUse> &uses)
 {
-   while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
-      insn = insn->getSrc(0)->getUniqueInsn();
-
-   // NOTE: the tex itself is, of course, not an overwriting definition
-   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
-      return;
+   int minGPR = texi->def(0).rep()->reg.data.id;
+   int maxGPR = minGPR + texi->def(0).rep()->reg.size / 4 - 1;
 
-   switch (insn->op) {
-   /* Values not connected to the tex's definition through any of these should
-    * not be conflicting.
-    */
-   case OP_SPLIT:
-   case OP_MERGE:
-   case OP_PHI:
-   case OP_UNION:
-      /* recurse again */
-      for (int s = 0; insn->srcExists(s); ++s)
-         findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
-                             uses);
-      break;
-   default:
-      // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
-      addTexUse(uses, insn, texi);
-      break;
-   }
+   unordered_set<const BasicBlock *> visited;
+   findFirstUsesBB(minGPR, maxGPR, texi->next, texi, uses, visited);
 }
 
 void
-NVC0LegalizePostRA::findFirstUses(
-      const Instruction *texi,
-      const Instruction *insn,
-      std::list<TexUse> &uses,
-      unordered_set<const Instruction *>& visited)
+NVC0LegalizePostRA::findFirstUsesBB(
+   int minGPR, int maxGPR, Instruction *start,
+   const Instruction *texi, std::list<TexUse> &uses,
+   unordered_set<const BasicBlock *> &visited)
 {
-   for (int d = 0; insn->defExists(d); ++d) {
-      Value *v = insn->getDef(d);
-      for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
-         Instruction *usei = (*u)->getInsn();
-
-         // NOTE: In case of a loop that overwrites a value but never uses
-         // it, it can happen that we have a cycle of uses that consists only
-         // of phis and no-op moves and will thus cause an infinite loop here
-         // since these are not considered actual uses.
-         // The most obvious (and perhaps the only) way to prevent this is to
-         // remember which instructions we've already visited.
-
-         if (visited.find(usei) != visited.end())
-            continue;
+   const BasicBlock *bb = start->bb;
+
+   // We don't process the whole bb the first time around. This is correct,
+   // however we might be in a loop and hit this BB again, and need to process
+   // the full thing. So only mark a bb as visited if we processed it from the
+   // beginning.
+   if (start == bb->getEntry()) {
+      if (visited.find(bb) != visited.end())
+         return;
+      visited.insert(bb);
+   }
 
-         visited.insert(usei);
-
-         if (usei->op == OP_PHI || usei->op == OP_UNION) {
-            // need a barrier before WAW cases, like:
-            //   %r0 = tex
-            //   if ...
-            //     texbar <- is required or tex might replace x again
-            //     %r1 = x <- overwriting def
-            //   %r2 = phi %r0, %r1
-            for (int s = 0; usei->srcExists(s); ++s) {
-               Instruction *defi = usei->getSrc(s)->getUniqueInsn();
-               if (defi && &usei->src(s) != *u)
-                  findOverwritingDefs(texi, defi, usei->bb, uses);
-            }
-         }
+   for (Instruction *insn = start; insn != bb->getExit(); insn = insn->next) {
+      if (insn->isNop())
+         continue;
 
-         if (usei->op == OP_SPLIT ||
-             usei->op == OP_MERGE ||
-             usei->op == OP_PHI ||
-             usei->op == OP_UNION) {
-            // these uses don't manifest in the machine code
-            findFirstUses(texi, usei, uses, visited);
-         } else
-         if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
-             usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
-            findFirstUses(texi, usei, uses, visited);
-         } else {
-            addTexUse(uses, usei, texi);
-         }
+      for (int d = 0; insn->defExists(d); ++d) {
+         if (insn->def(d).getFile() != FILE_GPR ||
+             insn->def(d).rep()->reg.data.id < minGPR ||
+             insn->def(d).rep()->reg.data.id > maxGPR)
+            continue;
+         addTexUse(uses, insn, texi);
+         return;
+      }
+
+      for (int s = 0; insn->srcExists(s); ++s) {
+         if (insn->src(s).getFile() != FILE_GPR ||
+             insn->src(s).rep()->reg.data.id < minGPR ||
+             insn->src(s).rep()->reg.data.id > maxGPR)
+            continue;
+         addTexUse(uses, insn, texi);
+         return;
       }
    }
+
+   for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+      findFirstUsesBB(minGPR, maxGPR, BasicBlock::get(ei.getNode())->getEntry(),
+                      texi, uses, visited);
+   }
 }
 
 // Texture barriers:
@@ -323,8 +299,7 @@ NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
    if (!uses)
       return false;
    for (size_t i = 0; i < texes.size(); ++i) {
-      unordered_set<const Instruction *> visited;
-      findFirstUses(texes[i], texes[i], uses[i], visited);
+      findFirstUses(texes[i], uses[i]);
    }
 
    // determine the barrier level at each use
@@ -870,7 +845,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
    Instruction *tex;
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
-   const int dim = i->tex.target.getDim();
+   const int dim = i->tex.target.getDim() + i->tex.target.isCube();
    const int array = i->tex.target.isArray();
 
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
@@ -917,7 +892,7 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
 bool
 NVC0LoweringPass::handleTXD(TexInstruction *txd)
 {
-   int dim = txd->tex.target.getDim();
+   int dim = txd->tex.target.getDim() + txd->tex.target.isCube();
    unsigned arg = txd->tex.target.getArgCount();
    unsigned expected_args = arg;
    const int chipset = prog->getTarget()->getChipset();
@@ -937,8 +912,7 @@ NVC0LoweringPass::handleTXD(TexInstruction *txd)
 
    if (expected_args > 4 ||
        dim > 2 ||
-       txd->tex.target.isShadow() ||
-       txd->tex.target.isCube())
+       txd->tex.target.isShadow())
       txd->op = OP_TEX;
 
    handleTEX(txd);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 2ce52e5..adb400a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -69,12 +69,10 @@ private:
    };
    bool insertTextureBarriers(Function *);
    inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
-   void findFirstUses(const Instruction *tex, const Instruction *def,
-                      std::list<TexUse>&,
-                      unordered_set<const Instruction *>&);
-   void findOverwritingDefs(const Instruction *tex, Instruction *insn,
-                            const BasicBlock *term,
-                            std::list<TexUse>&);
+   void findFirstUses(Instruction *texi, std::list<TexUse> &uses);
+   void findFirstUsesBB(int minGPR, int maxGPR, Instruction *start,
+                        const Instruction *texi, std::list<TexUse> &uses,
+                        unordered_set<const BasicBlock *> &visited);
    void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
    const Instruction *recurseDef(const Instruction *);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 805be5f..022626c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1501,6 +1501,7 @@ private:
    void handleSLCT(Instruction *);
    void handleLOGOP(Instruction *);
    void handleCVT_NEG(Instruction *);
+   void handleCVT_CVT(Instruction *);
    void handleCVT_EXTBF(Instruction *);
    void handleSUCLAMP(Instruction *);
 
@@ -1792,6 +1793,47 @@ AlgebraicOpt::handleCVT_NEG(Instruction *cvt)
    delete_Instruction(prog, cvt);
 }
 
+// F2I(TRUNC()) and so on can be expressed as a single CVT. If the earlier CVT
+// does a type conversion, this becomes trickier as there might be range
+// changes/etc. We could handle those in theory as long as the range was being
+// reduced or kept the same.
+void
+AlgebraicOpt::handleCVT_CVT(Instruction *cvt)
+{
+   Instruction *insn = cvt->getSrc(0)->getInsn();
+   RoundMode rnd = insn->rnd;
+
+   if (insn->saturate ||
+       insn->subOp ||
+       insn->dType != insn->sType ||
+       insn->dType != cvt->sType)
+      return;
+
+   switch (insn->op) {
+   case OP_CEIL:
+      rnd = ROUND_PI;
+      break;
+   case OP_FLOOR:
+      rnd = ROUND_MI;
+      break;
+   case OP_TRUNC:
+      rnd = ROUND_ZI;
+      break;
+   case OP_CVT:
+      break;
+   default:
+      return;
+   }
+
+   if (!isFloatType(cvt->dType) || !isFloatType(insn->sType))
+      rnd = (RoundMode)(rnd & 3);
+
+   cvt->rnd = rnd;
+   cvt->setSrc(0, insn->getSrc(0));
+   cvt->src(0).mod *= insn->src(0).mod;
+   cvt->sType = insn->sType;
+}
+
 // Some shaders extract packed bytes out of words and convert them to
 // e.g. float. The Fermi+ CVT instruction can extract those directly, as can
 // nv50 for word sizes.
@@ -1961,6 +2003,7 @@ AlgebraicOpt::visit(BasicBlock *bb)
          break;
       case OP_CVT:
          handleCVT_NEG(i);
+         handleCVT_CVT(i);
          if (prog->getTarget()->isOpSupported(OP_EXTBF, TYPE_U32))
              handleCVT_EXTBF(i);
          break;
@@ -2532,6 +2575,7 @@ MemoryOpt::runOpt(BasicBlock *bb)
 class FlatteningPass : public Pass
 {
 private:
+   virtual bool visit(Function *);
    virtual bool visit(BasicBlock *);
 
    bool tryPredicateConditional(BasicBlock *);
@@ -2540,6 +2584,8 @@ private:
    inline bool isConstantCondition(Value *pred);
    inline bool mayPredicate(const Instruction *, const Value *pred) const;
    inline void removeFlow(Instruction *);
+
+   uint8_t gpr_unit;
 };
 
 bool
@@ -2561,9 +2607,15 @@ FlatteningPass::isConstantCondition(Value *pred)
          file = ld->src(0).getFile();
       } else {
          file = insn->src(s).getFile();
-         // catch $r63 on NVC0
-         if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
-            file = FILE_IMMEDIATE;
+         // catch $r63 on NVC0 and $r63/$r127 on NV50. Unfortunately maxGPR is
+         // in register "units", which can vary between targets.
+         if (file == FILE_GPR) {
+            Value *v = insn->getSrc(s);
+            int bytes = v->reg.data.id * MIN2(v->reg.size, 4);
+            int units = bytes >> gpr_unit;
+            if (units > prog->maxGPR)
+               file = FILE_IMMEDIATE;
+         }
       }
       if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
          return false;
@@ -2669,6 +2721,14 @@ FlatteningPass::tryPropagateBranch(BasicBlock *bb)
 }
 
 bool
+FlatteningPass::visit(Function *fn)
+{
+   gpr_unit = prog->getTarget()->getFileUnit(FILE_GPR);
+
+   return true;
+}
+
+bool
 FlatteningPass::visit(BasicBlock *bb)
 {
    if (tryPredicateConditional(bb))
@@ -2774,6 +2834,15 @@ private:
    virtual bool visit(BasicBlock *);
 };
 
+static bool
+post_ra_dead(Instruction *i)
+{
+   for (int d = 0; i->defExists(d); ++d)
+      if (i->getDef(d)->refCount())
+         return false;
+   return true;
+}
+
 bool
 NV50PostRaConstantFolding::visit(BasicBlock *bb)
 {
@@ -2787,24 +2856,48 @@ NV50PostRaConstantFolding::visit(BasicBlock *bb)
              i->src(0).getFile() != FILE_GPR ||
              i->src(1).getFile() != FILE_GPR ||
              i->src(2).getFile() != FILE_GPR ||
-             i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id ||
-             !isFloatType(i->dType))
+             i->getDef(0)->reg.data.id != i->getSrc(2)->reg.data.id)
             break;
 
          if (i->getDef(0)->reg.data.id >= 64 ||
              i->getSrc(0)->reg.data.id >= 64)
             break;
 
+         if (i->flagsSrc >= 0 && i->getSrc(i->flagsSrc)->reg.data.id != 0)
+            break;
+
+         if (i->getPredicate())
+            break;
+
          def = i->getSrc(1)->getInsn();
+         if (def && def->op == OP_SPLIT && typeSizeof(def->sType) == 4)
+            def = def->getSrc(0)->getInsn();
          if (def && def->op == OP_MOV && def->src(0).getFile() == FILE_IMMEDIATE) {
             vtmp = i->getSrc(1);
-            i->setSrc(1, def->getSrc(0));
+            if (isFloatType(i->sType)) {
+               i->setSrc(1, def->getSrc(0));
+            } else {
+               ImmediateValue val;
+               bool ret = def->src(0).getImmediate(val);
+               assert(ret);
+               if (i->getSrc(1)->reg.data.id & 1)
+                  val.reg.data.u32 >>= 16;
+               val.reg.data.u32 &= 0xffff;
+               i->setSrc(1, new_ImmediateValue(bb->getProgram(), val.reg.data.u32));
+            }
 
             /* There's no post-RA dead code elimination, so do it here
              * XXX: if we add more code-removing post-RA passes, we might
              *      want to create a post-RA dead-code elim pass */
-            if (vtmp->refCount() == 0)
-               delete_Instruction(bb->getProgram(), def);
+            if (post_ra_dead(vtmp->getInsn())) {
+               Value *src = vtmp->getInsn()->getSrc(0);
+               // Careful -- splits will have already been removed from the
+               // functions. Don't double-delete.
+               if (vtmp->getInsn()->bb)
+                  delete_Instruction(prog, vtmp->getInsn());
+               if (src->getInsn() && post_ra_dead(src->getInsn()))
+                  delete_Instruction(prog, src->getInsn());
+            }
 
             break;
          }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index b32bc13..cd8c42c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1473,7 +1473,6 @@ GCRA::allocateRegisters(ArrayList& insns)
                // Short encoding only possible if they're all GPRs, no need to
                // affect them otherwise.
                if (insn->flagsDef < 0 &&
-                   isFloatType(insn->dType) &&
                    insn->src(0).getFile() == FILE_GPR &&
                    insn->src(1).getFile() == FILE_GPR &&
                    insn->src(2).getFile() == FILE_GPR)
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index a6065e4..4ca9e5c 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -147,6 +147,12 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
    if (nv_dbg)
       nouveau_mesa_debug = atoi(nv_dbg);
 
+   /* These must be set before any failure is possible, as the cleanup
+    * paths assume they're responsible for deleting them.
+    */
+   screen->drm = nouveau_drm(&dev->object);
+   screen->device = dev;
+
    /*
     * this is initialized to 1 in nouveau_drm_screen_create after screen
     * is fully constructed and added to the global screen list.
@@ -175,7 +181,6 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
                             data, size, &screen->channel);
    if (ret)
       return ret;
-   screen->device = dev;
 
    ret = nouveau_client_new(screen->device, &screen->client);
    if (ret)
@@ -229,6 +234,8 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 void
 nouveau_screen_fini(struct nouveau_screen *screen)
 {
+   int fd = screen->drm->fd;
+
    nouveau_mm_destroy(screen->mm_GART);
    nouveau_mm_destroy(screen->mm_VRAM);
 
@@ -238,6 +245,8 @@ nouveau_screen_fini(struct nouveau_screen *screen)
    nouveau_object_del(&screen->channel);
 
    nouveau_device_del(&screen->device);
+   nouveau_drm_del(&screen->drm);
+   close(fd);
 }
 
 static void
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index 328646f..28c4760 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -17,6 +17,7 @@ struct nouveau_bo;
 
 struct nouveau_screen {
    struct pipe_screen base;
+   struct nouveau_drm *drm;
    struct nouveau_device *device;
    struct nouveau_object *channel;
    struct nouveau_client *client;
diff --git a/src/gallium/drivers/nouveau/nouveau_winsys.h b/src/gallium/drivers/nouveau/nouveau_winsys.h
index 1319c32..f13988e 100644
--- a/src/gallium/drivers/nouveau/nouveau_winsys.h
+++ b/src/gallium/drivers/nouveau/nouveau_winsys.h
@@ -6,6 +6,7 @@
 
 #include "pipe/p_defines.h"
 
+#include <drm.h>
 #include <nouveau.h>
 
 #ifndef NV04_PFIFO_MAX_PACKET_LEN
@@ -79,13 +80,13 @@ nouveau_screen_transfer_flags(unsigned pipe)
    return flags;
 }
 
-extern struct pipe_screen *
+extern struct nouveau_screen *
 nv30_screen_create(struct nouveau_device *);
 
-extern struct pipe_screen *
+extern struct nouveau_screen *
 nv50_screen_create(struct nouveau_device *);
 
-extern struct pipe_screen *
+extern struct nouveau_screen *
 nvc0_screen_create(struct nouveau_device *);
 
 #endif
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 154c3d3..854f70c 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -413,23 +413,20 @@ nv30_screen_destroy(struct pipe_screen *pscreen)
 #define FAIL_SCREEN_INIT(str, err)                    \
    do {                                               \
       NOUVEAU_ERR(str, err);                          \
-      nv30_screen_destroy(pscreen);                   \
-      return NULL;                                    \
+      screen->base.base.context_create = NULL;        \
+      return &screen->base;                           \
    } while(0)
 
-struct pipe_screen *
+struct nouveau_screen *
 nv30_screen_create(struct nouveau_device *dev)
 {
-   struct nv30_screen *screen = CALLOC_STRUCT(nv30_screen);
+   struct nv30_screen *screen;
    struct pipe_screen *pscreen;
    struct nouveau_pushbuf *push;
    struct nv04_fifo *fifo;
    unsigned oclass = 0;
    int ret, i;
 
-   if (!screen)
-      return NULL;
-
    switch (dev->chipset & 0xf0) {
    case 0x30:
       if (RANKINE_0397_CHIPSET & (1 << (dev->chipset & 0x0f)))
@@ -458,10 +455,16 @@ nv30_screen_create(struct nouveau_device *dev)
 
    if (!oclass) {
       NOUVEAU_ERR("unknown 3d class for 0x%02x\n", dev->chipset);
-      FREE(screen);
       return NULL;
    }
 
+   screen = CALLOC_STRUCT(nv30_screen);
+   if (!screen)
+      return NULL;
+
+   pscreen = &screen->base.base;
+   pscreen->destroy = nv30_screen_destroy;
+
    /*
     * Some modern apps try to use msaa without keeping in mind the
     * restrictions on videomem of older cards. Resulting in dmesg saying:
@@ -479,8 +482,6 @@ nv30_screen_create(struct nouveau_device *dev)
    if (screen->max_sample_count > 4)
       screen->max_sample_count = 4;
 
-   pscreen = &screen->base.base;
-   pscreen->destroy = nv30_screen_destroy;
    pscreen->get_param = nv30_screen_get_param;
    pscreen->get_paramf = nv30_screen_get_paramf;
    pscreen->get_shader_param = nv30_screen_get_shader_param;
@@ -693,5 +694,5 @@ nv30_screen_create(struct nouveau_device *dev)
    nouveau_pushbuf_kick(push, push->channel);
 
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
-   return pscreen;
+   return &screen->base;
 }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index 812d10c..7450119 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -336,9 +336,10 @@ nv50_miptree_create(struct pipe_screen *pscreen,
                     const struct pipe_resource *templ)
 {
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_drm *drm = nouveau_screen(pscreen)->drm;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   bool compressed = dev->drm_version >= 0x01000101;
+   bool compressed = drm->version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
index b6ebbbf..cccd3b7 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw.c
@@ -113,6 +113,12 @@ static void
 nv50_hw_destroy_query(struct nv50_context *nv50, struct nv50_query *q)
 {
    struct nv50_hw_query *hq = nv50_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nv50, hq);
+      return;
+   }
+
    nv50_hw_query_allocate(nv50, q, 0);
    nouveau_fence_ref(NULL, &hq->fence);
    FREE(hq);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
index d1bccb9..4a605f2 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_metric.c
@@ -71,7 +71,8 @@ nv50_hw_metric_destroy_query(struct nv50_context *nv50,
    unsigned i;
 
    for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nv50, hmq->queries[i]);
    FREE(hmq);
 }
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
index 8453ce7..79c7023 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query_hw_sm.c
@@ -153,7 +153,9 @@ static void
 nv50_hw_sm_destroy_query(struct nv50_context *nv50, struct nv50_hw_query *hq)
 {
    struct nv50_query *q = &hq->base;
-   q->funcs->destroy_query(nv50, q);
+   nv50_hw_query_allocate(nv50, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
 }
 
 static boolean
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 1e4b75f..272e1d4 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -405,6 +405,11 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
 
    if (screen->blitter)
       nv50_blitter_destroy(screen);
+   if (screen->pm.prog) {
+      screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
+      nv50_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
+   }
 
    nouveau_bo_ref(NULL, &screen->code);
    nouveau_bo_ref(NULL, &screen->tls_bo);
@@ -518,11 +523,11 @@ nv50_screen_init_hwctx(struct nv50_screen *screen)
    }
 
    BEGIN_NV04(push, NV50_3D(ZETA_COMP_ENABLE), 1);
-   PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101);
+   PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
 
    BEGIN_NV04(push, NV50_3D(RT_COMP_ENABLE(0)), 8);
    for (i = 0; i < 8; ++i)
-      PUSH_DATA(push, screen->base.device->drm_version >= 0x01000101);
+      PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
 
    BEGIN_NV04(push, NV50_3D(RT_CONTROL), 1);
    PUSH_DATA (push, 1);
@@ -747,7 +752,7 @@ int nv50_tls_realloc(struct nv50_screen *screen, unsigned tls_space)
    return 1;
 }
 
-struct pipe_screen *
+struct nouveau_screen *
 nv50_screen_create(struct nouveau_device *dev)
 {
    struct nv50_screen *screen;
@@ -762,6 +767,7 @@ nv50_screen_create(struct nouveau_device *dev)
    if (!screen)
       return NULL;
    pscreen = &screen->base.base;
+   pscreen->destroy = nv50_screen_destroy;
 
    ret = nouveau_screen_init(&screen->base, dev);
    if (ret) {
@@ -782,7 +788,6 @@ nv50_screen_create(struct nouveau_device *dev)
 
    chan = screen->base.channel;
 
-   pscreen->destroy = nv50_screen_destroy;
    pscreen->context_create = nv50_create;
    pscreen->is_format_supported = nv50_screen_is_format_supported;
    pscreen->get_param = nv50_screen_get_param;
@@ -961,11 +966,11 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
-   return pscreen;
+   return &screen->base;
 
 fail:
-   nv50_screen_destroy(pscreen);
-   return NULL;
+   screen->base.base.context_create = NULL;
+   return &screen->base;
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_tex.c b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
index 6083ea9..c3f4336 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_tex.c
@@ -192,8 +192,7 @@ nv50_create_texture_view(struct pipe_context *pipe,
       tic[2] |= NV50_TIC_2_TARGET_BUFFER | NV50_TIC_2_LINEAR;
       break;
    default:
-      NOUVEAU_ERR("invalid texture target: %d\n", mt->base.base.target);
-      return false;
+      unreachable("unexpected/invalid texture target");
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index 85878d5..7de2f1f 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -91,6 +91,9 @@ nv50_vertex_state_create(struct pipe_context *pipe,
             }
             so->element[i].state = nv50_format_table[fmt].vtx;
             so->need_conversion = true;
+            pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK,
+                               "Converting vertex element %d, no hw format %s",
+                               i, util_format_name(ve->src_format));
         }
         so->element[i].state |= i;
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv84_video.c b/src/gallium/drivers/nouveau/nv50/nv84_video.c
index 7a4670f..88655db 100644
--- a/src/gallium/drivers/nouveau/nv50/nv84_video.c
+++ b/src/gallium/drivers/nouveau/nv50/nv84_video.c
@@ -756,8 +756,8 @@ firmware_present(struct pipe_screen *pscreen, enum pipe_video_format codec)
    int present, ret;
 
    if (!FIRMWARE_PRESENT(checked, VP_KERN)) {
-      nouveau_object_new(screen->channel, 0, 0x7476, NULL, 0, &obj);
-      if (obj)
+      ret = nouveau_object_new(screen->channel, 0, 0x7476, NULL, 0, &obj);
+      if (!ret)
          screen->firmware_info.profiles_present |= FIRMWARE_VP_KERN;
       nouveau_object_del(&obj);
       screen->firmware_info.profiles_checked |= FIRMWARE_VP_KERN;
@@ -765,8 +765,8 @@ firmware_present(struct pipe_screen *pscreen, enum pipe_video_format codec)
 
    if (codec == PIPE_VIDEO_FORMAT_MPEG4_AVC) {
       if (!FIRMWARE_PRESENT(checked, BSP_KERN)) {
-         nouveau_object_new(screen->channel, 0, 0x74b0, NULL, 0, &obj);
-         if (obj)
+         ret = nouveau_object_new(screen->channel, 0, 0x74b0, NULL, 0, &obj);
+         if (!ret)
             screen->firmware_info.profiles_present |= FIRMWARE_BSP_KERN;
          nouveau_object_del(&obj);
          screen->firmware_info.profiles_checked |= FIRMWARE_BSP_KERN;
diff --git a/src/gallium/drivers/nouveau/nv50/nv98_video.c b/src/gallium/drivers/nouveau/nv50/nv98_video.c
index 20ea547..177a7e0 100644
--- a/src/gallium/drivers/nouveau/nv50/nv98_video.c
+++ b/src/gallium/drivers/nouveau/nv50/nv98_video.c
@@ -25,6 +25,8 @@
 #include "util/u_sampler.h"
 #include "util/u_format.h"
 
+#include <nvif/class.h>
+
 static void
 nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder,
                               struct pipe_video_buffer *video_target,
@@ -56,6 +58,28 @@ nv98_decoder_decode_bitstream(struct pipe_video_codec *decoder,
    nv98_decoder_ppp(dec, desc, target, comm_seq);
 }
 
+static const struct nouveau_mclass
+nv98_decoder_msvld[] = {
+   { G98_MSVLD, -1 },
+   { IGT21A_MSVLD, -1 },
+   { GT212_MSVLD, -1 },
+   {}
+};
+
+static const struct nouveau_mclass
+nv98_decoder_mspdec[] = {
+   { G98_MSPDEC, -1 },
+   { GT212_MSPDEC, -1 },
+   {}
+};
+
+static const struct nouveau_mclass
+nv98_decoder_msppp[] = {
+   { G98_MSPPP, -1 },
+   { GT212_MSPPP, -1 },
+   {}
+};
+
 struct pipe_video_codec *
 nv98_create_decoder(struct pipe_context *context,
                     const struct pipe_video_codec *templ)
@@ -103,12 +127,33 @@ nv98_create_decoder(struct pipe_context *context,
    }
    push = dec->pushbuf;
 
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[0], 0x390b1, 0x85b1, NULL, 0, &dec->bsp);
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[1], 0x190b2, 0x85b2, NULL, 0, &dec->vp);
-   if (!ret)
-      ret = nouveau_object_new(dec->channel[2], 0x290b3, 0x85b3, NULL, 0, &dec->ppp);
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[0], nv98_decoder_msvld);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[0], 0xbeef85b1,
+                                  nv98_decoder_msvld[ret].oclass, NULL, 0,
+                                  &dec->bsp);
+      }
+   }
+
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[1], nv98_decoder_mspdec);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[1], 0xbeef85b2,
+                                  nv98_decoder_mspdec[ret].oclass, NULL, 0,
+                                  &dec->vp);
+      }
+   }
+
+   if (!ret) {
+      ret = nouveau_object_mclass(dec->channel[2], nv98_decoder_msppp);
+      if (ret >= 0) {
+         ret = nouveau_object_new(dec->channel[2], 0xbeef85b3,
+                                  nv98_decoder_msppp[ret].oclass, NULL, 0,
+                                  &dec->ppp);
+      }
+   }
+
    if (ret)
       goto fail;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index 15991c3..ed1ac48 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -248,9 +248,10 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
                     const struct pipe_resource *templ)
 {
    struct nouveau_device *dev = nouveau_screen(pscreen)->device;
+   struct nouveau_drm *drm = nouveau_screen(pscreen)->drm;
    struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
    struct pipe_resource *pt = &mt->base.base;
-   bool compressed = dev->drm_version >= 0x01000101;
+   bool compressed = drm->version >= 0x01000101;
    int ret;
    union nouveau_bo_config bo_config;
    uint32_t bo_flags;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 3845d61..7497317 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -184,7 +184,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
    count++;
 #endif
 
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             count += 2;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
index 90ee82f..a70d524 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw.c
@@ -116,6 +116,12 @@ static void
 nvc0_hw_destroy_query(struct nvc0_context *nvc0, struct nvc0_query *q)
 {
    struct nvc0_hw_query *hq = nvc0_hw_query(q);
+
+   if (hq->funcs && hq->funcs->destroy_query) {
+      hq->funcs->destroy_query(nvc0, hq);
+      return;
+   }
+
    nvc0_hw_query_allocate(nvc0, q, 0);
    nouveau_fence_ref(NULL, &hq->fence);
    FREE(hq);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
index 12fb609..7a64b69 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -293,7 +293,8 @@ nvc0_hw_metric_destroy_query(struct nvc0_context *nvc0,
    unsigned i;
 
    for (i = 0; i < hmq->num_queries; i++)
-      hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
+      if (hmq->queries[i]->funcs->destroy_query)
+         hmq->queries[i]->funcs->destroy_query(nvc0, hmq->queries[i]);
    FREE(hmq);
 }
 
@@ -420,7 +421,10 @@ sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
 {
    switch (hq->base.type - NVE4_HW_METRIC_QUERY(0)) {
    case NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
-      return sm20_hw_metric_calc_result(hq, res64);
+      /* (active_warps / active_cycles) / max. number of warps on a MP */
+      if (res64[1])
+         return (res64[0] / (double)res64[1]) / 64;
+      break;
    case NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
       return sm20_hw_metric_calc_result(hq, res64);
    case NVE4_HW_METRIC_QUERY_INST_ISSUED:
@@ -561,7 +565,7 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
    uint16_t class_3d = screen->base.class_3d;
    int count = 0;
 
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             count += NVE4_HW_METRIC_QUERY_COUNT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index 7d1e75f..721857e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -782,7 +782,9 @@ static void
 nvc0_hw_sm_destroy_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
 {
    struct nvc0_query *q = &hq->base;
-   q->funcs->destroy_query(nvc0, q);
+   nvc0_hw_query_allocate(nvc0, q, 0);
+   nouveau_fence_ref(NULL, &hq->fence);
+   FREE(hq);
 }
 
 static boolean
@@ -1075,17 +1077,6 @@ nve4_hw_sm_query_read_data(uint32_t count[32][8],
    return true;
 }
 
-/* Metric calculations:
- * sum(x) ... sum of x over all MPs
- * avg(x) ... average of x over all MPs
- *
- * IPC              : sum(inst_executed) / clock
- * INST_REPLAY_OHEAD: (sum(inst_issued) - sum(inst_executed)) / sum(inst_issued)
- * MP_OCCUPANCY     : avg((active_warps / 64) / active_cycles)
- * MP_EFFICIENCY    : avg(active_cycles / clock)
- *
- * NOTE: Interpretation of IPC requires knowledge of MP count.
- */
 static boolean
 nvc0_hw_sm_get_query_result(struct nvc0_context *nvc0, struct nvc0_hw_query *hq,
                             boolean wait, union pipe_query_result *result)
@@ -1130,7 +1121,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
    struct nvc0_hw_query *hq;
    unsigned space;
 
-   if (nvc0->screen->base.device->drm_version < 0x01000101)
+   if (nvc0->screen->base.drm->version < 0x01000101)
       return NULL;
 
    if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
@@ -1225,7 +1216,7 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
 {
    int count = 0;
 
-   if (screen->base.device->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       if (screen->compute) {
          if (screen->base.class_3d == NVE4_3D_CLASS) {
             count += NVE4_HW_SM_QUERY_COUNT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 461fcaa..3995446 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -22,6 +22,7 @@
 
 #include <xf86drm.h>
 #include <nouveau_drm.h>
+#include <nvif/class.h>
 #include "util/u_format.h"
 #include "util/u_format_s3tc.h"
 #include "pipe/p_screen.h"
@@ -428,6 +429,7 @@ nvc0_screen_destroy(struct pipe_screen *pscreen)
    if (screen->pm.prog) {
       screen->pm.prog->code = NULL; /* hardcoded, don't FREE */
       nvc0_program_destroy(NULL, screen->pm.prog);
+      FREE(screen->pm.prog);
    }
 
    nouveau_bo_ref(NULL, &screen->text);
@@ -617,11 +619,10 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
 #define FAIL_SCREEN_INIT(str, err)                    \
    do {                                               \
       NOUVEAU_ERR(str, err);                          \
-      nvc0_screen_destroy(pscreen);                   \
-      return NULL;                                    \
+      goto fail;                                      \
    } while(0)
 
-struct pipe_screen *
+struct nouveau_screen *
 nvc0_screen_create(struct nouveau_device *dev)
 {
    struct nvc0_screen *screen;
@@ -650,6 +651,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    if (!screen)
       return NULL;
    pscreen = &screen->base.base;
+   pscreen->destroy = nvc0_screen_destroy;
 
    ret = nouveau_screen_init(&screen->base, dev);
    if (ret) {
@@ -672,7 +674,6 @@ nvc0_screen_create(struct nouveau_device *dev)
       screen->base.vidmem_bindings = 0;
    }
 
-   pscreen->destroy = nvc0_screen_destroy;
    pscreen->context_create = nvc0_create;
    pscreen->is_format_supported = nvc0_screen_is_format_supported;
    pscreen->get_param = nvc0_screen_get_param;
@@ -687,7 +688,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.base.is_video_format_supported = nouveau_vp3_screen_video_supported;
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_MAP;
-   if (dev->drm_version >= 0x01000202)
+   if (screen->base.drm->version >= 0x01000202)
       flags |= NOUVEAU_BO_COHERENT;
 
    ret = nouveau_bo_new(dev, flags, 0, 4096, NULL, &screen->fence.bo);
@@ -699,12 +700,13 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.fence.update = nvc0_screen_fence_update;
 
 
-   ret = nouveau_object_new(chan,
-                            (dev->chipset < 0xe0) ? 0x1f906e : 0x906e, 0x906e,
-                            NULL, 0, &screen->nvsw);
+   ret = nouveau_object_new(chan, (dev->chipset < 0xe0) ? 0x1f906e : 0x906e,
+                            NVIF_CLASS_SW_GF100, NULL, 0, &screen->nvsw);
    if (ret)
       FAIL_SCREEN_INIT("Error creating SW object: %d\n", ret);
 
+   BEGIN_NVC0(push, SUBC_SW(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->nvsw->handle);
 
    switch (dev->chipset & ~0xf) {
    case 0x110:
@@ -811,10 +813,11 @@ nvc0_screen_create(struct nouveau_device *dev)
       PUSH_DATA (push, 0x17);
    }
 
-   IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE), dev->drm_version >= 0x01000101);
+   IMMED_NVC0(push, NVC0_3D(ZETA_COMP_ENABLE),
+                    screen->base.drm->version >= 0x01000101);
    BEGIN_NVC0(push, NVC0_3D(RT_COMP_ENABLE(0)), 8);
    for (i = 0; i < 8; ++i)
-           PUSH_DATA(push, dev->drm_version >= 0x01000101);
+           PUSH_DATA(push, screen->base.drm->version >= 0x01000101);
 
    BEGIN_NVC0(push, NVC0_3D(RT_CONTROL), 1);
    PUSH_DATA (push, 1);
@@ -910,7 +913,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATAh(push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
    PUSH_DATA (push, screen->uniform_bo->offset + (5 << 16) + (6 << 9));
 
-   if (dev->drm_version >= 0x01000101) {
+   if (screen->base.drm->version >= 0x01000101) {
       ret = nouveau_getparam(dev, NOUVEAU_GETPARAM_GRAPH_UNITS, &value);
       if (ret) {
          NOUVEAU_ERR("NOUVEAU_GETPARAM_GRAPH_UNITS failed.\n");
@@ -1061,11 +1064,11 @@ nvc0_screen_create(struct nouveau_device *dev)
 
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
-   return pscreen;
+   return &screen->base;
 
 fail:
-   nvc0_screen_destroy(pscreen);
-   return NULL;
+   screen->base.base.context_create = NULL;
+   return &screen->base;
 }
 
 int
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 7e2e999..5e84ca9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -236,11 +236,8 @@ nvc0_gmtyprog_validate(struct nvc0_context *nvc0)
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_program *gp = nvc0->gmtyprog;
 
-   if (gp)
-      nvc0_program_validate(nvc0, gp);
-
    /* we allow GPs with no code for specifying stream output state only */
-   if (gp && gp->code_size) {
+   if (gp && nvc0_program_validate(nvc0, gp) && gp->code_size) {
       const bool gp_selects_layer = !!(gp->hdr[13] & (1 << 9));
 
       BEGIN_NVC0(push, NVC0_3D(MACRO_GP_SELECT), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index f8e1efb..4e43c4e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1030,9 +1030,11 @@ nvc0_blitctx_post_blit(struct nvc0_blitctx *blit)
       nvc0->base.pipe.render_condition(&nvc0->base.pipe, nvc0->cond_query,
                                        nvc0->cond_cond, nvc0->cond_mode);
 
+   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_VTX_TMP);
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_FB);
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 0));
    nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_TEX(4, 1));
+   nouveau_scratch_done(&nvc0->base);
 
    nvc0->dirty = blit->saved.dirty |
       (NVC0_NEW_FRAMEBUFFER | NVC0_NEW_SCISSOR | NVC0_NEW_SAMPLE_MASK |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 2dd100f..74090ce 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -193,9 +193,7 @@ nvc0_create_texture_view(struct pipe_context *pipe,
       tic[2] |= NV50_TIC_2_TARGET_CUBE_ARRAY;
       break;
    default:
-      NOUVEAU_ERR("unexpected/invalid texture target: %d\n",
-                  mt->base.base.target);
-      return false;
+      unreachable("unexpected/invalid texture target");
    }
 
    tic[3] = (flags & NV50_TEXVIEW_FILTER_MSAA8) ? 0x20000000 : 0x00300000;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index c464904..54443bd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -95,6 +95,9 @@ nvc0_vertex_state_create(struct pipe_context *pipe,
             }
             so->element[i].state = nvc0_format_table[fmt].vtx;
             so->need_conversion = true;
+            pipe_debug_message(&nouveau_context(pipe)->debug, FALLBACK,
+                               "Converting vertex element %d, no hw format %s",
+                               i, util_format_name(ve->src_format));
         }
         size = util_format_get_blocksize(fmt);