summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/nouveau/codegen
diff options
context:
space:
mode:
Diffstat (limited to 'src/gallium/drivers/nouveau/codegen')
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir.cpp1231
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir.h1197
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp550
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp614
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h324
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h220
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp1682
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp1962
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp2988
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp2852
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp436
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h228
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h420
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp1101
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp1597
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp2464
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp698
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp2050
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp552
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp469
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target.h235
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp552
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h72
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp604
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h74
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp390
-rw-r--r--src/gallium/drivers/nouveau/codegen/nv50_ir_util.h788
-rw-r--r--src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm96
-rw-r--r--src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h112
-rw-r--r--src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm698
-rw-r--r--src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h592
-rw-r--r--src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h13
32 files changed, 27861 insertions, 0 deletions
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
new file mode 100644
index 0000000..90fb51c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.cpp
@@ -0,0 +1,1231 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+#include "codegen/nv50_ir_driver.h"
+
+extern "C" {
+#include "nv50/nv50_program.h"
+#include "nv50/nv50_debug.h"
+}
+
+namespace nv50_ir {
+
+Modifier::Modifier(operation op)
+{
+ switch (op) {
+ case OP_NEG: bits = NV50_IR_MOD_NEG; break;
+ case OP_ABS: bits = NV50_IR_MOD_ABS; break;
+ case OP_SAT: bits = NV50_IR_MOD_SAT; break;
+ case OP_NOT: bits = NV50_IR_MOD_NOT; break;
+ default:
+ bits = 0;
+ break;
+ }
+}
+
+Modifier Modifier::operator*(const Modifier m) const
+{
+ unsigned int a, b, c;
+
+ b = m.bits;
+ if (this->bits & NV50_IR_MOD_ABS)
+ b &= ~NV50_IR_MOD_NEG;
+
+ a = (this->bits ^ b) & (NV50_IR_MOD_NOT | NV50_IR_MOD_NEG);
+ c = (this->bits | m.bits) & (NV50_IR_MOD_ABS | NV50_IR_MOD_SAT);
+
+ return Modifier(a | c);
+}
+
+ValueRef::ValueRef(Value *v) : value(NULL), insn(NULL)
+{
+ indirect[0] = -1;
+ indirect[1] = -1;
+ usedAsPtr = false;
+ set(v);
+}
+
+ValueRef::ValueRef(const ValueRef& ref) : value(NULL), insn(ref.insn)
+{
+ set(ref);
+ usedAsPtr = ref.usedAsPtr;
+}
+
+ValueRef::~ValueRef()
+{
+ this->set(NULL);
+}
+
+bool ValueRef::getImmediate(ImmediateValue &imm) const
+{
+ const ValueRef *src = this;
+ Modifier m;
+ DataType type = src->insn->sType;
+
+ while (src) {
+ if (src->mod) {
+ if (src->insn->sType != type)
+ break;
+ m *= src->mod;
+ }
+ if (src->getFile() == FILE_IMMEDIATE) {
+ imm = *(src->value->asImm());
+ // The immediate's type isn't required to match its use, it's
+ // more of a hint; applying a modifier makes use of that hint.
+ imm.reg.type = type;
+ m.applyTo(imm);
+ return true;
+ }
+
+ Instruction *insn = src->value->getUniqueInsn();
+
+ if (insn && insn->op == OP_MOV) {
+ src = &insn->src(0);
+ if (src->mod)
+ WARN("OP_MOV with modifier encountered !\n");
+ } else {
+ src = NULL;
+ }
+ }
+ return false;
+}
+
+ValueDef::ValueDef(Value *v) : value(NULL), insn(NULL)
+{
+ set(v);
+}
+
+ValueDef::ValueDef(const ValueDef& def) : value(NULL), insn(NULL)
+{
+ set(def.get());
+}
+
+ValueDef::~ValueDef()
+{
+ this->set(NULL);
+}
+
+void
+ValueRef::set(const ValueRef &ref)
+{
+ this->set(ref.get());
+ mod = ref.mod;
+ indirect[0] = ref.indirect[0];
+ indirect[1] = ref.indirect[1];
+}
+
+void
+ValueRef::set(Value *refVal)
+{
+ if (value == refVal)
+ return;
+ if (value)
+ value->uses.remove(this);
+ if (refVal)
+ refVal->uses.push_back(this);
+
+ value = refVal;
+}
+
+void
+ValueDef::set(Value *defVal)
+{
+ if (value == defVal)
+ return;
+ if (value)
+ value->defs.remove(this);
+ if (defVal)
+ defVal->defs.push_back(this);
+
+ value = defVal;
+}
+
+// Check if we can replace this definition's value by the value in @rep,
+// including the source modifiers, i.e. make sure that all uses support
+// @rep.mod.
+bool
+ValueDef::mayReplace(const ValueRef &rep)
+{
+ if (!rep.mod)
+ return true;
+
+ if (!insn || !insn->bb) // Unbound instruction ?
+ return false;
+
+ const Target *target = insn->bb->getProgram()->getTarget();
+
+ for (Value::UseIterator it = value->uses.begin(); it != value->uses.end();
+ ++it) {
+ Instruction *insn = (*it)->getInsn();
+ int s = -1;
+
+ for (int i = 0; insn->srcExists(i); ++i) {
+ if (insn->src(i).get() == value) {
+ // If there are multiple references to us we'd have to check if the
+ // combination of mods is still supported, but just bail for now.
+ if (&insn->src(i) != (*it))
+ return false;
+ s = i;
+ }
+ }
+ assert(s >= 0); // integrity of uses list
+
+ if (!target->isModSupported(insn, s, rep.mod))
+ return false;
+ }
+ return true;
+}
+
+void
+ValueDef::replace(const ValueRef &repVal, bool doSet)
+{
+ assert(mayReplace(repVal));
+
+ if (value == repVal.get())
+ return;
+
+ while (!value->uses.empty()) {
+ ValueRef *ref = value->uses.front();
+ ref->set(repVal.get());
+ ref->mod *= repVal.mod;
+ }
+
+ if (doSet)
+ set(repVal.get());
+}
+
+Value::Value()
+{
+ join = this;
+ memset(&reg, 0, sizeof(reg));
+ reg.size = 4;
+}
+
+LValue::LValue(Function *fn, DataFile file)
+{
+ reg.file = file;
+ reg.size = (file != FILE_PREDICATE) ? 4 : 1;
+ reg.data.id = -1;
+
+ compMask = 0;
+ compound = 0;
+ ssa = 0;
+ fixedReg = 0;
+ noSpill = 0;
+
+ fn->add(this, this->id);
+}
+
+LValue::LValue(Function *fn, LValue *lval)
+{
+ assert(lval);
+
+ reg.file = lval->reg.file;
+ reg.size = lval->reg.size;
+ reg.data.id = -1;
+
+ compMask = 0;
+ compound = 0;
+ ssa = 0;
+ fixedReg = 0;
+ noSpill = 0;
+
+ fn->add(this, this->id);
+}
+
+LValue *
+LValue::clone(ClonePolicy<Function>& pol) const
+{
+ LValue *that = new_LValue(pol.context(), reg.file);
+
+ pol.set<Value>(this, that);
+
+ that->reg.size = this->reg.size;
+ that->reg.type = this->reg.type;
+ that->reg.data = this->reg.data;
+
+ return that;
+}
+
+bool
+LValue::isUniform() const
+{
+ if (defs.size() > 1)
+ return false;
+ Instruction *insn = getInsn();
+ // let's not try too hard here for now ...
+ return !insn->srcExists(1) && insn->getSrc(0)->isUniform();
+}
+
+Symbol::Symbol(Program *prog, DataFile f, ubyte fidx)
+{
+ baseSym = NULL;
+
+ reg.file = f;
+ reg.fileIndex = fidx;
+ reg.data.offset = 0;
+
+ prog->add(this, this->id);
+}
+
+Symbol *
+Symbol::clone(ClonePolicy<Function>& pol) const
+{
+ Program *prog = pol.context()->getProgram();
+
+ Symbol *that = new_Symbol(prog, reg.file, reg.fileIndex);
+
+ pol.set<Value>(this, that);
+
+ that->reg.size = this->reg.size;
+ that->reg.type = this->reg.type;
+ that->reg.data = this->reg.data;
+
+ that->baseSym = this->baseSym;
+
+ return that;
+}
+
+bool
+Symbol::isUniform() const
+{
+ return
+ reg.file != FILE_SYSTEM_VALUE &&
+ reg.file != FILE_MEMORY_LOCAL &&
+ reg.file != FILE_SHADER_INPUT;
+}
+
+ImmediateValue::ImmediateValue(Program *prog, uint32_t uval)
+{
+ memset(&reg, 0, sizeof(reg));
+
+ reg.file = FILE_IMMEDIATE;
+ reg.size = 4;
+ reg.type = TYPE_U32;
+
+ reg.data.u32 = uval;
+
+ prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(Program *prog, float fval)
+{
+ memset(&reg, 0, sizeof(reg));
+
+ reg.file = FILE_IMMEDIATE;
+ reg.size = 4;
+ reg.type = TYPE_F32;
+
+ reg.data.f32 = fval;
+
+ prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(Program *prog, double dval)
+{
+ memset(&reg, 0, sizeof(reg));
+
+ reg.file = FILE_IMMEDIATE;
+ reg.size = 8;
+ reg.type = TYPE_F64;
+
+ reg.data.f64 = dval;
+
+ prog->add(this, this->id);
+}
+
+ImmediateValue::ImmediateValue(const ImmediateValue *proto, DataType ty)
+{
+ reg = proto->reg;
+
+ reg.type = ty;
+ reg.size = typeSizeof(ty);
+}
+
+ImmediateValue *
+ImmediateValue::clone(ClonePolicy<Function>& pol) const
+{
+ Program *prog = pol.context()->getProgram();
+ ImmediateValue *that = new_ImmediateValue(prog, 0u);
+
+ pol.set<Value>(this, that);
+
+ that->reg.size = this->reg.size;
+ that->reg.type = this->reg.type;
+ that->reg.data = this->reg.data;
+
+ return that;
+}
+
+bool
+ImmediateValue::isInteger(const int i) const
+{
+ switch (reg.type) {
+ case TYPE_S8:
+ return reg.data.s8 == i;
+ case TYPE_U8:
+ return reg.data.u8 == i;
+ case TYPE_S16:
+ return reg.data.s16 == i;
+ case TYPE_U16:
+ return reg.data.u16 == i;
+ case TYPE_S32:
+ case TYPE_U32:
+ return reg.data.s32 == i; // as if ...
+ case TYPE_F32:
+ return reg.data.f32 == static_cast<float>(i);
+ case TYPE_F64:
+ return reg.data.f64 == static_cast<double>(i);
+ default:
+ return false;
+ }
+}
+
+bool
+ImmediateValue::isNegative() const
+{
+ switch (reg.type) {
+ case TYPE_S8: return reg.data.s8 < 0;
+ case TYPE_S16: return reg.data.s16 < 0;
+ case TYPE_S32:
+ case TYPE_U32: return reg.data.s32 < 0;
+ case TYPE_F32: return reg.data.u32 & (1 << 31);
+ case TYPE_F64: return reg.data.u64 & (1ULL << 63);
+ default:
+ return false;
+ }
+}
+
+bool
+ImmediateValue::isPow2() const
+{
+ switch (reg.type) {
+ case TYPE_U8:
+ case TYPE_U16:
+ case TYPE_U32: return util_is_power_of_two(reg.data.u32);
+ default:
+ return false;
+ }
+}
+
+void
+ImmediateValue::applyLog2()
+{
+ switch (reg.type) {
+ case TYPE_S8:
+ case TYPE_S16:
+ case TYPE_S32:
+ assert(!this->isNegative());
+ // fall through
+ case TYPE_U8:
+ case TYPE_U16:
+ case TYPE_U32:
+ reg.data.u32 = util_logbase2(reg.data.u32);
+ break;
+ case TYPE_F32:
+ reg.data.f32 = log2f(reg.data.f32);
+ break;
+ case TYPE_F64:
+ reg.data.f64 = log2(reg.data.f64);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+bool
+ImmediateValue::compare(CondCode cc, float fval) const
+{
+ if (reg.type != TYPE_F32)
+ ERROR("immediate value is not of type f32");
+
+ switch (static_cast<CondCode>(cc & 7)) {
+ case CC_TR: return true;
+ case CC_FL: return false;
+ case CC_LT: return reg.data.f32 < fval;
+ case CC_LE: return reg.data.f32 <= fval;
+ case CC_GT: return reg.data.f32 > fval;
+ case CC_GE: return reg.data.f32 >= fval;
+ case CC_EQ: return reg.data.f32 == fval;
+ case CC_NE: return reg.data.f32 != fval;
+ default:
+ assert(0);
+ return false;
+ }
+}
+
+ImmediateValue&
+ImmediateValue::operator=(const ImmediateValue &that)
+{
+ this->reg = that.reg;
+ return (*this);
+}
+
+bool
+Value::interfers(const Value *that) const
+{
+ uint32_t idA, idB;
+
+ if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex)
+ return false;
+ if (this->asImm())
+ return false;
+
+ if (this->asSym()) {
+ idA = this->join->reg.data.offset;
+ idB = that->join->reg.data.offset;
+ } else {
+ idA = this->join->reg.data.id * MIN2(this->reg.size, 4);
+ idB = that->join->reg.data.id * MIN2(that->reg.size, 4);
+ }
+
+ if (idA < idB)
+ return (idA + this->reg.size > idB);
+ else
+ if (idA > idB)
+ return (idB + that->reg.size > idA);
+ else
+ return (idA == idB);
+}
+
+bool
+Value::equals(const Value *that, bool strict) const
+{
+ if (strict)
+ return this == that;
+
+ if (that->reg.file != reg.file || that->reg.fileIndex != reg.fileIndex)
+ return false;
+ if (that->reg.size != this->reg.size)
+ return false;
+
+ if (that->reg.data.id != this->reg.data.id)
+ return false;
+
+ return true;
+}
+
+bool
+ImmediateValue::equals(const Value *that, bool strict) const
+{
+ const ImmediateValue *imm = that->asImm();
+ if (!imm)
+ return false;
+ return reg.data.u64 == imm->reg.data.u64;
+}
+
+bool
+Symbol::equals(const Value *that, bool strict) const
+{
+ if (reg.file != that->reg.file || reg.fileIndex != that->reg.fileIndex)
+ return false;
+ assert(that->asSym());
+
+ if (this->baseSym != that->asSym()->baseSym)
+ return false;
+
+ if (reg.file == FILE_SYSTEM_VALUE)
+ return (this->reg.data.sv.sv == that->reg.data.sv.sv &&
+ this->reg.data.sv.index == that->reg.data.sv.index);
+ return this->reg.data.offset == that->reg.data.offset;
+}
+
+void Instruction::init()
+{
+ next = prev = 0;
+
+ cc = CC_ALWAYS;
+ rnd = ROUND_N;
+ cache = CACHE_CA;
+ subOp = 0;
+
+ saturate = 0;
+ join = 0;
+ exit = 0;
+ terminator = 0;
+ ftz = 0;
+ dnz = 0;
+ perPatch = 0;
+ fixed = 0;
+ encSize = 0;
+ ipa = 0;
+ mask = 0;
+
+ lanes = 0xf;
+
+ postFactor = 0;
+
+ predSrc = -1;
+ flagsDef = -1;
+ flagsSrc = -1;
+}
+
+Instruction::Instruction()
+{
+ init();
+
+ op = OP_NOP;
+ dType = sType = TYPE_F32;
+
+ id = -1;
+ bb = 0;
+}
+
+Instruction::Instruction(Function *fn, operation opr, DataType ty)
+{
+ init();
+
+ op = opr;
+ dType = sType = ty;
+
+ fn->add(this, id);
+}
+
+Instruction::~Instruction()
+{
+ if (bb) {
+ Function *fn = bb->getFunction();
+ bb->remove(this);
+ fn->allInsns.remove(id);
+ }
+
+ for (int s = 0; srcExists(s); ++s)
+ setSrc(s, NULL);
+ // must unlink defs too since the list pointers will get deallocated
+ for (int d = 0; defExists(d); ++d)
+ setDef(d, NULL);
+}
+
+void
+Instruction::setDef(int i, Value *val)
+{
+ int size = defs.size();
+ if (i >= size) {
+ defs.resize(i + 1);
+ while (size <= i)
+ defs[size++].setInsn(this);
+ }
+ defs[i].set(val);
+}
+
+void
+Instruction::setSrc(int s, Value *val)
+{
+ int size = srcs.size();
+ if (s >= size) {
+ srcs.resize(s + 1);
+ while (size <= s)
+ srcs[size++].setInsn(this);
+ }
+ srcs[s].set(val);
+}
+
+void
+Instruction::setSrc(int s, const ValueRef& ref)
+{
+ setSrc(s, ref.get());
+ srcs[s].mod = ref.mod;
+}
+
+void
+Instruction::swapSources(int a, int b)
+{
+ Value *value = srcs[a].get();
+ Modifier m = srcs[a].mod;
+
+ setSrc(a, srcs[b]);
+
+ srcs[b].set(value);
+ srcs[b].mod = m;
+}
+
+static inline void moveSourcesAdjustIndex(int8_t &index, int s, int delta)
+{
+ if (index >= s)
+ index += delta;
+ else
+ if ((delta < 0) && (index >= (s + delta)))
+ index = -1;
+}
+
+// Moves sources [@s,last_source] by @delta.
+// If @delta < 0, sources [@s - abs(@delta), @s) are erased.
+void
+Instruction::moveSources(const int s, const int delta)
+{
+ if (delta == 0)
+ return;
+ assert(s + delta >= 0);
+
+ int k;
+
+ for (k = 0; srcExists(k); ++k) {
+ for (int i = 0; i < 2; ++i)
+ moveSourcesAdjustIndex(src(k).indirect[i], s, delta);
+ }
+ moveSourcesAdjustIndex(predSrc, s, delta);
+ moveSourcesAdjustIndex(flagsSrc, s, delta);
+ if (asTex()) {
+ TexInstruction *tex = asTex();
+ moveSourcesAdjustIndex(tex->tex.rIndirectSrc, s, delta);
+ moveSourcesAdjustIndex(tex->tex.sIndirectSrc, s, delta);
+ }
+
+ if (delta > 0) {
+ --k;
+ for (int p = k + delta; k >= s; --k, --p)
+ setSrc(p, src(k));
+ } else {
+ int p;
+ for (p = s; p < k; ++p)
+ setSrc(p + delta, src(p));
+ for (; (p + delta) < k; ++p)
+ setSrc(p + delta, NULL);
+ }
+}
+
+void
+Instruction::takeExtraSources(int s, Value *values[3])
+{
+ values[0] = getIndirect(s, 0);
+ if (values[0])
+ setIndirect(s, 0, NULL);
+
+ values[1] = getIndirect(s, 1);
+ if (values[1])
+ setIndirect(s, 1, NULL);
+
+ values[2] = getPredicate();
+ if (values[2])
+ setPredicate(cc, NULL);
+}
+
+void
+Instruction::putExtraSources(int s, Value *values[3])
+{
+ if (values[0])
+ setIndirect(s, 0, values[0]);
+ if (values[1])
+ setIndirect(s, 1, values[1]);
+ if (values[2])
+ setPredicate(cc, values[2]);
+}
+
+Instruction *
+Instruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+ if (!i)
+ i = new_Instruction(pol.context(), op, dType);
+#ifndef NDEBUG // non-conformant assert, so this is required
+ assert(typeid(*i) == typeid(*this));
+#endif
+
+ pol.set<Instruction>(this, i);
+
+ i->sType = sType;
+
+ i->rnd = rnd;
+ i->cache = cache;
+ i->subOp = subOp;
+
+ i->saturate = saturate;
+ i->join = join;
+ i->exit = exit;
+ i->mask = mask;
+ i->ftz = ftz;
+ i->dnz = dnz;
+ i->ipa = ipa;
+ i->lanes = lanes;
+ i->perPatch = perPatch;
+
+ i->postFactor = postFactor;
+
+ for (int d = 0; defExists(d); ++d)
+ i->setDef(d, pol.get(getDef(d)));
+
+ for (int s = 0; srcExists(s); ++s) {
+ i->setSrc(s, pol.get(getSrc(s)));
+ i->src(s).mod = src(s).mod;
+ }
+
+ i->cc = cc;
+ i->predSrc = predSrc;
+ i->flagsDef = flagsDef;
+ i->flagsSrc = flagsSrc;
+
+ return i;
+}
+
+unsigned int
+Instruction::defCount(unsigned int mask, bool singleFile) const
+{
+ unsigned int i, n;
+
+ if (singleFile) {
+ unsigned int d = ffs(mask);
+ if (!d)
+ return 0;
+ for (i = d--; defExists(i); ++i)
+ if (getDef(i)->reg.file != getDef(d)->reg.file)
+ mask &= ~(1 << i);
+ }
+
+ for (n = 0, i = 0; this->defExists(i); ++i, mask >>= 1)
+ n += mask & 1;
+ return n;
+}
+
+unsigned int
+Instruction::srcCount(unsigned int mask, bool singleFile) const
+{
+ unsigned int i, n;
+
+ if (singleFile) {
+ unsigned int s = ffs(mask);
+ if (!s)
+ return 0;
+ for (i = s--; srcExists(i); ++i)
+ if (getSrc(i)->reg.file != getSrc(s)->reg.file)
+ mask &= ~(1 << i);
+ }
+
+ for (n = 0, i = 0; this->srcExists(i); ++i, mask >>= 1)
+ n += mask & 1;
+ return n;
+}
+
+bool
+Instruction::setIndirect(int s, int dim, Value *value)
+{
+ assert(this->srcExists(s));
+
+ int p = srcs[s].indirect[dim];
+ if (p < 0) {
+ if (!value)
+ return true;
+ p = srcs.size();
+ while (p > 0 && !srcExists(p - 1))
+ --p;
+ }
+ setSrc(p, value);
+ srcs[p].usedAsPtr = (value != 0);
+ srcs[s].indirect[dim] = value ? p : -1;
+ return true;
+}
+
+bool
+Instruction::setPredicate(CondCode ccode, Value *value)
+{
+ cc = ccode;
+
+ if (!value) {
+ if (predSrc >= 0) {
+ srcs[predSrc].set(NULL);
+ predSrc = -1;
+ }
+ return true;
+ }
+
+ if (predSrc < 0) {
+ predSrc = srcs.size();
+ while (predSrc > 0 && !srcExists(predSrc - 1))
+ --predSrc;
+ }
+
+ setSrc(predSrc, value);
+ return true;
+}
+
+bool
+Instruction::writesPredicate() const
+{
+ for (int d = 0; defExists(d); ++d)
+ if (getDef(d)->inFile(FILE_PREDICATE) || getDef(d)->inFile(FILE_FLAGS))
+ return true;
+ return false;
+}
+
+static bool
+insnCheckCommutationDefSrc(const Instruction *a, const Instruction *b)
+{
+ for (int d = 0; a->defExists(d); ++d)
+ for (int s = 0; b->srcExists(s); ++s)
+ if (a->getDef(d)->interfers(b->getSrc(s)))
+ return false;
+ return true;
+}
+
+static bool
+insnCheckCommutationDefDef(const Instruction *a, const Instruction *b)
+{
+ for (int d = 0; a->defExists(d); ++d)
+ for (int c = 0; b->defExists(c); ++c)
+ if (a->getDef(d)->interfers(b->getDef(c)))
+ return false;
+ return true;
+}
+
+bool
+Instruction::isCommutationLegal(const Instruction *i) const
+{
+ bool ret = insnCheckCommutationDefDef(this, i);
+ ret = ret && insnCheckCommutationDefSrc(this, i);
+ ret = ret && insnCheckCommutationDefSrc(i, this);
+ return ret;
+}
+
+TexInstruction::TexInstruction(Function *fn, operation op)
+ : Instruction(fn, op, TYPE_F32)
+{
+ memset(&tex, 0, sizeof(tex));
+
+ tex.rIndirectSrc = -1;
+ tex.sIndirectSrc = -1;
+}
+
+TexInstruction::~TexInstruction()
+{
+ for (int c = 0; c < 3; ++c) {
+ dPdx[c].set(NULL);
+ dPdy[c].set(NULL);
+ }
+}
+
+TexInstruction *
+TexInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+ TexInstruction *tex = (i ? static_cast<TexInstruction *>(i) :
+ new_TexInstruction(pol.context(), op));
+
+ Instruction::clone(pol, tex);
+
+ tex->tex = this->tex;
+
+ if (op == OP_TXD) {
+ for (unsigned int c = 0; c < tex->tex.target.getDim(); ++c) {
+ tex->dPdx[c].set(dPdx[c]);
+ tex->dPdy[c].set(dPdy[c]);
+ }
+ }
+
+ return tex;
+}
+
+const struct TexInstruction::Target::Desc TexInstruction::Target::descTable[] =
+{
+ { "1D", 1, 1, false, false, false },
+ { "2D", 2, 2, false, false, false },
+ { "2D_MS", 2, 3, false, false, false },
+ { "3D", 3, 3, false, false, false },
+ { "CUBE", 2, 3, false, true, false },
+ { "1D_SHADOW", 1, 1, false, false, true },
+ { "2D_SHADOW", 2, 2, false, false, true },
+ { "CUBE_SHADOW", 2, 3, false, true, true },
+ { "1D_ARRAY", 1, 2, true, false, false },
+ { "2D_ARRAY", 2, 3, true, false, false },
+ { "2D_MS_ARRAY", 2, 4, true, false, false },
+ { "CUBE_ARRAY", 2, 4, true, true, false },
+ { "1D_ARRAY_SHADOW", 1, 2, true, false, true },
+ { "2D_ARRAY_SHADOW", 2, 3, true, false, true },
+ { "RECT", 2, 2, false, false, false },
+ { "RECT_SHADOW", 2, 2, false, false, true },
+ { "CUBE_ARRAY_SHADOW", 2, 4, true, true, true },
+ { "BUFFER", 1, 1, false, false, false },
+};
+
+void
+TexInstruction::setIndirectR(Value *v)
+{
+ int p = ((tex.rIndirectSrc < 0) && v) ? srcs.size() : tex.rIndirectSrc;
+ if (p >= 0) {
+ tex.rIndirectSrc = p;
+ setSrc(p, v);
+ srcs[p].usedAsPtr = !!v;
+ }
+}
+
+void
+TexInstruction::setIndirectS(Value *v)
+{
+ int p = ((tex.sIndirectSrc < 0) && v) ? srcs.size() : tex.sIndirectSrc;
+ if (p >= 0) {
+ tex.sIndirectSrc = p;
+ setSrc(p, v);
+ srcs[p].usedAsPtr = !!v;
+ }
+}
+
+CmpInstruction::CmpInstruction(Function *fn, operation op)
+ : Instruction(fn, op, TYPE_F32)
+{
+ setCond = CC_ALWAYS;
+}
+
+CmpInstruction *
+CmpInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+ CmpInstruction *cmp = (i ? static_cast<CmpInstruction *>(i) :
+ new_CmpInstruction(pol.context(), op));
+ cmp->dType = dType;
+ Instruction::clone(pol, cmp);
+ cmp->setCond = setCond;
+ return cmp;
+}
+
+FlowInstruction::FlowInstruction(Function *fn, operation op, void *targ)
+ : Instruction(fn, op, TYPE_NONE)
+{
+ if (op == OP_CALL)
+ target.fn = reinterpret_cast<Function *>(targ);
+ else
+ target.bb = reinterpret_cast<BasicBlock *>(targ);
+
+ if (op == OP_BRA ||
+ op == OP_CONT || op == OP_BREAK ||
+ op == OP_RET || op == OP_EXIT)
+ terminator = 1;
+ else
+ if (op == OP_JOIN)
+ terminator = targ ? 1 : 0;
+
+ allWarp = absolute = limit = builtin = indirect = 0;
+}
+
+FlowInstruction *
+FlowInstruction::clone(ClonePolicy<Function>& pol, Instruction *i) const
+{
+ FlowInstruction *flow = (i ? static_cast<FlowInstruction *>(i) :
+ new_FlowInstruction(pol.context(), op, NULL));
+
+ Instruction::clone(pol, flow);
+ flow->allWarp = allWarp;
+ flow->absolute = absolute;
+ flow->limit = limit;
+ flow->builtin = builtin;
+
+ if (builtin)
+ flow->target.builtin = target.builtin;
+ else
+ if (op == OP_CALL)
+ flow->target.fn = target.fn;
+ else
+ if (target.bb)
+ flow->target.bb = pol.get<BasicBlock>(target.bb);
+
+ return flow;
+}
+
+Program::Program(Type type, Target *arch)
+ : progType(type),
+ target(arch),
+ mem_Instruction(sizeof(Instruction), 6),
+ mem_CmpInstruction(sizeof(CmpInstruction), 4),
+ mem_TexInstruction(sizeof(TexInstruction), 4),
+ mem_FlowInstruction(sizeof(FlowInstruction), 4),
+ mem_LValue(sizeof(LValue), 8),
+ mem_Symbol(sizeof(Symbol), 7),
+ mem_ImmediateValue(sizeof(ImmediateValue), 7)
+{
+ code = NULL;
+ binSize = 0;
+
+ maxGPR = -1;
+
+ main = new Function(this, "MAIN", ~0);
+ calls.insert(&main->call);
+
+ dbgFlags = 0;
+ optLevel = 0;
+
+ targetPriv = NULL;
+}
+
+Program::~Program()
+{
+ for (ArrayList::Iterator it = allFuncs.iterator(); !it.end(); it.next())
+ delete reinterpret_cast<Function *>(it.get());
+
+ for (ArrayList::Iterator it = allRValues.iterator(); !it.end(); it.next())
+ releaseValue(reinterpret_cast<Value *>(it.get()));
+}
+
+void Program::releaseInstruction(Instruction *insn)
+{
+ // TODO: make this not suck so much
+
+ insn->~Instruction();
+
+ if (insn->asCmp())
+ mem_CmpInstruction.release(insn);
+ else
+ if (insn->asTex())
+ mem_TexInstruction.release(insn);
+ else
+ if (insn->asFlow())
+ mem_FlowInstruction.release(insn);
+ else
+ mem_Instruction.release(insn);
+}
+
+void Program::releaseValue(Value *value)
+{
+ value->~Value();
+
+ if (value->asLValue())
+ mem_LValue.release(value);
+ else
+ if (value->asImm())
+ mem_ImmediateValue.release(value);
+ else
+ if (value->asSym())
+ mem_Symbol.release(value);
+}
+
+
+} // namespace nv50_ir
+
+extern "C" {
+
+static void
+nv50_ir_init_prog_info(struct nv50_ir_prog_info *info)
+{
+#if defined(PIPE_SHADER_HULL) && defined(PIPE_SHADER_DOMAIN)
+ if (info->type == PIPE_SHADER_HULL || info->type == PIPE_SHADER_DOMAIN) {
+ info->prop.tp.domain = PIPE_PRIM_MAX;
+ info->prop.tp.outputPrim = PIPE_PRIM_MAX;
+ }
+#endif
+ if (info->type == PIPE_SHADER_GEOMETRY) {
+ info->prop.gp.instanceCount = 1;
+ info->prop.gp.maxVertices = 1;
+ }
+ info->io.clipDistance = 0xff;
+ info->io.pointSize = 0xff;
+ info->io.instanceId = 0xff;
+ info->io.vertexId = 0xff;
+ info->io.edgeFlagIn = 0xff;
+ info->io.edgeFlagOut = 0xff;
+ info->io.fragDepth = 0xff;
+ info->io.sampleMask = 0xff;
+ info->io.backFaceColor[0] = info->io.backFaceColor[1] = 0xff;
+}
+
+int
+nv50_ir_generate_code(struct nv50_ir_prog_info *info)
+{
+ int ret = 0;
+
+ nv50_ir::Program::Type type;
+
+ nv50_ir_init_prog_info(info);
+
+#define PROG_TYPE_CASE(a, b) \
+ case PIPE_SHADER_##a: type = nv50_ir::Program::TYPE_##b; break
+
+ switch (info->type) {
+ PROG_TYPE_CASE(VERTEX, VERTEX);
+// PROG_TYPE_CASE(HULL, TESSELLATION_CONTROL);
+// PROG_TYPE_CASE(DOMAIN, TESSELLATION_EVAL);
+ PROG_TYPE_CASE(GEOMETRY, GEOMETRY);
+ PROG_TYPE_CASE(FRAGMENT, FRAGMENT);
+ PROG_TYPE_CASE(COMPUTE, COMPUTE);
+ default:
+ type = nv50_ir::Program::TYPE_COMPUTE;
+ break;
+ }
+ INFO_DBG(info->dbgFlags, VERBOSE, "translating program of type %u\n", type);
+
+ nv50_ir::Target *targ = nv50_ir::Target::create(info->target);
+ if (!targ)
+ return -1;
+
+ nv50_ir::Program *prog = new nv50_ir::Program(type, targ);
+ if (!prog)
+ return -1;
+ prog->driver = info;
+ prog->dbgFlags = info->dbgFlags;
+ prog->optLevel = info->optLevel;
+
+ switch (info->bin.sourceRep) {
+#if 0
+ case PIPE_IR_LLVM:
+ case PIPE_IR_GLSL:
+ return -1;
+ case PIPE_IR_SM4:
+ ret = prog->makeFromSM4(info) ? 0 : -2;
+ break;
+ case PIPE_IR_TGSI:
+#endif
+ default:
+ ret = prog->makeFromTGSI(info) ? 0 : -2;
+ break;
+ }
+ if (ret < 0)
+ goto out;
+ if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+ prog->print();
+
+ targ->parseDriverInfo(info);
+ prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_PRE_SSA);
+
+ prog->convertToSSA();
+
+ if (prog->dbgFlags & NV50_IR_DEBUG_VERBOSE)
+ prog->print();
+
+ prog->optimizeSSA(info->optLevel);
+ prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_SSA);
+
+ if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+ prog->print();
+
+ if (!prog->registerAllocation()) {
+ ret = -4;
+ goto out;
+ }
+ prog->getTarget()->runLegalizePass(prog, nv50_ir::CG_STAGE_POST_RA);
+
+ prog->optimizePostRA(info->optLevel);
+
+ if (!prog->emitBinary(info)) {
+ ret = -5;
+ goto out;
+ }
+
+out:
+ INFO_DBG(prog->dbgFlags, VERBOSE, "nv50_ir_generate_code: ret = %i\n", ret);
+
+ info->bin.maxGPR = prog->maxGPR;
+ info->bin.code = prog->code;
+ info->bin.codeSize = prog->binSize;
+ info->bin.tlsSpace = prog->tlsSize;
+
+ delete prog;
+ nv50_ir::Target::destroy(targ);
+
+ return ret;
+}
+
+} // extern "C"
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir.h b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
new file mode 100644
index 0000000..68c76e5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir.h
@@ -0,0 +1,1197 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_H__
+#define __NV50_IR_H__
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <deque>
+#include <list>
+#include <vector>
+
+#include "codegen/nv50_ir_util.h"
+#include "codegen/nv50_ir_graph.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+namespace nv50_ir {
+
+enum operation
+{
+ OP_NOP = 0,
+ OP_PHI,
+ OP_UNION, // unify a new definition and several source values
+ OP_SPLIT, // $r0d -> { $r0, $r1 } ($r0d and $r0/$r1 will be coalesced)
+ OP_MERGE, // opposite of split, e.g. combine 2 32 bit into a 64 bit value
+ OP_CONSTRAINT, // copy values into consecutive registers
+ OP_MOV, // simple copy, no modifiers allowed
+ OP_LOAD,
+ OP_STORE,
+ OP_ADD, // NOTE: add u64 + u32 is legal for targets w/o 64-bit integer adds
+ OP_SUB,
+ OP_MUL,
+ OP_DIV,
+ OP_MOD,
+ OP_MAD,
+ OP_FMA,
+ OP_SAD, // abs(src0 - src1) + src2
+ OP_ABS,
+ OP_NEG,
+ OP_NOT,
+ OP_AND,
+ OP_OR,
+ OP_XOR,
+ OP_SHL,
+ OP_SHR,
+ OP_MAX,
+ OP_MIN,
+ OP_SAT, // CLAMP(f32, 0.0, 1.0)
+ OP_CEIL,
+ OP_FLOOR,
+ OP_TRUNC,
+ OP_CVT,
+ OP_SET_AND, // dst = (src0 CMP src1) & src2
+ OP_SET_OR,
+ OP_SET_XOR,
+ OP_SET,
+ OP_SELP, // dst = src2 ? src0 : src1
+ OP_SLCT, // dst = (src2 CMP 0) ? src0 : src1
+ OP_RCP,
+ OP_RSQ,
+ OP_LG2,
+ OP_SIN,
+ OP_COS,
+ OP_EX2,
+ OP_EXP, // exponential (base M_E)
+ OP_LOG, // natural logarithm
+ OP_PRESIN,
+ OP_PREEX2,
+ OP_SQRT,
+ OP_POW,
+ OP_BRA,
+ OP_CALL,
+ OP_RET,
+ OP_CONT,
+ OP_BREAK,
+ OP_PRERET,
+ OP_PRECONT,
+ OP_PREBREAK,
+ OP_BRKPT, // breakpoint (not related to loops)
+ OP_JOINAT, // push control flow convergence point
+ OP_JOIN, // converge
+ OP_DISCARD,
+ OP_EXIT,
+ OP_MEMBAR, // memory barrier (mfence, lfence, sfence)
+ OP_VFETCH, // indirection 0 in attribute space, indirection 1 is vertex base
+ OP_PFETCH, // fetch base address of vertex src0 (immediate) [+ src1]
+ OP_EXPORT,
+ OP_LINTERP,
+ OP_PINTERP,
+ OP_EMIT, // emit vertex
+ OP_RESTART, // restart primitive
+ OP_TEX,
+ OP_TXB, // texture bias
+ OP_TXL, // texure lod
+ OP_TXF, // texel fetch
+ OP_TXQ, // texture size query
+ OP_TXD, // texture derivatives
+ OP_TXG, // texture gather
+ OP_TEXCSAA, // texture op for coverage sampling
+ OP_TEXPREP, // turn cube map array into 2d array coordinates
+ OP_SULDB, // surface load (raw)
+ OP_SULDP, // surface load (formatted)
+ OP_SUSTB, // surface store (raw)
+ OP_SUSTP, // surface store (formatted)
+ OP_SUREDB,
+ OP_SUREDP, // surface reduction (atomic op)
+ OP_SULEA, // surface load effective address
+ OP_SUBFM, // surface bitfield manipulation
+ OP_SUCLAMP, // clamp surface coordinates
+ OP_SUEAU, // surface effective address
+ OP_MADSP, // special integer multiply-add
+ OP_TEXBAR, // texture dependency barrier
+ OP_DFDX,
+ OP_DFDY,
+ OP_RDSV, // read system value
+ OP_WRSV, // write system value
+ OP_QUADOP,
+ OP_QUADON,
+ OP_QUADPOP,
+ OP_POPCNT, // bitcount(src0 & src1)
+ OP_INSBF, // insert first src1[8:15] bits of src0 into src2 at src1[0:7]
+ OP_EXTBF, // place bits [K,K+N) of src0 into dst, src1 = 0xNNKK
+ OP_PERMT, // dst = bytes from src2,src0 selected by src1 (nvc0's src order)
+ OP_ATOM,
+ OP_BAR, // execution barrier, sources = { id, thread count, predicate }
+ OP_VADD, // byte/word vector operations
+ OP_VAVG,
+ OP_VMIN,
+ OP_VMAX,
+ OP_VSAD,
+ OP_VSET,
+ OP_VSHR,
+ OP_VSHL,
+ OP_VSEL,
+ OP_CCTL, // cache control
+ OP_LAST
+};
+
+// various instruction-specific modifier definitions Instruction::subOp
+// MOV_FINAL marks a MOV originating from an EXPORT (used for placing TEXBARs)
+#define NV50_IR_SUBOP_MUL_HIGH 1
+#define NV50_IR_SUBOP_EMIT_RESTART 1
+#define NV50_IR_SUBOP_LDC_IL 1
+#define NV50_IR_SUBOP_LDC_IS 2
+#define NV50_IR_SUBOP_LDC_ISL 3
+#define NV50_IR_SUBOP_SHIFT_WRAP 1
+#define NV50_IR_SUBOP_EMU_PRERET 1
+#define NV50_IR_SUBOP_TEXBAR(n) n
+#define NV50_IR_SUBOP_MOV_FINAL 1
+#define NV50_IR_SUBOP_EXTBF_REV 1
+#define NV50_IR_SUBOP_PERMT_F4E 1
+#define NV50_IR_SUBOP_PERMT_B4E 2
+#define NV50_IR_SUBOP_PERMT_RC8 3
+#define NV50_IR_SUBOP_PERMT_ECL 4
+#define NV50_IR_SUBOP_PERMT_ECR 5
+#define NV50_IR_SUBOP_PERMT_RC16 6
+#define NV50_IR_SUBOP_BAR_SYNC 0
+#define NV50_IR_SUBOP_BAR_ARRIVE 1
+#define NV50_IR_SUBOP_BAR_RED_AND 2
+#define NV50_IR_SUBOP_BAR_RED_OR 3
+#define NV50_IR_SUBOP_BAR_RED_POPC 4
+#define NV50_IR_SUBOP_MEMBAR_L 1
+#define NV50_IR_SUBOP_MEMBAR_S 2
+#define NV50_IR_SUBOP_MEMBAR_M 3
+#define NV50_IR_SUBOP_MEMBAR_CTA (0 << 2)
+#define NV50_IR_SUBOP_MEMBAR_GL (1 << 2)
+#define NV50_IR_SUBOP_MEMBAR_SYS (2 << 2)
+#define NV50_IR_SUBOP_MEMBAR_DIR(m) ((m) & 0x3)
+#define NV50_IR_SUBOP_MEMBAR_SCOPE(m) ((m) & ~0x3)
+#define NV50_IR_SUBOP_MEMBAR(d,s) \
+ (NV50_IR_SUBOP_MEMBAR_##d | NV50_IR_SUBOP_MEMBAR_##s)
+#define NV50_IR_SUBOP_ATOM_ADD 0
+#define NV50_IR_SUBOP_ATOM_MIN 1
+#define NV50_IR_SUBOP_ATOM_MAX 2
+#define NV50_IR_SUBOP_ATOM_INC 3
+#define NV50_IR_SUBOP_ATOM_DEC 4
+#define NV50_IR_SUBOP_ATOM_AND 5
+#define NV50_IR_SUBOP_ATOM_OR 6
+#define NV50_IR_SUBOP_ATOM_XOR 7
+#define NV50_IR_SUBOP_ATOM_CAS 8
+#define NV50_IR_SUBOP_ATOM_EXCH 9
+#define NV50_IR_SUBOP_CCTL_IV 5
+#define NV50_IR_SUBOP_CCTL_IVALL 6
+#define NV50_IR_SUBOP_SUST_IGN 0
+#define NV50_IR_SUBOP_SUST_TRAP 1
+#define NV50_IR_SUBOP_SUST_SDCL 3
+#define NV50_IR_SUBOP_SULD_ZERO 0
+#define NV50_IR_SUBOP_SULD_TRAP 1
+#define NV50_IR_SUBOP_SULD_SDCL 3
+#define NV50_IR_SUBOP_SUBFM_3D 1
+#define NV50_IR_SUBOP_SUCLAMP_2D 0x10
+#define NV50_IR_SUBOP_SUCLAMP_SD(r, d) (( 0 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_SUCLAMP_PL(r, d) (( 5 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_SUCLAMP_BL(r, d) ((10 + (r)) | ((d == 2) ? 0x10 : 0))
+#define NV50_IR_SUBOP_MADSP_SD 0xffff
+// Yes, we could represent those with DataType.
+// Or put the type into operation and have a couple 1000 values in that enum.
+// This will have to do for now.
+// The bitfields are supposed to correspond to nve4 ISA.
+#define NV50_IR_SUBOP_MADSP(a,b,c) (((c) << 8) | ((b) << 4) | (a))
+#define NV50_IR_SUBOP_V1(d,a,b) (((d) << 10) | ((b) << 5) | (a) | 0x0000)
+#define NV50_IR_SUBOP_V2(d,a,b) (((d) << 10) | ((b) << 5) | (a) | 0x4000)
+#define NV50_IR_SUBOP_V4(d,a,b) (((d) << 10) | ((b) << 5) | (a) | 0x8000)
+#define NV50_IR_SUBOP_Vn(n) ((n) >> 14)
+
+enum DataType
+{
+ TYPE_NONE,
+ TYPE_U8,
+ TYPE_S8,
+ TYPE_U16,
+ TYPE_S16,
+ TYPE_U32,
+ TYPE_S32,
+ TYPE_U64, // 64 bit operations are only lowered after register allocation
+ TYPE_S64,
+ TYPE_F16,
+ TYPE_F32,
+ TYPE_F64,
+ TYPE_B96,
+ TYPE_B128
+};
+
+enum CondCode
+{
+ CC_FL = 0,
+ CC_NEVER = CC_FL, // when used with FILE_FLAGS
+ CC_LT = 1,
+ CC_EQ = 2,
+ CC_NOT_P = CC_EQ, // when used with FILE_PREDICATE
+ CC_LE = 3,
+ CC_GT = 4,
+ CC_NE = 5,
+ CC_P = CC_NE,
+ CC_GE = 6,
+ CC_TR = 7,
+ CC_ALWAYS = CC_TR,
+ CC_U = 8,
+ CC_LTU = 9,
+ CC_EQU = 10,
+ CC_LEU = 11,
+ CC_GTU = 12,
+ CC_NEU = 13,
+ CC_GEU = 14,
+ CC_NO = 0x10,
+ CC_NC = 0x11,
+ CC_NS = 0x12,
+ CC_NA = 0x13,
+ CC_A = 0x14,
+ CC_S = 0x15,
+ CC_C = 0x16,
+ CC_O = 0x17
+};
+
+enum RoundMode
+{
+ ROUND_N, // nearest
+ ROUND_M, // towards -inf
+ ROUND_Z, // towards 0
+ ROUND_P, // towards +inf
+ ROUND_NI, // nearest integer
+ ROUND_MI, // to integer towards -inf
+ ROUND_ZI, // to integer towards 0
+ ROUND_PI, // to integer towards +inf
+};
+
+enum CacheMode
+{
+ CACHE_CA, // cache at all levels
+ CACHE_WB = CACHE_CA, // cache write back
+ CACHE_CG, // cache at global level
+ CACHE_CS, // cache streaming
+ CACHE_CV, // cache as volatile
+ CACHE_WT = CACHE_CV // cache write-through
+};
+
+enum DataFile
+{
+ FILE_NULL = 0,
+ FILE_GPR,
+ FILE_PREDICATE, // boolean predicate
+ FILE_FLAGS, // zero/sign/carry/overflow bits
+ FILE_ADDRESS,
+ LAST_REGISTER_FILE = FILE_ADDRESS,
+ FILE_IMMEDIATE,
+ FILE_MEMORY_CONST,
+ FILE_SHADER_INPUT,
+ FILE_SHADER_OUTPUT,
+ FILE_MEMORY_GLOBAL,
+ FILE_MEMORY_SHARED,
+ FILE_MEMORY_LOCAL,
+ FILE_SYSTEM_VALUE,
+ DATA_FILE_COUNT
+};
+
+enum TexTarget
+{
+ TEX_TARGET_1D,
+ TEX_TARGET_2D,
+ TEX_TARGET_2D_MS,
+ TEX_TARGET_3D,
+ TEX_TARGET_CUBE,
+ TEX_TARGET_1D_SHADOW,
+ TEX_TARGET_2D_SHADOW,
+ TEX_TARGET_CUBE_SHADOW,
+ TEX_TARGET_1D_ARRAY,
+ TEX_TARGET_2D_ARRAY,
+ TEX_TARGET_2D_MS_ARRAY,
+ TEX_TARGET_CUBE_ARRAY,
+ TEX_TARGET_1D_ARRAY_SHADOW,
+ TEX_TARGET_2D_ARRAY_SHADOW,
+ TEX_TARGET_RECT,
+ TEX_TARGET_RECT_SHADOW,
+ TEX_TARGET_CUBE_ARRAY_SHADOW,
+ TEX_TARGET_BUFFER,
+ TEX_TARGET_COUNT
+};
+
+enum SVSemantic
+{
+ SV_POSITION, // WPOS
+ SV_VERTEX_ID,
+ SV_INSTANCE_ID,
+ SV_INVOCATION_ID,
+ SV_PRIMITIVE_ID,
+ SV_VERTEX_COUNT, // gl_PatchVerticesIn
+ SV_LAYER,
+ SV_VIEWPORT_INDEX,
+ SV_YDIR,
+ SV_FACE,
+ SV_POINT_SIZE,
+ SV_POINT_COORD,
+ SV_CLIP_DISTANCE,
+ SV_SAMPLE_INDEX,
+ SV_TESS_FACTOR,
+ SV_TESS_COORD,
+ SV_TID,
+ SV_CTAID,
+ SV_NTID,
+ SV_GRIDID,
+ SV_NCTAID,
+ SV_LANEID,
+ SV_PHYSID,
+ SV_NPHYSID,
+ SV_CLOCK,
+ SV_LBASE,
+ SV_SBASE,
+ SV_UNDEFINED,
+ SV_LAST
+};
+
+class Program;
+class Function;
+class BasicBlock;
+
+class Target;
+
+class Instruction;
+class CmpInstruction;
+class TexInstruction;
+class FlowInstruction;
+
+class Value;
+class LValue;
+class Symbol;
+class ImmediateValue;
+
+struct Storage
+{
+ DataFile file;
+ int8_t fileIndex; // signed, may be indirect for CONST[]
+ uint8_t size; // this should match the Instruction type's size
+ DataType type; // mainly for pretty printing
+ union {
+ uint64_t u64; // immediate values
+ uint32_t u32;
+ uint16_t u16;
+ uint8_t u8;
+ int64_t s64;
+ int32_t s32;
+ int16_t s16;
+ int8_t s8;
+ float f32;
+ double f64;
+ int32_t offset; // offset from 0 (base of address space)
+ int32_t id; // register id (< 0 if virtual/unassigned, in units <= 4)
+ struct {
+ SVSemantic sv;
+ int index;
+ } sv;
+ } data;
+};
+
+// precedence: NOT after SAT after NEG after ABS
+#define NV50_IR_MOD_ABS (1 << 0)
+#define NV50_IR_MOD_NEG (1 << 1)
+#define NV50_IR_MOD_SAT (1 << 2)
+#define NV50_IR_MOD_NOT (1 << 3)
+#define NV50_IR_MOD_NEG_ABS (NV50_IR_MOD_NEG | NV50_IR_MOD_ABS)
+
+#define NV50_IR_INTERP_MODE_MASK 0x3
+#define NV50_IR_INTERP_LINEAR (0 << 0)
+#define NV50_IR_INTERP_PERSPECTIVE (1 << 0)
+#define NV50_IR_INTERP_FLAT (2 << 0)
+#define NV50_IR_INTERP_SC (3 << 0) // what exactly is that ?
+#define NV50_IR_INTERP_SAMPLE_MASK 0xc
+#define NV50_IR_INTERP_DEFAULT (0 << 2)
+#define NV50_IR_INTERP_CENTROID (1 << 2)
+#define NV50_IR_INTERP_OFFSET (2 << 2)
+#define NV50_IR_INTERP_SAMPLEID (3 << 2)
+
+// do we really want this to be a class ?
+class Modifier
+{
+public:
+ Modifier() : bits(0) { }
+ Modifier(unsigned int m) : bits(m) { }
+ Modifier(operation op);
+
+ // @return new Modifier applying a after b (asserts if unrepresentable)
+ Modifier operator*(const Modifier) const;
+ Modifier operator*=(const Modifier m) { *this = *this * m; return *this; }
+ Modifier operator==(const Modifier m) const { return m.bits == bits; }
+ Modifier operator!=(const Modifier m) const { return m.bits != bits; }
+
+ inline Modifier operator&(const Modifier m) const { return bits & m.bits; }
+ inline Modifier operator|(const Modifier m) const { return bits | m.bits; }
+ inline Modifier operator^(const Modifier m) const { return bits ^ m.bits; }
+
+ operation getOp() const;
+
+ inline int neg() const { return (bits & NV50_IR_MOD_NEG) ? 1 : 0; }
+ inline int abs() const { return (bits & NV50_IR_MOD_ABS) ? 1 : 0; }
+
+ inline operator bool() const { return bits ? true : false; }
+
+ void applyTo(ImmediateValue &imm) const;
+
+ int print(char *buf, size_t size) const;
+
+private:
+ uint8_t bits;
+};
+
+class ValueRef
+{
+public:
+ ValueRef(Value * = NULL);
+ ValueRef(const ValueRef&);
+ ~ValueRef();
+
+ inline bool exists() const { return value != NULL; }
+
+ void set(Value *);
+ void set(const ValueRef&);
+ inline Value *get() const { return value; }
+ inline Value *rep() const;
+
+ inline Instruction *getInsn() const { return insn; }
+ inline void setInsn(Instruction *inst) { insn = inst; }
+
+ inline bool isIndirect(int dim) const { return indirect[dim] >= 0; }
+ inline const ValueRef *getIndirect(int dim) const;
+
+ inline DataFile getFile() const;
+ inline unsigned getSize() const;
+
+ // SSA: return eventual (traverse MOVs) literal value, if it exists
+ bool getImmediate(ImmediateValue&) const;
+
+public:
+ Modifier mod;
+ int8_t indirect[2]; // >= 0 if relative to lvalue in insn->src(indirect[i])
+ uint8_t swizzle;
+
+ bool usedAsPtr; // for printing
+
+private:
+ Value *value;
+ Instruction *insn;
+};
+
+class ValueDef
+{
+public:
+ ValueDef(Value * = NULL);
+ ValueDef(const ValueDef&);
+ ~ValueDef();
+
+ inline bool exists() const { return value != NULL; }
+
+ inline Value *get() const { return value; }
+ inline Value *rep() const;
+ void set(Value *);
+ bool mayReplace(const ValueRef &);
+ void replace(const ValueRef &, bool doSet); // replace all uses of the old value
+
+ inline Instruction *getInsn() const { return insn; }
+ inline void setInsn(Instruction *inst) { insn = inst; }
+
+ inline DataFile getFile() const;
+ inline unsigned getSize() const;
+
+ inline void setSSA(LValue *);
+ inline const LValue *preSSA() const;
+
+private:
+ Value *value; // should make this LValue * ...
+ LValue *origin; // pre SSA value
+ Instruction *insn;
+};
+
+class Value
+{
+public:
+ Value();
+ virtual ~Value() { }
+
+ virtual Value *clone(ClonePolicy<Function>&) const = 0;
+
+ virtual int print(char *, size_t, DataType ty = TYPE_NONE) const = 0;
+
+ virtual bool equals(const Value *, bool strict = false) const;
+ virtual bool interfers(const Value *) const;
+ virtual bool isUniform() const { return true; }
+
+ inline Value *rep() const { return join; }
+
+ inline Instruction *getUniqueInsn() const;
+ inline Instruction *getInsn() const; // use when uniqueness is certain
+
+ inline int refCount() { return uses.size(); }
+
+ inline LValue *asLValue();
+ inline Symbol *asSym();
+ inline ImmediateValue *asImm();
+ inline const Symbol *asSym() const;
+ inline const ImmediateValue *asImm() const;
+
+ inline bool inFile(DataFile f) { return reg.file == f; }
+
+ static inline Value *get(Iterator&);
+
+ std::list<ValueRef *> uses;
+ std::list<ValueDef *> defs;
+ typedef std::list<ValueRef *>::iterator UseIterator;
+ typedef std::list<ValueRef *>::const_iterator UseCIterator;
+ typedef std::list<ValueDef *>::iterator DefIterator;
+ typedef std::list<ValueDef *>::const_iterator DefCIterator;
+
+ int id;
+ Storage reg;
+
+ // TODO: these should be in LValue:
+ Interval livei;
+ Value *join;
+};
+
+class LValue : public Value
+{
+public:
+ LValue(Function *, DataFile file);
+ LValue(Function *, LValue *);
+ ~LValue() { }
+
+ virtual bool isUniform() const;
+
+ virtual LValue *clone(ClonePolicy<Function>&) const;
+
+ virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+
+public:
+ unsigned compMask : 8; // compound/component mask
+ unsigned compound : 1; // used by RA, value involved in split/merge
+ unsigned ssa : 1;
+ unsigned fixedReg : 1; // set & used by RA, earlier just use (id < 0)
+ unsigned noSpill : 1; // do not spill (e.g. if spill temporary already)
+};
+
+class Symbol : public Value
+{
+public:
+ Symbol(Program *, DataFile file = FILE_MEMORY_CONST, ubyte fileIdx = 0);
+ ~Symbol() { }
+
+ virtual Symbol *clone(ClonePolicy<Function>&) const;
+
+ virtual bool equals(const Value *that, bool strict) const;
+
+ virtual bool isUniform() const;
+
+ virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+
+ // print with indirect values
+ int print(char *, size_t, Value *, Value *, DataType ty = TYPE_NONE) const;
+
+ inline void setFile(DataFile file, ubyte fileIndex = 0)
+ {
+ reg.file = file;
+ reg.fileIndex = fileIndex;
+ }
+
+ inline void setOffset(int32_t offset);
+ inline void setAddress(Symbol *base, int32_t offset);
+ inline void setSV(SVSemantic sv, uint32_t idx = 0);
+
+ inline const Symbol *getBase() const { return baseSym; }
+
+private:
+ Symbol *baseSym; // array base for Symbols representing array elements
+};
+
+class ImmediateValue : public Value
+{
+public:
+ ImmediateValue() { }
+ ImmediateValue(Program *, uint32_t);
+ ImmediateValue(Program *, float);
+ ImmediateValue(Program *, double);
+ // NOTE: not added to program with
+ ImmediateValue(const ImmediateValue *, DataType ty);
+ ~ImmediateValue() { };
+
+ virtual ImmediateValue *clone(ClonePolicy<Function>&) const;
+
+ virtual bool equals(const Value *that, bool strict) const;
+
+ // these only work if 'type' is valid (we mostly use untyped literals):
+ bool isInteger(const int ival) const; // ival is cast to this' type
+ bool isNegative() const;
+ bool isPow2() const;
+
+ void applyLog2();
+
+ // for constant folding:
+ ImmediateValue operator+(const ImmediateValue&) const;
+ ImmediateValue operator-(const ImmediateValue&) const;
+ ImmediateValue operator*(const ImmediateValue&) const;
+ ImmediateValue operator/(const ImmediateValue&) const;
+
+ ImmediateValue& operator=(const ImmediateValue&); // only sets value !
+
+ bool compare(CondCode cc, float fval) const;
+
+ virtual int print(char *, size_t, DataType ty = TYPE_NONE) const;
+};
+
+class Instruction
+{
+public:
+ Instruction();
+ Instruction(Function *, operation, DataType);
+ virtual ~Instruction();
+
+ virtual Instruction *clone(ClonePolicy<Function>&,
+ Instruction * = NULL) const;
+
+ void setDef(int i, Value *);
+ void setSrc(int s, Value *);
+ void setSrc(int s, const ValueRef&);
+ void swapSources(int a, int b);
+ void moveSources(int s, int delta);
+ bool setIndirect(int s, int dim, Value *);
+
+ inline ValueRef& src(int s) { return srcs[s]; }
+ inline ValueDef& def(int s) { return defs[s]; }
+ inline const ValueRef& src(int s) const { return srcs[s]; }
+ inline const ValueDef& def(int s) const { return defs[s]; }
+
+ inline Value *getDef(int d) const { return defs[d].get(); }
+ inline Value *getSrc(int s) const { return srcs[s].get(); }
+ inline Value *getIndirect(int s, int dim) const;
+
+ inline bool defExists(unsigned d) const
+ {
+ return d < defs.size() && defs[d].exists();
+ }
+ inline bool srcExists(unsigned s) const
+ {
+ return s < srcs.size() && srcs[s].exists();
+ }
+
+ inline bool constrainedDefs() const;
+
+ bool setPredicate(CondCode ccode, Value *);
+ inline Value *getPredicate() const;
+ bool writesPredicate() const;
+ inline bool isPredicated() const { return predSrc >= 0; }
+
+ inline void setFlagsSrc(int s, Value *);
+ inline void setFlagsDef(int d, Value *);
+ inline bool usesFlags() const { return flagsSrc >= 0; }
+
+ unsigned int defCount() const { return defs.size(); };
+ unsigned int defCount(unsigned int mask, bool singleFile = false) const;
+ unsigned int srcCount() const { return srcs.size(); };
+ unsigned int srcCount(unsigned int mask, bool singleFile = false) const;
+
+ // save & remove / set indirect[0,1] and predicate source
+ void takeExtraSources(int s, Value *[3]);
+ void putExtraSources(int s, Value *[3]);
+
+ inline void setType(DataType type) { dType = sType = type; }
+
+ inline void setType(DataType dtype, DataType stype)
+ {
+ dType = dtype;
+ sType = stype;
+ }
+
+ inline bool isPseudo() const { return op < OP_MOV; }
+ bool isDead() const;
+ bool isNop() const;
+ bool isCommutationLegal(const Instruction *) const; // must be adjacent !
+ bool isActionEqual(const Instruction *) const;
+ bool isResultEqual(const Instruction *) const;
+
+ void print() const;
+
+ inline CmpInstruction *asCmp();
+ inline TexInstruction *asTex();
+ inline FlowInstruction *asFlow();
+ inline const TexInstruction *asTex() const;
+ inline const CmpInstruction *asCmp() const;
+ inline const FlowInstruction *asFlow() const;
+
+public:
+ Instruction *next;
+ Instruction *prev;
+ int id;
+ int serial; // CFG order
+
+ operation op;
+ DataType dType; // destination or defining type
+ DataType sType; // source or secondary type
+ CondCode cc;
+ RoundMode rnd;
+ CacheMode cache;
+
+ uint16_t subOp; // quadop, 1 for mul-high, etc.
+
+ unsigned encSize : 4; // encoding size in bytes
+ unsigned saturate : 1; // to [0.0f, 1.0f]
+ unsigned join : 1; // converge control flow (use OP_JOIN until end)
+ unsigned fixed : 1; // prevent dead code elimination
+ unsigned terminator : 1; // end of basic block
+ unsigned ftz : 1; // flush denormal to zero
+ unsigned dnz : 1; // denormals, NaN are zero
+ unsigned ipa : 4; // interpolation mode
+ unsigned lanes : 4;
+ unsigned perPatch : 1;
+ unsigned exit : 1; // terminate program after insn
+ unsigned mask : 4; // for vector ops
+
+ int8_t postFactor; // MUL/DIV(if < 0) by 1 << postFactor
+
+ int8_t predSrc;
+ int8_t flagsDef;
+ int8_t flagsSrc;
+
+ uint8_t sched; // scheduling data (NOTE: maybe move to separate storage)
+
+ BasicBlock *bb;
+
+protected:
+ std::deque<ValueDef> defs; // no gaps !
+ std::deque<ValueRef> srcs; // no gaps !
+
+ // instruction specific methods:
+ // (don't want to subclass, would need more constructors and memory pools)
+public:
+ inline void setInterpolate(unsigned int mode) { ipa = mode; }
+
+ unsigned int getInterpMode() const { return ipa & 0x3; }
+ unsigned int getSampleMode() const { return ipa & 0xc; }
+
+private:
+ void init();
+};
+
+enum TexQuery
+{
+ TXQ_DIMS,
+ TXQ_TYPE,
+ TXQ_SAMPLE_POSITION,
+ TXQ_FILTER,
+ TXQ_LOD,
+ TXQ_WRAP,
+ TXQ_BORDER_COLOUR
+};
+
+class TexInstruction : public Instruction
+{
+public:
+ class Target
+ {
+ public:
+ Target(TexTarget targ = TEX_TARGET_2D) : target(targ) { }
+
+ const char *getName() const { return descTable[target].name; }
+ unsigned int getArgCount() const { return descTable[target].argc; }
+ unsigned int getDim() const { return descTable[target].dim; }
+ int isArray() const { return descTable[target].array ? 1 : 0; }
+ int isCube() const { return descTable[target].cube ? 1 : 0; }
+ int isShadow() const { return descTable[target].shadow ? 1 : 0; }
+ int isMS() const {
+ return target == TEX_TARGET_2D_MS || target == TEX_TARGET_2D_MS_ARRAY; }
+
+ Target& operator=(TexTarget targ)
+ {
+ assert(targ < TEX_TARGET_COUNT);
+ target = targ;
+ return *this;
+ }
+
+ inline bool operator==(TexTarget targ) const { return target == targ; }
+ inline bool operator!=(TexTarget targ) const { return target != targ; }
+
+ enum TexTarget getEnum() const { return target; }
+
+ private:
+ struct Desc
+ {
+ char name[19];
+ uint8_t dim;
+ uint8_t argc;
+ bool array;
+ bool cube;
+ bool shadow;
+ };
+
+ static const struct Desc descTable[TEX_TARGET_COUNT];
+
+ private:
+ enum TexTarget target;
+ };
+
+public:
+ TexInstruction(Function *, operation);
+ virtual ~TexInstruction();
+
+ virtual TexInstruction *clone(ClonePolicy<Function>&,
+ Instruction * = NULL) const;
+
+ inline void setTexture(Target targ, uint8_t r, uint8_t s)
+ {
+ tex.r = r;
+ tex.s = s;
+ tex.target = targ;
+ }
+
+ void setIndirectR(Value *);
+ void setIndirectS(Value *);
+ inline Value *getIndirectR() const;
+ inline Value *getIndirectS() const;
+
+public:
+ struct {
+ Target target;
+
+ uint16_t r;
+ uint16_t s;
+ int8_t rIndirectSrc;
+ int8_t sIndirectSrc;
+
+ uint8_t mask;
+ uint8_t gatherComp;
+
+ bool liveOnly; // only execute on live pixels of a quad (optimization)
+ bool levelZero;
+ bool derivAll;
+
+ int8_t useOffsets; // 0, 1, or 4 for textureGatherOffsets
+ int8_t offset[4][3];
+
+ enum TexQuery query;
+ } tex;
+
+ ValueRef dPdx[3];
+ ValueRef dPdy[3];
+};
+
+class CmpInstruction : public Instruction
+{
+public:
+ CmpInstruction(Function *, operation);
+
+ virtual CmpInstruction *clone(ClonePolicy<Function>&,
+ Instruction * = NULL) const;
+
+ void setCondition(CondCode cond) { setCond = cond; }
+ CondCode getCondition() const { return setCond; }
+
+public:
+ CondCode setCond;
+};
+
+class FlowInstruction : public Instruction
+{
+public:
+ FlowInstruction(Function *, operation, void *target);
+
+ virtual FlowInstruction *clone(ClonePolicy<Function>&,
+ Instruction * = NULL) const;
+
+public:
+ unsigned allWarp : 1;
+ unsigned absolute : 1;
+ unsigned limit : 1;
+ unsigned builtin : 1; // true for calls to emulation code
+ unsigned indirect : 1; // target in src(0)
+
+ union {
+ BasicBlock *bb;
+ int builtin;
+ Function *fn;
+ } target;
+};
+
+class BasicBlock
+{
+public:
+ BasicBlock(Function *);
+ ~BasicBlock();
+
+ BasicBlock *clone(ClonePolicy<Function>&) const;
+
+ inline int getId() const { return id; }
+ inline unsigned int getInsnCount() const { return numInsns; }
+ inline bool isTerminated() const { return exit && exit->terminator; }
+
+ bool dominatedBy(BasicBlock *bb);
+ inline bool reachableBy(const BasicBlock *by, const BasicBlock *term);
+
+ // returns mask of conditional out blocks
+ // e.g. 3 for IF { .. } ELSE { .. } ENDIF, 1 for IF { .. } ENDIF
+ unsigned int initiatesSimpleConditional() const;
+
+public:
+ Function *getFunction() const { return func; }
+ Program *getProgram() const { return program; }
+
+ Instruction *getEntry() const { return entry; } // first non-phi instruction
+ Instruction *getPhi() const { return phi; }
+ Instruction *getFirst() const { return phi ? phi : entry; }
+ Instruction *getExit() const { return exit; }
+
+ void insertHead(Instruction *);
+ void insertTail(Instruction *);
+ void insertBefore(Instruction *, Instruction *);
+ void insertAfter(Instruction *, Instruction *);
+ void remove(Instruction *);
+ void permuteAdjacent(Instruction *, Instruction *);
+
+ BasicBlock *idom() const;
+
+ // NOTE: currently does not rebuild the dominator tree
+ BasicBlock *splitBefore(Instruction *, bool attach = true);
+ BasicBlock *splitAfter(Instruction *, bool attach = true);
+
+ DLList& getDF() { return df; }
+ DLList::Iterator iterDF() { return df.iterator(); }
+
+ static inline BasicBlock *get(Iterator&);
+ static inline BasicBlock *get(Graph::Node *);
+
+public:
+ Graph::Node cfg; // first edge is branch *taken* (the ELSE branch)
+ Graph::Node dom;
+
+ BitSet liveSet;
+ BitSet defSet;
+
+ uint32_t binPos;
+ uint32_t binSize;
+
+ Instruction *joinAt; // for quick reference
+
+ bool explicitCont; // loop headers: true if loop contains continue stmts
+
+private:
+ int id;
+ DLList df;
+
+ Instruction *phi;
+ Instruction *entry;
+ Instruction *exit;
+
+ unsigned int numInsns;
+
+private:
+ Function *func;
+ Program *program;
+
+ void splitCommon(Instruction *, BasicBlock *, bool attach);
+};
+
+class Function
+{
+public:
+ Function(Program *, const char *name, uint32_t label);
+ ~Function();
+
+ static inline Function *get(Graph::Node *node);
+
+ inline Program *getProgram() const { return prog; }
+ inline const char *getName() const { return name; }
+ inline int getId() const { return id; }
+ inline uint32_t getLabel() const { return label; }
+
+ void print();
+ void printLiveIntervals() const;
+ void printCFGraph(const char *filePath);
+
+ bool setEntry(BasicBlock *);
+ bool setExit(BasicBlock *);
+
+ unsigned int orderInstructions(ArrayList&);
+
+ inline void add(BasicBlock *bb, int& id) { allBBlocks.insert(bb, id); }
+ inline void add(Instruction *insn, int& id) { allInsns.insert(insn, id); }
+ inline void add(LValue *lval, int& id) { allLValues.insert(lval, id); }
+
+ inline LValue *getLValue(int id);
+
+ void buildLiveSets();
+ void buildDefSets();
+ bool convertToSSA();
+
+public:
+ std::deque<ValueDef> ins;
+ std::deque<ValueRef> outs;
+ std::deque<Value *> clobbers;
+
+ Graph cfg;
+ Graph::Node *cfgExit;
+ Graph *domTree;
+ Graph::Node call; // node in the call graph
+
+ BasicBlock **bbArray; // BBs in emission order
+ int bbCount;
+
+ unsigned int loopNestingBound;
+ int regClobberMax;
+
+ uint32_t binPos;
+ uint32_t binSize;
+
+ Value *stackPtr;
+
+ uint32_t tlsBase; // base address for l[] space (if no stack pointer is used)
+ uint32_t tlsSize;
+
+ ArrayList allBBlocks;
+ ArrayList allInsns;
+ ArrayList allLValues;
+
+private:
+ void buildLiveSetsPreSSA(BasicBlock *, const int sequence);
+ void buildDefSetsPreSSA(BasicBlock *bb, const int seq);
+
+private:
+ uint32_t label;
+ int id;
+ const char *const name;
+ Program *prog;
+};
+
+enum CGStage
+{
+ CG_STAGE_PRE_SSA,
+ CG_STAGE_SSA, // expected directly before register allocation
+ CG_STAGE_POST_RA
+};
+
+class Program
+{
+public:
+ enum Type
+ {
+ TYPE_VERTEX,
+ TYPE_TESSELLATION_CONTROL,
+ TYPE_TESSELLATION_EVAL,
+ TYPE_GEOMETRY,
+ TYPE_FRAGMENT,
+ TYPE_COMPUTE
+ };
+
+ Program(Type type, Target *targ);
+ ~Program();
+
+ void print();
+
+ Type getType() const { return progType; }
+
+ inline void add(Function *fn, int& id) { allFuncs.insert(fn, id); }
+ inline void del(Function *fn, int& id) { allFuncs.remove(id); }
+ inline void add(Value *rval, int& id) { allRValues.insert(rval, id); }
+
+ bool makeFromTGSI(struct nv50_ir_prog_info *);
+ bool makeFromSM4(struct nv50_ir_prog_info *);
+ bool convertToSSA();
+ bool optimizeSSA(int level);
+ bool optimizePostRA(int level);
+ bool registerAllocation();
+ bool emitBinary(struct nv50_ir_prog_info *);
+
+ const Target *getTarget() const { return target; }
+
+private:
+ void emitSymbolTable(struct nv50_ir_prog_info *);
+
+ Type progType;
+ Target *target;
+
+public:
+ Function *main;
+ Graph calls;
+
+ ArrayList allFuncs;
+ ArrayList allRValues;
+
+ uint32_t *code;
+ uint32_t binSize;
+ uint32_t tlsSize; // size required for FILE_MEMORY_LOCAL
+
+ int maxGPR;
+
+ MemoryPool mem_Instruction;
+ MemoryPool mem_CmpInstruction;
+ MemoryPool mem_TexInstruction;
+ MemoryPool mem_FlowInstruction;
+ MemoryPool mem_LValue;
+ MemoryPool mem_Symbol;
+ MemoryPool mem_ImmediateValue;
+
+ uint32_t dbgFlags;
+ uint8_t optLevel;
+
+ void *targetPriv; // e.g. to carry information between passes
+
+ const struct nv50_ir_prog_info *driver; // for driver configuration
+
+ void releaseInstruction(Instruction *);
+ void releaseValue(Value *);
+};
+
+// TODO: add const version
+class Pass
+{
+public:
+ bool run(Program *, bool ordered = false, bool skipPhi = false);
+ bool run(Function *, bool ordered = false, bool skipPhi = false);
+
+private:
+ // return false to continue with next entity on next higher level
+ virtual bool visit(Function *) { return true; }
+ virtual bool visit(BasicBlock *) { return true; }
+ virtual bool visit(Instruction *) { return false; }
+
+ bool doRun(Program *, bool ordered, bool skipPhi);
+ bool doRun(Function *, bool ordered, bool skipPhi);
+
+protected:
+ bool err;
+ Function *func;
+ Program *prog;
+};
+
+// =============================================================================
+
+#include "codegen/nv50_ir_inlines.h"
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
new file mode 100644
index 0000000..51b9225
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_bb.cpp
@@ -0,0 +1,550 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+Function::Function(Program *p, const char *fnName, uint32_t label)
+ : call(this),
+ label(label),
+ name(fnName),
+ prog(p)
+{
+ cfgExit = NULL;
+ domTree = NULL;
+
+ bbArray = NULL;
+ bbCount = 0;
+ loopNestingBound = 0;
+ regClobberMax = 0;
+
+ binPos = 0;
+ binSize = 0;
+
+ stackPtr = NULL;
+ tlsBase = 0;
+ tlsSize = 0;
+
+ prog->add(this, id);
+}
+
+Function::~Function()
+{
+ prog->del(this, id);
+
+ if (domTree)
+ delete domTree;
+ if (bbArray)
+ delete[] bbArray;
+
+ // clear value refs and defs
+ ins.clear();
+ outs.clear();
+
+ for (ArrayList::Iterator it = allInsns.iterator(); !it.end(); it.next())
+ delete_Instruction(prog, reinterpret_cast<Instruction *>(it.get()));
+
+ for (ArrayList::Iterator it = allLValues.iterator(); !it.end(); it.next())
+ delete_Value(prog, reinterpret_cast<LValue *>(it.get()));
+
+ for (ArrayList::Iterator BBs = allBBlocks.iterator(); !BBs.end(); BBs.next())
+ delete reinterpret_cast<BasicBlock *>(BBs.get());
+}
+
+BasicBlock::BasicBlock(Function *fn) : cfg(this), dom(this), func(fn)
+{
+ program = func->getProgram();
+
+ joinAt = phi = entry = exit = NULL;
+
+ numInsns = 0;
+ binPos = 0;
+ binSize = 0;
+
+ explicitCont = false;
+
+ func->add(this, this->id);
+}
+
+BasicBlock::~BasicBlock()
+{
+ // nothing yet
+}
+
+BasicBlock *
+BasicBlock::clone(ClonePolicy<Function>& pol) const
+{
+ BasicBlock *bb = new BasicBlock(pol.context());
+
+ pol.set(this, bb);
+
+ for (Instruction *i = getFirst(); i; i = i->next)
+ bb->insertTail(i->clone(pol));
+
+ pol.context()->cfg.insert(&bb->cfg);
+
+ for (Graph::EdgeIterator it = cfg.outgoing(); !it.end(); it.next()) {
+ BasicBlock *obb = BasicBlock::get(it.getNode());
+ bb->cfg.attach(&pol.get(obb)->cfg, it.getType());
+ }
+
+ return bb;
+}
+
+BasicBlock *
+BasicBlock::idom() const
+{
+ Graph::Node *dn = dom.parent();
+ return dn ? BasicBlock::get(dn) : NULL;
+}
+
+void
+BasicBlock::insertHead(Instruction *inst)
+{
+ assert(inst->next == 0 && inst->prev == 0);
+
+ if (inst->op == OP_PHI) {
+ if (phi) {
+ insertBefore(phi, inst);
+ } else {
+ if (entry) {
+ insertBefore(entry, inst);
+ } else {
+ assert(!exit);
+ phi = exit = inst;
+ inst->bb = this;
+ ++numInsns;
+ }
+ }
+ } else {
+ if (entry) {
+ insertBefore(entry, inst);
+ } else {
+ if (phi) {
+ insertAfter(exit, inst); // after last phi
+ } else {
+ assert(!exit);
+ entry = exit = inst;
+ inst->bb = this;
+ ++numInsns;
+ }
+ }
+ }
+}
+
+void
+BasicBlock::insertTail(Instruction *inst)
+{
+ assert(inst->next == 0 && inst->prev == 0);
+
+ if (inst->op == OP_PHI) {
+ if (entry) {
+ insertBefore(entry, inst);
+ } else
+ if (exit) {
+ assert(phi);
+ insertAfter(exit, inst);
+ } else {
+ assert(!phi);
+ phi = exit = inst;
+ inst->bb = this;
+ ++numInsns;
+ }
+ } else {
+ if (exit) {
+ insertAfter(exit, inst);
+ } else {
+ assert(!phi);
+ entry = exit = inst;
+ inst->bb = this;
+ ++numInsns;
+ }
+ }
+}
+
+void
+BasicBlock::insertBefore(Instruction *q, Instruction *p)
+{
+ assert(p && q);
+
+ assert(p->next == 0 && p->prev == 0);
+
+ if (q == entry) {
+ if (p->op == OP_PHI) {
+ if (!phi)
+ phi = p;
+ } else {
+ entry = p;
+ }
+ } else
+ if (q == phi) {
+ assert(p->op == OP_PHI);
+ phi = p;
+ }
+
+ p->next = q;
+ p->prev = q->prev;
+ if (p->prev)
+ p->prev->next = p;
+ q->prev = p;
+
+ p->bb = this;
+ ++numInsns;
+}
+
+void
+BasicBlock::insertAfter(Instruction *p, Instruction *q)
+{
+ assert(p && q);
+ assert(q->op != OP_PHI || p->op == OP_PHI);
+
+ assert(q->next == 0 && q->prev == 0);
+
+ if (p == exit)
+ exit = q;
+ if (p->op == OP_PHI && q->op != OP_PHI)
+ entry = q;
+
+ q->prev = p;
+ q->next = p->next;
+ if (q->next)
+ q->next->prev = q;
+ p->next = q;
+
+ q->bb = this;
+ ++numInsns;
+}
+
+void
+BasicBlock::remove(Instruction *insn)
+{
+ assert(insn->bb == this);
+
+ if (insn->prev)
+ insn->prev->next = insn->next;
+
+ if (insn->next)
+ insn->next->prev = insn->prev;
+ else
+ exit = insn->prev;
+
+ if (insn == entry) {
+ if (insn->next)
+ entry = insn->next;
+ else
+ if (insn->prev && insn->prev->op != OP_PHI)
+ entry = insn->prev;
+ else
+ entry = NULL;
+ }
+
+ if (insn == phi)
+ phi = (insn->next && insn->next->op == OP_PHI) ? insn->next : 0;
+
+ --numInsns;
+ insn->bb = NULL;
+ insn->next =
+ insn->prev = NULL;
+}
+
+void BasicBlock::permuteAdjacent(Instruction *a, Instruction *b)
+{
+ assert(a->bb == b->bb);
+
+ if (a->next != b) {
+ Instruction *i = a;
+ a = b;
+ b = i;
+ }
+ assert(a->next == b);
+ assert(a->op != OP_PHI && b->op != OP_PHI);
+
+ if (b == exit)
+ exit = a;
+ if (a == entry)
+ entry = b;
+
+ b->prev = a->prev;
+ a->next = b->next;
+ b->next = a;
+ a->prev = b;
+
+ if (b->prev)
+ b->prev->next = b;
+ if (a->prev)
+ a->next->prev = a;
+}
+
+void
+BasicBlock::splitCommon(Instruction *insn, BasicBlock *bb, bool attach)
+{
+ bb->entry = insn;
+
+ if (insn) {
+ exit = insn->prev;
+ insn->prev = NULL;
+ }
+
+ if (exit)
+ exit->next = NULL;
+ else
+ entry = NULL;
+
+ while (!cfg.outgoing(true).end()) {
+ Graph::Edge *e = cfg.outgoing(true).getEdge();
+ bb->cfg.attach(e->getTarget(), e->getType());
+ this->cfg.detach(e->getTarget());
+ }
+
+ for (; insn; insn = insn->next) {
+ this->numInsns--;
+ bb->numInsns++;
+ insn->bb = bb;
+ bb->exit = insn;
+ }
+ if (attach)
+ this->cfg.attach(&bb->cfg, Graph::Edge::TREE);
+}
+
+BasicBlock *
+BasicBlock::splitBefore(Instruction *insn, bool attach)
+{
+ BasicBlock *bb = new BasicBlock(func);
+ assert(!insn || insn->op != OP_PHI);
+
+ splitCommon(insn, bb, attach);
+ return bb;
+}
+
+BasicBlock *
+BasicBlock::splitAfter(Instruction *insn, bool attach)
+{
+ BasicBlock *bb = new BasicBlock(func);
+ assert(!insn || insn->op != OP_PHI);
+
+ bb->joinAt = joinAt;
+ joinAt = NULL;
+
+ splitCommon(insn ? insn->next : NULL, bb, attach);
+ return bb;
+}
+
+bool
+BasicBlock::dominatedBy(BasicBlock *that)
+{
+ Graph::Node *bn = &that->dom;
+ Graph::Node *dn = &this->dom;
+
+ while (dn && dn != bn)
+ dn = dn->parent();
+
+ return dn != NULL;
+}
+
+unsigned int
+BasicBlock::initiatesSimpleConditional() const
+{
+ Graph::Node *out[2];
+ int n;
+ Graph::Edge::Type eR;
+
+ if (cfg.outgoingCount() != 2) // -> if and -> else/endif
+ return false;
+
+ n = 0;
+ for (Graph::EdgeIterator ei = cfg.outgoing(); !ei.end(); ei.next())
+ out[n++] = ei.getNode();
+ eR = out[1]->outgoing().getType();
+
+ // IF block is out edge to the right
+ if (eR == Graph::Edge::CROSS || eR == Graph::Edge::BACK)
+ return 0x2;
+
+ if (out[1]->outgoingCount() != 1) // 0 is IF { RET; }, >1 is more divergence
+ return 0x0;
+ // do they reconverge immediately ?
+ if (out[1]->outgoing().getNode() == out[0])
+ return 0x1;
+ if (out[0]->outgoingCount() == 1)
+ if (out[0]->outgoing().getNode() == out[1]->outgoing().getNode())
+ return 0x3;
+
+ return 0x0;
+}
+
+bool
+Function::setEntry(BasicBlock *bb)
+{
+ if (cfg.getRoot())
+ return false;
+ cfg.insert(&bb->cfg);
+ return true;
+}
+
+bool
+Function::setExit(BasicBlock *bb)
+{
+ if (cfgExit)
+ return false;
+ cfgExit = &bb->cfg;
+ return true;
+}
+
+unsigned int
+Function::orderInstructions(ArrayList &result)
+{
+ result.clear();
+
+ for (IteratorRef it = cfg.iteratorCFG(); !it->end(); it->next()) {
+ BasicBlock *bb =
+ BasicBlock::get(reinterpret_cast<Graph::Node *>(it->get()));
+
+ for (Instruction *insn = bb->getFirst(); insn; insn = insn->next)
+ result.insert(insn, insn->serial);
+ }
+
+ return result.getSize();
+}
+
+void
+Function::buildLiveSets()
+{
+ for (unsigned i = 0; i <= loopNestingBound; ++i)
+ buildLiveSetsPreSSA(BasicBlock::get(cfg.getRoot()), cfg.nextSequence());
+
+ for (ArrayList::Iterator bi = allBBlocks.iterator(); !bi.end(); bi.next())
+ BasicBlock::get(bi)->liveSet.marker = false;
+}
+
+void
+Function::buildDefSets()
+{
+ for (unsigned i = 0; i <= loopNestingBound; ++i)
+ buildDefSetsPreSSA(BasicBlock::get(cfgExit), cfg.nextSequence());
+
+ for (ArrayList::Iterator bi = allBBlocks.iterator(); !bi.end(); bi.next())
+ BasicBlock::get(bi)->liveSet.marker = false;
+}
+
+bool
+Pass::run(Program *prog, bool ordered, bool skipPhi)
+{
+ this->prog = prog;
+ err = false;
+ return doRun(prog, ordered, skipPhi);
+}
+
+bool
+Pass::doRun(Program *prog, bool ordered, bool skipPhi)
+{
+ for (IteratorRef it = prog->calls.iteratorDFS(false);
+ !it->end(); it->next()) {
+ Graph::Node *n = reinterpret_cast<Graph::Node *>(it->get());
+ if (!doRun(Function::get(n), ordered, skipPhi))
+ return false;
+ }
+ return !err;
+}
+
+bool
+Pass::run(Function *func, bool ordered, bool skipPhi)
+{
+ prog = func->getProgram();
+ err = false;
+ return doRun(func, ordered, skipPhi);
+}
+
+bool
+Pass::doRun(Function *func, bool ordered, bool skipPhi)
+{
+ IteratorRef bbIter;
+ BasicBlock *bb;
+ Instruction *insn, *next;
+
+ this->func = func;
+ if (!visit(func))
+ return false;
+
+ bbIter = ordered ? func->cfg.iteratorCFG() : func->cfg.iteratorDFS();
+
+ for (; !bbIter->end(); bbIter->next()) {
+ bb = BasicBlock::get(reinterpret_cast<Graph::Node *>(bbIter->get()));
+ if (!visit(bb))
+ break;
+ for (insn = skipPhi ? bb->getEntry() : bb->getFirst(); insn != NULL;
+ insn = next) {
+ next = insn->next;
+ if (!visit(insn))
+ break;
+ }
+ }
+
+ return !err;
+}
+
+void
+Function::printCFGraph(const char *filePath)
+{
+ FILE *out = fopen(filePath, "a");
+ if (!out) {
+ ERROR("failed to open file: %s\n", filePath);
+ return;
+ }
+ INFO("printing control flow graph to: %s\n", filePath);
+
+ fprintf(out, "digraph G {\n");
+
+ for (IteratorRef it = cfg.iteratorDFS(); !it->end(); it->next()) {
+ BasicBlock *bb = BasicBlock::get(
+ reinterpret_cast<Graph::Node *>(it->get()));
+ int idA = bb->getId();
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+ int idB = BasicBlock::get(ei.getNode())->getId();
+ switch (ei.getType()) {
+ case Graph::Edge::TREE:
+ fprintf(out, "\t%i -> %i;\n", idA, idB);
+ break;
+ case Graph::Edge::FORWARD:
+ fprintf(out, "\t%i -> %i [color=green];\n", idA, idB);
+ break;
+ case Graph::Edge::CROSS:
+ fprintf(out, "\t%i -> %i [color=red];\n", idA, idB);
+ break;
+ case Graph::Edge::BACK:
+ fprintf(out, "\t%i -> %i;\n", idA, idB);
+ break;
+ case Graph::Edge::DUMMY:
+ fprintf(out, "\t%i -> %i [style=dotted];\n", idA, idB);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ }
+ }
+
+ fprintf(out, "}\n");
+ fclose(out);
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
new file mode 100644
index 0000000..70e5e22
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+namespace nv50_ir {
+
+BuildUtil::BuildUtil()
+{
+ init(NULL);
+}
+
+BuildUtil::BuildUtil(Program *prog)
+{
+ init(prog);
+}
+
+void
+BuildUtil::init(Program *prog)
+{
+ this->prog = prog;
+
+ func = NULL;
+ bb = NULL;
+ pos = NULL;
+
+ memset(imms, 0, sizeof(imms));
+ immCount = 0;
+}
+
+void
+BuildUtil::addImmediate(ImmediateValue *imm)
+{
+ if (immCount > (NV50_IR_BUILD_IMM_HT_SIZE * 3) / 4)
+ return;
+
+ unsigned int pos = u32Hash(imm->reg.data.u32);
+
+ while (imms[pos])
+ pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE;
+ imms[pos] = imm;
+ immCount++;
+}
+
+Instruction *
+BuildUtil::mkOp1(operation op, DataType ty, Value *dst, Value *src)
+{
+ Instruction *insn = new_Instruction(func, op, ty);
+
+ insn->setDef(0, dst);
+ insn->setSrc(0, src);
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkOp2(operation op, DataType ty, Value *dst,
+ Value *src0, Value *src1)
+{
+ Instruction *insn = new_Instruction(func, op, ty);
+
+ insn->setDef(0, dst);
+ insn->setSrc(0, src0);
+ insn->setSrc(1, src1);
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkOp3(operation op, DataType ty, Value *dst,
+ Value *src0, Value *src1, Value *src2)
+{
+ Instruction *insn = new_Instruction(func, op, ty);
+
+ insn->setDef(0, dst);
+ insn->setSrc(0, src0);
+ insn->setSrc(1, src1);
+ insn->setSrc(2, src2);
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkLoad(DataType ty, Value *dst, Symbol *mem, Value *ptr)
+{
+ Instruction *insn = new_Instruction(func, OP_LOAD, ty);
+
+ insn->setDef(0, dst);
+ insn->setSrc(0, mem);
+ if (ptr)
+ insn->setIndirect(0, 0, ptr);
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkStore(operation op, DataType ty, Symbol *mem, Value *ptr,
+ Value *stVal)
+{
+ Instruction *insn = new_Instruction(func, op, ty);
+
+ insn->setSrc(0, mem);
+ insn->setSrc(1, stVal);
+ if (ptr)
+ insn->setIndirect(0, 0, ptr);
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkFetch(Value *dst, DataType ty, DataFile file, int32_t offset,
+ Value *attrRel, Value *primRel)
+{
+ Symbol *sym = mkSymbol(file, 0, ty, offset);
+
+ Instruction *insn = mkOp1(OP_VFETCH, ty, dst, sym);
+
+ insn->setIndirect(0, 0, attrRel);
+ insn->setIndirect(0, 1, primRel);
+
+ // already inserted
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkInterp(unsigned mode, Value *dst, int32_t offset, Value *rel)
+{
+ operation op = OP_LINTERP;
+ DataType ty = TYPE_F32;
+
+ if ((mode & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_FLAT)
+ ty = TYPE_U32;
+ else
+ if ((mode & NV50_IR_INTERP_MODE_MASK) == NV50_IR_INTERP_PERSPECTIVE)
+ op = OP_PINTERP;
+
+ Symbol *sym = mkSymbol(FILE_SHADER_INPUT, 0, ty, offset);
+
+ Instruction *insn = mkOp1(op, ty, dst, sym);
+ insn->setIndirect(0, 0, rel);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkMov(Value *dst, Value *src, DataType ty)
+{
+ Instruction *insn = new_Instruction(func, OP_MOV, ty);
+
+ insn->setDef(0, dst);
+ insn->setSrc(0, src);
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkMovToReg(int id, Value *src)
+{
+ Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(src->reg.size));
+
+ insn->setDef(0, new_LValue(func, FILE_GPR));
+ insn->getDef(0)->reg.data.id = id;
+ insn->setSrc(0, src);
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkMovFromReg(Value *dst, int id)
+{
+ Instruction *insn = new_Instruction(func, OP_MOV, typeOfSize(dst->reg.size));
+
+ insn->setDef(0, dst);
+ insn->setSrc(0, new_LValue(func, FILE_GPR));
+ insn->getSrc(0)->reg.data.id = id;
+
+ insert(insn);
+ return insn;
+}
+
+Instruction *
+BuildUtil::mkCvt(operation op,
+ DataType dstTy, Value *dst, DataType srcTy, Value *src)
+{
+ Instruction *insn = new_Instruction(func, op, dstTy);
+
+ insn->setType(dstTy, srcTy);
+ insn->setDef(0, dst);
+ insn->setSrc(0, src);
+
+ insert(insn);
+ return insn;
+}
+
+CmpInstruction *
+BuildUtil::mkCmp(operation op, CondCode cc, DataType ty, Value *dst,
+ Value *src0, Value *src1, Value *src2)
+{
+ CmpInstruction *insn = new_CmpInstruction(func, op);
+
+ insn->setType((dst->reg.file == FILE_PREDICATE ||
+ dst->reg.file == FILE_FLAGS) ? TYPE_U8 : ty, ty);
+ insn->setCondition(cc);
+ insn->setDef(0, dst);
+ insn->setSrc(0, src0);
+ insn->setSrc(1, src1);
+ if (src2)
+ insn->setSrc(2, src2);
+
+ if (dst->reg.file == FILE_FLAGS)
+ insn->flagsDef = 0;
+
+ insert(insn);
+ return insn;
+}
+
+TexInstruction *
+BuildUtil::mkTex(operation op, TexTarget targ,
+ uint16_t tic, uint16_t tsc,
+ const std::vector<Value *> &def,
+ const std::vector<Value *> &src)
+{
+ TexInstruction *tex = new_TexInstruction(func, op);
+
+ for (size_t d = 0; d < def.size() && def[d]; ++d)
+ tex->setDef(d, def[d]);
+ for (size_t s = 0; s < src.size() && src[s]; ++s)
+ tex->setSrc(s, src[s]);
+
+ tex->setTexture(targ, tic, tsc);
+
+ insert(tex);
+ return tex;
+}
+
+Instruction *
+BuildUtil::mkQuadop(uint8_t q, Value *def, uint8_t l, Value *src0, Value *src1)
+{
+ Instruction *quadop = mkOp2(OP_QUADOP, TYPE_F32, def, src0, src1);
+ quadop->subOp = q;
+ quadop->lanes = l;
+ return quadop;
+}
+
+Instruction *
+BuildUtil::mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc)
+{
+ LValue *def0 = getSSA();
+ LValue *def1 = getSSA();
+
+ mkMov(def0, trSrc)->setPredicate(CC_P, pred);
+ mkMov(def1, flSrc)->setPredicate(CC_NOT_P, pred);
+
+ return mkOp2(OP_UNION, typeOfSize(dst->reg.size), dst, def0, def1);
+}
+
+Instruction *
+BuildUtil::mkSplit(Value *h[2], uint8_t halfSize, Value *val)
+{
+ Instruction *insn = NULL;
+
+ const DataType fTy = typeOfSize(halfSize * 2);
+
+ if (val->reg.file == FILE_IMMEDIATE)
+ val = mkMov(getSSA(halfSize * 2), val, fTy)->getDef(0);
+
+ if (isMemoryFile(val->reg.file)) {
+ h[0] = cloneShallow(getFunction(), val);
+ h[1] = cloneShallow(getFunction(), val);
+ h[0]->reg.size = halfSize;
+ h[1]->reg.size = halfSize;
+ h[1]->reg.data.offset += halfSize;
+ } else {
+ h[0] = getSSA(halfSize, val->reg.file);
+ h[1] = getSSA(halfSize, val->reg.file);
+ insn = mkOp1(OP_SPLIT, fTy, h[0], val);
+ insn->setDef(1, h[1]);
+ }
+ return insn;
+}
+
+FlowInstruction *
+BuildUtil::mkFlow(operation op, void *targ, CondCode cc, Value *pred)
+{
+ FlowInstruction *insn = new_FlowInstruction(func, op, targ);
+
+ if (pred)
+ insn->setPredicate(cc, pred);
+
+ insert(insn);
+ return insn;
+}
+
+void
+BuildUtil::mkClobber(DataFile f, uint32_t rMask, int unit)
+{
+ static const uint16_t baseSize2[16] =
+ {
+ 0x0000, 0x0010, 0x0011, 0x0020, 0x0012, 0x1210, 0x1211, 0x1220,
+ 0x0013, 0x1310, 0x1311, 0x1320, 0x0022, 0x2210, 0x2211, 0x0040,
+ };
+
+ int base = 0;
+
+ for (; rMask; rMask >>= 4, base += 4) {
+ const uint32_t mask = rMask & 0xf;
+ if (!mask)
+ continue;
+ int base1 = (baseSize2[mask] >> 0) & 0xf;
+ int size1 = (baseSize2[mask] >> 4) & 0xf;
+ int base2 = (baseSize2[mask] >> 8) & 0xf;
+ int size2 = (baseSize2[mask] >> 12) & 0xf;
+ Instruction *insn = mkOp(OP_NOP, TYPE_NONE, NULL);
+ if (1) { // size1 can't be 0
+ LValue *reg = new_LValue(func, f);
+ reg->reg.size = size1 << unit;
+ reg->reg.data.id = base + base1;
+ insn->setDef(0, reg);
+ }
+ if (size2) {
+ LValue *reg = new_LValue(func, f);
+ reg->reg.size = size2 << unit;
+ reg->reg.data.id = base + base2;
+ insn->setDef(1, reg);
+ }
+ }
+}
+
+ImmediateValue *
+BuildUtil::mkImm(uint32_t u)
+{
+ unsigned int pos = u32Hash(u);
+
+ while (imms[pos] && imms[pos]->reg.data.u32 != u)
+ pos = (pos + 1) % NV50_IR_BUILD_IMM_HT_SIZE;
+
+ ImmediateValue *imm = imms[pos];
+ if (!imm) {
+ imm = new_ImmediateValue(prog, u);
+ addImmediate(imm);
+ }
+ return imm;
+}
+
+ImmediateValue *
+BuildUtil::mkImm(uint64_t u)
+{
+ ImmediateValue *imm = new_ImmediateValue(prog, (uint32_t)0);
+
+ imm->reg.size = 8;
+ imm->reg.type = TYPE_U64;
+ imm->reg.data.u64 = u;
+
+ return imm;
+}
+
+ImmediateValue *
+BuildUtil::mkImm(float f)
+{
+ union {
+ float f32;
+ uint32_t u32;
+ } u;
+ u.f32 = f;
+ return mkImm(u.u32);
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, float f)
+{
+ return mkOp1v(OP_MOV, TYPE_F32, dst ? dst : getScratch(), mkImm(f));
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, uint32_t u)
+{
+ return mkOp1v(OP_MOV, TYPE_U32, dst ? dst : getScratch(), mkImm(u));
+}
+
+Value *
+BuildUtil::loadImm(Value *dst, uint64_t u)
+{
+ return mkOp1v(OP_MOV, TYPE_U64, dst ? dst : getScratch(8), mkImm(u));
+}
+
+Symbol *
+BuildUtil::mkSymbol(DataFile file, int8_t fileIndex, DataType ty,
+ uint32_t baseAddr)
+{
+ Symbol *sym = new_Symbol(prog, file, fileIndex);
+
+ sym->setOffset(baseAddr);
+ sym->reg.type = ty;
+ sym->reg.size = typeSizeof(ty);
+
+ return sym;
+}
+
+Symbol *
+BuildUtil::mkSysVal(SVSemantic svName, uint32_t svIndex)
+{
+ Symbol *sym = new_Symbol(prog, FILE_SYSTEM_VALUE, 0);
+
+ assert(svIndex < 4 ||
+ (svName == SV_CLIP_DISTANCE || svName == SV_TESS_FACTOR));
+
+ switch (svName) {
+ case SV_POSITION:
+ case SV_FACE:
+ case SV_YDIR:
+ case SV_POINT_SIZE:
+ case SV_POINT_COORD:
+ case SV_CLIP_DISTANCE:
+ case SV_TESS_FACTOR:
+ sym->reg.type = TYPE_F32;
+ break;
+ default:
+ sym->reg.type = TYPE_U32;
+ break;
+ }
+ sym->reg.size = typeSizeof(sym->reg.type);
+
+ sym->reg.data.sv.sv = svName;
+ sym->reg.data.sv.index = svIndex;
+
+ return sym;
+}
+
+void
+BuildUtil::DataArray::setup(unsigned array, unsigned arrayIdx,
+ uint32_t base, int len, int vecDim, int eltSize,
+ DataFile file, int8_t fileIdx)
+{
+ this->array = array;
+ this->arrayIdx = arrayIdx;
+ this->baseAddr = base;
+ this->arrayLen = len;
+ this->vecDim = vecDim;
+ this->eltSize = eltSize;
+ this->file = file;
+ this->regOnly = !isMemoryFile(file);
+
+ if (!regOnly) {
+ baseSym = new_Symbol(up->getProgram(), file, fileIdx);
+ baseSym->setOffset(baseAddr);
+ baseSym->reg.size = eltSize;
+ } else {
+ baseSym = NULL;
+ }
+}
+
+Value *
+BuildUtil::DataArray::acquire(ValueMap &m, int i, int c)
+{
+ if (regOnly) {
+ Value *v = lookup(m, i, c);
+ if (!v)
+ v = insert(m, i, c, new_LValue(up->getFunction(), file));
+
+ return v;
+ } else {
+ return up->getScratch();
+ }
+}
+
+Value *
+BuildUtil::DataArray::load(ValueMap &m, int i, int c, Value *ptr)
+{
+ if (regOnly) {
+ Value *v = lookup(m, i, c);
+ if (!v)
+ v = insert(m, i, c, new_LValue(up->getFunction(), file));
+
+ return v;
+ } else {
+ Value *sym = lookup(m, i, c);
+ if (!sym)
+ sym = insert(m, i, c, mkSymbol(i, c));
+
+ return up->mkLoadv(typeOfSize(eltSize), static_cast<Symbol *>(sym), ptr);
+ }
+}
+
+void
+BuildUtil::DataArray::store(ValueMap &m, int i, int c, Value *ptr, Value *value)
+{
+ if (regOnly) {
+ assert(!ptr);
+ if (!lookup(m, i, c))
+ insert(m, i, c, value);
+
+ assert(lookup(m, i, c) == value);
+ } else {
+ Value *sym = lookup(m, i, c);
+ if (!sym)
+ sym = insert(m, i, c, mkSymbol(i, c));
+
+ const DataType stTy = typeOfSize(value->reg.size);
+
+ up->mkStore(OP_STORE, stTy, static_cast<Symbol *>(sym), ptr, value);
+ }
+}
+
+Symbol *
+BuildUtil::DataArray::mkSymbol(int i, int c)
+{
+ const unsigned int idx = i * vecDim + c;
+ Symbol *sym = new_Symbol(up->getProgram(), file, 0);
+
+ assert(baseSym || (idx < arrayLen && c < vecDim));
+
+ sym->reg.size = eltSize;
+ sym->reg.type = typeOfSize(eltSize);
+ sym->setAddress(baseSym, baseAddr + idx * eltSize);
+ return sym;
+}
+
+
+Instruction *
+BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
+ Value *zero,
+ Value *carry)
+{
+ DataType hTy;
+ int srcNr;
+
+ switch (i->dType) {
+ case TYPE_U64: hTy = TYPE_U32; break;
+ case TYPE_S64: hTy = TYPE_S32; break;
+ default:
+ return NULL;
+ }
+
+ switch (i->op) {
+ case OP_MOV: srcNr = 1; break;
+ case OP_ADD:
+ case OP_SUB:
+ if (!carry)
+ return NULL;
+ srcNr = 2;
+ break;
+ default:
+ // TODO when needed
+ return NULL;
+ }
+
+ i->setType(hTy);
+ i->setDef(0, cloneShallow(fn, i->getDef(0)));
+ i->getDef(0)->reg.size = 4;
+ Instruction *lo = i;
+ Instruction *hi = cloneForward(fn, i);
+ lo->bb->insertAfter(lo, hi);
+
+ hi->getDef(0)->reg.data.id++;
+
+ for (int s = 0; s < srcNr; ++s) {
+ if (lo->getSrc(s)->reg.size < 8) {
+ hi->setSrc(s, zero);
+ } else {
+ if (lo->getSrc(s)->refCount() > 1)
+ lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
+ lo->getSrc(s)->reg.size /= 2;
+ hi->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
+
+ switch (hi->src(s).getFile()) {
+ case FILE_IMMEDIATE:
+ hi->getSrc(s)->reg.data.u64 >>= 32;
+ break;
+ case FILE_MEMORY_CONST:
+ case FILE_MEMORY_SHARED:
+ case FILE_SHADER_INPUT:
+ hi->getSrc(s)->reg.data.offset += 4;
+ break;
+ default:
+ assert(hi->src(s).getFile() == FILE_GPR);
+ hi->getSrc(s)->reg.data.id++;
+ break;
+ }
+ }
+ }
+ if (srcNr == 2) {
+ lo->setDef(1, carry);
+ hi->setFlagsSrc(hi->srcCount(), carry);
+ }
+ return hi;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
new file mode 100644
index 0000000..2305a27
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.h
@@ -0,0 +1,324 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_BUILD_UTIL__
+#define __NV50_IR_BUILD_UTIL__
+
+namespace nv50_ir {
+
+class BuildUtil
+{
+public:
+ BuildUtil();
+ BuildUtil(Program *);
+
+ inline void setProgram(Program *);
+ inline Program *getProgram() const { return prog; }
+ inline Function *getFunction() const { return func; }
+
+ // keeps inserting at head/tail of block
+ inline void setPosition(BasicBlock *, bool tail);
+ // position advances only if @after is true
+ inline void setPosition(Instruction *, bool after);
+
+ inline BasicBlock *getBB() { return bb; }
+
+ inline void insert(Instruction *);
+ inline void remove(Instruction *i) { assert(i->bb == bb); bb->remove(i); }
+
+ inline LValue *getScratch(int size = 4, DataFile = FILE_GPR);
+ // scratch value for a single assignment:
+ inline LValue *getSSA(int size = 4, DataFile = FILE_GPR);
+
+ inline Instruction *mkOp(operation, DataType, Value *);
+ Instruction *mkOp1(operation, DataType, Value *, Value *);
+ Instruction *mkOp2(operation, DataType, Value *, Value *, Value *);
+ Instruction *mkOp3(operation, DataType, Value *, Value *, Value *, Value *);
+
+ LValue *mkOp1v(operation, DataType, Value *, Value *);
+ LValue *mkOp2v(operation, DataType, Value *, Value *, Value *);
+ LValue *mkOp3v(operation, DataType, Value *, Value *, Value *, Value *);
+
+ Instruction *mkLoad(DataType, Value *dst, Symbol *, Value *ptr);
+ Instruction *mkStore(operation, DataType, Symbol *, Value *ptr, Value *val);
+
+ LValue *mkLoadv(DataType, Symbol *, Value *ptr);
+
+ Instruction *mkMov(Value *, Value *, DataType = TYPE_U32);
+ Instruction *mkMovToReg(int id, Value *);
+ Instruction *mkMovFromReg(Value *, int id);
+
+ Instruction *mkInterp(unsigned mode, Value *, int32_t offset, Value *rel);
+ Instruction *mkFetch(Value *, DataType, DataFile, int32_t offset,
+ Value *attrRel, Value *primRel);
+
+ Instruction *mkCvt(operation, DataType, Value *, DataType, Value *);
+ CmpInstruction *mkCmp(operation, CondCode, DataType,
+ Value *,
+ Value *, Value *, Value * = NULL);
+ TexInstruction *mkTex(operation, TexTarget,
+ uint16_t tic, uint16_t tsc,
+ const std::vector<Value *> &def,
+ const std::vector<Value *> &src);
+ Instruction *mkQuadop(uint8_t qop, Value *, uint8_t l, Value *, Value *);
+
+ FlowInstruction *mkFlow(operation, void *target, CondCode, Value *pred);
+
+ Instruction *mkSelect(Value *pred, Value *dst, Value *trSrc, Value *flSrc);
+
+ Instruction *mkSplit(Value *half[2], uint8_t halfSize, Value *);
+
+ void mkClobber(DataFile file, uint32_t regMask, int regUnitLog2);
+
+ ImmediateValue *mkImm(float);
+ ImmediateValue *mkImm(uint32_t);
+ ImmediateValue *mkImm(uint64_t);
+
+ ImmediateValue *mkImm(int i) { return mkImm((uint32_t)i); }
+
+ Value *loadImm(Value *dst, float);
+ Value *loadImm(Value *dst, uint32_t);
+ Value *loadImm(Value *dst, uint64_t);
+
+ Value *loadImm(Value *dst, int i) { return loadImm(dst, (uint32_t)i); }
+
+ // returns high part of the operation
+ static Instruction *split64BitOpPostRA(Function *, Instruction *,
+ Value *zero, Value *carry);
+
+ struct Location
+ {
+ Location(unsigned array, unsigned arrayIdx, unsigned i, unsigned c)
+ : array(array), arrayIdx(arrayIdx), i(i), c(c) { }
+ Location(const Location &l)
+ : array(l.array), arrayIdx(l.arrayIdx), i(l.i), c(l.c) { }
+
+ bool operator==(const Location &l) const
+ {
+ return
+ array == l.array && arrayIdx == l.arrayIdx && i == l.i && c == l.c;
+ }
+
+ bool operator<(const Location &l) const
+ {
+ return array != l.array ? array < l.array :
+ arrayIdx != l.arrayIdx ? arrayIdx < l.arrayIdx :
+ i != l.i ? i < l.i :
+ c != l.c ? c < l.c :
+ false;
+ }
+
+ unsigned array, arrayIdx, i, c;
+ };
+
+ typedef bimap<Location, Value *> ValueMap;
+
+ class DataArray
+ {
+ public:
+ DataArray(BuildUtil *bld) : up(bld) { }
+
+ void setup(unsigned array, unsigned arrayIdx,
+ uint32_t base, int len, int vecDim, int eltSize,
+ DataFile file, int8_t fileIdx);
+
+ inline bool exists(ValueMap&, unsigned int i, unsigned int c);
+
+ Value *load(ValueMap&, int i, int c, Value *ptr);
+ void store(ValueMap&, int i, int c, Value *ptr, Value *value);
+ Value *acquire(ValueMap&, int i, int c);
+
+ private:
+ inline Value *lookup(ValueMap&, unsigned i, unsigned c);
+ inline Value *insert(ValueMap&, unsigned i, unsigned c, Value *v);
+
+ Symbol *mkSymbol(int i, int c);
+
+ private:
+ BuildUtil *up;
+ unsigned array, arrayIdx;
+
+ uint32_t baseAddr;
+ uint32_t arrayLen;
+ Symbol *baseSym;
+
+ uint8_t vecDim;
+ uint8_t eltSize; // in bytes
+
+ DataFile file;
+ bool regOnly;
+ };
+
+ Symbol *mkSymbol(DataFile file, int8_t fileIndex,
+ DataType ty, uint32_t baseAddress);
+
+ Symbol *mkSysVal(SVSemantic svName, uint32_t svIndex);
+
+private:
+ void init(Program *);
+ void addImmediate(ImmediateValue *);
+ inline unsigned int u32Hash(uint32_t);
+
+protected:
+ Program *prog;
+ Function *func;
+ Instruction *pos;
+ BasicBlock *bb;
+ bool tail;
+
+#define NV50_IR_BUILD_IMM_HT_SIZE 256
+
+ ImmediateValue *imms[NV50_IR_BUILD_IMM_HT_SIZE];
+ unsigned int immCount;
+};
+
+unsigned int BuildUtil::u32Hash(uint32_t u)
+{
+ return (u % 273) % NV50_IR_BUILD_IMM_HT_SIZE;
+}
+
+void BuildUtil::setProgram(Program *program)
+{
+ prog = program;
+}
+
+void
+BuildUtil::setPosition(BasicBlock *block, bool atTail)
+{
+ bb = block;
+ prog = bb->getProgram();
+ func = bb->getFunction();
+ pos = NULL;
+ tail = atTail;
+}
+
+void
+BuildUtil::setPosition(Instruction *i, bool after)
+{
+ bb = i->bb;
+ prog = bb->getProgram();
+ func = bb->getFunction();
+ pos = i;
+ tail = after;
+ assert(bb);
+}
+
+LValue *
+BuildUtil::getScratch(int size, DataFile f)
+{
+ LValue *lval = new_LValue(func, f);
+ lval->reg.size = size;
+ return lval;
+}
+
+LValue *
+BuildUtil::getSSA(int size, DataFile f)
+{
+ LValue *lval = new_LValue(func, f);
+ lval->ssa = 1;
+ lval->reg.size = size;
+ return lval;
+}
+
+void BuildUtil::insert(Instruction *i)
+{
+ if (!pos) {
+ tail ? bb->insertTail(i) : bb->insertHead(i);
+ } else {
+ if (tail) {
+ bb->insertAfter(pos, i);
+ pos = i;
+ } else {
+ bb->insertBefore(pos, i);
+ }
+ }
+}
+
+Instruction *
+BuildUtil::mkOp(operation op, DataType ty, Value *dst)
+{
+ Instruction *insn = new_Instruction(func, op, ty);
+ insn->setDef(0, dst);
+ insert(insn);
+ if (op == OP_DISCARD || op == OP_EXIT ||
+ op == OP_JOIN ||
+ op == OP_QUADON || op == OP_QUADPOP ||
+ op == OP_EMIT || op == OP_RESTART)
+ insn->fixed = 1;
+ return insn;
+}
+
+inline LValue *
+BuildUtil::mkOp1v(operation op, DataType ty, Value *dst, Value *src)
+{
+ mkOp1(op, ty, dst, src);
+ return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkOp2v(operation op, DataType ty, Value *dst,
+ Value *src0, Value *src1)
+{
+ mkOp2(op, ty, dst, src0, src1);
+ return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkOp3v(operation op, DataType ty, Value *dst,
+ Value *src0, Value *src1, Value *src2)
+{
+ mkOp3(op, ty, dst, src0, src1, src2);
+ return dst->asLValue();
+}
+
+inline LValue *
+BuildUtil::mkLoadv(DataType ty, Symbol *mem, Value *ptr)
+{
+ LValue *dst = getScratch();
+ mkLoad(ty, dst, mem, ptr);
+ return dst;
+}
+
+bool
+BuildUtil::DataArray::exists(ValueMap &m, unsigned int i, unsigned int c)
+{
+ assert(i < arrayLen && c < vecDim);
+ return !regOnly || m.r.count(Location(array, arrayIdx, i, c));
+}
+
+Value *
+BuildUtil::DataArray::lookup(ValueMap &m, unsigned i, unsigned c)
+{
+ ValueMap::r_iterator it = m.r.find(Location(array, arrayIdx, i, c));
+ return it != m.r.end() ? it->second : NULL;
+}
+
+Value *
+BuildUtil::DataArray::insert(ValueMap &m, unsigned i, unsigned c, Value *v)
+{
+ m.insert(Location(array, arrayIdx, i, c), v);
+ return v;
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_BUILD_UTIL_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
new file mode 100644
index 0000000..752bad3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -0,0 +1,220 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_DRIVER_H__
+#define __NV50_IR_DRIVER_H__
+
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/tgsi_util.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_scan.h"
+
+/*
+ * This struct constitutes linkage information in TGSI terminology.
+ *
+ * It is created by the code generator and handed to the pipe driver
+ * for input/output slot assignment.
+ */
+struct nv50_ir_varying
+{
+ uint8_t slot[4]; /* native slots for xyzw (addresses in 32-bit words) */
+
+ unsigned mask : 4; /* vec4 mask */
+ unsigned linear : 1; /* linearly interpolated if true (and not flat) */
+ unsigned flat : 1;
+ unsigned sc : 1; /* special colour interpolation mode (SHADE_MODEL) */
+ unsigned centroid : 1;
+ unsigned patch : 1; /* patch constant value */
+ unsigned regular : 1; /* driver-specific meaning (e.g. input in sreg) */
+ unsigned input : 1; /* indicates direction of system values */
+ unsigned oread : 1; /* true if output is read from parallel TCP */
+
+ ubyte id; /* TGSI register index */
+ ubyte sn; /* TGSI semantic name */
+ ubyte si; /* TGSI semantic index */
+};
+
+#define NV50_PROGRAM_IR_TGSI 0
+#define NV50_PROGRAM_IR_SM4 1
+#define NV50_PROGRAM_IR_GLSL 2
+#define NV50_PROGRAM_IR_LLVM 3
+
+#ifdef DEBUG
+# define NV50_IR_DEBUG_BASIC (1 << 0)
+# define NV50_IR_DEBUG_VERBOSE (2 << 0)
+# define NV50_IR_DEBUG_REG_ALLOC (1 << 2)
+#else
+# define NV50_IR_DEBUG_BASIC 0
+# define NV50_IR_DEBUG_VERBOSE 0
+# define NV50_IR_DEBUG_REG_ALLOC 0
+#endif
+
+#define NV50_SEMANTIC_CLIPDISTANCE (TGSI_SEMANTIC_COUNT + 0)
+#define NV50_SEMANTIC_VIEWPORTINDEX (TGSI_SEMANTIC_COUNT + 4)
+#define NV50_SEMANTIC_LAYER (TGSI_SEMANTIC_COUNT + 5)
+#define NV50_SEMANTIC_INVOCATIONID (TGSI_SEMANTIC_COUNT + 6)
+#define NV50_SEMANTIC_TESSFACTOR (TGSI_SEMANTIC_COUNT + 7)
+#define NV50_SEMANTIC_TESSCOORD (TGSI_SEMANTIC_COUNT + 8)
+#define NV50_SEMANTIC_SAMPLEMASK (TGSI_SEMANTIC_COUNT + 9)
+#define NV50_SEMANTIC_COUNT (TGSI_SEMANTIC_COUNT + 10)
+
+#define NV50_TESS_PART_FRACT_ODD 0
+#define NV50_TESS_PART_FRACT_EVEN 1
+#define NV50_TESS_PART_POW2 2
+#define NV50_TESS_PART_INTEGER 3
+
+#define NV50_PRIM_PATCHES PIPE_PRIM_MAX
+
+struct nv50_ir_prog_symbol
+{
+ uint32_t label;
+ uint32_t offset;
+};
+
+#define NVISA_GF100_CHIPSET_C0 0xc0
+#define NVISA_GF100_CHIPSET_D0 0xd0
+#define NVISA_GK104_CHIPSET 0xe0
+#define NVISA_GK110_CHIPSET 0xf0
+
+struct nv50_ir_prog_info
+{
+ uint16_t target; /* chipset (0x50, 0x84, 0xc0, ...) */
+
+ uint8_t type; /* PIPE_SHADER */
+
+ uint8_t optLevel; /* optimization level (0 to 3) */
+ uint8_t dbgFlags;
+
+ struct {
+ int16_t maxGPR; /* may be -1 if none used */
+ int16_t maxOutput;
+ uint32_t tlsSpace; /* required local memory per thread */
+ uint32_t *code;
+ uint32_t codeSize;
+ uint8_t sourceRep; /* NV50_PROGRAM_IR */
+ const void *source;
+ void *relocData;
+ struct nv50_ir_prog_symbol *syms;
+ uint16_t numSyms;
+ } bin;
+
+ struct nv50_ir_varying sv[PIPE_MAX_SHADER_INPUTS];
+ struct nv50_ir_varying in[PIPE_MAX_SHADER_INPUTS];
+ struct nv50_ir_varying out[PIPE_MAX_SHADER_OUTPUTS];
+ uint8_t numInputs;
+ uint8_t numOutputs;
+ uint8_t numPatchConstants; /* also included in numInputs/numOutputs */
+ uint8_t numSysVals;
+
+ struct {
+ uint32_t *buf; /* for IMMEDIATE_ARRAY */
+ uint16_t bufSize; /* size of immediate array */
+ uint16_t count; /* count of inline immediates */
+ uint32_t *data; /* inline immediate data */
+ uint8_t *type; /* for each vec4 (128 bit) */
+ } immd;
+
+ union {
+ struct {
+ uint32_t inputMask[4]; /* mask of attributes read (1 bit per scalar) */
+ } vp;
+ struct {
+ uint8_t inputPatchSize;
+ uint8_t outputPatchSize;
+ uint8_t partitioning; /* PIPE_TESS_PART */
+ int8_t winding; /* +1 (clockwise) / -1 (counter-clockwise) */
+ uint8_t domain; /* PIPE_PRIM_{QUADS,TRIANGLES,LINES} */
+ uint8_t outputPrim; /* PIPE_PRIM_{TRIANGLES,LINES,POINTS} */
+ } tp;
+ struct {
+ uint8_t inputPrim;
+ uint8_t outputPrim;
+ unsigned instanceCount;
+ unsigned maxVertices;
+ } gp;
+ struct {
+ unsigned numColourResults;
+ boolean writesDepth;
+ boolean earlyFragTests;
+ boolean separateFragData;
+ boolean usesDiscard;
+ } fp;
+ struct {
+ uint32_t inputOffset; /* base address for user args */
+ uint32_t sharedOffset; /* reserved space in s[] */
+ uint32_t gridInfoBase; /* base address for NTID,NCTAID */
+ } cp;
+ } prop;
+
+ uint8_t numBarriers;
+
+ struct {
+ uint8_t clipDistance; /* index of first clip distance output */
+ uint8_t clipDistanceMask; /* mask of clip distances defined */
+ uint8_t cullDistanceMask; /* clip distance mode (1 bit per output) */
+ int8_t genUserClip; /* request user clip planes for ClipVertex */
+ uint16_t ucpBase; /* base address for UCPs */
+ uint8_t ucpCBSlot; /* constant buffer index of UCP data */
+ uint8_t pointSize; /* output index for PointSize */
+ uint8_t instanceId; /* system value index of InstanceID */
+ uint8_t vertexId; /* system value index of VertexID */
+ uint8_t edgeFlagIn;
+ uint8_t edgeFlagOut;
+ uint8_t fragDepth; /* output index of FragDepth */
+ uint8_t sampleMask; /* output index of SampleMask */
+ uint8_t backFaceColor[2]; /* input/output indices of back face colour */
+ uint8_t globalAccess; /* 1 for read, 2 for wr, 3 for rw */
+ boolean nv50styleSurfaces; /* generate gX[] access for raw buffers */
+ uint8_t resInfoCBSlot; /* cX[] used for tex handles, surface info */
+ uint16_t texBindBase; /* base address for tex handles (nve4) */
+ uint16_t suInfoBase; /* base address for surface info (nve4) */
+ uint8_t msInfoCBSlot; /* cX[] used for multisample info */
+ uint16_t msInfoBase; /* base address for multisample info */
+ } io;
+
+ /* driver callback to assign input/output locations */
+ int (*assignSlots)(struct nv50_ir_prog_info *);
+
+ void *driverPriv;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int nv50_ir_generate_code(struct nv50_ir_prog_info *);
+
+extern void nv50_ir_relocate_code(void *relocData, uint32_t *code,
+ uint32_t codePos,
+ uint32_t libPos,
+ uint32_t dataPos);
+
+/* obtain code that will be shared among programs */
+extern void nv50_ir_get_target_library(uint32_t chipset,
+ const uint32_t **code, uint32_t *size);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // __NV50_IR_DRIVER_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
new file mode 100644
index 0000000..ac59187
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -0,0 +1,1682 @@
+/*
+ * Copyright 2012 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+// CodeEmitter for GK110 encoding of the Fermi/Kepler ISA.
+
+namespace nv50_ir {
+
+class CodeEmitterGK110 : public CodeEmitter
+{
+public:
+ CodeEmitterGK110(const TargetNVC0 *);
+
+ virtual bool emitInstruction(Instruction *);
+ virtual uint32_t getMinEncodingSize(const Instruction *) const;
+ virtual void prepareEmission(Function *);
+
+ inline void setProgramType(Program::Type pType) { progType = pType; }
+
+private:
+ const TargetNVC0 *targNVC0;
+
+ Program::Type progType;
+
+ const bool writeIssueDelays;
+
+private:
+ void emitForm_21(const Instruction *, uint32_t opc2, uint32_t opc1);
+ void emitForm_C(const Instruction *, uint32_t opc, uint8_t ctg);
+ void emitForm_L(const Instruction *, uint32_t opc, uint8_t ctg, Modifier);
+
+ void emitPredicate(const Instruction *);
+
+ void setCAddress14(const ValueRef&);
+ void setShortImmediate(const Instruction *, const int s);
+ void setImmediate32(const Instruction *, const int s, Modifier);
+
+ void modNegAbsF32_3b(const Instruction *, const int s);
+
+ void emitCondCode(CondCode cc, int pos, uint8_t mask);
+ void emitInterpMode(const Instruction *);
+ void emitLoadStoreType(DataType ty, const int pos);
+ void emitCachingMode(CacheMode c, const int pos);
+
+ inline uint8_t getSRegEncoding(const ValueRef&);
+
+ void emitRoundMode(RoundMode, const int pos, const int rintPos);
+ void emitRoundModeF(RoundMode, const int pos);
+ void emitRoundModeI(RoundMode, const int pos);
+
+ void emitNegAbs12(const Instruction *);
+
+ void emitNOP(const Instruction *);
+
+ void emitLOAD(const Instruction *);
+ void emitSTORE(const Instruction *);
+ void emitMOV(const Instruction *);
+
+ void emitINTERP(const Instruction *);
+ void emitPFETCH(const Instruction *);
+ void emitVFETCH(const Instruction *);
+ void emitEXPORT(const Instruction *);
+ void emitOUT(const Instruction *);
+
+ void emitUADD(const Instruction *);
+ void emitFADD(const Instruction *);
+ void emitIMUL(const Instruction *);
+ void emitFMUL(const Instruction *);
+ void emitIMAD(const Instruction *);
+ void emitISAD(const Instruction *);
+ void emitFMAD(const Instruction *);
+
+ void emitNOT(const Instruction *);
+ void emitLogicOp(const Instruction *, uint8_t subOp);
+ void emitPOPC(const Instruction *);
+ void emitINSBF(const Instruction *);
+ void emitShift(const Instruction *);
+
+ void emitSFnOp(const Instruction *, uint8_t subOp);
+
+ void emitCVT(const Instruction *);
+ void emitMINMAX(const Instruction *);
+ void emitPreOp(const Instruction *);
+
+ void emitSET(const CmpInstruction *);
+ void emitSLCT(const CmpInstruction *);
+ void emitSELP(const Instruction *);
+
+ void emitTEXBAR(const Instruction *);
+ void emitTEX(const TexInstruction *);
+ void emitTEXCSAA(const TexInstruction *);
+ void emitTXQ(const TexInstruction *);
+
+ void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
+
+ void emitFlow(const Instruction *);
+
+ inline void defId(const ValueDef&, const int pos);
+ inline void srcId(const ValueRef&, const int pos);
+ inline void srcId(const ValueRef *, const int pos);
+ inline void srcId(const Instruction *, int s, const int pos);
+
+ inline void srcAddr32(const ValueRef&, const int pos); // address / 4
+
+ inline bool isLIMM(const ValueRef&, DataType ty, bool mod = false);
+};
+
+#define GK110_GPR_ZERO 255
+
+#define NEG_(b, s) \
+ if (i->src(s).mod.neg()) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+#define ABS_(b, s) \
+ if (i->src(s).mod.abs()) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define NOT_(b, s) if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT)) \
+ code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define FTZ_(b) if (i->ftz) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define SAT_(b) if (i->saturate) code[(0x##b) / 32] |= 1 << ((0x##b) % 32)
+
+#define RND_(b, t) emitRoundMode##t(i->rnd, 0x##b)
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterGK110::srcId(const ValueRef& src, const int pos)
+{
+ code[pos / 32] |= (src.get() ? SDATA(src).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+void CodeEmitterGK110::srcId(const ValueRef *src, const int pos)
+{
+ code[pos / 32] |= (src ? SDATA(*src).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+void CodeEmitterGK110::srcId(const Instruction *insn, int s, int pos)
+{
+ int r = insn->srcExists(s) ? SDATA(insn->src(s)).id : GK110_GPR_ZERO;
+ code[pos / 32] |= r << (pos % 32);
+}
+
+void CodeEmitterGK110::srcAddr32(const ValueRef& src, const int pos)
+{
+ code[pos / 32] |= (SDATA(src).offset >> 2) << (pos % 32);
+}
+
+void CodeEmitterGK110::defId(const ValueDef& def, const int pos)
+{
+ code[pos / 32] |= (def.get() ? DDATA(def).id : GK110_GPR_ZERO) << (pos % 32);
+}
+
+bool CodeEmitterGK110::isLIMM(const ValueRef& ref, DataType ty, bool mod)
+{
+ const ImmediateValue *imm = ref.get()->asImm();
+
+ return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+}
+
+void
+CodeEmitterGK110::emitRoundMode(RoundMode rnd, const int pos, const int rintPos)
+{
+ bool rint = false;
+ uint8_t n;
+
+ switch (rnd) {
+ case ROUND_MI: rint = true; /* fall through */ case ROUND_M: n = 1; break;
+ case ROUND_PI: rint = true; /* fall through */ case ROUND_P: n = 2; break;
+ case ROUND_ZI: rint = true; /* fall through */ case ROUND_Z: n = 3; break;
+ default:
+ rint = rnd == ROUND_NI;
+ n = 0;
+ assert(rnd == ROUND_N || rnd == ROUND_NI);
+ break;
+ }
+ code[pos / 32] |= n << (pos % 32);
+ if (rint && rintPos >= 0)
+ code[rintPos / 32] |= 1 << (rintPos % 32);
+}
+
+void
+CodeEmitterGK110::emitRoundModeF(RoundMode rnd, const int pos)
+{
+ uint8_t n;
+
+ switch (rnd) {
+ case ROUND_M: n = 1; break;
+ case ROUND_P: n = 2; break;
+ case ROUND_Z: n = 3; break;
+ default:
+ n = 0;
+ assert(rnd == ROUND_N);
+ break;
+ }
+ code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitRoundModeI(RoundMode rnd, const int pos)
+{
+ uint8_t n;
+
+ switch (rnd) {
+ case ROUND_MI: n = 1; break;
+ case ROUND_PI: n = 2; break;
+ case ROUND_ZI: n = 3; break;
+ default:
+ n = 0;
+ assert(rnd == ROUND_NI);
+ break;
+ }
+ code[pos / 32] |= n << (pos % 32);
+}
+
+void CodeEmitterGK110::emitCondCode(CondCode cc, int pos, uint8_t mask)
+{
+ uint8_t n;
+
+ switch (cc) {
+ case CC_FL: n = 0x00; break;
+ case CC_LT: n = 0x01; break;
+ case CC_EQ: n = 0x02; break;
+ case CC_LE: n = 0x03; break;
+ case CC_GT: n = 0x04; break;
+ case CC_NE: n = 0x05; break;
+ case CC_GE: n = 0x06; break;
+ case CC_LTU: n = 0x09; break;
+ case CC_EQU: n = 0x0a; break;
+ case CC_LEU: n = 0x0b; break;
+ case CC_GTU: n = 0x0c; break;
+ case CC_NEU: n = 0x0d; break;
+ case CC_GEU: n = 0x0e; break;
+ case CC_TR: n = 0x0f; break;
+ case CC_NO: n = 0x10; break;
+ case CC_NC: n = 0x11; break;
+ case CC_NS: n = 0x12; break;
+ case CC_NA: n = 0x13; break;
+ case CC_A: n = 0x14; break;
+ case CC_S: n = 0x15; break;
+ case CC_C: n = 0x16; break;
+ case CC_O: n = 0x17; break;
+ default:
+ n = 0;
+ assert(!"invalid condition code");
+ break;
+ }
+ code[pos / 32] |= (n & mask) << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitPredicate(const Instruction *i)
+{
+ if (i->predSrc >= 0) {
+ srcId(i->src(i->predSrc), 18);
+ if (i->cc == CC_NOT_P)
+ code[0] |= 8 << 18; // negate
+ assert(i->getPredicate()->reg.file == FILE_PREDICATE);
+ } else {
+ code[0] |= 7 << 18;
+ }
+}
+
+void
+CodeEmitterGK110::setCAddress14(const ValueRef& src)
+{
+ const int32_t addr = src.get()->asSym()->reg.data.offset / 4;
+
+ code[0] |= (addr & 0x01ff) << 23;
+ code[1] |= (addr & 0x3e00) >> 9;
+}
+
+void
+CodeEmitterGK110::setShortImmediate(const Instruction *i, const int s)
+{
+ const uint32_t u32 = i->getSrc(s)->asImm()->reg.data.u32;
+ const uint64_t u64 = i->getSrc(s)->asImm()->reg.data.u64;
+
+ if (i->sType == TYPE_F32) {
+ assert(!(u32 & 0x00000fff));
+ code[0] |= ((u32 & 0x001ff000) >> 12) << 23;
+ code[1] |= ((u32 & 0x7fe00000) >> 21);
+ code[1] |= ((u32 & 0x80000000) >> 4);
+ } else
+ if (i->sType == TYPE_F64) {
+ assert(!(u64 & 0x00000fffffffffffULL));
+ code[0] |= ((u64 & 0x001ff00000000000ULL) >> 44) << 23;
+ code[1] |= ((u64 & 0x7fe0000000000000ULL) >> 53);
+ code[1] |= ((u64 & 0x8000000000000000ULL) >> 36);
+ } else {
+ assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+ code[0] |= (u32 & 0x001ff) << 23;
+ code[1] |= (u32 & 0x7fe00) >> 9;
+ code[1] |= (u32 & 0x80000) << 8;
+ }
+}
+
+void
+CodeEmitterGK110::setImmediate32(const Instruction *i, const int s,
+ Modifier mod)
+{
+ uint32_t u32 = i->getSrc(s)->asImm()->reg.data.u32;
+
+ if (mod) {
+ ImmediateValue imm(i->getSrc(s)->asImm(), i->sType);
+ mod.applyTo(imm);
+ u32 = imm.reg.data.u32;
+ }
+
+ code[0] |= u32 << 23;
+ code[1] |= u32 >> 9;
+}
+
+void
+CodeEmitterGK110::emitForm_L(const Instruction *i, uint32_t opc, uint8_t ctg,
+ Modifier mod)
+{
+ code[0] = ctg;
+ code[1] = opc << 20;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 2);
+
+ for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+ switch (i->src(s).getFile()) {
+ case FILE_GPR:
+ srcId(i->src(s), s ? 42 : 10);
+ break;
+ case FILE_IMMEDIATE:
+ setImmediate32(i, s, mod);
+ break;
+ default:
+ break;
+ }
+ }
+}
+
+
+void
+CodeEmitterGK110::emitForm_C(const Instruction *i, uint32_t opc, uint8_t ctg)
+{
+ code[0] = ctg;
+ code[1] = opc << 20;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 2);
+
+ switch (i->src(0).getFile()) {
+ case FILE_MEMORY_CONST:
+ code[1] |= 0x4 << 28;
+ setCAddress14(i->src(0));
+ break;
+ case FILE_GPR:
+ code[1] |= 0xc << 28;
+ srcId(i->src(0), 23);
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+// 0x2 for GPR, c[] and 0x1 for short immediate
+void
+CodeEmitterGK110::emitForm_21(const Instruction *i, uint32_t opc2,
+ uint32_t opc1)
+{
+ const bool imm = i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE;
+
+ int s1 = 23;
+ if (i->srcExists(2) && i->src(2).getFile() == FILE_MEMORY_CONST)
+ s1 = 42;
+
+ if (imm) {
+ code[0] = 0x1;
+ code[1] = opc1 << 20;
+ } else {
+ code[0] = 0x2;
+ code[1] = (0xc << 28) | (opc2 << 20);
+ }
+
+ emitPredicate(i);
+
+ defId(i->def(0), 2);
+
+ for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+ switch (i->src(s).getFile()) {
+ case FILE_MEMORY_CONST:
+ code[1] &= (s == 2) ? ~(0x4 << 28) : ~(0x8 << 28);
+ setCAddress14(i->src(s));
+ code[1] |= i->getSrc(s)->reg.fileIndex << 5;
+ break;
+ case FILE_IMMEDIATE:
+ setShortImmediate(i, s);
+ break;
+ case FILE_GPR:
+ srcId(i->src(s), s ? ((s == 2) ? 42 : s1) : 10);
+ break;
+ default:
+ // ignore here, can be predicate or flags, but must not be address
+ break;
+ }
+ }
+ // 0x0 = invalid
+ // 0xc = rrr
+ // 0x8 = rrc
+ // 0x4 = rcr
+ assert(imm || (code[1] & (0xc << 28)));
+}
+
+inline void
+CodeEmitterGK110::modNegAbsF32_3b(const Instruction *i, const int s)
+{
+ if (i->src(s).mod.abs()) code[1] &= ~(1 << 27);
+ if (i->src(s).mod.neg()) code[1] ^= (1 << 27);
+}
+
+void
+CodeEmitterGK110::emitNOP(const Instruction *i)
+{
+ code[0] = 0x00003c02;
+ code[1] = 0x85800000;
+
+ if (i)
+ emitPredicate(i);
+ else
+ code[0] = 0x001c3c02;
+}
+
+void
+CodeEmitterGK110::emitFMAD(const Instruction *i)
+{
+ assert(!isLIMM(i->src(1), TYPE_F32));
+
+ emitForm_21(i, 0x0c0, 0x940);
+
+ NEG_(34, 2);
+ SAT_(35);
+ RND_(36, F);
+ FTZ_(38);
+
+ bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ if (code[0] & 0x1) {
+ if (neg1)
+ code[1] ^= 1 << 27;
+ } else
+ if (neg1) {
+ code[1] |= 1 << 19;
+ }
+}
+
+void
+CodeEmitterGK110::emitFMUL(const Instruction *i)
+{
+ bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ assert(i->postFactor >= -3 && i->postFactor <= 3);
+
+ if (isLIMM(i->src(1), TYPE_F32)) {
+ emitForm_L(i, 0x200, 0x2, Modifier(0));
+
+ FTZ_(38);
+ SAT_(3a);
+ if (neg)
+ code[1] ^= 1 << 22;
+
+ assert(i->postFactor == 0);
+ } else {
+ emitForm_21(i, 0x234, 0xc34);
+
+ RND_(2a, F);
+ FTZ_(2f);
+ SAT_(35);
+
+ if (code[0] & 0x1) {
+ if (neg)
+ code[1] ^= 1 << 27;
+ } else
+ if (neg) {
+ code[1] |= 1 << 19;
+ }
+ }
+}
+
+void
+CodeEmitterGK110::emitIMUL(const Instruction *i)
+{
+ assert(!i->src(0).mod.neg() && !i->src(1).mod.neg());
+ assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+
+ if (isLIMM(i->src(1), TYPE_S32)) {
+ emitForm_L(i, 0x280, 2, Modifier(0));
+
+ assert(i->subOp != NV50_IR_SUBOP_MUL_HIGH);
+
+ if (i->sType == TYPE_S32)
+ code[1] |= 3 << 25;
+ } else {
+ emitForm_21(i, 0x21c, 0xc1c);
+
+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+ code[1] |= 1 << 10;
+ if (i->sType == TYPE_S32)
+ code[1] |= 3 << 11;
+ }
+}
+
+void
+CodeEmitterGK110::emitFADD(const Instruction *i)
+{
+ if (isLIMM(i->src(1), TYPE_F32)) {
+ assert(i->rnd == ROUND_N);
+ assert(!i->saturate);
+
+ emitForm_L(i, 0x400, 0, i->src(1).mod);
+
+ FTZ_(3a);
+ NEG_(3b, 0);
+ ABS_(39, 0);
+ } else {
+ emitForm_21(i, 0x22c, 0xc2c);
+
+ FTZ_(2f);
+ RND_(2a, F);
+ ABS_(31, 0);
+ NEG_(33, 0);
+
+ if (code[0] & 0x1) {
+ modNegAbsF32_3b(i, 1);
+ } else {
+ ABS_(34, 1);
+ NEG_(30, 1);
+ }
+ }
+}
+
+void
+CodeEmitterGK110::emitUADD(const Instruction *i)
+{
+ uint8_t addOp = (i->src(0).mod.neg() << 1) | i->src(1).mod.neg();
+
+ if (i->op == OP_SUB)
+ addOp ^= 1;
+
+ assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+
+ if (isLIMM(i->src(1), TYPE_S32)) {
+ emitForm_L(i, 0x400, 1, Modifier((addOp & 1) ? NV50_IR_MOD_NEG : 0));
+
+ if (addOp & 2)
+ code[1] |= 1 << 27;
+
+ assert(!i->defExists(1));
+ assert(i->flagsSrc < 0);
+
+ SAT_(39);
+ } else {
+ emitForm_21(i, 0x208, 0xc08);
+
+ assert(addOp != 3); // would be add-plus-one
+
+ code[1] |= addOp << 19;
+
+ if (i->defExists(1))
+ code[1] |= 1 << 18; // write carry
+ if (i->flagsSrc >= 0)
+ code[1] |= 1 << 14; // add carry
+
+ SAT_(35);
+ }
+}
+
+// TODO: shl-add
+void
+CodeEmitterGK110::emitIMAD(const Instruction *i)
+{
+ uint8_t addOp =
+ (i->src(2).mod.neg() << 1) | (i->src(0).mod.neg() ^ i->src(1).mod.neg());
+
+ emitForm_21(i, 0x100, 0xa00);
+
+ assert(addOp != 3);
+ code[1] |= addOp << 26;
+
+ if (i->sType == TYPE_S32)
+ code[1] |= (1 << 19) | (1 << 24);
+
+ if (code[0] & 0x1) {
+ assert(!i->subOp);
+ SAT_(39);
+ } else {
+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+ code[1] |= 1 << 25;
+ SAT_(35);
+ }
+}
+
+void
+CodeEmitterGK110::emitISAD(const Instruction *i)
+{
+ assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);
+
+ emitForm_21(i, 0x1fc, 0xb74);
+
+ if (i->dType == TYPE_S32)
+ code[1] |= 1 << 19;
+}
+
+void
+CodeEmitterGK110::emitNOT(const Instruction *i)
+{
+ code[0] = 0x0003fc02; // logop(mov2) dst, 0, not src
+ code[1] = 0x22003800;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 2);
+
+ switch (i->src(0).getFile()) {
+ case FILE_GPR:
+ code[1] |= 0xc << 28;
+ srcId(i->src(0), 23);
+ break;
+ case FILE_MEMORY_CONST:
+ code[1] |= 0x4 << 28;
+ setCAddress14(i->src(1));
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+void
+CodeEmitterGK110::emitLogicOp(const Instruction *i, uint8_t subOp)
+{
+ assert(!(i->src(0).mod & Modifier(NV50_IR_MOD_NOT))); // XXX: find me
+
+ if (isLIMM(i->src(1), TYPE_S32)) {
+ emitForm_L(i, 0x200, 0, i->src(1).mod);
+ code[1] |= subOp << 24;
+ } else {
+ emitForm_21(i, 0x220, 0xc20);
+ code[1] |= subOp << 12;
+ NOT_(2b, 1);
+ }
+ assert(!(code[0] & 0x1) || !(i->src(1).mod & Modifier(NV50_IR_MOD_NOT)));
+}
+
+void
+CodeEmitterGK110::emitPOPC(const Instruction *i)
+{
+ assert(!isLIMM(i->src(1), TYPE_S32, true));
+
+ emitForm_21(i, 0x204, 0xc04);
+
+ NOT_(2a, 0);
+ if (!(code[0] & 0x1))
+ NOT_(2b, 1);
+}
+
+void
+CodeEmitterGK110::emitINSBF(const Instruction *i)
+{
+ emitForm_21(i, 0x1f8, 0xb78);
+}
+
+void
+CodeEmitterGK110::emitShift(const Instruction *i)
+{
+ const bool sar = i->op == OP_SHR && isSignedType(i->sType);
+
+ if (sar) {
+ emitForm_21(i, 0x214, 0x014);
+ code[1] |= 1 << 19;
+ } else
+ if (i->op == OP_SHR) {
+ // this is actually RSHF
+ emitForm_21(i, 0x27c, 0x87c);
+ code[1] |= GK110_GPR_ZERO << 10;
+ } else {
+ // this is actually LSHF
+ emitForm_21(i, 0x1fc, 0xb7c);
+ code[1] |= GK110_GPR_ZERO << 10;
+ }
+
+ if (i->subOp == NV50_IR_SUBOP_SHIFT_WRAP) {
+ if (!sar)
+ code[1] |= 1 << 21;
+ // XXX: find wrap modifier for SHR S32
+ }
+}
+
+void
+CodeEmitterGK110::emitPreOp(const Instruction *i)
+{
+ emitForm_21(i, 0x248, -1);
+
+ if (i->op == OP_PREEX2)
+ code[1] |= 1 << 10;
+
+ NEG_(30, 0);
+ ABS_(34, 0);
+}
+
+void
+CodeEmitterGK110::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+ code[0] = 0x00000002 | (subOp << 23);
+ code[1] = 0x84000000;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 2);
+ srcId(i->src(0), 10);
+
+ NEG_(33, 0);
+ ABS_(31, 0);
+
+ // XXX: find saturate
+}
+
+void
+CodeEmitterGK110::emitMINMAX(const Instruction *i)
+{
+ uint32_t op2, op1;
+
+ switch (i->dType) {
+ case TYPE_U32:
+ case TYPE_S32:
+ op2 = 0x210;
+ op1 = 0xc10;
+ break;
+ case TYPE_F32:
+ op2 = 0x230;
+ op1 = 0xc30;
+ break;
+ case TYPE_F64:
+ op2 = 0x228;
+ op1 = 0xc28;
+ break;
+ default:
+ assert(0);
+ op2 = 0;
+ op1 = 0;
+ break;
+ }
+ emitForm_21(i, op2, op1);
+
+ if (i->dType == TYPE_S32)
+ code[1] |= 1 << 19;
+ code[1] |= (i->op == OP_MIN) ? 0x1c00 : 0x3c00; // [!]pt
+
+ FTZ_(2f);
+ ABS_(31, 0);
+ NEG_(33, 0);
+ if (code[0] & 0x1) {
+ modNegAbsF32_3b(i, 1);
+ } else {
+ ABS_(34, 1);
+ NEG_(30, 1);
+ }
+}
+
+void
+CodeEmitterGK110::emitCVT(const Instruction *i)
+{
+ const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+ const bool f2i = !isFloatType(i->dType) && isFloatType(i->sType);
+ const bool i2f = isFloatType(i->dType) && !isFloatType(i->sType);
+
+ bool sat = i->saturate;
+ bool abs = i->src(0).mod.abs();
+ bool neg = i->src(0).mod.neg();
+
+ RoundMode rnd = i->rnd;
+
+ switch (i->op) {
+ case OP_CEIL: rnd = f2f ? ROUND_PI : ROUND_P; break;
+ case OP_FLOOR: rnd = f2f ? ROUND_MI : ROUND_M; break;
+ case OP_TRUNC: rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+ case OP_SAT: sat = true; break;
+ case OP_NEG: neg = !neg; break;
+ case OP_ABS: abs = true; neg = false; break;
+ default:
+ break;
+ }
+
+ uint32_t op;
+
+ if (f2f) op = 0x254;
+ else if (f2i) op = 0x258;
+ else if (i2f) op = 0x25c;
+ else op = 0x260;
+
+ emitForm_C(i, op, 0x2);
+
+ FTZ_(2f);
+ if (neg) code[1] |= 1 << 16;
+ if (abs) code[1] |= 1 << 20;
+ if (sat) code[1] |= 1 << 21;
+
+ emitRoundMode(rnd, 32 + 10, f2f ? (32 + 13) : -1);
+
+ code[0] |= typeSizeofLog2(i->dType) << 10;
+ code[0] |= typeSizeofLog2(i->sType) << 12;
+
+ if (isSignedIntType(i->dType))
+ code[0] |= 0x4000;
+ if (isSignedIntType(i->sType))
+ code[0] |= 0x8000;
+}
+
+void
+CodeEmitterGK110::emitSET(const CmpInstruction *i)
+{
+ uint16_t op1, op2;
+
+ if (i->def(0).getFile() == FILE_PREDICATE) {
+ switch (i->sType) {
+ case TYPE_F32: op2 = 0x1d8; op1 = 0xb58; break;
+ case TYPE_F64: op2 = 0x1c0; op1 = 0xb40; break;
+ default:
+ op2 = 0x1b0;
+ op1 = 0xb30;
+ break;
+ }
+ emitForm_21(i, op2, op1);
+
+ NEG_(2e, 0);
+ ABS_(9, 0);
+ if (!(code[0] & 0x1)) {
+ NEG_(8, 1);
+ ABS_(2f, 1);
+ } else {
+ modNegAbsF32_3b(i, 1);
+ }
+ FTZ_(32);
+
+ // normal DST field is negated predicate result
+ code[0] = (code[0] & ~0xfc) | ((code[0] << 3) & 0xe0);
+ if (i->defExists(1))
+ defId(i->def(1), 2);
+ else
+ code[0] |= 0x1c;
+ } else {
+ switch (i->sType) {
+ case TYPE_F32: op2 = 0x000; op1 = 0x820; break;
+ case TYPE_F64: op2 = 0x080; op1 = 0x900; break;
+ default:
+ op2 = 0x1a8;
+ op1 = 0xb28;
+ break;
+ }
+ emitForm_21(i, op2, op1);
+
+ NEG_(2e, 0);
+ ABS_(39, 0);
+ if (!(code[0] & 0x1)) {
+ NEG_(38, 1);
+ ABS_(2f, 1);
+ } else {
+ modNegAbsF32_3b(i, 1);
+ }
+ FTZ_(3a);
+ }
+ if (i->sType == TYPE_S32)
+ code[1] |= 1 << 19;
+
+ if (i->op != OP_SET) {
+ switch (i->op) {
+ case OP_SET_AND: code[1] |= 0x0 << 16; break;
+ case OP_SET_OR: code[1] |= 0x1 << 16; break;
+ case OP_SET_XOR: code[1] |= 0x2 << 16; break;
+ default:
+ assert(0);
+ break;
+ }
+ srcId(i->src(2), 0x2a);
+ } else {
+ code[1] |= 0x7 << 10;
+ }
+ emitCondCode(i->setCond,
+ isFloatType(i->sType) ? 0x33 : 0x34,
+ isFloatType(i->sType) ? 0xf : 0x7);
+}
+
+void
+CodeEmitterGK110::emitSLCT(const CmpInstruction *i)
+{
+ CondCode cc = i->setCond;
+ if (i->src(2).mod.neg())
+ cc = reverseCondCode(cc);
+
+ if (i->dType == TYPE_F32) {
+ emitForm_21(i, 0x1d0, 0xb50);
+ FTZ_(32);
+ emitCondCode(cc, 0x33, 0xf);
+ } else {
+ emitForm_21(i, 0x1a4, 0xb20);
+ emitCondCode(cc, 0x34, 0x7);
+ }
+}
+
+void CodeEmitterGK110::emitSELP(const Instruction *i)
+{
+ emitForm_21(i, 0x250, 0x050);
+
+ if ((i->cc == CC_NOT_P) ^ (bool)(i->src(2).mod & Modifier(NV50_IR_MOD_NOT)))
+ code[1] |= 1 << 13;
+}
+
+void CodeEmitterGK110::emitTEXBAR(const Instruction *i)
+{
+ code[0] = 0x00000002 | (i->subOp << 23);
+ code[1] = 0x77000000;
+
+ emitPredicate(i);
+}
+
+void CodeEmitterGK110::emitTEXCSAA(const TexInstruction *i)
+{
+ emitNOP(i); // TODO
+}
+
+static inline bool
+isNextIndependentTex(const TexInstruction *i)
+{
+ if (!i->next || !isTextureOp(i->next->op))
+ return false;
+ if (i->getDef(0)->interfers(i->next->getSrc(0)))
+ return false;
+ return !i->next->srcExists(1) || !i->getDef(0)->interfers(i->next->getSrc(1));
+}
+
+void
+CodeEmitterGK110::emitTEX(const TexInstruction *i)
+{
+ const bool ind = i->tex.rIndirectSrc >= 0;
+
+ if (ind) {
+ code[0] = 0x00000002;
+ switch (i->op) {
+ case OP_TXD:
+ code[1] = 0x7e000000;
+ break;
+ default:
+ code[1] = 0x7d800000;
+ break;
+ }
+ } else {
+ switch (i->op) {
+ case OP_TXD:
+ code[0] = 0x00000002;
+ code[1] = 0x76000000;
+ break;
+ default:
+ code[0] = 0x00000001;
+ code[1] = 0x60000000;
+ break;
+ }
+ code[1] |= i->tex.r << 15;
+ }
+
+ code[1] |= isNextIndependentTex(i) ? 0x1 : 0x2; // t : p mode
+
+ // if (i->tex.liveOnly)
+ // ?
+
+ switch (i->op) {
+ case OP_TEX: break;
+ case OP_TXB: code[1] |= 0x2000; break;
+ case OP_TXL: code[1] |= 0x3000; break;
+ case OP_TXF: break; // XXX
+ case OP_TXG: break; // XXX
+ case OP_TXD: break;
+ default:
+ assert(!"invalid texture op");
+ break;
+ }
+ /*
+ if (i->op == OP_TXF) {
+ if (!i->tex.levelZero)
+ code[1] |= 0x02000000;
+ } else */
+ if (i->tex.levelZero) {
+ code[1] |= 0x1000;
+ }
+
+ // if (i->op != OP_TXD && i->tex.derivAll)
+ // code[1] |= 1 << 13;
+
+ emitPredicate(i);
+
+ code[1] |= i->tex.mask << 2;
+
+ const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+ defId(i->def(0), 2);
+ srcId(i->src(0), 10);
+ srcId(i, src1, 23);
+
+ // if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
+
+ // texture target:
+ code[1] |= (i->tex.target.isCube() ? 3 : (i->tex.target.getDim() - 1)) << 7;
+ if (i->tex.target.isArray())
+ code[1] |= 0x40;
+ // if (i->tex.target.isShadow())
+ // ?
+ // if (i->tex.target == TEX_TARGET_2D_MS ||
+ // i->tex.target == TEX_TARGET_2D_MS_ARRAY)
+ // ?
+
+ if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) {
+ // ?
+ }
+
+ // if (i->tex.useOffsets)
+ // ?
+}
+
+void
+CodeEmitterGK110::emitTXQ(const TexInstruction *i)
+{
+ emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
+{
+ emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitFlow(const Instruction *i)
+{
+ const FlowInstruction *f = i->asFlow();
+
+ unsigned mask; // bit 0: predicate, bit 1: target
+
+ code[0] = 0x00000000;
+
+ switch (i->op) {
+ case OP_BRA:
+ code[1] = f->absolute ? 0x00000 : 0x12000000; // XXX
+ // if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+ // code[0] |= 0x4000;
+ mask = 3;
+ break;
+ case OP_CALL:
+ code[1] = f->absolute ? 0x00000 : 0x13000000; // XXX
+ // if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+ // code[0] |= 0x4000;
+ mask = 2;
+ break;
+
+ case OP_EXIT: code[1] = 0x18000000; mask = 1; break;
+ case OP_RET: code[1] = 0x19000000; mask = 1; break;
+ case OP_DISCARD: code[1] = 0x19800000; mask = 1; break; // XXX: guess
+ case OP_BREAK: code[1] = 0x1a800000; mask = 1; break; // XXX: guess
+ case OP_CONT: code[1] = 0x1b000000; mask = 1; break; // XXX: guess
+
+ case OP_JOINAT: code[1] = 0x14800000; mask = 2; break;
+ case OP_PREBREAK: code[1] = 0x15000000; mask = 2; break; // XXX: guess
+ case OP_PRECONT: code[1] = 0x15800000; mask = 2; break; // XXX: guess
+ case OP_PRERET: code[1] = 0x16000000; mask = 2; break; // XXX: guess
+
+ case OP_QUADON: code[1] = 0x1c000000; mask = 0; break; // XXX: guess
+ case OP_QUADPOP: code[1] = 0x1c800000; mask = 0; break; // XXX: guess
+ case OP_BRKPT: code[1] = 0x1d000000; mask = 0; break; // XXX: guess
+ default:
+ assert(!"invalid flow operation");
+ return;
+ }
+
+ if (mask & 1) {
+ emitPredicate(i);
+ if (i->flagsSrc < 0)
+ code[0] |= 0x3c;
+ }
+
+ if (!f)
+ return;
+
+ // TODO
+ /*
+ if (f->allWarp)
+ code[0] |= 1 << 15;
+ if (f->limit)
+ code[0] |= 1 << 16;
+ */
+
+ if (f->op == OP_CALL) {
+ if (f->builtin) {
+ assert(f->absolute);
+ uint32_t pcAbs = targNVC0->getBuiltinOffset(f->target.builtin);
+ addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xff800000, 23);
+ addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x007fffff, -9);
+ } else {
+ assert(!f->absolute);
+ int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
+ code[0] |= (pcRel & 0x1ff) << 23;
+ code[1] |= (pcRel >> 9) & 0x7fff;
+ }
+ } else
+ if (mask & 2) {
+ int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+ // currently we don't want absolute branches
+ assert(!f->absolute);
+ code[0] |= (pcRel & 0x1ff) << 23;
+ code[1] |= (pcRel >> 9) & 0x7fff;
+ }
+}
+
+void
+CodeEmitterGK110::emitPFETCH(const Instruction *i)
+{
+ emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitVFETCH(const Instruction *i)
+{
+ uint32_t offset = i->src(0).get()->reg.data.offset;
+
+ code[0] = 0x00000002 | (offset << 23);
+ code[1] = 0x7ec00000 | (offset >> 9);
+
+#if 0
+ if (i->perPatch)
+ code[0] |= 0x100;
+ if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+ code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
+#endif
+
+ emitPredicate(i);
+
+ defId(i->def(0), 2);
+ srcId(i->src(0).getIndirect(0), 10);
+ srcId(i->src(0).getIndirect(1), 32 + 10); // vertex address
+}
+
+void
+CodeEmitterGK110::emitEXPORT(const Instruction *i)
+{
+ uint32_t offset = i->src(0).get()->reg.data.offset;
+
+ code[0] = 0x00000002 | (offset << 23);
+ code[1] = 0x7f000000 | (offset >> 9);
+
+#if 0
+ if (i->perPatch)
+ code[0] |= 0x100;
+#endif
+
+ emitPredicate(i);
+
+ assert(i->src(1).getFile() == FILE_GPR);
+
+ srcId(i->src(0).getIndirect(0), 10);
+ srcId(i->src(0).getIndirect(1), 32 + 10); // vertex base address
+ srcId(i->src(1), 2);
+}
+
+void
+CodeEmitterGK110::emitOUT(const Instruction *i)
+{
+ emitNOP(i); // TODO
+}
+
+void
+CodeEmitterGK110::emitInterpMode(const Instruction *i)
+{
+ code[1] |= i->ipa << 21; // TODO: INTERP_SAMPLEID
+}
+
+void
+CodeEmitterGK110::emitINTERP(const Instruction *i)
+{
+ const uint32_t base = i->getSrc(0)->reg.data.offset;
+
+ code[0] = 0x00000002 | (base << 31);
+ code[1] = 0x74800000 | (base >> 1);
+
+ if (i->saturate)
+ code[1] |= 1 << 18;
+
+ if (i->op == OP_PINTERP)
+ srcId(i->src(1), 23);
+ else
+ code[0] |= 0xff << 23;
+
+ srcId(i->src(0).getIndirect(0), 10);
+ emitInterpMode(i);
+
+ emitPredicate(i);
+ defId(i->def(0), 2);
+
+ if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
+ srcId(i->src(i->op == OP_PINTERP ? 2 : 1), 32 + 10);
+ else
+ code[1] |= 0xff << 10;
+}
+
+void
+CodeEmitterGK110::emitLoadStoreType(DataType ty, const int pos)
+{
+ uint8_t n;
+
+ switch (ty) {
+ case TYPE_U8:
+ n = 0;
+ break;
+ case TYPE_S8:
+ n = 1;
+ break;
+ case TYPE_U16:
+ n = 2;
+ break;
+ case TYPE_S16:
+ n = 3;
+ break;
+ case TYPE_F32:
+ case TYPE_U32:
+ case TYPE_S32:
+ n = 4;
+ break;
+ case TYPE_F64:
+ case TYPE_U64:
+ case TYPE_S64:
+ n = 5;
+ break;
+ case TYPE_B128:
+ n = 6;
+ break;
+ default:
+ n = 0;
+ assert(!"invalid ld/st type");
+ break;
+ }
+ code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitCachingMode(CacheMode c, const int pos)
+{
+ uint8_t n;
+
+ switch (c) {
+ case CACHE_CA:
+// case CACHE_WB:
+ n = 0;
+ break;
+ case CACHE_CG:
+ n = 1;
+ break;
+ case CACHE_CS:
+ n = 2;
+ break;
+ case CACHE_CV:
+// case CACHE_WT:
+ n = 3;
+ break;
+ default:
+ n = 0;
+ assert(!"invalid caching mode");
+ break;
+ }
+ code[pos / 32] |= n << (pos % 32);
+}
+
+void
+CodeEmitterGK110::emitSTORE(const Instruction *i)
+{
+ int32_t offset = SDATA(i->src(0)).offset;
+
+ switch (i->src(0).getFile()) {
+ case FILE_MEMORY_GLOBAL: code[1] = 0xe0000000; code[0] = 0x00000000; break;
+ case FILE_MEMORY_LOCAL: code[1] = 0x7a800000; code[0] = 0x00000002; break;
+ case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+ default:
+ assert(!"invalid memory file");
+ break;
+ }
+
+ if (i->src(0).getFile() != FILE_MEMORY_GLOBAL)
+ offset &= 0xffffff;
+
+ if (code[0] & 0x2) {
+ emitLoadStoreType(i->dType, 0x33);
+ if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
+ emitCachingMode(i->cache, 0x2f);
+ } else {
+ emitLoadStoreType(i->dType, 0x38);
+ emitCachingMode(i->cache, 0x3b);
+ }
+ code[0] |= offset << 23;
+ code[1] |= offset >> 9;
+
+ emitPredicate(i);
+
+ srcId(i->src(1), 2);
+ srcId(i->src(0).getIndirect(0), 10);
+}
+
+void
+CodeEmitterGK110::emitLOAD(const Instruction *i)
+{
+ int32_t offset = SDATA(i->src(0)).offset;
+
+ switch (i->src(0).getFile()) {
+ case FILE_MEMORY_GLOBAL: code[1] = 0xc0000000; code[0] = 0x00000000; break;
+ case FILE_MEMORY_LOCAL: code[1] = 0x7a000000; code[0] = 0x00000002; break;
+ case FILE_MEMORY_SHARED: code[1] = 0x7ac00000; code[0] = 0x00000002; break;
+ case FILE_MEMORY_CONST:
+ if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
+ emitMOV(i);
+ return;
+ }
+ offset &= 0xffff;
+ code[0] = 0x00000002;
+ code[1] = 0x7c800000 | (i->src(0).get()->reg.fileIndex << 7);
+ break;
+ default:
+ assert(!"invalid memory file");
+ break;
+ }
+
+ if (code[0] & 0x2) {
+ offset &= 0xffffff;
+ emitLoadStoreType(i->dType, 0x33);
+ if (i->src(0).getFile() == FILE_MEMORY_LOCAL)
+ emitCachingMode(i->cache, 0x2f);
+ } else {
+ emitLoadStoreType(i->dType, 0x38);
+ emitCachingMode(i->cache, 0x3b);
+ }
+ code[0] |= offset << 23;
+ code[1] |= offset >> 9;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 2);
+ srcId(i->src(0).getIndirect(0), 10);
+}
+
+uint8_t
+CodeEmitterGK110::getSRegEncoding(const ValueRef& ref)
+{
+ switch (SDATA(ref).sv.sv) {
+ case SV_LANEID: return 0x00;
+ case SV_PHYSID: return 0x03;
+ case SV_VERTEX_COUNT: return 0x10;
+ case SV_INVOCATION_ID: return 0x11;
+ case SV_YDIR: return 0x12;
+ case SV_TID: return 0x21 + SDATA(ref).sv.index;
+ case SV_CTAID: return 0x25 + SDATA(ref).sv.index;
+ case SV_NTID: return 0x29 + SDATA(ref).sv.index;
+ case SV_GRIDID: return 0x2c;
+ case SV_NCTAID: return 0x2d + SDATA(ref).sv.index;
+ case SV_LBASE: return 0x34;
+ case SV_SBASE: return 0x30;
+ case SV_CLOCK: return 0x50 + SDATA(ref).sv.index;
+ default:
+ assert(!"no sreg for system value");
+ return 0;
+ }
+}
+
+void
+CodeEmitterGK110::emitMOV(const Instruction *i)
+{
+ if (i->src(0).getFile() == FILE_SYSTEM_VALUE) {
+ code[0] = 0x00000002 | (getSRegEncoding(i->src(0)) << 23);
+ code[1] = 0x86400000;
+ emitPredicate(i);
+ defId(i->def(0), 2);
+ } else
+ if (i->src(0).getFile() == FILE_IMMEDIATE) {
+ code[0] = 0x00000002 | (i->lanes << 14);
+ code[1] = 0x74000000;
+ emitPredicate(i);
+ defId(i->def(0), 2);
+ setImmediate32(i, 0, Modifier(0));
+ } else
+ if (i->src(0).getFile() == FILE_PREDICATE) {
+ // TODO
+ } else {
+ emitForm_C(i, 0x24c, 2);
+ code[1] |= i->lanes << 10;
+ }
+}
+
+bool
+CodeEmitterGK110::emitInstruction(Instruction *insn)
+{
+ const unsigned int size = (writeIssueDelays && !(codeSize & 0x3f)) ? 16 : 8;
+
+ if (insn->encSize != 8) {
+ ERROR("skipping unencodable instruction: ");
+ insn->print();
+ return false;
+ } else
+ if (codeSize + size > codeSizeLimit) {
+ ERROR("code emitter output buffer too small\n");
+ return false;
+ }
+
+ if (writeIssueDelays) {
+ int id = (codeSize & 0x3f) / 8 - 1;
+ if (id < 0) {
+ id += 1;
+ code[0] = 0x00000000; // cf issue delay "instruction"
+ code[1] = 0x08000000;
+ code += 2;
+ codeSize += 8;
+ }
+ uint32_t *data = code - (id * 2 + 2);
+
+ switch (id) {
+ case 0: data[0] |= insn->sched << 2; break;
+ case 1: data[0] |= insn->sched << 10; break;
+ case 2: data[0] |= insn->sched << 18; break;
+ case 3: data[0] |= insn->sched << 26; data[1] |= insn->sched >> 6; break;
+ case 4: data[1] |= insn->sched << 2;
+ case 5: data[1] |= insn->sched << 10; break;
+ case 6: data[1] |= insn->sched << 18; break;
+ default:
+ assert(0);
+ break;
+ }
+ }
+
+ // assert that instructions with multiple defs don't corrupt registers
+ for (int d = 0; insn->defExists(d); ++d)
+ assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
+
+ switch (insn->op) {
+ case OP_MOV:
+ case OP_RDSV:
+ emitMOV(insn);
+ break;
+ case OP_NOP:
+ break;
+ case OP_LOAD:
+ emitLOAD(insn);
+ break;
+ case OP_STORE:
+ emitSTORE(insn);
+ break;
+ case OP_LINTERP:
+ case OP_PINTERP:
+ emitINTERP(insn);
+ break;
+ case OP_VFETCH:
+ emitVFETCH(insn);
+ break;
+ case OP_EXPORT:
+ emitEXPORT(insn);
+ break;
+ case OP_PFETCH:
+ emitPFETCH(insn);
+ break;
+ case OP_EMIT:
+ case OP_RESTART:
+ emitOUT(insn);
+ break;
+ case OP_ADD:
+ case OP_SUB:
+ if (isFloatType(insn->dType))
+ emitFADD(insn);
+ else
+ emitUADD(insn);
+ break;
+ case OP_MUL:
+ if (isFloatType(insn->dType))
+ emitFMUL(insn);
+ else
+ emitIMUL(insn);
+ break;
+ case OP_MAD:
+ case OP_FMA:
+ if (isFloatType(insn->dType))
+ emitFMAD(insn);
+ else
+ emitIMAD(insn);
+ break;
+ case OP_SAD:
+ emitISAD(insn);
+ break;
+ case OP_NOT:
+ emitNOT(insn);
+ break;
+ case OP_AND:
+ emitLogicOp(insn, 0);
+ break;
+ case OP_OR:
+ emitLogicOp(insn, 1);
+ break;
+ case OP_XOR:
+ emitLogicOp(insn, 2);
+ break;
+ case OP_SHL:
+ case OP_SHR:
+ emitShift(insn);
+ break;
+ case OP_SET:
+ case OP_SET_AND:
+ case OP_SET_OR:
+ case OP_SET_XOR:
+ emitSET(insn->asCmp());
+ break;
+ case OP_SELP:
+ emitSELP(insn);
+ break;
+ case OP_SLCT:
+ emitSLCT(insn->asCmp());
+ break;
+ case OP_MIN:
+ case OP_MAX:
+ emitMINMAX(insn);
+ break;
+ case OP_ABS:
+ case OP_NEG:
+ case OP_CEIL:
+ case OP_FLOOR:
+ case OP_TRUNC:
+ case OP_CVT:
+ case OP_SAT:
+ emitCVT(insn);
+ break;
+ case OP_RSQ:
+ emitSFnOp(insn, 5);
+ break;
+ case OP_RCP:
+ emitSFnOp(insn, 4);
+ break;
+ case OP_LG2:
+ emitSFnOp(insn, 3);
+ break;
+ case OP_EX2:
+ emitSFnOp(insn, 2);
+ break;
+ case OP_SIN:
+ emitSFnOp(insn, 1);
+ break;
+ case OP_COS:
+ emitSFnOp(insn, 0);
+ break;
+ case OP_PRESIN:
+ case OP_PREEX2:
+ emitPreOp(insn);
+ break;
+ case OP_TEX:
+ case OP_TXB:
+ case OP_TXL:
+ case OP_TXD:
+ case OP_TXF:
+ emitTEX(insn->asTex());
+ break;
+ case OP_TXQ:
+ emitTXQ(insn->asTex());
+ break;
+ case OP_TEXBAR:
+ emitTEXBAR(insn);
+ break;
+ case OP_BRA:
+ case OP_CALL:
+ case OP_PRERET:
+ case OP_RET:
+ case OP_DISCARD:
+ case OP_EXIT:
+ case OP_PRECONT:
+ case OP_CONT:
+ case OP_PREBREAK:
+ case OP_BREAK:
+ case OP_JOINAT:
+ case OP_BRKPT:
+ case OP_QUADON:
+ case OP_QUADPOP:
+ emitFlow(insn);
+ break;
+ case OP_QUADOP:
+ emitQUADOP(insn, insn->subOp, insn->lanes);
+ break;
+ case OP_DFDX:
+ emitQUADOP(insn, insn->src(0).mod.neg() ? 0x66 : 0x99, 0x4);
+ break;
+ case OP_DFDY:
+ emitQUADOP(insn, insn->src(0).mod.neg() ? 0x5a : 0xa5, 0x5);
+ break;
+ case OP_POPCNT:
+ emitPOPC(insn);
+ break;
+ case OP_JOIN:
+ emitNOP(insn);
+ insn->join = 1;
+ break;
+ case OP_PHI:
+ case OP_UNION:
+ case OP_CONSTRAINT:
+ ERROR("operation should have been eliminated");
+ return false;
+ case OP_EXP:
+ case OP_LOG:
+ case OP_SQRT:
+ case OP_POW:
+ ERROR("operation should have been lowered\n");
+ return false;
+ default:
+ ERROR("unknow op\n");
+ return false;
+ }
+
+ if (insn->join)
+ code[0] |= 1 << 22;
+
+ code += 2;
+ codeSize += 8;
+ return true;
+}
+
+uint32_t
+CodeEmitterGK110::getMinEncodingSize(const Instruction *i) const
+{
+ // No more short instruction encodings.
+ return 8;
+}
+
+void
+CodeEmitterGK110::prepareEmission(Function *func)
+{
+ const Target *targ = func->getProgram()->getTarget();
+
+ CodeEmitter::prepareEmission(func);
+
+ if (targ->hasSWSched)
+ calculateSchedDataNVC0(targ, func);
+}
+
+CodeEmitterGK110::CodeEmitterGK110(const TargetNVC0 *target)
+ : CodeEmitter(target),
+ targNVC0(target),
+ writeIssueDelays(target->hasSWSched)
+{
+ code = NULL;
+ codeSize = codeSizeLimit = 0;
+ relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNVC0::createCodeEmitterGK110(Program::Type type)
+{
+ CodeEmitterGK110 *emit = new CodeEmitterGK110(this);
+ emit->setProgramType(type);
+ return emit;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
new file mode 100644
index 0000000..3eca27d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -0,0 +1,1962 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+#define NV50_OP_ENC_LONG 0
+#define NV50_OP_ENC_SHORT 1
+#define NV50_OP_ENC_IMM 2
+#define NV50_OP_ENC_LONG_ALT 3
+
+class CodeEmitterNV50 : public CodeEmitter
+{
+public:
+ CodeEmitterNV50(const TargetNV50 *);
+
+ virtual bool emitInstruction(Instruction *);
+
+ virtual uint32_t getMinEncodingSize(const Instruction *) const;
+
+ inline void setProgramType(Program::Type pType) { progType = pType; }
+
+ virtual void prepareEmission(Function *);
+
+private:
+ Program::Type progType;
+
+ const TargetNV50 *targNV50;
+
+private:
+ inline void defId(const ValueDef&, const int pos);
+ inline void srcId(const ValueRef&, const int pos);
+ inline void srcId(const ValueRef *, const int pos);
+
+ inline void srcAddr16(const ValueRef&, bool adj, const int pos);
+ inline void srcAddr8(const ValueRef&, const int pos);
+
+ void emitFlagsRd(const Instruction *);
+ void emitFlagsWr(const Instruction *);
+
+ void emitCondCode(CondCode cc, DataType ty, int pos);
+
+ inline void setARegBits(unsigned int);
+
+ void setAReg16(const Instruction *, int s);
+ void setImmediate(const Instruction *, int s);
+
+ void setDst(const Value *);
+ void setDst(const Instruction *, int d);
+ void setSrcFileBits(const Instruction *, int enc);
+ void setSrc(const Instruction *, unsigned int s, int slot);
+
+ void emitForm_MAD(const Instruction *);
+ void emitForm_ADD(const Instruction *);
+ void emitForm_MUL(const Instruction *);
+ void emitForm_IMM(const Instruction *);
+
+ void emitLoadStoreSizeLG(DataType ty, int pos);
+ void emitLoadStoreSizeCS(DataType ty);
+
+ void roundMode_MAD(const Instruction *);
+ void roundMode_CVT(RoundMode);
+
+ void emitMNeg12(const Instruction *);
+
+ void emitLOAD(const Instruction *);
+ void emitSTORE(const Instruction *);
+ void emitMOV(const Instruction *);
+ void emitNOP();
+ void emitINTERP(const Instruction *);
+ void emitPFETCH(const Instruction *);
+ void emitOUT(const Instruction *);
+
+ void emitUADD(const Instruction *);
+ void emitAADD(const Instruction *);
+ void emitFADD(const Instruction *);
+ void emitIMUL(const Instruction *);
+ void emitFMUL(const Instruction *);
+ void emitFMAD(const Instruction *);
+ void emitIMAD(const Instruction *);
+ void emitISAD(const Instruction *);
+
+ void emitMINMAX(const Instruction *);
+
+ void emitPreOp(const Instruction *);
+ void emitSFnOp(const Instruction *, uint8_t subOp);
+
+ void emitShift(const Instruction *);
+ void emitARL(const Instruction *, unsigned int shl);
+ void emitLogicOp(const Instruction *);
+ void emitNOT(const Instruction *);
+
+ void emitCVT(const Instruction *);
+ void emitSET(const Instruction *);
+
+ void emitTEX(const TexInstruction *);
+ void emitTXQ(const TexInstruction *);
+ void emitTEXPREP(const TexInstruction *);
+
+ void emitQUADOP(const Instruction *, uint8_t lane, uint8_t quOp);
+
+ void emitFlow(const Instruction *, uint8_t flowOp);
+ void emitPRERETEmu(const FlowInstruction *);
+ void emitBAR(const Instruction *);
+
+ void emitATOM(const Instruction *);
+};
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterNV50::srcId(const ValueRef& src, const int pos)
+{
+ assert(src.get());
+ code[pos / 32] |= SDATA(src).id << (pos % 32);
+}
+
+void CodeEmitterNV50::srcId(const ValueRef *src, const int pos)
+{
+ assert(src->get());
+ code[pos / 32] |= SDATA(*src).id << (pos % 32);
+}
+
+void CodeEmitterNV50::srcAddr16(const ValueRef& src, bool adj, const int pos)
+{
+ assert(src.get());
+
+ int32_t offset = SDATA(src).offset;
+
+ assert(!adj || src.get()->reg.size <= 4);
+ if (adj)
+ offset /= src.get()->reg.size;
+
+ assert(offset <= 0x7fff && offset >= (int32_t)-0x8000 && (pos % 32) <= 16);
+
+ if (offset < 0)
+ offset &= adj ? (0xffff >> (src.get()->reg.size >> 1)) : 0xffff;
+
+ code[pos / 32] |= offset << (pos % 32);
+}
+
+void CodeEmitterNV50::srcAddr8(const ValueRef& src, const int pos)
+{
+ assert(src.get());
+
+ uint32_t offset = SDATA(src).offset;
+
+ assert((offset <= 0x1fc || offset == 0x3fc) && !(offset & 0x3));
+
+ code[pos / 32] |= (offset >> 2) << (pos % 32);
+}
+
+void CodeEmitterNV50::defId(const ValueDef& def, const int pos)
+{
+ assert(def.get() && def.getFile() != FILE_SHADER_OUTPUT);
+
+ code[pos / 32] |= DDATA(def).id << (pos % 32);
+}
+
+void
+CodeEmitterNV50::roundMode_MAD(const Instruction *insn)
+{
+ switch (insn->rnd) {
+ case ROUND_M: code[1] |= 1 << 22; break;
+ case ROUND_P: code[1] |= 2 << 22; break;
+ case ROUND_Z: code[1] |= 3 << 22; break;
+ default:
+ assert(insn->rnd == ROUND_N);
+ break;
+ }
+}
+
+void
+CodeEmitterNV50::emitMNeg12(const Instruction *i)
+{
+ code[1] |= i->src(0).mod.neg() << 26;
+ code[1] |= i->src(1).mod.neg() << 27;
+}
+
+void CodeEmitterNV50::emitCondCode(CondCode cc, DataType ty, int pos)
+{
+ uint8_t enc;
+
+ assert(pos >= 32 || pos <= 27);
+
+ switch (cc) {
+ case CC_LT: enc = 0x1; break;
+ case CC_LTU: enc = 0x9; break;
+ case CC_EQ: enc = 0x2; break;
+ case CC_EQU: enc = 0xa; break;
+ case CC_LE: enc = 0x3; break;
+ case CC_LEU: enc = 0xb; break;
+ case CC_GT: enc = 0x4; break;
+ case CC_GTU: enc = 0xc; break;
+ case CC_NE: enc = 0x5; break;
+ case CC_NEU: enc = 0xd; break;
+ case CC_GE: enc = 0x6; break;
+ case CC_GEU: enc = 0xe; break;
+ case CC_TR: enc = 0xf; break;
+ case CC_FL: enc = 0x0; break;
+
+ case CC_O: enc = 0x10; break;
+ case CC_C: enc = 0x11; break;
+ case CC_A: enc = 0x12; break;
+ case CC_S: enc = 0x13; break;
+ case CC_NS: enc = 0x1c; break;
+ case CC_NA: enc = 0x1d; break;
+ case CC_NC: enc = 0x1e; break;
+ case CC_NO: enc = 0x1f; break;
+
+ default:
+ enc = 0;
+ assert(!"invalid condition code");
+ break;
+ }
+ if (ty != TYPE_NONE && !isFloatType(ty))
+ enc &= ~0x8; // unordered only exists for float types
+
+ code[pos / 32] |= enc << (pos % 32);
+}
+
+void
+CodeEmitterNV50::emitFlagsRd(const Instruction *i)
+{
+ int s = (i->flagsSrc >= 0) ? i->flagsSrc : i->predSrc;
+
+ assert(!(code[1] & 0x00003f80));
+
+ if (s >= 0) {
+ assert(i->getSrc(s)->reg.file == FILE_FLAGS);
+ emitCondCode(i->cc, TYPE_NONE, 32 + 7);
+ srcId(i->src(s), 32 + 12);
+ } else {
+ code[1] |= 0x0780;
+ }
+}
+
+void
+CodeEmitterNV50::emitFlagsWr(const Instruction *i)
+{
+ assert(!(code[1] & 0x70));
+
+ int flagsDef = i->flagsDef;
+
+ // find flags definition and check that it is the last def
+ if (flagsDef < 0) {
+ for (int d = 0; i->defExists(d); ++d)
+ if (i->def(d).getFile() == FILE_FLAGS)
+ flagsDef = d;
+ if (flagsDef >= 0 && 0) // TODO: enforce use of flagsDef at some point
+ WARN("Instruction::flagsDef was not set properly\n");
+ }
+ if (flagsDef == 0 && i->defExists(1))
+ WARN("flags def should not be the primary definition\n");
+
+ if (flagsDef >= 0)
+ code[1] |= (DDATA(i->def(flagsDef)).id << 4) | 0x40;
+
+}
+
+void
+CodeEmitterNV50::setARegBits(unsigned int u)
+{
+ code[0] |= (u & 3) << 26;
+ code[1] |= (u & 4);
+}
+
+void
+CodeEmitterNV50::setAReg16(const Instruction *i, int s)
+{
+ if (i->srcExists(s)) {
+ s = i->src(s).indirect[0];
+ if (s >= 0)
+ setARegBits(SDATA(i->src(s)).id + 1);
+ }
+}
+
+void
+CodeEmitterNV50::setImmediate(const Instruction *i, int s)
+{
+ const ImmediateValue *imm = i->src(s).get()->asImm();
+ assert(imm);
+
+ uint32_t u = imm->reg.data.u32;
+
+ if (i->src(s).mod & Modifier(NV50_IR_MOD_NOT))
+ u = ~u;
+
+ code[1] |= 3;
+ code[0] |= (u & 0x3f) << 16;
+ code[1] |= (u >> 6) << 2;
+}
+
+void
+CodeEmitterNV50::setDst(const Value *dst)
+{
+ const Storage *reg = &dst->join->reg;
+
+ assert(reg->file != FILE_ADDRESS);
+
+ if (reg->data.id < 0 || reg->file == FILE_FLAGS) {
+ code[0] |= (127 << 2) | 1;
+ code[1] |= 8;
+ } else {
+ int id;
+ if (reg->file == FILE_SHADER_OUTPUT) {
+ code[1] |= 8;
+ id = reg->data.offset / 4;
+ } else {
+ id = reg->data.id;
+ }
+ code[0] |= id << 2;
+ }
+}
+
+void
+CodeEmitterNV50::setDst(const Instruction *i, int d)
+{
+ if (i->defExists(d)) {
+ setDst(i->getDef(d));
+ } else
+ if (!d) {
+ code[0] |= 0x01fc; // bit bucket
+ code[1] |= 0x0008;
+ }
+}
+
+// 3 * 2 bits:
+// 0: r
+// 1: a/s
+// 2: c
+// 3: i
+void
+CodeEmitterNV50::setSrcFileBits(const Instruction *i, int enc)
+{
+ uint8_t mode = 0;
+
+ for (unsigned int s = 0; s < Target::operationSrcNr[i->op]; ++s) {
+ switch (i->src(s).getFile()) {
+ case FILE_GPR:
+ break;
+ case FILE_MEMORY_SHARED:
+ case FILE_SHADER_INPUT:
+ mode |= 1 << (s * 2);
+ break;
+ case FILE_MEMORY_CONST:
+ mode |= 2 << (s * 2);
+ break;
+ case FILE_IMMEDIATE:
+ mode |= 3 << (s * 2);
+ break;
+ default:
+ ERROR("invalid file on source %i: %u\n", s, i->src(s).getFile());
+ assert(0);
+ break;
+ }
+ }
+ switch (mode) {
+ case 0x00: // rrr
+ break;
+ case 0x01: // arr/grr
+ if (progType == Program::TYPE_GEOMETRY) {
+ code[0] |= 0x01800000;
+ if (enc == NV50_OP_ENC_LONG || enc == NV50_OP_ENC_LONG_ALT)
+ code[1] |= 0x00200000;
+ } else {
+ if (enc == NV50_OP_ENC_SHORT)
+ code[0] |= 0x01000000;
+ else
+ code[1] |= 0x00200000;
+ }
+ break;
+ case 0x03: // irr
+ assert(i->op == OP_MOV);
+ return;
+ case 0x0c: // rir
+ break;
+ case 0x0d: // gir
+ code[0] |= 0x01000000;
+ assert(progType == Program::TYPE_GEOMETRY ||
+ progType == Program::TYPE_COMPUTE);
+ break;
+ case 0x08: // rcr
+ code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+ code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+ break;
+ case 0x09: // acr/gcr
+ if (progType == Program::TYPE_GEOMETRY) {
+ code[0] |= 0x01800000;
+ } else {
+ code[0] |= (enc == NV50_OP_ENC_LONG_ALT) ? 0x01000000 : 0x00800000;
+ code[1] |= 0x00200000;
+ }
+ code[1] |= (i->getSrc(1)->reg.fileIndex << 22);
+ break;
+ case 0x20: // rrc
+ code[0] |= 0x01000000;
+ code[1] |= (i->getSrc(2)->reg.fileIndex << 22);
+ break;
+ case 0x21: // arc
+ code[0] |= 0x01000000;
+ code[1] |= 0x00200000 | (i->getSrc(2)->reg.fileIndex << 22);
+ assert(progType != Program::TYPE_GEOMETRY);
+ break;
+ default:
+ ERROR("not encodable: %x\n", mode);
+ assert(0);
+ break;
+ }
+ if (progType != Program::TYPE_COMPUTE)
+ return;
+
+ if ((mode & 3) == 1) {
+ const int pos = i->src(1).getFile() == FILE_IMMEDIATE ? 13 : 14;
+
+ switch (i->getSrc(0)->reg.type) {
+ case TYPE_U8:
+ break;
+ case TYPE_U16:
+ code[0] |= 1 << pos;
+ break;
+ case TYPE_S16:
+ code[0] |= 2 << pos;
+ break;
+ default:
+ code[0] |= 3 << pos;
+ assert(i->getSrc(0)->reg.size == 4);
+ break;
+ }
+ }
+}
+
+void
+CodeEmitterNV50::setSrc(const Instruction *i, unsigned int s, int slot)
+{
+ if (Target::operationSrcNr[i->op] <= s)
+ return;
+ const Storage *reg = &i->src(s).rep()->reg;
+
+ unsigned int id = (reg->file == FILE_GPR) ?
+ reg->data.id :
+ reg->data.offset >> (reg->size >> 1); // no > 4 byte sources here
+
+ switch (slot) {
+ case 0: code[0] |= id << 9; break;
+ case 1: code[0] |= id << 16; break;
+ case 2: code[1] |= id << 14; break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+// the default form:
+// - long instruction
+// - 1 to 3 sources in slots 0, 1, 2 (rrr, arr, rcr, acr, rrc, arc, gcr, grr)
+// - address & flags
+void
+CodeEmitterNV50::emitForm_MAD(const Instruction *i)
+{
+ assert(i->encSize == 8);
+ code[0] |= 1;
+
+ emitFlagsRd(i);
+ emitFlagsWr(i);
+
+ setDst(i, 0);
+
+ setSrcFileBits(i, NV50_OP_ENC_LONG);
+ setSrc(i, 0, 0);
+ setSrc(i, 1, 1);
+ setSrc(i, 2, 2);
+
+ setAReg16(i, 1);
+}
+
+// like default form, but 2nd source in slot 2, and no 3rd source
+void
+CodeEmitterNV50::emitForm_ADD(const Instruction *i)
+{
+ assert(i->encSize == 8);
+ code[0] |= 1;
+
+ emitFlagsRd(i);
+ emitFlagsWr(i);
+
+ setDst(i, 0);
+
+ setSrcFileBits(i, NV50_OP_ENC_LONG_ALT);
+ setSrc(i, 0, 0);
+ setSrc(i, 1, 2);
+
+ setAReg16(i, 1);
+}
+
+// default short form (rr, ar, rc, gr)
+void
+CodeEmitterNV50::emitForm_MUL(const Instruction *i)
+{
+ assert(i->encSize == 4 && !(code[0] & 1));
+ assert(i->defExists(0));
+ assert(!i->getPredicate());
+
+ setDst(i, 0);
+
+ setSrcFileBits(i, NV50_OP_ENC_SHORT);
+ setSrc(i, 0, 0);
+ setSrc(i, 1, 1);
+}
+
+// usual immediate form
+// - 1 to 3 sources where last is immediate (rir, gir)
+// - no address or predicate possible
+void
+CodeEmitterNV50::emitForm_IMM(const Instruction *i)
+{
+ assert(i->encSize == 8);
+ code[0] |= 1;
+
+ assert(i->defExists(0) && i->srcExists(0));
+
+ setDst(i, 0);
+
+ setSrcFileBits(i, NV50_OP_ENC_IMM);
+ if (Target::operationSrcNr[i->op] > 1) {
+ setSrc(i, 0, 0);
+ setImmediate(i, 1);
+ setSrc(i, 2, 1);
+ } else {
+ setImmediate(i, 0);
+ }
+}
+
+void
+CodeEmitterNV50::emitLoadStoreSizeLG(DataType ty, int pos)
+{
+ uint8_t enc;
+
+ switch (ty) {
+ case TYPE_F32: // fall through
+ case TYPE_S32: // fall through
+ case TYPE_U32: enc = 0x6; break;
+ case TYPE_B128: enc = 0x5; break;
+ case TYPE_F64: // fall through
+ case TYPE_S64: // fall through
+ case TYPE_U64: enc = 0x4; break;
+ case TYPE_S16: enc = 0x3; break;
+ case TYPE_U16: enc = 0x2; break;
+ case TYPE_S8: enc = 0x1; break;
+ case TYPE_U8: enc = 0x0; break;
+ default:
+ enc = 0;
+ assert(!"invalid load/store type");
+ break;
+ }
+ code[pos / 32] |= enc << (pos % 32);
+}
+
+void
+CodeEmitterNV50::emitLoadStoreSizeCS(DataType ty)
+{
+ switch (ty) {
+ case TYPE_U8: break;
+ case TYPE_U16: code[1] |= 0x4000; break;
+ case TYPE_S16: code[1] |= 0x8000; break;
+ case TYPE_F32:
+ case TYPE_S32:
+ case TYPE_U32: code[1] |= 0xc000; break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+void
+CodeEmitterNV50::emitLOAD(const Instruction *i)
+{
+ DataFile sf = i->src(0).getFile();
+ int32_t offset = i->getSrc(0)->reg.data.offset;
+
+ switch (sf) {
+ case FILE_SHADER_INPUT:
+ // use 'mov' where we can
+ code[0] = i->src(0).isIndirect(0) ? 0x00000001 : 0x10000001;
+ code[1] = 0x00200000 | (i->lanes << 14);
+ if (typeSizeof(i->dType) == 4)
+ code[1] |= 0x04000000;
+ break;
+ case FILE_MEMORY_SHARED:
+ if (targ->getChipset() >= 0x84) {
+ assert(offset <= (int32_t)(0x3fff * typeSizeof(i->sType)));
+ code[0] = 0x10000001;
+ code[1] = 0x40000000;
+
+ if (typeSizeof(i->dType) == 4)
+ code[1] |= 0x04000000;
+
+ emitLoadStoreSizeCS(i->sType);
+ } else {
+ assert(offset <= (int32_t)(0x1f * typeSizeof(i->sType)));
+ code[0] = 0x10000001;
+ code[1] = 0x00200000 | (i->lanes << 14);
+ emitLoadStoreSizeCS(i->sType);
+ }
+ break;
+ case FILE_MEMORY_CONST:
+ code[0] = 0x10000001;
+ code[1] = 0x20000000 | (i->getSrc(0)->reg.fileIndex << 22);
+ if (typeSizeof(i->dType) == 4)
+ code[1] |= 0x04000000;
+ emitLoadStoreSizeCS(i->sType);
+ break;
+ case FILE_MEMORY_LOCAL:
+ code[0] = 0xd0000001;
+ code[1] = 0x40000000;
+ break;
+ case FILE_MEMORY_GLOBAL:
+ code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16);
+ code[1] = 0x80000000;
+ break;
+ default:
+ assert(!"invalid load source file");
+ break;
+ }
+ if (sf == FILE_MEMORY_LOCAL ||
+ sf == FILE_MEMORY_GLOBAL)
+ emitLoadStoreSizeLG(i->sType, 21 + 32);
+
+ setDst(i, 0);
+
+ emitFlagsRd(i);
+ emitFlagsWr(i);
+
+ if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+ srcId(*i->src(0).getIndirect(0), 9);
+ } else {
+ setAReg16(i, 0);
+ srcAddr16(i->src(0), i->src(0).getFile() != FILE_MEMORY_LOCAL, 9);
+ }
+}
+
+void
+CodeEmitterNV50::emitSTORE(const Instruction *i)
+{
+ DataFile f = i->getSrc(0)->reg.file;
+ int32_t offset = i->getSrc(0)->reg.data.offset;
+
+ switch (f) {
+ case FILE_SHADER_OUTPUT:
+ code[0] = 0x00000001 | ((offset >> 2) << 9);
+ code[1] = 0x80c00000;
+ srcId(i->src(1), 32 + 14);
+ break;
+ case FILE_MEMORY_GLOBAL:
+ code[0] = 0xd0000001 | (i->getSrc(0)->reg.fileIndex << 16);
+ code[1] = 0xa0000000;
+ emitLoadStoreSizeLG(i->dType, 21 + 32);
+ srcId(i->src(1), 2);
+ break;
+ case FILE_MEMORY_LOCAL:
+ code[0] = 0xd0000001;
+ code[1] = 0x60000000;
+ emitLoadStoreSizeLG(i->dType, 21 + 32);
+ srcId(i->src(1), 2);
+ break;
+ case FILE_MEMORY_SHARED:
+ code[0] = 0x00000001;
+ code[1] = 0xe0000000;
+ switch (typeSizeof(i->dType)) {
+ case 1:
+ code[0] |= offset << 9;
+ code[1] |= 0x00400000;
+ break;
+ case 2:
+ code[0] |= (offset >> 1) << 9;
+ break;
+ case 4:
+ code[0] |= (offset >> 2) << 9;
+ code[1] |= 0x04200000;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ srcId(i->src(1), 32 + 14);
+ break;
+ default:
+ assert(!"invalid store destination file");
+ break;
+ }
+
+ if (f == FILE_MEMORY_GLOBAL)
+ srcId(*i->src(0).getIndirect(0), 9);
+ else
+ setAReg16(i, 0);
+
+ if (f == FILE_MEMORY_LOCAL)
+ srcAddr16(i->src(0), false, 9);
+
+ emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitMOV(const Instruction *i)
+{
+ DataFile sf = i->getSrc(0)->reg.file;
+ DataFile df = i->getDef(0)->reg.file;
+
+ assert(sf == FILE_GPR || df == FILE_GPR);
+
+ if (sf == FILE_FLAGS) {
+ code[0] = 0x00000001;
+ code[1] = 0x20000000;
+ defId(i->def(0), 2);
+ srcId(i->src(0), 12);
+ emitFlagsRd(i);
+ } else
+ if (sf == FILE_ADDRESS) {
+ code[0] = 0x00000001;
+ code[1] = 0x40000000;
+ defId(i->def(0), 2);
+ setARegBits(SDATA(i->src(0)).id + 1);
+ emitFlagsRd(i);
+ } else
+ if (df == FILE_FLAGS) {
+ code[0] = 0x00000001;
+ code[1] = 0xa0000000;
+ defId(i->def(0), 4);
+ srcId(i->src(0), 9);
+ emitFlagsRd(i);
+ } else
+ if (sf == FILE_IMMEDIATE) {
+ code[0] = 0x10008001;
+ code[1] = 0x00000003;
+ emitForm_IMM(i);
+ } else {
+ if (i->encSize == 4) {
+ code[0] = 0x10008000;
+ } else {
+ code[0] = 0x10000001;
+ code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
+ code[1] |= (i->lanes << 14);
+ emitFlagsRd(i);
+ }
+ defId(i->def(0), 2);
+ srcId(i->src(0), 9);
+ }
+ if (df == FILE_SHADER_OUTPUT) {
+ assert(i->encSize == 8);
+ code[1] |= 0x8;
+ }
+}
+
+void
+CodeEmitterNV50::emitNOP()
+{
+ code[0] = 0xf0000001;
+ code[1] = 0xe0000000;
+}
+
+void
+CodeEmitterNV50::emitQUADOP(const Instruction *i, uint8_t lane, uint8_t quOp)
+{
+ code[0] = 0xc0000000 | (lane << 16);
+ code[1] = 0x80000000;
+
+ code[0] |= (quOp & 0x03) << 20;
+ code[1] |= (quOp & 0xfc) << 20;
+
+ emitForm_ADD(i);
+
+ if (!i->srcExists(1))
+ srcId(i->src(0), 32 + 14);
+}
+
+void
+CodeEmitterNV50::emitPFETCH(const Instruction *i)
+{
+ code[0] = 0x11800001;
+ code[1] = 0x04200000 | (0xf << 14);
+
+ defId(i->def(0), 2);
+ srcAddr8(i->src(0), 9);
+ setAReg16(i, 0);
+}
+
+void
+CodeEmitterNV50::emitINTERP(const Instruction *i)
+{
+ code[0] = 0x80000000;
+
+ defId(i->def(0), 2);
+ srcAddr8(i->src(0), 16);
+
+ if (i->getInterpMode() == NV50_IR_INTERP_FLAT) {
+ code[0] |= 1 << 8;
+ } else {
+ if (i->op == OP_PINTERP) {
+ code[0] |= 1 << 25;
+ srcId(i->src(1), 9);
+ }
+ if (i->getSampleMode() == NV50_IR_INTERP_CENTROID)
+ code[0] |= 1 << 24;
+ }
+
+ if (i->encSize == 8) {
+ code[1] =
+ (code[0] & (3 << 24)) >> (24 - 16) |
+ (code[0] & (1 << 8)) << (18 - 8);
+ code[0] &= ~0x03000100;
+ code[0] |= 1;
+ emitFlagsRd(i);
+ }
+}
+
+void
+CodeEmitterNV50::emitMINMAX(const Instruction *i)
+{
+ if (i->dType == TYPE_F64) {
+ code[0] = 0xe0000000;
+ code[1] = (i->op == OP_MIN) ? 0xa0000000 : 0xc0000000;
+ } else {
+ code[0] = 0x30000000;
+ code[1] = 0x80000000;
+ if (i->op == OP_MIN)
+ code[1] |= 0x20000000;
+
+ switch (i->dType) {
+ case TYPE_F32: code[0] |= 0x80000000; break;
+ case TYPE_S32: code[1] |= 0x8c000000; break;
+ case TYPE_U32: code[1] |= 0x84000000; break;
+ case TYPE_S16: code[1] |= 0x80000000; break;
+ case TYPE_U16: break;
+ default:
+ assert(0);
+ break;
+ }
+ code[1] |= i->src(0).mod.abs() << 20;
+ code[1] |= i->src(1).mod.abs() << 19;
+ }
+ emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitFMAD(const Instruction *i)
+{
+ const int neg_mul = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+ const int neg_add = i->src(2).mod.neg();
+
+ code[0] = 0xe0000000;
+
+ if (i->encSize == 4) {
+ emitForm_MUL(i);
+ assert(!neg_mul && !neg_add);
+ } else {
+ code[1] = neg_mul << 26;
+ code[1] |= neg_add << 27;
+ if (i->saturate)
+ code[1] |= 1 << 29;
+ emitForm_MAD(i);
+ }
+}
+
+void
+CodeEmitterNV50::emitFADD(const Instruction *i)
+{
+ const int neg0 = i->src(0).mod.neg();
+ const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+ code[0] = 0xb0000000;
+
+ assert(!(i->src(0).mod | i->src(1).mod).abs());
+
+ if (i->src(1).getFile() == FILE_IMMEDIATE) {
+ code[1] = 0;
+ emitForm_IMM(i);
+ code[0] |= neg0 << 15;
+ code[0] |= neg1 << 22;
+ if (i->saturate)
+ code[0] |= 1 << 8;
+ } else
+ if (i->encSize == 8) {
+ code[1] = 0;
+ emitForm_ADD(i);
+ code[1] |= neg0 << 26;
+ code[1] |= neg1 << 27;
+ if (i->saturate)
+ code[1] |= 1 << 29;
+ } else {
+ emitForm_MUL(i);
+ code[0] |= neg0 << 15;
+ code[0] |= neg1 << 22;
+ if (i->saturate)
+ code[0] |= 1 << 8;
+ }
+}
+
+void
+CodeEmitterNV50::emitUADD(const Instruction *i)
+{
+ const int neg0 = i->src(0).mod.neg();
+ const int neg1 = i->src(1).mod.neg() ^ ((i->op == OP_SUB) ? 1 : 0);
+
+ code[0] = 0x20008000;
+
+ if (i->src(1).getFile() == FILE_IMMEDIATE) {
+ code[1] = 0;
+ emitForm_IMM(i);
+ } else
+ if (i->encSize == 8) {
+ code[0] = 0x20000000;
+ code[1] = (typeSizeof(i->dType) == 2) ? 0 : 0x04000000;
+ emitForm_ADD(i);
+ } else {
+ emitForm_MUL(i);
+ }
+ assert(!(neg0 && neg1));
+ code[0] |= neg0 << 28;
+ code[0] |= neg1 << 22;
+
+ if (i->flagsSrc >= 0) {
+ // addc == sub | subr
+ assert(!(code[0] & 0x10400000) && !i->getPredicate());
+ code[0] |= 0x10400000;
+ srcId(i->src(i->flagsSrc), 32 + 12);
+ }
+}
+
+void
+CodeEmitterNV50::emitAADD(const Instruction *i)
+{
+ const int s = (i->op == OP_MOV) ? 0 : 1;
+
+ code[0] = 0xd0000001 | (i->getSrc(s)->reg.data.u16 << 9);
+ code[1] = 0x20000000;
+
+ code[0] |= (DDATA(i->def(0)).id + 1) << 2;
+
+ emitFlagsRd(i);
+
+ if (s && i->srcExists(0))
+ setARegBits(SDATA(i->src(0)).id + 1);
+}
+
+void
+CodeEmitterNV50::emitIMUL(const Instruction *i)
+{
+ code[0] = 0x40000000;
+
+ if (i->encSize == 8) {
+ code[1] = (i->sType == TYPE_S16) ? (0x8000 | 0x4000) : 0x0000;
+ emitForm_MAD(i);
+ } else {
+ if (i->sType == TYPE_S16)
+ code[0] |= 0x8100;
+ emitForm_MUL(i);
+ }
+}
+
+void
+CodeEmitterNV50::emitFMUL(const Instruction *i)
+{
+ const int neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ code[0] = 0xc0000000;
+
+ if (i->src(1).getFile() == FILE_IMMEDIATE) {
+ code[1] = 0;
+ emitForm_IMM(i);
+ if (neg)
+ code[0] |= 0x8000;
+ } else
+ if (i->encSize == 8) {
+ code[1] = i->rnd == ROUND_Z ? 0x0000c000 : 0;
+ if (neg)
+ code[1] |= 0x08000000;
+ emitForm_MAD(i);
+ } else {
+ emitForm_MUL(i);
+ if (neg)
+ code[0] |= 0x8000;
+ }
+}
+
+void
+CodeEmitterNV50::emitIMAD(const Instruction *i)
+{
+ code[0] = 0x60000000;
+ if (isSignedType(i->sType))
+ code[1] = i->saturate ? 0x40000000 : 0x20000000;
+ else
+ code[1] = 0x00000000;
+
+ int neg1 = i->src(0).mod.neg() ^ i->src(1).mod.neg();
+ int neg2 = i->src(2).mod.neg();
+
+ assert(!(neg1 & neg2));
+ code[1] |= neg1 << 27;
+ code[1] |= neg2 << 26;
+
+ emitForm_MAD(i);
+
+ if (i->flagsSrc >= 0) {
+ // add with carry from $cX
+ assert(!(code[1] & 0x0c000000) && !i->getPredicate());
+ code[1] |= 0xc << 24;
+ srcId(i->src(i->flagsSrc), 32 + 12);
+ }
+}
+
+void
+CodeEmitterNV50::emitISAD(const Instruction *i)
+{
+ if (i->encSize == 8) {
+ code[0] = 0x50000000;
+ switch (i->sType) {
+ case TYPE_U32: code[1] = 0x04000000; break;
+ case TYPE_S32: code[1] = 0x0c000000; break;
+ case TYPE_U16: code[1] = 0x00000000; break;
+ case TYPE_S16: code[1] = 0x08000000; break;
+ default:
+ assert(0);
+ break;
+ }
+ emitForm_MAD(i);
+ } else {
+ switch (i->sType) {
+ case TYPE_U32: code[0] = 0x50008000; break;
+ case TYPE_S32: code[0] = 0x50008100; break;
+ case TYPE_U16: code[0] = 0x50000000; break;
+ case TYPE_S16: code[0] = 0x50000100; break;
+ default:
+ assert(0);
+ break;
+ }
+ emitForm_MUL(i);
+ }
+}
+
+void
+CodeEmitterNV50::emitSET(const Instruction *i)
+{
+ code[0] = 0x30000000;
+ code[1] = 0x60000000;
+
+ emitCondCode(i->asCmp()->setCond, i->sType, 32 + 14);
+
+ switch (i->sType) {
+ case TYPE_F32: code[0] |= 0x80000000; break;
+ case TYPE_S32: code[1] |= 0x0c000000; break;
+ case TYPE_U32: code[1] |= 0x04000000; break;
+ case TYPE_S16: code[1] |= 0x08000000; break;
+ case TYPE_U16: break;
+ default:
+ assert(0);
+ break;
+ }
+ if (i->src(0).mod.neg()) code[1] |= 0x04000000;
+ if (i->src(1).mod.neg()) code[1] |= 0x08000000;
+ if (i->src(0).mod.abs()) code[1] |= 0x00100000;
+ if (i->src(1).mod.abs()) code[1] |= 0x00080000;
+
+ emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::roundMode_CVT(RoundMode rnd)
+{
+ switch (rnd) {
+ case ROUND_NI: code[1] |= 0x08000000; break;
+ case ROUND_M: code[1] |= 0x00020000; break;
+ case ROUND_MI: code[1] |= 0x08020000; break;
+ case ROUND_P: code[1] |= 0x00040000; break;
+ case ROUND_PI: code[1] |= 0x08040000; break;
+ case ROUND_Z: code[1] |= 0x00060000; break;
+ case ROUND_ZI: code[1] |= 0x08060000; break;
+ default:
+ assert(rnd == ROUND_N);
+ break;
+ }
+}
+
+void
+CodeEmitterNV50::emitCVT(const Instruction *i)
+{
+ const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+ RoundMode rnd;
+
+ switch (i->op) {
+ case OP_CEIL: rnd = f2f ? ROUND_PI : ROUND_P; break;
+ case OP_FLOOR: rnd = f2f ? ROUND_MI : ROUND_M; break;
+ case OP_TRUNC: rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+ default:
+ rnd = i->rnd;
+ break;
+ }
+
+ code[0] = 0xa0000000;
+
+ switch (i->dType) {
+ case TYPE_F64:
+ switch (i->sType) {
+ case TYPE_F64: code[1] = 0xc4404000; break;
+ case TYPE_S64: code[1] = 0x44414000; break;
+ case TYPE_U64: code[1] = 0x44404000; break;
+ case TYPE_F32: code[1] = 0xc4400000; break;
+ case TYPE_S32: code[1] = 0x44410000; break;
+ case TYPE_U32: code[1] = 0x44400000; break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case TYPE_S64:
+ switch (i->sType) {
+ case TYPE_F64: code[1] = 0x8c404000; break;
+ case TYPE_F32: code[1] = 0x8c400000; break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case TYPE_U64:
+ switch (i->sType) {
+ case TYPE_F64: code[1] = 0x84404000; break;
+ case TYPE_F32: code[1] = 0x84400000; break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case TYPE_F32:
+ switch (i->sType) {
+ case TYPE_F64: code[1] = 0xc0404000; break;
+ case TYPE_S64: code[1] = 0x40414000; break;
+ case TYPE_U64: code[1] = 0x40404000; break;
+ case TYPE_F32: code[1] = 0xc4004000; break;
+ case TYPE_S32: code[1] = 0x44014000; break;
+ case TYPE_U32: code[1] = 0x44004000; break;
+ case TYPE_F16: code[1] = 0xc4000000; break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case TYPE_S32:
+ switch (i->sType) {
+ case TYPE_F64: code[1] = 0x88404000; break;
+ case TYPE_F32: code[1] = 0x8c004000; break;
+ case TYPE_S32: code[1] = 0x0c014000; break;
+ case TYPE_U32: code[1] = 0x0c004000; break;
+ case TYPE_F16: code[1] = 0x8c000000; break;
+ case TYPE_S16: code[1] = 0x0c010000; break;
+ case TYPE_U16: code[1] = 0x0c000000; break;
+ case TYPE_S8: code[1] = 0x0c018000; break;
+ case TYPE_U8: code[1] = 0x0c008000; break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case TYPE_U32:
+ switch (i->sType) {
+ case TYPE_F64: code[1] = 0x80404000; break;
+ case TYPE_F32: code[1] = 0x84004000; break;
+ case TYPE_S32: code[1] = 0x04014000; break;
+ case TYPE_U32: code[1] = 0x04004000; break;
+ case TYPE_F16: code[1] = 0x84000000; break;
+ case TYPE_S16: code[1] = 0x04010000; break;
+ case TYPE_U16: code[1] = 0x04000000; break;
+ case TYPE_S8: code[1] = 0x04018000; break;
+ case TYPE_U8: code[1] = 0x04008000; break;
+ default:
+ assert(0);
+ break;
+ }
+ break;
+ case TYPE_S16:
+ case TYPE_U16:
+ case TYPE_S8:
+ case TYPE_U8:
+ default:
+ assert(0);
+ break;
+ }
+ if (typeSizeof(i->sType) == 1 && i->getSrc(0)->reg.size == 4)
+ code[1] |= 0x00004000;
+
+ roundMode_CVT(rnd);
+
+ switch (i->op) {
+ case OP_ABS: code[1] |= 1 << 20; break;
+ case OP_SAT: code[1] |= 1 << 19; break;
+ case OP_NEG: code[1] |= 1 << 29; break;
+ default:
+ break;
+ }
+ code[1] ^= i->src(0).mod.neg() << 29;
+ code[1] |= i->src(0).mod.abs() << 20;
+ if (i->saturate)
+ code[1] |= 1 << 19;
+
+ assert(i->op != OP_ABS || !i->src(0).mod.neg());
+
+ emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitPreOp(const Instruction *i)
+{
+ code[0] = 0xb0000000;
+ code[1] = (i->op == OP_PREEX2) ? 0xc0004000 : 0xc0000000;
+
+ code[1] |= i->src(0).mod.abs() << 20;
+ code[1] |= i->src(0).mod.neg() << 26;
+
+ emitForm_MAD(i);
+}
+
+void
+CodeEmitterNV50::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+ code[0] = 0x90000000;
+
+ if (i->encSize == 4) {
+ assert(i->op == OP_RCP);
+ code[0] |= i->src(0).mod.abs() << 15;
+ code[0] |= i->src(0).mod.neg() << 22;
+ emitForm_MUL(i);
+ } else {
+ code[1] = subOp << 29;
+ code[1] |= i->src(0).mod.abs() << 20;
+ code[1] |= i->src(0).mod.neg() << 26;
+ emitForm_MAD(i);
+ }
+}
+
+void
+CodeEmitterNV50::emitNOT(const Instruction *i)
+{
+ code[0] = 0xd0000000;
+ code[1] = 0x0002c000;
+
+ switch (i->sType) {
+ case TYPE_U32:
+ case TYPE_S32:
+ code[1] |= 0x04000000;
+ break;
+ default:
+ break;
+ }
+ emitForm_MAD(i);
+ setSrc(i, 0, 1);
+}
+
+void
+CodeEmitterNV50::emitLogicOp(const Instruction *i)
+{
+ code[0] = 0xd0000000;
+ code[1] = 0;
+
+ if (i->src(1).getFile() == FILE_IMMEDIATE) {
+ switch (i->op) {
+ case OP_OR: code[0] |= 0x0100; break;
+ case OP_XOR: code[0] |= 0x8000; break;
+ default:
+ assert(i->op == OP_AND);
+ break;
+ }
+ if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
+ code[0] |= 1 << 22;
+
+ emitForm_IMM(i);
+ } else {
+ switch (i->op) {
+ case OP_AND: code[1] = 0x04000000; break;
+ case OP_OR: code[1] = 0x04004000; break;
+ case OP_XOR: code[1] = 0x04008000; break;
+ default:
+ assert(0);
+ break;
+ }
+ if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT))
+ code[1] |= 1 << 16;
+ if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT))
+ code[1] |= 1 << 17;
+
+ emitForm_MAD(i);
+ }
+}
+
+void
+CodeEmitterNV50::emitARL(const Instruction *i, unsigned int shl)
+{
+ code[0] = 0x00000001 | (shl << 16);
+ code[1] = 0xc0000000;
+
+ code[0] |= (DDATA(i->def(0)).id + 1) << 2;
+
+ setSrcFileBits(i, NV50_OP_ENC_IMM);
+ setSrc(i, 0, 0);
+ emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitShift(const Instruction *i)
+{
+ if (i->def(0).getFile() == FILE_ADDRESS) {
+ assert(i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE);
+ emitARL(i, i->getSrc(1)->reg.data.u32 & 0x3f);
+ } else {
+ code[0] = 0x30000001;
+ code[1] = (i->op == OP_SHR) ? 0xe4000000 : 0xc4000000;
+ if (i->op == OP_SHR && isSignedType(i->sType))
+ code[1] |= 1 << 27;
+
+ if (i->src(1).getFile() == FILE_IMMEDIATE) {
+ code[1] |= 1 << 20;
+ code[0] |= (i->getSrc(1)->reg.data.u32 & 0x7f) << 16;
+ defId(i->def(0), 2);
+ srcId(i->src(0), 9);
+ emitFlagsRd(i);
+ } else {
+ emitForm_MAD(i);
+ }
+ }
+}
+
+void
+CodeEmitterNV50::emitOUT(const Instruction *i)
+{
+ code[0] = (i->op == OP_EMIT) ? 0xf0000200 : 0xf0000400;
+ code[1] = 0xc0000001;
+
+ emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTEX(const TexInstruction *i)
+{
+ code[0] = 0xf0000001;
+ code[1] = 0x00000000;
+
+ switch (i->op) {
+ case OP_TXB:
+ code[1] = 0x20000000;
+ break;
+ case OP_TXL:
+ code[1] = 0x40000000;
+ break;
+ case OP_TXF:
+ code[0] |= 0x01000000;
+ break;
+ case OP_TXG:
+ code[0] = 0x01000000;
+ code[1] = 0x80000000;
+ break;
+ default:
+ assert(i->op == OP_TEX);
+ break;
+ }
+
+ code[0] |= i->tex.r << 9;
+ code[0] |= i->tex.s << 17;
+
+ int argc = i->tex.target.getArgCount();
+
+ if (i->op == OP_TXB || i->op == OP_TXL || i->op == OP_TXF)
+ argc += 1;
+ if (i->tex.target.isShadow())
+ argc += 1;
+ assert(argc <= 4);
+
+ code[0] |= (argc - 1) << 22;
+
+ if (i->tex.target.isCube()) {
+ code[0] |= 0x08000000;
+ } else
+ if (i->tex.useOffsets) {
+ code[1] |= (i->tex.offset[0][0] & 0xf) << 24;
+ code[1] |= (i->tex.offset[0][1] & 0xf) << 20;
+ code[1] |= (i->tex.offset[0][2] & 0xf) << 16;
+ }
+
+ code[0] |= (i->tex.mask & 0x3) << 25;
+ code[1] |= (i->tex.mask & 0xc) << 12;
+
+ if (i->tex.liveOnly)
+ code[1] |= 4;
+
+ defId(i->def(0), 2);
+
+ emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTXQ(const TexInstruction *i)
+{
+ assert(i->tex.query == TXQ_DIMS);
+
+ code[0] = 0xf0000001;
+ code[1] = 0x60000000;
+
+ code[0] |= i->tex.r << 9;
+ code[0] |= i->tex.s << 17;
+
+ code[0] |= (i->tex.mask & 0x3) << 25;
+ code[1] |= (i->tex.mask & 0xc) << 12;
+
+ defId(i->def(0), 2);
+
+ emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitTEXPREP(const TexInstruction *i)
+{
+ code[0] = 0xf8000001 | (3 << 22) | (i->tex.s << 17) | (i->tex.r << 9);
+ code[1] = 0x60010000;
+
+ code[0] |= (i->tex.mask & 0x3) << 25;
+ code[1] |= (i->tex.mask & 0xc) << 12;
+ defId(i->def(0), 2);
+
+ emitFlagsRd(i);
+}
+
+void
+CodeEmitterNV50::emitPRERETEmu(const FlowInstruction *i)
+{
+ uint32_t pos = i->target.bb->binPos + 8; // +8 to skip an op */
+
+ code[0] = 0x10000003; // bra
+ code[1] = 0x00000780; // always
+
+ switch (i->subOp) {
+ case NV50_IR_SUBOP_EMU_PRERET + 0: // bra to the call
+ break;
+ case NV50_IR_SUBOP_EMU_PRERET + 1: // bra to skip the call
+ pos += 8;
+ break;
+ default:
+ assert(i->subOp == (NV50_IR_SUBOP_EMU_PRERET + 2));
+ code[0] = 0x20000003; // call
+ code[1] = 0x00000000; // no predicate
+ break;
+ }
+ addReloc(RelocEntry::TYPE_CODE, 0, pos, 0x07fff800, 9);
+ addReloc(RelocEntry::TYPE_CODE, 1, pos, 0x000fc000, -4);
+}
+
+void
+CodeEmitterNV50::emitFlow(const Instruction *i, uint8_t flowOp)
+{
+ const FlowInstruction *f = i->asFlow();
+ bool hasPred = false;
+ bool hasTarg = false;
+
+ code[0] = 0x00000003 | (flowOp << 28);
+ code[1] = 0x00000000;
+
+ switch (i->op) {
+ case OP_BRA:
+ hasPred = true;
+ hasTarg = true;
+ break;
+ case OP_BREAK:
+ case OP_BRKPT:
+ case OP_DISCARD:
+ case OP_RET:
+ hasPred = true;
+ break;
+ case OP_CALL:
+ case OP_PREBREAK:
+ case OP_JOINAT:
+ hasTarg = true;
+ break;
+ case OP_PRERET:
+ hasTarg = true;
+ if (i->subOp >= NV50_IR_SUBOP_EMU_PRERET) {
+ emitPRERETEmu(f);
+ return;
+ }
+ break;
+ default:
+ break;
+ }
+
+ if (hasPred)
+ emitFlagsRd(i);
+
+ if (hasTarg && f) {
+ uint32_t pos;
+
+ if (f->op == OP_CALL) {
+ if (f->builtin) {
+ pos = targNV50->getBuiltinOffset(f->target.builtin);
+ } else {
+ pos = f->target.fn->binPos;
+ }
+ } else {
+ pos = f->target.bb->binPos;
+ }
+
+ code[0] |= ((pos >> 2) & 0xffff) << 11;
+ code[1] |= ((pos >> 18) & 0x003f) << 14;
+
+ RelocEntry::Type relocTy;
+
+ relocTy = f->builtin ? RelocEntry::TYPE_BUILTIN : RelocEntry::TYPE_CODE;
+
+ addReloc(relocTy, 0, pos, 0x07fff800, 9);
+ addReloc(relocTy, 1, pos, 0x000fc000, -4);
+ }
+}
+
+void
+CodeEmitterNV50::emitBAR(const Instruction *i)
+{
+ ImmediateValue *barId = i->getSrc(0)->asImm();
+ assert(barId);
+
+ code[0] = 0x82000003 | (barId->reg.data.u32 << 21);
+ code[1] = 0x00004000;
+
+ if (i->subOp == NV50_IR_SUBOP_BAR_SYNC)
+ code[0] |= 1 << 26;
+}
+
+void
+CodeEmitterNV50::emitATOM(const Instruction *i)
+{
+ uint8_t subOp;
+ switch (i->subOp) {
+ case NV50_IR_SUBOP_ATOM_ADD: subOp = 0x0; break;
+ case NV50_IR_SUBOP_ATOM_MIN: subOp = 0x7; break;
+ case NV50_IR_SUBOP_ATOM_MAX: subOp = 0x6; break;
+ case NV50_IR_SUBOP_ATOM_INC: subOp = 0x4; break;
+ case NV50_IR_SUBOP_ATOM_DEC: subOp = 0x5; break;
+ case NV50_IR_SUBOP_ATOM_AND: subOp = 0xa; break;
+ case NV50_IR_SUBOP_ATOM_OR: subOp = 0xb; break;
+ case NV50_IR_SUBOP_ATOM_XOR: subOp = 0xc; break;
+ case NV50_IR_SUBOP_ATOM_CAS: subOp = 0x2; break;
+ case NV50_IR_SUBOP_ATOM_EXCH: subOp = 0x1; break;
+ default:
+ assert(!"invalid subop");
+ return;
+ }
+ code[0] = 0xd0000001;
+ code[1] = 0xe0c00000 | (subOp << 2);
+ if (isSignedType(i->dType))
+ code[1] |= 1 << 21;
+
+ // args
+ emitFlagsRd(i);
+ setDst(i, 0);
+ setSrc(i, 1, 1);
+ if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+ setSrc(i, 2, 2);
+
+ // g[] pointer
+ code[0] |= i->getSrc(0)->reg.fileIndex << 23;
+ srcId(i->getIndirect(0, 0), 9);
+}
+
+bool
+CodeEmitterNV50::emitInstruction(Instruction *insn)
+{
+ if (!insn->encSize) {
+ ERROR("skipping unencodable instruction: "); insn->print();
+ return false;
+ } else
+ if (codeSize + insn->encSize > codeSizeLimit) {
+ ERROR("code emitter output buffer too small\n");
+ return false;
+ }
+
+ if (insn->bb->getProgram()->dbgFlags & NV50_IR_DEBUG_BASIC) {
+ INFO("EMIT: "); insn->print();
+ }
+
+ switch (insn->op) {
+ case OP_MOV:
+ emitMOV(insn);
+ break;
+ case OP_EXIT:
+ case OP_NOP:
+ case OP_JOIN:
+ emitNOP();
+ break;
+ case OP_VFETCH:
+ case OP_LOAD:
+ emitLOAD(insn);
+ break;
+ case OP_EXPORT:
+ case OP_STORE:
+ emitSTORE(insn);
+ break;
+ case OP_PFETCH:
+ emitPFETCH(insn);
+ break;
+ case OP_LINTERP:
+ case OP_PINTERP:
+ emitINTERP(insn);
+ break;
+ case OP_ADD:
+ case OP_SUB:
+ if (isFloatType(insn->dType))
+ emitFADD(insn);
+ else if (insn->getDef(0)->reg.file == FILE_ADDRESS)
+ emitAADD(insn);
+ else
+ emitUADD(insn);
+ break;
+ case OP_MUL:
+ if (isFloatType(insn->dType))
+ emitFMUL(insn);
+ else
+ emitIMUL(insn);
+ break;
+ case OP_MAD:
+ case OP_FMA:
+ if (isFloatType(insn->dType))
+ emitFMAD(insn);
+ else
+ emitIMAD(insn);
+ break;
+ case OP_SAD:
+ emitISAD(insn);
+ break;
+ case OP_NOT:
+ emitNOT(insn);
+ break;
+ case OP_AND:
+ case OP_OR:
+ case OP_XOR:
+ emitLogicOp(insn);
+ break;
+ case OP_SHL:
+ case OP_SHR:
+ emitShift(insn);
+ break;
+ case OP_SET:
+ emitSET(insn);
+ break;
+ case OP_MIN:
+ case OP_MAX:
+ emitMINMAX(insn);
+ break;
+ case OP_CEIL:
+ case OP_FLOOR:
+ case OP_TRUNC:
+ case OP_ABS:
+ case OP_NEG:
+ case OP_SAT:
+ emitCVT(insn);
+ break;
+ case OP_CVT:
+ if (insn->def(0).getFile() == FILE_ADDRESS)
+ emitARL(insn, 0);
+ else
+ if (insn->def(0).getFile() == FILE_FLAGS ||
+ insn->src(0).getFile() == FILE_FLAGS ||
+ insn->src(0).getFile() == FILE_ADDRESS)
+ emitMOV(insn);
+ else
+ emitCVT(insn);
+ break;
+ case OP_RCP:
+ emitSFnOp(insn, 0);
+ break;
+ case OP_RSQ:
+ emitSFnOp(insn, 2);
+ break;
+ case OP_LG2:
+ emitSFnOp(insn, 3);
+ break;
+ case OP_SIN:
+ emitSFnOp(insn, 4);
+ break;
+ case OP_COS:
+ emitSFnOp(insn, 5);
+ break;
+ case OP_EX2:
+ emitSFnOp(insn, 6);
+ break;
+ case OP_PRESIN:
+ case OP_PREEX2:
+ emitPreOp(insn);
+ break;
+ case OP_TEX:
+ case OP_TXB:
+ case OP_TXL:
+ case OP_TXF:
+ emitTEX(insn->asTex());
+ break;
+ case OP_TXQ:
+ emitTXQ(insn->asTex());
+ break;
+ case OP_TEXPREP:
+ emitTEXPREP(insn->asTex());
+ break;
+ case OP_EMIT:
+ case OP_RESTART:
+ emitOUT(insn);
+ break;
+ case OP_DISCARD:
+ emitFlow(insn, 0x0);
+ break;
+ case OP_BRA:
+ emitFlow(insn, 0x1);
+ break;
+ case OP_CALL:
+ emitFlow(insn, 0x2);
+ break;
+ case OP_RET:
+ emitFlow(insn, 0x3);
+ break;
+ case OP_PREBREAK:
+ emitFlow(insn, 0x4);
+ break;
+ case OP_BREAK:
+ emitFlow(insn, 0x5);
+ break;
+ case OP_QUADON:
+ emitFlow(insn, 0x6);
+ break;
+ case OP_QUADPOP:
+ emitFlow(insn, 0x7);
+ break;
+ case OP_JOINAT:
+ emitFlow(insn, 0xa);
+ break;
+ case OP_PRERET:
+ emitFlow(insn, 0xd);
+ break;
+ case OP_QUADOP:
+ emitQUADOP(insn, insn->lanes, insn->subOp);
+ break;
+ case OP_DFDX:
+ emitQUADOP(insn, 4, insn->src(0).mod.neg() ? 0x66 : 0x99);
+ break;
+ case OP_DFDY:
+ emitQUADOP(insn, 5, insn->src(0).mod.neg() ? 0x5a : 0xa5);
+ break;
+ case OP_ATOM:
+ emitATOM(insn);
+ break;
+ case OP_BAR:
+ emitBAR(insn);
+ break;
+ case OP_PHI:
+ case OP_UNION:
+ case OP_CONSTRAINT:
+ ERROR("operation should have been eliminated\n");
+ return false;
+ case OP_EXP:
+ case OP_LOG:
+ case OP_SQRT:
+ case OP_POW:
+ case OP_SELP:
+ case OP_SLCT:
+ case OP_TXD:
+ case OP_PRECONT:
+ case OP_CONT:
+ case OP_POPCNT:
+ case OP_INSBF:
+ case OP_EXTBF:
+ ERROR("operation should have been lowered\n");
+ return false;
+ default:
+ ERROR("unknown op: %u\n", insn->op);
+ return false;
+ }
+ if (insn->join || insn->op == OP_JOIN)
+ code[1] |= 0x2;
+ else
+ if (insn->exit || insn->op == OP_EXIT)
+ code[1] |= 0x1;
+
+ assert((insn->encSize == 8) == (code[0] & 1));
+
+ code += insn->encSize / 4;
+ codeSize += insn->encSize;
+ return true;
+}
+
+uint32_t
+CodeEmitterNV50::getMinEncodingSize(const Instruction *i) const
+{
+ const Target::OpInfo &info = targ->getOpInfo(i);
+
+ if (info.minEncSize > 4)
+ return 8;
+
+ // check constraints on dst and src operands
+ for (int d = 0; i->defExists(d); ++d) {
+ if (i->def(d).rep()->reg.data.id > 63 ||
+ i->def(d).rep()->reg.file != FILE_GPR)
+ return 8;
+ }
+
+ for (int s = 0; i->srcExists(s); ++s) {
+ DataFile sf = i->src(s).getFile();
+ if (sf != FILE_GPR)
+ if (sf != FILE_SHADER_INPUT || progType != Program::TYPE_FRAGMENT)
+ return 8;
+ if (i->src(s).rep()->reg.data.id > 63)
+ return 8;
+ }
+
+ // check modifiers & rounding
+ if (i->join || i->lanes != 0xf || i->exit)
+ return 8;
+ if (i->op == OP_MUL && i->rnd != ROUND_N)
+ return 8;
+
+ if (i->asTex())
+ return 8; // TODO: short tex encoding
+
+ // check constraints on short MAD
+ if (info.srcNr >= 2 && i->srcExists(2)) {
+ if (i->saturate || i->src(2).mod)
+ return 8;
+ if ((i->src(0).mod ^ i->src(1).mod) ||
+ (i->src(0).mod | i->src(1).mod).abs())
+ return 8;
+ if (!i->defExists(0) ||
+ i->def(0).rep()->reg.data.id != i->src(2).rep()->reg.data.id)
+ return 8;
+ }
+
+ return info.minEncSize;
+}
+
+// Change the encoding size of an instruction after BBs have been scheduled.
+static void
+makeInstructionLong(Instruction *insn)
+{
+ if (insn->encSize == 8)
+ return;
+ Function *fn = insn->bb->getFunction();
+ int n = 0;
+ int adj = 4;
+
+ for (Instruction *i = insn->next; i && i->encSize == 4; ++n, i = i->next);
+
+ if (n & 1) {
+ adj = 8;
+ insn->next->encSize = 8;
+ } else
+ if (insn->prev && insn->prev->encSize == 4) {
+ adj = 8;
+ insn->prev->encSize = 8;
+ }
+ insn->encSize = 8;
+
+ for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) {
+ fn->bbArray[i]->binPos += 4;
+ }
+ fn->binSize += adj;
+ insn->bb->binSize += adj;
+}
+
+static bool
+trySetExitModifier(Instruction *insn)
+{
+ if (insn->op == OP_DISCARD ||
+ insn->op == OP_QUADON ||
+ insn->op == OP_QUADPOP)
+ return false;
+ for (int s = 0; insn->srcExists(s); ++s)
+ if (insn->src(s).getFile() == FILE_IMMEDIATE)
+ return false;
+ if (insn->asFlow()) {
+ if (insn->op == OP_CALL) // side effects !
+ return false;
+ if (insn->getPredicate()) // cannot do conditional exit (or can we ?)
+ return false;
+ insn->op = OP_EXIT;
+ }
+ insn->exit = 1;
+ makeInstructionLong(insn);
+ return true;
+}
+
+static void
+replaceExitWithModifier(Function *func)
+{
+ BasicBlock *epilogue = BasicBlock::get(func->cfgExit);
+
+ if (!epilogue->getExit() ||
+ epilogue->getExit()->op != OP_EXIT) // only main will use OP_EXIT
+ return;
+
+ if (epilogue->getEntry()->op != OP_EXIT) {
+ Instruction *insn = epilogue->getExit()->prev;
+ if (!insn || !trySetExitModifier(insn))
+ return;
+ insn->exit = 1;
+ } else {
+ for (Graph::EdgeIterator ei = func->cfgExit->incident();
+ !ei.end(); ei.next()) {
+ BasicBlock *bb = BasicBlock::get(ei.getNode());
+ Instruction *i = bb->getExit();
+
+ if (!i || !trySetExitModifier(i))
+ return;
+ }
+ }
+ epilogue->binSize -= 8;
+ func->binSize -= 8;
+ delete_Instruction(func->getProgram(), epilogue->getExit());
+}
+
+void
+CodeEmitterNV50::prepareEmission(Function *func)
+{
+ CodeEmitter::prepareEmission(func);
+
+ replaceExitWithModifier(func);
+}
+
+CodeEmitterNV50::CodeEmitterNV50(const TargetNV50 *target) :
+ CodeEmitter(target), targNV50(target)
+{
+ targ = target; // specialized
+ code = NULL;
+ codeSize = codeSizeLimit = 0;
+ relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNV50::getCodeEmitter(Program::Type type)
+{
+ CodeEmitterNV50 *emit = new CodeEmitterNV50(this);
+ emit->setProgramType(type);
+ return emit;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
new file mode 100644
index 0000000..90c409d
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -0,0 +1,2988 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+// Argh, all these assertions ...
+
+class CodeEmitterNVC0 : public CodeEmitter
+{
+public:
+ CodeEmitterNVC0(const TargetNVC0 *);
+
+ virtual bool emitInstruction(Instruction *);
+ virtual uint32_t getMinEncodingSize(const Instruction *) const;
+ virtual void prepareEmission(Function *);
+
+ inline void setProgramType(Program::Type pType) { progType = pType; }
+
+private:
+ const TargetNVC0 *targNVC0;
+
+ Program::Type progType;
+
+ const bool writeIssueDelays;
+
+private:
+ void emitForm_A(const Instruction *, uint64_t);
+ void emitForm_B(const Instruction *, uint64_t);
+ void emitForm_S(const Instruction *, uint32_t, bool pred);
+
+ void emitPredicate(const Instruction *);
+
+ void setAddress16(const ValueRef&);
+ void setAddress24(const ValueRef&);
+ void setAddressByFile(const ValueRef&);
+ void setImmediate(const Instruction *, const int s); // needs op already set
+ void setImmediateS8(const ValueRef&);
+ void setSUConst16(const Instruction *, const int s);
+ void setSUPred(const Instruction *, const int s);
+
+ void emitCondCode(CondCode cc, int pos);
+ void emitInterpMode(const Instruction *);
+ void emitLoadStoreType(DataType ty);
+ void emitSUGType(DataType);
+ void emitCachingMode(CacheMode c);
+
+ void emitShortSrc2(const ValueRef&);
+
+ inline uint8_t getSRegEncoding(const ValueRef&);
+
+ void roundMode_A(const Instruction *);
+ void roundMode_C(const Instruction *);
+ void roundMode_CS(const Instruction *);
+
+ void emitNegAbs12(const Instruction *);
+
+ void emitNOP(const Instruction *);
+
+ void emitLOAD(const Instruction *);
+ void emitSTORE(const Instruction *);
+ void emitMOV(const Instruction *);
+ void emitATOM(const Instruction *);
+ void emitMEMBAR(const Instruction *);
+ void emitCCTL(const Instruction *);
+
+ void emitINTERP(const Instruction *);
+ void emitPFETCH(const Instruction *);
+ void emitVFETCH(const Instruction *);
+ void emitEXPORT(const Instruction *);
+ void emitOUT(const Instruction *);
+
+ void emitUADD(const Instruction *);
+ void emitFADD(const Instruction *);
+ void emitUMUL(const Instruction *);
+ void emitFMUL(const Instruction *);
+ void emitIMAD(const Instruction *);
+ void emitISAD(const Instruction *);
+ void emitFMAD(const Instruction *);
+ void emitMADSP(const Instruction *);
+
+ void emitNOT(Instruction *);
+ void emitLogicOp(const Instruction *, uint8_t subOp);
+ void emitPOPC(const Instruction *);
+ void emitINSBF(const Instruction *);
+ void emitEXTBF(const Instruction *);
+ void emitPERMT(const Instruction *);
+ void emitShift(const Instruction *);
+
+ void emitSFnOp(const Instruction *, uint8_t subOp);
+
+ void emitCVT(Instruction *);
+ void emitMINMAX(const Instruction *);
+ void emitPreOp(const Instruction *);
+
+ void emitSET(const CmpInstruction *);
+ void emitSLCT(const CmpInstruction *);
+ void emitSELP(const Instruction *);
+
+ void emitTEXBAR(const Instruction *);
+ void emitTEX(const TexInstruction *);
+ void emitTEXCSAA(const TexInstruction *);
+ void emitTXQ(const TexInstruction *);
+
+ void emitQUADOP(const Instruction *, uint8_t qOp, uint8_t laneMask);
+
+ void emitFlow(const Instruction *);
+ void emitBAR(const Instruction *);
+
+ void emitSUCLAMPMode(uint16_t);
+ void emitSUCalc(Instruction *);
+ void emitSULDGB(const TexInstruction *);
+ void emitSUSTGx(const TexInstruction *);
+
+ void emitVSHL(const Instruction *);
+ void emitVectorSubOp(const Instruction *);
+
+ inline void defId(const ValueDef&, const int pos);
+ inline void defId(const Instruction *, int d, const int pos);
+ inline void srcId(const ValueRef&, const int pos);
+ inline void srcId(const ValueRef *, const int pos);
+ inline void srcId(const Instruction *, int s, const int pos);
+ inline void srcAddr32(const ValueRef&, int pos, int shr);
+
+ inline bool isLIMM(const ValueRef&, DataType ty);
+};
+
+// for better visibility
+#define HEX64(h, l) 0x##h##l##ULL
+
+#define SDATA(a) ((a).rep()->reg.data)
+#define DDATA(a) ((a).rep()->reg.data)
+
+void CodeEmitterNVC0::srcId(const ValueRef& src, const int pos)
+{
+ code[pos / 32] |= (src.get() ? SDATA(src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcId(const ValueRef *src, const int pos)
+{
+ code[pos / 32] |= (src ? SDATA(*src).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::srcId(const Instruction *insn, int s, int pos)
+{
+ int r = insn->srcExists(s) ? SDATA(insn->src(s)).id : 63;
+ code[pos / 32] |= r << (pos % 32);
+}
+
+void
+CodeEmitterNVC0::srcAddr32(const ValueRef& src, int pos, int shr)
+{
+ const uint32_t offset = SDATA(src).offset >> shr;
+
+ code[pos / 32] |= offset << (pos % 32);
+ if (pos && (pos < 32))
+ code[1] |= offset >> (32 - pos);
+}
+
+void CodeEmitterNVC0::defId(const ValueDef& def, const int pos)
+{
+ code[pos / 32] |= (def.get() ? DDATA(def).id : 63) << (pos % 32);
+}
+
+void CodeEmitterNVC0::defId(const Instruction *insn, int d, int pos)
+{
+ int r = insn->defExists(d) ? DDATA(insn->def(d)).id : 63;
+ code[pos / 32] |= r << (pos % 32);
+}
+
+bool CodeEmitterNVC0::isLIMM(const ValueRef& ref, DataType ty)
+{
+ const ImmediateValue *imm = ref.get()->asImm();
+
+ return imm && (imm->reg.data.u32 & ((ty == TYPE_F32) ? 0xfff : 0xfff00000));
+}
+
+void
+CodeEmitterNVC0::roundMode_A(const Instruction *insn)
+{
+ switch (insn->rnd) {
+ case ROUND_M: code[1] |= 1 << 23; break;
+ case ROUND_P: code[1] |= 2 << 23; break;
+ case ROUND_Z: code[1] |= 3 << 23; break;
+ default:
+ assert(insn->rnd == ROUND_N);
+ break;
+ }
+}
+
+void
+CodeEmitterNVC0::emitNegAbs12(const Instruction *i)
+{
+ if (i->src(1).mod.abs()) code[0] |= 1 << 6;
+ if (i->src(0).mod.abs()) code[0] |= 1 << 7;
+ if (i->src(1).mod.neg()) code[0] |= 1 << 8;
+ if (i->src(0).mod.neg()) code[0] |= 1 << 9;
+}
+
+void CodeEmitterNVC0::emitCondCode(CondCode cc, int pos)
+{
+ uint8_t val;
+
+ switch (cc) {
+ case CC_LT: val = 0x1; break;
+ case CC_LTU: val = 0x9; break;
+ case CC_EQ: val = 0x2; break;
+ case CC_EQU: val = 0xa; break;
+ case CC_LE: val = 0x3; break;
+ case CC_LEU: val = 0xb; break;
+ case CC_GT: val = 0x4; break;
+ case CC_GTU: val = 0xc; break;
+ case CC_NE: val = 0x5; break;
+ case CC_NEU: val = 0xd; break;
+ case CC_GE: val = 0x6; break;
+ case CC_GEU: val = 0xe; break;
+ case CC_TR: val = 0xf; break;
+ case CC_FL: val = 0x0; break;
+
+ case CC_A: val = 0x14; break;
+ case CC_NA: val = 0x13; break;
+ case CC_S: val = 0x15; break;
+ case CC_NS: val = 0x12; break;
+ case CC_C: val = 0x16; break;
+ case CC_NC: val = 0x11; break;
+ case CC_O: val = 0x17; break;
+ case CC_NO: val = 0x10; break;
+
+ default:
+ val = 0;
+ assert(!"invalid condition code");
+ break;
+ }
+ code[pos / 32] |= val << (pos % 32);
+}
+
+void
+CodeEmitterNVC0::emitPredicate(const Instruction *i)
+{
+ if (i->predSrc >= 0) {
+ assert(i->getPredicate()->reg.file == FILE_PREDICATE);
+ srcId(i->src(i->predSrc), 10);
+ if (i->cc == CC_NOT_P)
+ code[0] |= 0x2000; // negate
+ } else {
+ code[0] |= 0x1c00;
+ }
+}
+
+void
+CodeEmitterNVC0::setAddressByFile(const ValueRef& src)
+{
+ switch (src.getFile()) {
+ case FILE_MEMORY_GLOBAL:
+ srcAddr32(src, 26, 0);
+ break;
+ case FILE_MEMORY_LOCAL:
+ case FILE_MEMORY_SHARED:
+ setAddress24(src);
+ break;
+ default:
+ assert(src.getFile() == FILE_MEMORY_CONST);
+ setAddress16(src);
+ break;
+ }
+}
+
+void
+CodeEmitterNVC0::setAddress16(const ValueRef& src)
+{
+ Symbol *sym = src.get()->asSym();
+
+ assert(sym);
+
+ code[0] |= (sym->reg.data.offset & 0x003f) << 26;
+ code[1] |= (sym->reg.data.offset & 0xffc0) >> 6;
+}
+
+void
+CodeEmitterNVC0::setAddress24(const ValueRef& src)
+{
+ Symbol *sym = src.get()->asSym();
+
+ assert(sym);
+
+ code[0] |= (sym->reg.data.offset & 0x00003f) << 26;
+ code[1] |= (sym->reg.data.offset & 0xffffc0) >> 6;
+}
+
+void
+CodeEmitterNVC0::setImmediate(const Instruction *i, const int s)
+{
+ const ImmediateValue *imm = i->src(s).get()->asImm();
+ uint32_t u32;
+
+ assert(imm);
+ u32 = imm->reg.data.u32;
+
+ if ((code[0] & 0xf) == 0x2) {
+ // LIMM
+ code[0] |= (u32 & 0x3f) << 26;
+ code[1] |= u32 >> 6;
+ } else
+ if ((code[0] & 0xf) == 0x3 || (code[0] & 0xf) == 4) {
+ // integer immediate
+ assert((u32 & 0xfff00000) == 0 || (u32 & 0xfff00000) == 0xfff00000);
+ assert(!(code[1] & 0xc000));
+ u32 &= 0xfffff;
+ code[0] |= (u32 & 0x3f) << 26;
+ code[1] |= 0xc000 | (u32 >> 6);
+ } else {
+ // float immediate
+ assert(!(u32 & 0x00000fff));
+ assert(!(code[1] & 0xc000));
+ code[0] |= ((u32 >> 12) & 0x3f) << 26;
+ code[1] |= 0xc000 | (u32 >> 18);
+ }
+}
+
+void CodeEmitterNVC0::setImmediateS8(const ValueRef &ref)
+{
+ const ImmediateValue *imm = ref.get()->asImm();
+
+ int8_t s8 = static_cast<int8_t>(imm->reg.data.s32);
+
+ assert(s8 == imm->reg.data.s32);
+
+ code[0] |= (s8 & 0x3f) << 26;
+ code[0] |= (s8 >> 6) << 8;
+}
+
+void
+CodeEmitterNVC0::emitForm_A(const Instruction *i, uint64_t opc)
+{
+ code[0] = opc;
+ code[1] = opc >> 32;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 14);
+
+ int s1 = 26;
+ if (i->srcExists(2) && i->getSrc(2)->reg.file == FILE_MEMORY_CONST)
+ s1 = 49;
+
+ for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+ switch (i->getSrc(s)->reg.file) {
+ case FILE_MEMORY_CONST:
+ assert(!(code[1] & 0xc000));
+ code[1] |= (s == 2) ? 0x8000 : 0x4000;
+ code[1] |= i->getSrc(s)->reg.fileIndex << 10;
+ setAddress16(i->src(s));
+ break;
+ case FILE_IMMEDIATE:
+ assert(s == 1 ||
+ i->op == OP_MOV || i->op == OP_PRESIN || i->op == OP_PREEX2);
+ assert(!(code[1] & 0xc000));
+ setImmediate(i, s);
+ break;
+ case FILE_GPR:
+ if ((s == 2) && ((code[0] & 0x7) == 2)) // LIMM: 3rd src == dst
+ break;
+ srcId(i->src(s), s ? ((s == 2) ? 49 : s1) : 20);
+ break;
+ default:
+ // ignore here, can be predicate or flags, but must not be address
+ break;
+ }
+ }
+}
+
+void
+CodeEmitterNVC0::emitForm_B(const Instruction *i, uint64_t opc)
+{
+ code[0] = opc;
+ code[1] = opc >> 32;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 14);
+
+ switch (i->src(0).getFile()) {
+ case FILE_MEMORY_CONST:
+ assert(!(code[1] & 0xc000));
+ code[1] |= 0x4000 | (i->src(0).get()->reg.fileIndex << 10);
+ setAddress16(i->src(0));
+ break;
+ case FILE_IMMEDIATE:
+ assert(!(code[1] & 0xc000));
+ setImmediate(i, 0);
+ break;
+ case FILE_GPR:
+ srcId(i->src(0), 26);
+ break;
+ default:
+ // ignore here, can be predicate or flags, but must not be address
+ break;
+ }
+}
+
+void
+CodeEmitterNVC0::emitForm_S(const Instruction *i, uint32_t opc, bool pred)
+{
+ code[0] = opc;
+
+ int ss2a = 0;
+ if (opc == 0x0d || opc == 0x0e)
+ ss2a = 2;
+
+ defId(i->def(0), 14);
+ srcId(i->src(0), 20);
+
+ assert(pred || (i->predSrc < 0));
+ if (pred)
+ emitPredicate(i);
+
+ for (int s = 1; s < 3 && i->srcExists(s); ++s) {
+ if (i->src(s).get()->reg.file == FILE_MEMORY_CONST) {
+ assert(!(code[0] & (0x300 >> ss2a)));
+ switch (i->src(s).get()->reg.fileIndex) {
+ case 0: code[0] |= 0x100 >> ss2a; break;
+ case 1: code[0] |= 0x200 >> ss2a; break;
+ case 16: code[0] |= 0x300 >> ss2a; break;
+ default:
+ ERROR("invalid c[] space for short form\n");
+ break;
+ }
+ if (s == 1)
+ code[0] |= i->getSrc(s)->reg.data.offset << 24;
+ else
+ code[0] |= i->getSrc(s)->reg.data.offset << 6;
+ } else
+ if (i->src(s).getFile() == FILE_IMMEDIATE) {
+ assert(s == 1);
+ setImmediateS8(i->src(s));
+ } else
+ if (i->src(s).getFile() == FILE_GPR) {
+ srcId(i->src(s), (s == 1) ? 26 : 8);
+ }
+ }
+}
+
+void
+CodeEmitterNVC0::emitShortSrc2(const ValueRef &src)
+{
+ if (src.getFile() == FILE_MEMORY_CONST) {
+ switch (src.get()->reg.fileIndex) {
+ case 0: code[0] |= 0x100; break;
+ case 1: code[0] |= 0x200; break;
+ case 16: code[0] |= 0x300; break;
+ default:
+ assert(!"unsupported file index for short op");
+ break;
+ }
+ srcAddr32(src, 20, 2);
+ } else {
+ srcId(src, 20);
+ assert(src.getFile() == FILE_GPR);
+ }
+}
+
+void
+CodeEmitterNVC0::emitNOP(const Instruction *i)
+{
+ code[0] = 0x000001e4;
+ code[1] = 0x40000000;
+ emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFMAD(const Instruction *i)
+{
+ bool neg1 = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ if (i->encSize == 8) {
+ if (isLIMM(i->src(1), TYPE_F32)) {
+ emitForm_A(i, HEX64(20000000, 00000002));
+ } else {
+ emitForm_A(i, HEX64(30000000, 00000000));
+
+ if (i->src(2).mod.neg())
+ code[0] |= 1 << 8;
+ }
+ roundMode_A(i);
+
+ if (neg1)
+ code[0] |= 1 << 9;
+
+ if (i->saturate)
+ code[0] |= 1 << 5;
+ if (i->ftz)
+ code[0] |= 1 << 6;
+ } else {
+ assert(!i->saturate && !i->src(2).mod.neg());
+ emitForm_S(i, (i->src(2).getFile() == FILE_MEMORY_CONST) ? 0x2e : 0x0e,
+ false);
+ if (neg1)
+ code[0] |= 1 << 4;
+ }
+}
+
+void
+CodeEmitterNVC0::emitFMUL(const Instruction *i)
+{
+ bool neg = (i->src(0).mod ^ i->src(1).mod).neg();
+
+ assert(i->postFactor >= -3 && i->postFactor <= 3);
+
+ if (i->encSize == 8) {
+ if (isLIMM(i->src(1), TYPE_F32)) {
+ assert(i->postFactor == 0); // constant folded, hopefully
+ emitForm_A(i, HEX64(30000000, 00000002));
+ } else {
+ emitForm_A(i, HEX64(58000000, 00000000));
+ roundMode_A(i);
+ code[1] |= ((i->postFactor > 0) ?
+ (7 - i->postFactor) : (0 - i->postFactor)) << 17;
+ }
+ if (neg)
+ code[1] ^= 1 << 25; // aliases with LIMM sign bit
+
+ if (i->saturate)
+ code[0] |= 1 << 5;
+
+ if (i->dnz)
+ code[0] |= 1 << 7;
+ else
+ if (i->ftz)
+ code[0] |= 1 << 6;
+ } else {
+ assert(!neg && !i->saturate && !i->ftz && !i->postFactor);
+ emitForm_S(i, 0xa8, true);
+ }
+}
+
+void
+CodeEmitterNVC0::emitUMUL(const Instruction *i)
+{
+ if (i->encSize == 8) {
+ if (i->src(1).getFile() == FILE_IMMEDIATE) {
+ emitForm_A(i, HEX64(10000000, 00000002));
+ } else {
+ emitForm_A(i, HEX64(50000000, 00000003));
+ }
+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+ code[0] |= 1 << 6;
+ if (i->sType == TYPE_S32)
+ code[0] |= 1 << 5;
+ if (i->dType == TYPE_S32)
+ code[0] |= 1 << 7;
+ } else {
+ emitForm_S(i, i->src(1).getFile() == FILE_IMMEDIATE ? 0xaa : 0x2a, true);
+
+ if (i->sType == TYPE_S32)
+ code[0] |= 1 << 6;
+ }
+}
+
+void
+CodeEmitterNVC0::emitFADD(const Instruction *i)
+{
+ if (i->encSize == 8) {
+ if (isLIMM(i->src(1), TYPE_F32)) {
+ assert(!i->saturate);
+ emitForm_A(i, HEX64(28000000, 00000002));
+
+ code[0] |= i->src(0).mod.abs() << 7;
+ code[0] |= i->src(0).mod.neg() << 9;
+
+ if (i->src(1).mod.abs())
+ code[1] &= 0xfdffffff;
+ if ((i->op == OP_SUB) != static_cast<bool>(i->src(1).mod.neg()))
+ code[1] ^= 0x02000000;
+ } else {
+ emitForm_A(i, HEX64(50000000, 00000000));
+
+ roundMode_A(i);
+ if (i->saturate)
+ code[1] |= 1 << 17;
+
+ emitNegAbs12(i);
+ if (i->op == OP_SUB) code[0] ^= 1 << 8;
+ }
+ if (i->ftz)
+ code[0] |= 1 << 5;
+ } else {
+ assert(!i->saturate && i->op != OP_SUB &&
+ !i->src(0).mod.abs() &&
+ !i->src(1).mod.neg() && !i->src(1).mod.abs());
+
+ emitForm_S(i, 0x49, true);
+
+ if (i->src(0).mod.neg())
+ code[0] |= 1 << 7;
+ }
+}
+
+void
+CodeEmitterNVC0::emitUADD(const Instruction *i)
+{
+ uint32_t addOp = 0;
+
+ assert(!i->src(0).mod.abs() && !i->src(1).mod.abs());
+ assert(!i->src(0).mod.neg() || !i->src(1).mod.neg());
+
+ if (i->src(0).mod.neg())
+ addOp |= 0x200;
+ if (i->src(1).mod.neg())
+ addOp |= 0x100;
+ if (i->op == OP_SUB) {
+ addOp ^= 0x100;
+ assert(addOp != 0x300); // would be add-plus-one
+ }
+
+ if (i->encSize == 8) {
+ if (isLIMM(i->src(1), TYPE_U32)) {
+ emitForm_A(i, HEX64(08000000, 00000002));
+ if (i->defExists(1))
+ code[1] |= 1 << 26; // write carry
+ } else {
+ emitForm_A(i, HEX64(48000000, 00000003));
+ if (i->defExists(1))
+ code[1] |= 1 << 16; // write carry
+ }
+ code[0] |= addOp;
+
+ if (i->saturate)
+ code[0] |= 1 << 5;
+ if (i->flagsSrc >= 0) // add carry
+ code[0] |= 1 << 6;
+ } else {
+ assert(!(addOp & 0x100));
+ emitForm_S(i, (addOp >> 3) |
+ ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0xac : 0x2c), true);
+ }
+}
+
+// TODO: shl-add
+void
+CodeEmitterNVC0::emitIMAD(const Instruction *i)
+{
+ assert(i->encSize == 8);
+ emitForm_A(i, HEX64(20000000, 00000003));
+
+ if (isSignedType(i->dType))
+ code[0] |= 1 << 7;
+ if (isSignedType(i->sType))
+ code[0] |= 1 << 5;
+
+ code[1] |= i->saturate << 24;
+
+ if (i->flagsDef >= 0) code[1] |= 1 << 16;
+ if (i->flagsSrc >= 0) code[1] |= 1 << 23;
+
+ if (i->src(2).mod.neg()) code[0] |= 0x10;
+ if (i->src(1).mod.neg() ^
+ i->src(0).mod.neg()) code[0] |= 0x20;
+
+ if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
+ code[0] |= 1 << 6;
+}
+
+void
+CodeEmitterNVC0::emitMADSP(const Instruction *i)
+{
+ assert(targ->getChipset() >= NVISA_GK104_CHIPSET);
+
+ emitForm_A(i, HEX64(00000000, 00000003));
+
+ if (i->subOp == NV50_IR_SUBOP_MADSP_SD) {
+ code[1] |= 0x01800000;
+ } else {
+ code[0] |= (i->subOp & 0x00f) << 7;
+ code[0] |= (i->subOp & 0x0f0) << 1;
+ code[0] |= (i->subOp & 0x100) >> 3;
+ code[0] |= (i->subOp & 0x200) >> 2;
+ code[1] |= (i->subOp & 0xc00) << 13;
+ }
+
+ if (i->flagsDef >= 0)
+ code[1] |= 1 << 16;
+}
+
+void
+CodeEmitterNVC0::emitISAD(const Instruction *i)
+{
+ assert(i->dType == TYPE_S32 || i->dType == TYPE_U32);
+ assert(i->encSize == 8);
+
+ emitForm_A(i, HEX64(38000000, 00000003));
+
+ if (i->dType == TYPE_S32)
+ code[0] |= 1 << 5;
+}
+
+void
+CodeEmitterNVC0::emitNOT(Instruction *i)
+{
+ assert(i->encSize == 8);
+ i->setSrc(1, i->src(0));
+ emitForm_A(i, HEX64(68000000, 000001c3));
+}
+
+void
+CodeEmitterNVC0::emitLogicOp(const Instruction *i, uint8_t subOp)
+{
+ if (i->def(0).getFile() == FILE_PREDICATE) {
+ code[0] = 0x00000004 | (subOp << 30);
+ code[1] = 0x0c000000;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 17);
+ srcId(i->src(0), 20);
+ if (i->src(0).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 23;
+ srcId(i->src(1), 26);
+ if (i->src(1).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 29;
+
+ if (i->defExists(1)) {
+ defId(i->def(1), 14);
+ } else {
+ code[0] |= 7 << 14;
+ }
+ // (a OP b) OP c
+ if (i->predSrc != 2 && i->srcExists(2)) {
+ code[1] |= subOp << 21;
+ srcId(i->src(2), 17);
+ if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 20;
+ } else {
+ code[1] |= 0x000e0000;
+ }
+ } else
+ if (i->encSize == 8) {
+ if (isLIMM(i->src(1), TYPE_U32)) {
+ emitForm_A(i, HEX64(38000000, 00000002));
+
+ if (i->flagsDef >= 0)
+ code[1] |= 1 << 26;
+ } else {
+ emitForm_A(i, HEX64(68000000, 00000003));
+
+ if (i->flagsDef >= 0)
+ code[1] |= 1 << 16;
+ }
+ code[0] |= subOp << 6;
+
+ if (i->flagsSrc >= 0) // carry
+ code[0] |= 1 << 5;
+
+ if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+ if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+ } else {
+ emitForm_S(i, (subOp << 5) |
+ ((i->src(1).getFile() == FILE_IMMEDIATE) ? 0x1d : 0x8d), true);
+ }
+}
+
+void
+CodeEmitterNVC0::emitPOPC(const Instruction *i)
+{
+ emitForm_A(i, HEX64(54000000, 00000004));
+
+ if (i->src(0).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 9;
+ if (i->src(1).mod & Modifier(NV50_IR_MOD_NOT)) code[0] |= 1 << 8;
+}
+
+void
+CodeEmitterNVC0::emitINSBF(const Instruction *i)
+{
+ emitForm_A(i, HEX64(28000000, 00000003));
+}
+
+void
+CodeEmitterNVC0::emitEXTBF(const Instruction *i)
+{
+ emitForm_A(i, HEX64(70000000, 00000003));
+
+ if (i->dType == TYPE_S32)
+ code[0] |= 1 << 5;
+ if (i->subOp == NV50_IR_SUBOP_EXTBF_REV)
+ code[0] |= 1 << 8;
+}
+
+void
+CodeEmitterNVC0::emitPERMT(const Instruction *i)
+{
+ emitForm_A(i, HEX64(24000000, 00000004));
+
+ code[0] |= i->subOp << 5;
+}
+
+void
+CodeEmitterNVC0::emitShift(const Instruction *i)
+{
+ if (i->op == OP_SHR) {
+ emitForm_A(i, HEX64(58000000, 00000003)
+ | (isSignedType(i->dType) ? 0x20 : 0x00));
+ } else {
+ emitForm_A(i, HEX64(60000000, 00000003));
+ }
+
+ if (i->subOp == NV50_IR_SUBOP_SHIFT_WRAP)
+ code[0] |= 1 << 9;
+}
+
+void
+CodeEmitterNVC0::emitPreOp(const Instruction *i)
+{
+ if (i->encSize == 8) {
+ emitForm_B(i, HEX64(60000000, 00000000));
+
+ if (i->op == OP_PREEX2)
+ code[0] |= 0x20;
+
+ if (i->src(0).mod.abs()) code[0] |= 1 << 6;
+ if (i->src(0).mod.neg()) code[0] |= 1 << 8;
+ } else {
+ emitForm_S(i, i->op == OP_PREEX2 ? 0x74000008 : 0x70000008, true);
+ }
+}
+
+void
+CodeEmitterNVC0::emitSFnOp(const Instruction *i, uint8_t subOp)
+{
+ if (i->encSize == 8) {
+ code[0] = 0x00000000 | (subOp << 26);
+ code[1] = 0xc8000000;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 14);
+ srcId(i->src(0), 20);
+
+ assert(i->src(0).getFile() == FILE_GPR);
+
+ if (i->saturate) code[0] |= 1 << 5;
+
+ if (i->src(0).mod.abs()) code[0] |= 1 << 7;
+ if (i->src(0).mod.neg()) code[0] |= 1 << 9;
+ } else {
+ emitForm_S(i, 0x80000008 | (subOp << 26), true);
+
+ assert(!i->src(0).mod.neg());
+ if (i->src(0).mod.abs()) code[0] |= 1 << 30;
+ }
+}
+
+void
+CodeEmitterNVC0::emitMINMAX(const Instruction *i)
+{
+ uint64_t op;
+
+ assert(i->encSize == 8);
+
+ op = (i->op == OP_MIN) ? 0x080e000000000000ULL : 0x081e000000000000ULL;
+
+ if (i->ftz)
+ op |= 1 << 5;
+ else
+ if (!isFloatType(i->dType))
+ op |= isSignedType(i->dType) ? 0x23 : 0x03;
+
+ emitForm_A(i, op);
+ emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::roundMode_C(const Instruction *i)
+{
+ switch (i->rnd) {
+ case ROUND_M: code[1] |= 1 << 17; break;
+ case ROUND_P: code[1] |= 2 << 17; break;
+ case ROUND_Z: code[1] |= 3 << 17; break;
+ case ROUND_NI: code[0] |= 1 << 7; break;
+ case ROUND_MI: code[0] |= 1 << 7; code[1] |= 1 << 17; break;
+ case ROUND_PI: code[0] |= 1 << 7; code[1] |= 2 << 17; break;
+ case ROUND_ZI: code[0] |= 1 << 7; code[1] |= 3 << 17; break;
+ case ROUND_N: break;
+ default:
+ assert(!"invalid round mode");
+ break;
+ }
+}
+
+void
+CodeEmitterNVC0::roundMode_CS(const Instruction *i)
+{
+ switch (i->rnd) {
+ case ROUND_M:
+ case ROUND_MI: code[0] |= 1 << 16; break;
+ case ROUND_P:
+ case ROUND_PI: code[0] |= 2 << 16; break;
+ case ROUND_Z:
+ case ROUND_ZI: code[0] |= 3 << 16; break;
+ default:
+ break;
+ }
+}
+
+void
+CodeEmitterNVC0::emitCVT(Instruction *i)
+{
+ const bool f2f = isFloatType(i->dType) && isFloatType(i->sType);
+
+ switch (i->op) {
+ case OP_CEIL: i->rnd = f2f ? ROUND_PI : ROUND_P; break;
+ case OP_FLOOR: i->rnd = f2f ? ROUND_MI : ROUND_M; break;
+ case OP_TRUNC: i->rnd = f2f ? ROUND_ZI : ROUND_Z; break;
+ default:
+ break;
+ }
+
+ const bool sat = (i->op == OP_SAT) || i->saturate;
+ const bool abs = (i->op == OP_ABS) || i->src(0).mod.abs();
+ const bool neg = (i->op == OP_NEG) || i->src(0).mod.neg();
+
+ if (i->encSize == 8) {
+ emitForm_B(i, HEX64(10000000, 00000004));
+
+ roundMode_C(i);
+
+ // cvt u16 f32 sets high bits to 0, so we don't have to use Value::Size()
+ code[0] |= util_logbase2(typeSizeof(i->dType)) << 20;
+ code[0] |= util_logbase2(typeSizeof(i->sType)) << 23;
+
+ if (sat)
+ code[0] |= 0x20;
+ if (abs)
+ code[0] |= 1 << 6;
+ if (neg && i->op != OP_ABS)
+ code[0] |= 1 << 8;
+
+ if (i->ftz)
+ code[1] |= 1 << 23;
+
+ if (isSignedIntType(i->dType))
+ code[0] |= 0x080;
+ if (isSignedIntType(i->sType))
+ code[0] |= 0x200;
+
+ if (isFloatType(i->dType)) {
+ if (!isFloatType(i->sType))
+ code[1] |= 0x08000000;
+ } else {
+ if (isFloatType(i->sType))
+ code[1] |= 0x04000000;
+ else
+ code[1] |= 0x0c000000;
+ }
+ } else {
+ if (i->op == OP_CEIL || i->op == OP_FLOOR || i->op == OP_TRUNC) {
+ code[0] = 0x298;
+ } else
+ if (isFloatType(i->dType)) {
+ if (isFloatType(i->sType))
+ code[0] = 0x098;
+ else
+ code[0] = 0x088 | (isSignedType(i->sType) ? (1 << 8) : 0);
+ } else {
+ assert(isFloatType(i->sType));
+
+ code[0] = 0x288 | (isSignedType(i->sType) ? (1 << 8) : 0);
+ }
+
+ if (neg) code[0] |= 1 << 16;
+ if (sat) code[0] |= 1 << 18;
+ if (abs) code[0] |= 1 << 19;
+
+ roundMode_CS(i);
+ }
+}
+
+void
+CodeEmitterNVC0::emitSET(const CmpInstruction *i)
+{
+ uint32_t hi;
+ uint32_t lo = 0;
+
+ if (i->sType == TYPE_F64)
+ lo = 0x1;
+ else
+ if (!isFloatType(i->sType))
+ lo = 0x3;
+
+ if (isFloatType(i->dType) || isSignedIntType(i->sType))
+ lo |= 0x20;
+
+ switch (i->op) {
+ case OP_SET_AND: hi = 0x10000000; break;
+ case OP_SET_OR: hi = 0x10200000; break;
+ case OP_SET_XOR: hi = 0x10400000; break;
+ default:
+ hi = 0x100e0000;
+ break;
+ }
+ emitForm_A(i, (static_cast<uint64_t>(hi) << 32) | lo);
+
+ if (i->op != OP_SET)
+ srcId(i->src(2), 32 + 17);
+
+ if (i->def(0).getFile() == FILE_PREDICATE) {
+ if (i->sType == TYPE_F32)
+ code[1] += 0x10000000;
+ else
+ code[1] += 0x08000000;
+
+ code[0] &= ~0xfc000;
+ defId(i->def(0), 17);
+ if (i->defExists(1))
+ defId(i->def(1), 14);
+ else
+ code[0] |= 0x1c000;
+ }
+
+ if (i->ftz)
+ code[1] |= 1 << 27;
+
+ emitCondCode(i->setCond, 32 + 23);
+ emitNegAbs12(i);
+}
+
+void
+CodeEmitterNVC0::emitSLCT(const CmpInstruction *i)
+{
+ uint64_t op;
+
+ switch (i->dType) {
+ case TYPE_S32:
+ op = HEX64(30000000, 00000023);
+ break;
+ case TYPE_U32:
+ op = HEX64(30000000, 00000003);
+ break;
+ case TYPE_F32:
+ op = HEX64(38000000, 00000000);
+ break;
+ default:
+ assert(!"invalid type for SLCT");
+ op = 0;
+ break;
+ }
+ emitForm_A(i, op);
+
+ CondCode cc = i->setCond;
+
+ if (i->src(2).mod.neg())
+ cc = reverseCondCode(cc);
+
+ emitCondCode(cc, 32 + 23);
+
+ if (i->ftz)
+ code[0] |= 1 << 5;
+}
+
+void CodeEmitterNVC0::emitSELP(const Instruction *i)
+{
+ emitForm_A(i, HEX64(20000000, 00000004));
+
+ if (i->cc == CC_NOT_P || i->src(2).mod & Modifier(NV50_IR_MOD_NOT))
+ code[1] |= 1 << 20;
+}
+
+void CodeEmitterNVC0::emitTEXBAR(const Instruction *i)
+{
+ code[0] = 0x00000006 | (i->subOp << 26);
+ code[1] = 0xf0000000;
+ emitPredicate(i);
+ emitCondCode(i->flagsSrc >= 0 ? i->cc : CC_ALWAYS, 5);
+}
+
+void CodeEmitterNVC0::emitTEXCSAA(const TexInstruction *i)
+{
+ code[0] = 0x00000086;
+ code[1] = 0xd0000000;
+
+ code[1] |= i->tex.r;
+ code[1] |= i->tex.s << 8;
+
+ if (i->tex.liveOnly)
+ code[0] |= 1 << 9;
+
+ defId(i->def(0), 14);
+ srcId(i->src(0), 20);
+}
+
+static inline bool
+isNextIndependentTex(const TexInstruction *i)
+{
+ if (!i->next || !isTextureOp(i->next->op))
+ return false;
+ if (i->getDef(0)->interfers(i->next->getSrc(0)))
+ return false;
+ return !i->next->srcExists(1) || !i->getDef(0)->interfers(i->next->getSrc(1));
+}
+
+void
+CodeEmitterNVC0::emitTEX(const TexInstruction *i)
+{
+ code[0] = 0x00000006;
+
+ if (isNextIndependentTex(i))
+ code[0] |= 0x080; // t mode
+ else
+ code[0] |= 0x100; // p mode
+
+ if (i->tex.liveOnly)
+ code[0] |= 1 << 9;
+
+ switch (i->op) {
+ case OP_TEX: code[1] = 0x80000000; break;
+ case OP_TXB: code[1] = 0x84000000; break;
+ case OP_TXL: code[1] = 0x86000000; break;
+ case OP_TXF: code[1] = 0x90000000; break;
+ case OP_TXG: code[1] = 0xa0000000; break;
+ case OP_TXD: code[1] = 0xe0000000; break;
+ default:
+ assert(!"invalid texture op");
+ break;
+ }
+ if (i->op == OP_TXF) {
+ if (!i->tex.levelZero)
+ code[1] |= 0x02000000;
+ } else
+ if (i->tex.levelZero) {
+ code[1] |= 0x02000000;
+ }
+
+ if (i->op != OP_TXD && i->tex.derivAll)
+ code[1] |= 1 << 13;
+
+ defId(i->def(0), 14);
+ srcId(i->src(0), 20);
+
+ emitPredicate(i);
+
+ if (i->op == OP_TXG) code[0] |= i->tex.gatherComp << 5;
+
+ code[1] |= i->tex.mask << 14;
+
+ code[1] |= i->tex.r;
+ code[1] |= i->tex.s << 8;
+ if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0)
+ code[1] |= 1 << 18; // in 1st source (with array index)
+
+ // texture target:
+ code[1] |= (i->tex.target.getDim() - 1) << 20;
+ if (i->tex.target.isCube())
+ code[1] += 2 << 20;
+ if (i->tex.target.isArray())
+ code[1] |= 1 << 19;
+ if (i->tex.target.isShadow())
+ code[1] |= 1 << 24;
+
+ const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+ if (i->srcExists(src1) && i->src(src1).getFile() == FILE_IMMEDIATE) {
+ // lzero
+ if (i->op == OP_TXL)
+ code[1] &= ~(1 << 26);
+ else
+ if (i->op == OP_TXF)
+ code[1] &= ~(1 << 25);
+ }
+ if (i->tex.target == TEX_TARGET_2D_MS ||
+ i->tex.target == TEX_TARGET_2D_MS_ARRAY)
+ code[1] |= 1 << 23;
+
+ if (i->tex.useOffsets) // in vecSrc0.w
+ code[1] |= 1 << 22;
+
+ srcId(i, src1, 26);
+}
+
+void
+CodeEmitterNVC0::emitTXQ(const TexInstruction *i)
+{
+ code[0] = 0x00000086;
+ code[1] = 0xc0000000;
+
+ switch (i->tex.query) {
+ case TXQ_DIMS: code[1] |= 0 << 22; break;
+ case TXQ_TYPE: code[1] |= 1 << 22; break;
+ case TXQ_SAMPLE_POSITION: code[1] |= 2 << 22; break;
+ case TXQ_FILTER: code[1] |= 3 << 22; break;
+ case TXQ_LOD: code[1] |= 4 << 22; break;
+ case TXQ_BORDER_COLOUR: code[1] |= 5 << 22; break;
+ default:
+ assert(!"invalid texture query");
+ break;
+ }
+
+ code[1] |= i->tex.mask << 14;
+
+ code[1] |= i->tex.r;
+ code[1] |= i->tex.s << 8;
+ if (i->tex.sIndirectSrc >= 0 || i->tex.rIndirectSrc >= 0)
+ code[1] |= 1 << 18;
+
+ const int src1 = (i->predSrc == 1) ? 2 : 1; // if predSrc == 1, !srcExists(2)
+
+ defId(i->def(0), 14);
+ srcId(i->src(0), 20);
+ srcId(i, src1, 26);
+
+ emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
+{
+ code[0] = 0x00000000 | (laneMask << 6);
+ code[1] = 0x48000000 | qOp;
+
+ defId(i->def(0), 14);
+ srcId(i->src(0), 20);
+ srcId(i->srcExists(1) ? i->src(1) : i->src(0), 26);
+
+ if (i->op == OP_QUADOP && progType != Program::TYPE_FRAGMENT)
+ code[0] |= 1 << 9; // dall
+
+ emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitFlow(const Instruction *i)
+{
+ const FlowInstruction *f = i->asFlow();
+
+ unsigned mask; // bit 0: predicate, bit 1: target
+
+ code[0] = 0x00000007;
+
+ switch (i->op) {
+ case OP_BRA:
+ code[1] = f->absolute ? 0x00000000 : 0x40000000;
+ if (i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST)
+ code[0] |= 0x4000;
+ mask = 3;
+ break;
+ case OP_CALL:
+ code[1] = f->absolute ? 0x10000000 : 0x50000000;
+ if (f->indirect)
+ code[0] |= 0x4000; // indirect calls always use c[] source
+ mask = 2;
+ break;
+
+ case OP_EXIT: code[1] = 0x80000000; mask = 1; break;
+ case OP_RET: code[1] = 0x90000000; mask = 1; break;
+ case OP_DISCARD: code[1] = 0x98000000; mask = 1; break;
+ case OP_BREAK: code[1] = 0xa8000000; mask = 1; break;
+ case OP_CONT: code[1] = 0xb0000000; mask = 1; break;
+
+ case OP_JOINAT: code[1] = 0x60000000; mask = 2; break;
+ case OP_PREBREAK: code[1] = 0x68000000; mask = 2; break;
+ case OP_PRECONT: code[1] = 0x70000000; mask = 2; break;
+ case OP_PRERET: code[1] = 0x78000000; mask = 2; break;
+
+ case OP_QUADON: code[1] = 0xc0000000; mask = 0; break;
+ case OP_QUADPOP: code[1] = 0xc8000000; mask = 0; break;
+ case OP_BRKPT: code[1] = 0xd0000000; mask = 0; break;
+ default:
+ assert(!"invalid flow operation");
+ return;
+ }
+
+ if (mask & 1) {
+ emitPredicate(i);
+ if (i->flagsSrc < 0)
+ code[0] |= 0x1e0;
+ }
+
+ if (!f)
+ return;
+
+ if (f->allWarp)
+ code[0] |= 1 << 15;
+ if (f->limit)
+ code[0] |= 1 << 16;
+
+ if (f->indirect) {
+ if (code[0] & 0x4000) {
+ assert(i->srcExists(0) && i->src(0).getFile() == FILE_MEMORY_CONST);
+ setAddress16(i->src(0));
+ code[1] |= i->getSrc(0)->reg.fileIndex << 10;
+ if (f->op == OP_BRA)
+ srcId(f->src(0).getIndirect(0), 20);
+ } else {
+ srcId(f, 0, 20);
+ }
+ }
+
+ if (f->op == OP_CALL) {
+ if (f->indirect) {
+ // nothing
+ } else
+ if (f->builtin) {
+ assert(f->absolute);
+ uint32_t pcAbs = targNVC0->getBuiltinOffset(f->target.builtin);
+ addReloc(RelocEntry::TYPE_BUILTIN, 0, pcAbs, 0xfc000000, 26);
+ addReloc(RelocEntry::TYPE_BUILTIN, 1, pcAbs, 0x03ffffff, -6);
+ } else {
+ assert(!f->absolute);
+ int32_t pcRel = f->target.fn->binPos - (codeSize + 8);
+ code[0] |= (pcRel & 0x3f) << 26;
+ code[1] |= (pcRel >> 6) & 0x3ffff;
+ }
+ } else
+ if (mask & 2) {
+ int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+ // currently we don't want absolute branches
+ assert(!f->absolute);
+ code[0] |= (pcRel & 0x3f) << 26;
+ code[1] |= (pcRel >> 6) & 0x3ffff;
+ }
+}
+
+void
+CodeEmitterNVC0::emitBAR(const Instruction *i)
+{
+ Value *rDef = NULL, *pDef = NULL;
+
+ switch (i->subOp) {
+ case NV50_IR_SUBOP_BAR_ARRIVE: code[0] = 0x84; break;
+ case NV50_IR_SUBOP_BAR_RED_AND: code[0] = 0x24; break;
+ case NV50_IR_SUBOP_BAR_RED_OR: code[0] = 0x44; break;
+ case NV50_IR_SUBOP_BAR_RED_POPC: code[0] = 0x04; break;
+ default:
+ code[0] = 0x04;
+ assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC);
+ break;
+ }
+ code[1] = 0x50000000;
+
+ code[0] |= 63 << 14;
+ code[1] |= 7 << 21;
+
+ emitPredicate(i);
+
+ // barrier id
+ if (i->src(0).getFile() == FILE_GPR) {
+ srcId(i->src(0), 20);
+ } else {
+ ImmediateValue *imm = i->getSrc(0)->asImm();
+ assert(imm);
+ code[0] |= imm->reg.data.u32 << 20;
+ }
+
+ // thread count
+ if (i->src(1).getFile() == FILE_GPR) {
+ srcId(i->src(1), 26);
+ } else {
+ ImmediateValue *imm = i->getSrc(1)->asImm();
+ assert(imm);
+ code[0] |= imm->reg.data.u32 << 26;
+ code[1] |= imm->reg.data.u32 >> 6;
+ }
+
+ if (i->srcExists(2) && (i->predSrc != 2)) {
+ srcId(i->src(2), 32 + 17);
+ if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT))
+ code[1] |= 1 << 20;
+ } else {
+ code[1] |= 7 << 17;
+ }
+
+ if (i->defExists(0)) {
+ if (i->def(0).getFile() == FILE_GPR)
+ rDef = i->getDef(0);
+ else
+ pDef = i->getDef(0);
+
+ if (i->defExists(1)) {
+ if (i->def(1).getFile() == FILE_GPR)
+ rDef = i->getDef(1);
+ else
+ pDef = i->getDef(1);
+ }
+ }
+ if (rDef) {
+ code[0] &= ~(63 << 14);
+ defId(rDef, 14);
+ }
+ if (pDef) {
+ code[1] &= ~(7 << 21);
+ defId(pDef, 32 + 21);
+ }
+}
+
+void
+CodeEmitterNVC0::emitPFETCH(const Instruction *i)
+{
+ uint32_t prim = i->src(0).get()->reg.data.u32;
+
+ code[0] = 0x00000006 | ((prim & 0x3f) << 26);
+ code[1] = 0x00000000 | (prim >> 6);
+
+ emitPredicate(i);
+
+ defId(i->def(0), 14);
+ srcId(i->src(1), 20);
+}
+
+void
+CodeEmitterNVC0::emitVFETCH(const Instruction *i)
+{
+ code[0] = 0x00000006;
+ code[1] = 0x06000000 | i->src(0).get()->reg.data.offset;
+
+ if (i->perPatch)
+ code[0] |= 0x100;
+ if (i->getSrc(0)->reg.file == FILE_SHADER_OUTPUT)
+ code[0] |= 0x200; // yes, TCPs can read from *outputs* of other threads
+
+ emitPredicate(i);
+
+ code[0] |= ((i->getDef(0)->reg.size / 4) - 1) << 5;
+
+ defId(i->def(0), 14);
+ srcId(i->src(0).getIndirect(0), 20);
+ srcId(i->src(0).getIndirect(1), 26); // vertex address
+}
+
+void
+CodeEmitterNVC0::emitEXPORT(const Instruction *i)
+{
+ unsigned int size = typeSizeof(i->dType);
+
+ code[0] = 0x00000006 | ((size / 4 - 1) << 5);
+ code[1] = 0x0a000000 | i->src(0).get()->reg.data.offset;
+
+ assert(!(code[1] & ((size == 12) ? 15 : (size - 1))));
+
+ if (i->perPatch)
+ code[0] |= 0x100;
+
+ emitPredicate(i);
+
+ assert(i->src(1).getFile() == FILE_GPR);
+
+ srcId(i->src(0).getIndirect(0), 20);
+ srcId(i->src(0).getIndirect(1), 32 + 17); // vertex base address
+ srcId(i->src(1), 26);
+}
+
+void
+CodeEmitterNVC0::emitOUT(const Instruction *i)
+{
+ code[0] = 0x00000006;
+ code[1] = 0x1c000000;
+
+ emitPredicate(i);
+
+ defId(i->def(0), 14); // new secret address
+ srcId(i->src(0), 20); // old secret address, should be 0 initially
+
+ assert(i->src(0).getFile() == FILE_GPR);
+
+ if (i->op == OP_EMIT)
+ code[0] |= 1 << 5;
+ if (i->op == OP_RESTART || i->subOp == NV50_IR_SUBOP_EMIT_RESTART)
+ code[0] |= 1 << 6;
+
+ // vertex stream
+ if (i->src(1).getFile() == FILE_IMMEDIATE) {
+ code[1] |= 0xc000;
+ code[0] |= SDATA(i->src(1)).u32 << 26;
+ } else {
+ srcId(i->src(1), 26);
+ }
+}
+
+void
+CodeEmitterNVC0::emitInterpMode(const Instruction *i)
+{
+ if (i->encSize == 8) {
+ code[0] |= i->ipa << 6; // TODO: INTERP_SAMPLEID
+ } else {
+ if (i->getInterpMode() == NV50_IR_INTERP_SC)
+ code[0] |= 0x80;
+ assert(i->op == OP_PINTERP && i->getSampleMode() == 0);
+ }
+}
+
+void
+CodeEmitterNVC0::emitINTERP(const Instruction *i)
+{
+ const uint32_t base = i->getSrc(0)->reg.data.offset;
+
+ if (i->encSize == 8) {
+ code[0] = 0x00000000;
+ code[1] = 0xc0000000 | (base & 0xffff);
+
+ if (i->saturate)
+ code[0] |= 1 << 5;
+
+ if (i->op == OP_PINTERP)
+ srcId(i->src(1), 26);
+ else
+ code[0] |= 0x3f << 26;
+
+ srcId(i->src(0).getIndirect(0), 20);
+ } else {
+ assert(i->op == OP_PINTERP);
+ code[0] = 0x00000009 | ((base & 0xc) << 6) | ((base >> 4) << 26);
+ srcId(i->src(1), 20);
+ }
+ emitInterpMode(i);
+
+ emitPredicate(i);
+ defId(i->def(0), 14);
+
+ if (i->getSampleMode() == NV50_IR_INTERP_OFFSET)
+ srcId(i->src(i->op == OP_PINTERP ? 2 : 1), 17);
+ else
+ code[1] |= 0x3f << 17;
+}
+
+void
+CodeEmitterNVC0::emitLoadStoreType(DataType ty)
+{
+ uint8_t val;
+
+ switch (ty) {
+ case TYPE_U8:
+ val = 0x00;
+ break;
+ case TYPE_S8:
+ val = 0x20;
+ break;
+ case TYPE_F16:
+ case TYPE_U16:
+ val = 0x40;
+ break;
+ case TYPE_S16:
+ val = 0x60;
+ break;
+ case TYPE_F32:
+ case TYPE_U32:
+ case TYPE_S32:
+ val = 0x80;
+ break;
+ case TYPE_F64:
+ case TYPE_U64:
+ case TYPE_S64:
+ val = 0xa0;
+ break;
+ case TYPE_B128:
+ val = 0xc0;
+ break;
+ default:
+ val = 0x80;
+ assert(!"invalid type");
+ break;
+ }
+ code[0] |= val;
+}
+
+void
+CodeEmitterNVC0::emitCachingMode(CacheMode c)
+{
+ uint32_t val;
+
+ switch (c) {
+ case CACHE_CA:
+// case CACHE_WB:
+ val = 0x000;
+ break;
+ case CACHE_CG:
+ val = 0x100;
+ break;
+ case CACHE_CS:
+ val = 0x200;
+ break;
+ case CACHE_CV:
+// case CACHE_WT:
+ val = 0x300;
+ break;
+ default:
+ val = 0;
+ assert(!"invalid caching mode");
+ break;
+ }
+ code[0] |= val;
+}
+
+static inline bool
+uses64bitAddress(const Instruction *ldst)
+{
+ return ldst->src(0).getFile() == FILE_MEMORY_GLOBAL &&
+ ldst->src(0).isIndirect(0) &&
+ ldst->getIndirect(0, 0)->reg.size == 8;
+}
+
+void
+CodeEmitterNVC0::emitSTORE(const Instruction *i)
+{
+ uint32_t opc;
+
+ switch (i->src(0).getFile()) {
+ case FILE_MEMORY_GLOBAL: opc = 0x90000000; break;
+ case FILE_MEMORY_LOCAL: opc = 0xc8000000; break;
+ case FILE_MEMORY_SHARED: opc = 0xc9000000; break;
+ default:
+ assert(!"invalid memory file");
+ opc = 0;
+ break;
+ }
+ code[0] = 0x00000005;
+ code[1] = opc;
+
+ setAddressByFile(i->src(0));
+ srcId(i->src(1), 14);
+ srcId(i->src(0).getIndirect(0), 20);
+ if (uses64bitAddress(i))
+ code[1] |= 1 << 26;
+
+ emitPredicate(i);
+
+ emitLoadStoreType(i->dType);
+ emitCachingMode(i->cache);
+}
+
+void
+CodeEmitterNVC0::emitLOAD(const Instruction *i)
+{
+ uint32_t opc;
+
+ code[0] = 0x00000005;
+
+ switch (i->src(0).getFile()) {
+ case FILE_MEMORY_GLOBAL: opc = 0x80000000; break;
+ case FILE_MEMORY_LOCAL: opc = 0xc0000000; break;
+ case FILE_MEMORY_SHARED: opc = 0xc1000000; break;
+ case FILE_MEMORY_CONST:
+ if (!i->src(0).isIndirect(0) && typeSizeof(i->dType) == 4) {
+ emitMOV(i); // not sure if this is any better
+ return;
+ }
+ opc = 0x14000000 | (i->src(0).get()->reg.fileIndex << 10);
+ code[0] = 0x00000006 | (i->subOp << 8);
+ break;
+ default:
+ assert(!"invalid memory file");
+ opc = 0;
+ break;
+ }
+ code[1] = opc;
+
+ defId(i->def(0), 14);
+
+ setAddressByFile(i->src(0));
+ srcId(i->src(0).getIndirect(0), 20);
+ if (uses64bitAddress(i))
+ code[1] |= 1 << 26;
+
+ emitPredicate(i);
+
+ emitLoadStoreType(i->dType);
+ emitCachingMode(i->cache);
+}
+
+uint8_t
+CodeEmitterNVC0::getSRegEncoding(const ValueRef& ref)
+{
+ switch (SDATA(ref).sv.sv) {
+ case SV_LANEID: return 0x00;
+ case SV_PHYSID: return 0x03;
+ case SV_VERTEX_COUNT: return 0x10;
+ case SV_INVOCATION_ID: return 0x11;
+ case SV_YDIR: return 0x12;
+ case SV_TID: return 0x21 + SDATA(ref).sv.index;
+ case SV_CTAID: return 0x25 + SDATA(ref).sv.index;
+ case SV_NTID: return 0x29 + SDATA(ref).sv.index;
+ case SV_GRIDID: return 0x2c;
+ case SV_NCTAID: return 0x2d + SDATA(ref).sv.index;
+ case SV_LBASE: return 0x34;
+ case SV_SBASE: return 0x30;
+ case SV_CLOCK: return 0x50 + SDATA(ref).sv.index;
+ default:
+ assert(!"no sreg for system value");
+ return 0;
+ }
+}
+
+void
+CodeEmitterNVC0::emitMOV(const Instruction *i)
+{
+ if (i->def(0).getFile() == FILE_PREDICATE) {
+ if (i->src(0).getFile() == FILE_GPR) {
+ code[0] = 0xfc01c003;
+ code[1] = 0x1a8e0000;
+ srcId(i->src(0), 20);
+ } else {
+ code[0] = 0x0001c004;
+ code[1] = 0x0c0e0000;
+ if (i->src(0).getFile() == FILE_IMMEDIATE) {
+ code[0] |= 7 << 20;
+ if (!i->getSrc(0)->reg.data.u32)
+ code[0] |= 1 << 23;
+ } else {
+ srcId(i->src(0), 20);
+ }
+ }
+ defId(i->def(0), 17);
+ emitPredicate(i);
+ } else
+ if (i->src(0).getFile() == FILE_SYSTEM_VALUE) {
+ uint8_t sr = getSRegEncoding(i->src(0));
+
+ if (i->encSize == 8) {
+ code[0] = 0x00000004 | (sr << 26);
+ code[1] = 0x2c000000;
+ } else {
+ code[0] = 0x40000008 | (sr << 20);
+ }
+ defId(i->def(0), 14);
+
+ emitPredicate(i);
+ } else
+ if (i->encSize == 8) {
+ uint64_t opc;
+
+ if (i->src(0).getFile() == FILE_IMMEDIATE)
+ opc = HEX64(18000000, 000001e2);
+ else
+ if (i->src(0).getFile() == FILE_PREDICATE)
+ opc = HEX64(080e0000, 1c000004);
+ else
+ opc = HEX64(28000000, 00000004);
+
+ opc |= i->lanes << 5;
+
+ emitForm_B(i, opc);
+ } else {
+ uint32_t imm;
+
+ if (i->src(0).getFile() == FILE_IMMEDIATE) {
+ imm = SDATA(i->src(0)).u32;
+ if (imm & 0xfff00000) {
+ assert(!(imm & 0x000fffff));
+ code[0] = 0x00000318 | imm;
+ } else {
+ assert(imm < 0x800 || ((int32_t)imm >= -0x800));
+ code[0] = 0x00000118 | (imm << 20);
+ }
+ } else {
+ code[0] = 0x0028;
+ emitShortSrc2(i->src(0));
+ }
+ defId(i->def(0), 14);
+
+ emitPredicate(i);
+ }
+}
+
+void
+CodeEmitterNVC0::emitATOM(const Instruction *i)
+{
+ const bool hasDst = i->defExists(0);
+ const bool casOrExch =
+ i->subOp == NV50_IR_SUBOP_ATOM_EXCH ||
+ i->subOp == NV50_IR_SUBOP_ATOM_CAS;
+
+ if (i->dType == TYPE_U64) {
+ switch (i->subOp) {
+ case NV50_IR_SUBOP_ATOM_ADD:
+ code[0] = 0x205;
+ if (hasDst)
+ code[1] = 0x507e0000;
+ else
+ code[1] = 0x10000000;
+ break;
+ case NV50_IR_SUBOP_ATOM_EXCH:
+ code[0] = 0x305;
+ code[1] = 0x507e0000;
+ break;
+ case NV50_IR_SUBOP_ATOM_CAS:
+ code[0] = 0x325;
+ code[1] = 0x50000000;
+ break;
+ default:
+ assert(!"invalid u64 red op");
+ break;
+ }
+ } else
+ if (i->dType == TYPE_U32) {
+ switch (i->subOp) {
+ case NV50_IR_SUBOP_ATOM_EXCH:
+ code[0] = 0x105;
+ code[1] = 0x507e0000;
+ break;
+ case NV50_IR_SUBOP_ATOM_CAS:
+ code[0] = 0x125;
+ code[1] = 0x50000000;
+ break;
+ default:
+ code[0] = 0x5 | (i->subOp << 5);
+ if (hasDst)
+ code[1] = 0x507e0000;
+ else
+ code[1] = 0x10000000;
+ break;
+ }
+ } else
+ if (i->dType == TYPE_S32) {
+ assert(i->subOp <= 2);
+ code[0] = 0x205 | (i->subOp << 5);
+ if (hasDst)
+ code[1] = 0x587e0000;
+ else
+ code[1] = 0x18000000;
+ } else
+ if (i->dType == TYPE_F32) {
+ assert(i->subOp == NV50_IR_SUBOP_ATOM_ADD);
+ code[0] = 0x205;
+ if (hasDst)
+ code[1] = 0x687e0000;
+ else
+ code[1] = 0x28000000;
+ }
+
+ emitPredicate(i);
+
+ srcId(i->src(1), 14);
+
+ if (hasDst)
+ defId(i->def(0), 32 + 11);
+ else
+ if (casOrExch)
+ code[1] |= 63 << 11;
+
+ if (hasDst || casOrExch) {
+ const int32_t offset = SDATA(i->src(0)).offset;
+ assert(offset < 0x80000 && offset >= -0x80000);
+ code[0] |= offset << 26;
+ code[1] |= (offset & 0x1ffc0) >> 6;
+ code[1] |= (offset & 0xe0000) << 6;
+ } else {
+ srcAddr32(i->src(0), 26, 0);
+ }
+ if (i->getIndirect(0, 0)) {
+ srcId(i->getIndirect(0, 0), 20);
+ if (i->getIndirect(0, 0)->reg.size == 8)
+ code[1] |= 1 << 26;
+ } else {
+ code[0] |= 63 << 20;
+ }
+
+ if (i->subOp == NV50_IR_SUBOP_ATOM_CAS)
+ srcId(i->src(2), 32 + 17);
+}
+
+void
+CodeEmitterNVC0::emitMEMBAR(const Instruction *i)
+{
+ switch (NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp)) {
+ case NV50_IR_SUBOP_MEMBAR_CTA: code[0] = 0x05; break;
+ case NV50_IR_SUBOP_MEMBAR_GL: code[0] = 0x25; break;
+ default:
+ code[0] = 0x45;
+ assert(NV50_IR_SUBOP_MEMBAR_SCOPE(i->subOp) == NV50_IR_SUBOP_MEMBAR_SYS);
+ break;
+ }
+ code[1] = 0xe0000000;
+
+ emitPredicate(i);
+}
+
+void
+CodeEmitterNVC0::emitCCTL(const Instruction *i)
+{
+ code[0] = 0x00000005 | (i->subOp << 5);
+
+ if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+ code[1] = 0x98000000;
+ srcAddr32(i->src(0), 28, 2);
+ } else {
+ code[1] = 0xd0000000;
+ setAddress24(i->src(0));
+ }
+ if (uses64bitAddress(i))
+ code[1] |= 1 << 26;
+ srcId(i->src(0).getIndirect(0), 20);
+
+ emitPredicate(i);
+
+ defId(i, 0, 14);
+}
+
+void
+CodeEmitterNVC0::emitSUCLAMPMode(uint16_t subOp)
+{
+ uint8_t m;
+ switch (subOp & ~NV50_IR_SUBOP_SUCLAMP_2D) {
+ case NV50_IR_SUBOP_SUCLAMP_SD(0, 1): m = 0; break;
+ case NV50_IR_SUBOP_SUCLAMP_SD(1, 1): m = 1; break;
+ case NV50_IR_SUBOP_SUCLAMP_SD(2, 1): m = 2; break;
+ case NV50_IR_SUBOP_SUCLAMP_SD(3, 1): m = 3; break;
+ case NV50_IR_SUBOP_SUCLAMP_SD(4, 1): m = 4; break;
+ case NV50_IR_SUBOP_SUCLAMP_PL(0, 1): m = 5; break;
+ case NV50_IR_SUBOP_SUCLAMP_PL(1, 1): m = 6; break;
+ case NV50_IR_SUBOP_SUCLAMP_PL(2, 1): m = 7; break;
+ case NV50_IR_SUBOP_SUCLAMP_PL(3, 1): m = 8; break;
+ case NV50_IR_SUBOP_SUCLAMP_PL(4, 1): m = 9; break;
+ case NV50_IR_SUBOP_SUCLAMP_BL(0, 1): m = 10; break;
+ case NV50_IR_SUBOP_SUCLAMP_BL(1, 1): m = 11; break;
+ case NV50_IR_SUBOP_SUCLAMP_BL(2, 1): m = 12; break;
+ case NV50_IR_SUBOP_SUCLAMP_BL(3, 1): m = 13; break;
+ case NV50_IR_SUBOP_SUCLAMP_BL(4, 1): m = 14; break;
+ default:
+ return;
+ }
+ code[0] |= m << 5;
+ if (subOp & NV50_IR_SUBOP_SUCLAMP_2D)
+ code[1] |= 1 << 16;
+}
+
+void
+CodeEmitterNVC0::emitSUCalc(Instruction *i)
+{
+ ImmediateValue *imm = NULL;
+ uint64_t opc;
+
+ if (i->srcExists(2)) {
+ imm = i->getSrc(2)->asImm();
+ if (imm)
+ i->setSrc(2, NULL); // special case, make emitForm_A not assert
+ }
+
+ switch (i->op) {
+ case OP_SUCLAMP: opc = HEX64(58000000, 00000004); break;
+ case OP_SUBFM: opc = HEX64(5c000000, 00000004); break;
+ case OP_SUEAU: opc = HEX64(60000000, 00000004); break;
+ default:
+ assert(0);
+ return;
+ }
+ emitForm_A(i, opc);
+
+ if (i->op == OP_SUCLAMP) {
+ if (i->dType == TYPE_S32)
+ code[0] |= 1 << 9;
+ emitSUCLAMPMode(i->subOp);
+ }
+
+ if (i->op == OP_SUBFM && i->subOp == NV50_IR_SUBOP_SUBFM_3D)
+ code[1] |= 1 << 16;
+
+ if (i->op != OP_SUEAU) {
+ if (i->def(0).getFile() == FILE_PREDICATE) { // p, #
+ code[0] |= 63 << 14;
+ code[1] |= i->getDef(0)->reg.data.id << 23;
+ } else
+ if (i->defExists(1)) { // r, p
+ assert(i->def(1).getFile() == FILE_PREDICATE);
+ code[1] |= i->getDef(1)->reg.data.id << 23;
+ } else { // r, #
+ code[1] |= 7 << 23;
+ }
+ }
+ if (imm) {
+ assert(i->op == OP_SUCLAMP);
+ i->setSrc(2, imm);
+ code[1] |= (imm->reg.data.u32 & 0x3f) << 17; // sint6
+ }
+}
+
+void
+CodeEmitterNVC0::emitSUGType(DataType ty)
+{
+ switch (ty) {
+ case TYPE_S32: code[1] |= 1 << 13; break;
+ case TYPE_U8: code[1] |= 2 << 13; break;
+ case TYPE_S8: code[1] |= 3 << 13; break;
+ default:
+ assert(ty == TYPE_U32);
+ break;
+ }
+}
+
+void
+CodeEmitterNVC0::setSUConst16(const Instruction *i, const int s)
+{
+ const uint32_t offset = i->getSrc(s)->reg.data.offset;
+
+ assert(i->src(s).getFile() == FILE_MEMORY_CONST);
+ assert(offset == (offset & 0xfffc));
+
+ code[1] |= 1 << 21;
+ code[0] |= offset << 24;
+ code[1] |= offset >> 8;
+ code[1] |= i->getSrc(s)->reg.fileIndex << 8;
+}
+
+void
+CodeEmitterNVC0::setSUPred(const Instruction *i, const int s)
+{
+ if (!i->srcExists(s) || (i->predSrc == s)) {
+ code[1] |= 0x7 << 17;
+ } else {
+ if (i->src(s).mod == Modifier(NV50_IR_MOD_NOT))
+ code[1] |= 1 << 20;
+ srcId(i->src(s), 32 + 17);
+ }
+}
+
+void
+CodeEmitterNVC0::emitSULDGB(const TexInstruction *i)
+{
+ code[0] = 0x5;
+ code[1] = 0xd4000000 | (i->subOp << 15);
+
+ emitLoadStoreType(i->dType);
+ emitSUGType(i->sType);
+ emitCachingMode(i->cache);
+
+ emitPredicate(i);
+ defId(i->def(0), 14); // destination
+ srcId(i->src(0), 20); // address
+ // format
+ if (i->src(1).getFile() == FILE_GPR)
+ srcId(i->src(1), 26);
+ else
+ setSUConst16(i, 1);
+ setSUPred(i, 2);
+}
+
+void
+CodeEmitterNVC0::emitSUSTGx(const TexInstruction *i)
+{
+ code[0] = 0x5;
+ code[1] = 0xdc000000 | (i->subOp << 15);
+
+ if (i->op == OP_SUSTP)
+ code[1] |= i->tex.mask << 22;
+ else
+ emitLoadStoreType(i->dType);
+ emitSUGType(i->sType);
+ emitCachingMode(i->cache);
+
+ emitPredicate(i);
+ srcId(i->src(0), 20); // address
+ // format
+ if (i->src(1).getFile() == FILE_GPR)
+ srcId(i->src(1), 26);
+ else
+ setSUConst16(i, 1);
+ srcId(i->src(3), 14); // values
+ setSUPred(i, 2);
+}
+
+void
+CodeEmitterNVC0::emitVectorSubOp(const Instruction *i)
+{
+ switch (NV50_IR_SUBOP_Vn(i->subOp)) {
+ case 0:
+ code[1] |= (i->subOp & 0x000f) << 12; // vsrc1
+ code[1] |= (i->subOp & 0x00e0) >> 5; // vsrc2
+ code[1] |= (i->subOp & 0x0100) << 7; // vsrc2
+ code[1] |= (i->subOp & 0x3c00) << 13; // vdst
+ break;
+ case 1:
+ code[1] |= (i->subOp & 0x000f) << 8; // v2src1
+ code[1] |= (i->subOp & 0x0010) << 11; // v2src1
+ code[1] |= (i->subOp & 0x01e0) >> 1; // v2src2
+ code[1] |= (i->subOp & 0x0200) << 6; // v2src2
+ code[1] |= (i->subOp & 0x3c00) << 2; // v4dst
+ code[1] |= (i->mask & 0x3) << 2;
+ break;
+ case 2:
+ code[1] |= (i->subOp & 0x000f) << 8; // v4src1
+ code[1] |= (i->subOp & 0x01e0) >> 1; // v4src2
+ code[1] |= (i->subOp & 0x3c00) << 2; // v4dst
+ code[1] |= (i->mask & 0x3) << 2;
+ code[1] |= (i->mask & 0xc) << 21;
+ break;
+ default:
+ assert(0);
+ break;
+ }
+}
+
+void
+CodeEmitterNVC0::emitVSHL(const Instruction *i)
+{
+ uint64_t opc = 0x4;
+
+ switch (NV50_IR_SUBOP_Vn(i->subOp)) {
+ case 0: opc |= 0xe8ULL << 56; break;
+ case 1: opc |= 0xb4ULL << 56; break;
+ case 2: opc |= 0x94ULL << 56; break;
+ default:
+ assert(0);
+ break;
+ }
+ if (NV50_IR_SUBOP_Vn(i->subOp) == 1) {
+ if (isSignedType(i->dType)) opc |= 1ULL << 0x2a;
+ if (isSignedType(i->sType)) opc |= (1 << 6) | (1 << 5);
+ } else {
+ if (isSignedType(i->dType)) opc |= 1ULL << 0x39;
+ if (isSignedType(i->sType)) opc |= 1 << 6;
+ }
+ emitForm_A(i, opc);
+ emitVectorSubOp(i);
+
+ if (i->saturate)
+ code[0] |= 1 << 9;
+ if (i->flagsDef >= 0)
+ code[1] |= 1 << 16;
+}
+
+bool
+CodeEmitterNVC0::emitInstruction(Instruction *insn)
+{
+ unsigned int size = insn->encSize;
+
+ if (writeIssueDelays && !(codeSize & 0x3f))
+ size += 8;
+
+ if (!insn->encSize) {
+ ERROR("skipping unencodable instruction: "); insn->print();
+ return false;
+ } else
+ if (codeSize + size > codeSizeLimit) {
+ ERROR("code emitter output buffer too small\n");
+ return false;
+ }
+
+ if (writeIssueDelays) {
+ if (!(codeSize & 0x3f)) {
+ code[0] = 0x00000007; // cf issue delay "instruction"
+ code[1] = 0x20000000;
+ code += 2;
+ codeSize += 8;
+ }
+ const unsigned int id = (codeSize & 0x3f) / 8 - 1;
+ uint32_t *data = code - (id * 2 + 2);
+ if (id <= 2) {
+ data[0] |= insn->sched << (id * 8 + 4);
+ } else
+ if (id == 3) {
+ data[0] |= insn->sched << 28;
+ data[1] |= insn->sched >> 4;
+ } else {
+ data[1] |= insn->sched << ((id - 4) * 8 + 4);
+ }
+ }
+
+ // assert that instructions with multiple defs don't corrupt registers
+ for (int d = 0; insn->defExists(d); ++d)
+ assert(insn->asTex() || insn->def(d).rep()->reg.data.id >= 0);
+
+ switch (insn->op) {
+ case OP_MOV:
+ case OP_RDSV:
+ emitMOV(insn);
+ break;
+ case OP_NOP:
+ break;
+ case OP_LOAD:
+ emitLOAD(insn);
+ break;
+ case OP_STORE:
+ emitSTORE(insn);
+ break;
+ case OP_LINTERP:
+ case OP_PINTERP:
+ emitINTERP(insn);
+ break;
+ case OP_VFETCH:
+ emitVFETCH(insn);
+ break;
+ case OP_EXPORT:
+ emitEXPORT(insn);
+ break;
+ case OP_PFETCH:
+ emitPFETCH(insn);
+ break;
+ case OP_EMIT:
+ case OP_RESTART:
+ emitOUT(insn);
+ break;
+ case OP_ADD:
+ case OP_SUB:
+ if (isFloatType(insn->dType))
+ emitFADD(insn);
+ else
+ emitUADD(insn);
+ break;
+ case OP_MUL:
+ if (isFloatType(insn->dType))
+ emitFMUL(insn);
+ else
+ emitUMUL(insn);
+ break;
+ case OP_MAD:
+ case OP_FMA:
+ if (isFloatType(insn->dType))
+ emitFMAD(insn);
+ else
+ emitIMAD(insn);
+ break;
+ case OP_SAD:
+ emitISAD(insn);
+ break;
+ case OP_NOT:
+ emitNOT(insn);
+ break;
+ case OP_AND:
+ emitLogicOp(insn, 0);
+ break;
+ case OP_OR:
+ emitLogicOp(insn, 1);
+ break;
+ case OP_XOR:
+ emitLogicOp(insn, 2);
+ break;
+ case OP_SHL:
+ case OP_SHR:
+ emitShift(insn);
+ break;
+ case OP_SET:
+ case OP_SET_AND:
+ case OP_SET_OR:
+ case OP_SET_XOR:
+ emitSET(insn->asCmp());
+ break;
+ case OP_SELP:
+ emitSELP(insn);
+ break;
+ case OP_SLCT:
+ emitSLCT(insn->asCmp());
+ break;
+ case OP_MIN:
+ case OP_MAX:
+ emitMINMAX(insn);
+ break;
+ case OP_ABS:
+ case OP_NEG:
+ case OP_CEIL:
+ case OP_FLOOR:
+ case OP_TRUNC:
+ case OP_CVT:
+ case OP_SAT:
+ emitCVT(insn);
+ break;
+ case OP_RSQ:
+ emitSFnOp(insn, 5);
+ break;
+ case OP_RCP:
+ emitSFnOp(insn, 4);
+ break;
+ case OP_LG2:
+ emitSFnOp(insn, 3);
+ break;
+ case OP_EX2:
+ emitSFnOp(insn, 2);
+ break;
+ case OP_SIN:
+ emitSFnOp(insn, 1);
+ break;
+ case OP_COS:
+ emitSFnOp(insn, 0);
+ break;
+ case OP_PRESIN:
+ case OP_PREEX2:
+ emitPreOp(insn);
+ break;
+ case OP_TEX:
+ case OP_TXB:
+ case OP_TXL:
+ case OP_TXD:
+ case OP_TXF:
+ emitTEX(insn->asTex());
+ break;
+ case OP_TXQ:
+ emitTXQ(insn->asTex());
+ break;
+ case OP_TEXBAR:
+ emitTEXBAR(insn);
+ break;
+ case OP_SUBFM:
+ case OP_SUCLAMP:
+ case OP_SUEAU:
+ emitSUCalc(insn);
+ break;
+ case OP_MADSP:
+ emitMADSP(insn);
+ break;
+ case OP_SULDB:
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+ emitSULDGB(insn->asTex());
+ else
+ ERROR("SULDB not yet supported on < nve4\n");
+ break;
+ case OP_SUSTB:
+ case OP_SUSTP:
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+ emitSUSTGx(insn->asTex());
+ else
+ ERROR("SUSTx not yet supported on < nve4\n");
+ break;
+ case OP_ATOM:
+ emitATOM(insn);
+ break;
+ case OP_BRA:
+ case OP_CALL:
+ case OP_PRERET:
+ case OP_RET:
+ case OP_DISCARD:
+ case OP_EXIT:
+ case OP_PRECONT:
+ case OP_CONT:
+ case OP_PREBREAK:
+ case OP_BREAK:
+ case OP_JOINAT:
+ case OP_BRKPT:
+ case OP_QUADON:
+ case OP_QUADPOP:
+ emitFlow(insn);
+ break;
+ case OP_QUADOP:
+ emitQUADOP(insn, insn->subOp, insn->lanes);
+ break;
+ case OP_DFDX:
+ emitQUADOP(insn, insn->src(0).mod.neg() ? 0x66 : 0x99, 0x4);
+ break;
+ case OP_DFDY:
+ emitQUADOP(insn, insn->src(0).mod.neg() ? 0x5a : 0xa5, 0x5);
+ break;
+ case OP_POPCNT:
+ emitPOPC(insn);
+ break;
+ case OP_INSBF:
+ emitINSBF(insn);
+ break;
+ case OP_EXTBF:
+ emitEXTBF(insn);
+ break;
+ case OP_PERMT:
+ emitPERMT(insn);
+ break;
+ case OP_JOIN:
+ emitNOP(insn);
+ insn->join = 1;
+ break;
+ case OP_BAR:
+ emitBAR(insn);
+ break;
+ case OP_MEMBAR:
+ emitMEMBAR(insn);
+ break;
+ case OP_CCTL:
+ emitCCTL(insn);
+ break;
+ case OP_VSHL:
+ emitVSHL(insn);
+ break;
+ case OP_PHI:
+ case OP_UNION:
+ case OP_CONSTRAINT:
+ ERROR("operation should have been eliminated");
+ return false;
+ case OP_EXP:
+ case OP_LOG:
+ case OP_SQRT:
+ case OP_POW:
+ ERROR("operation should have been lowered\n");
+ return false;
+ default:
+ ERROR("unknow op\n");
+ return false;
+ }
+
+ if (insn->join) {
+ code[0] |= 0x10;
+ assert(insn->encSize == 8);
+ }
+
+ code += insn->encSize / 4;
+ codeSize += insn->encSize;
+ return true;
+}
+
+uint32_t
+CodeEmitterNVC0::getMinEncodingSize(const Instruction *i) const
+{
+ const Target::OpInfo &info = targ->getOpInfo(i);
+
+ if (writeIssueDelays || info.minEncSize == 8 || 1)
+ return 8;
+
+ if (i->ftz || i->saturate || i->join)
+ return 8;
+ if (i->rnd != ROUND_N)
+ return 8;
+ if (i->predSrc >= 0 && i->op == OP_MAD)
+ return 8;
+
+ if (i->op == OP_PINTERP) {
+ if (i->getSampleMode() || 1) // XXX: grr, short op doesn't work
+ return 8;
+ } else
+ if (i->op == OP_MOV && i->lanes != 0xf) {
+ return 8;
+ }
+
+ for (int s = 0; i->srcExists(s); ++s) {
+ if (i->src(s).isIndirect(0))
+ return 8;
+
+ if (i->src(s).getFile() == FILE_MEMORY_CONST) {
+ if (SDATA(i->src(s)).offset >= 0x100)
+ return 8;
+ if (i->getSrc(s)->reg.fileIndex > 1 &&
+ i->getSrc(s)->reg.fileIndex != 16)
+ return 8;
+ } else
+ if (i->src(s).getFile() == FILE_IMMEDIATE) {
+ if (i->dType == TYPE_F32) {
+ if (SDATA(i->src(s)).u32 >= 0x100)
+ return 8;
+ } else {
+ if (SDATA(i->src(s)).u32 > 0xff)
+ return 8;
+ }
+ }
+
+ if (i->op == OP_CVT)
+ continue;
+ if (i->src(s).mod != Modifier(0)) {
+ if (i->src(s).mod == Modifier(NV50_IR_MOD_ABS))
+ if (i->op != OP_RSQ)
+ return 8;
+ if (i->src(s).mod == Modifier(NV50_IR_MOD_NEG))
+ if (i->op != OP_ADD || s != 0)
+ return 8;
+ }
+ }
+
+ return 4;
+}
+
+// Simplified, erring on safe side.
+class SchedDataCalculator : public Pass
+{
+public:
+ SchedDataCalculator(const Target *targ) : targ(targ) { }
+
+private:
+ struct RegScores
+ {
+ struct Resource {
+ int st[DATA_FILE_COUNT]; // LD to LD delay 3
+ int ld[DATA_FILE_COUNT]; // ST to ST delay 3
+ int tex; // TEX to non-TEX delay 17 (0x11)
+ int sfu; // SFU to SFU delay 3 (except PRE-ops)
+ int imul; // integer MUL to MUL delay 3
+ } res;
+ struct ScoreData {
+ int r[64];
+ int p[8];
+ int c;
+ } rd, wr;
+ int base;
+
+ void rebase(const int base)
+ {
+ const int delta = this->base - base;
+ if (!delta)
+ return;
+ this->base = 0;
+
+ for (int i = 0; i < 64; ++i) {
+ rd.r[i] += delta;
+ wr.r[i] += delta;
+ }
+ for (int i = 0; i < 8; ++i) {
+ rd.p[i] += delta;
+ wr.p[i] += delta;
+ }
+ rd.c += delta;
+ wr.c += delta;
+
+ for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+ res.ld[f] += delta;
+ res.st[f] += delta;
+ }
+ res.sfu += delta;
+ res.imul += delta;
+ res.tex += delta;
+ }
+ void wipe()
+ {
+ memset(&rd, 0, sizeof(rd));
+ memset(&wr, 0, sizeof(wr));
+ memset(&res, 0, sizeof(res));
+ }
+ int getLatest(const ScoreData& d) const
+ {
+ int max = 0;
+ for (int i = 0; i < 64; ++i)
+ if (d.r[i] > max)
+ max = d.r[i];
+ for (int i = 0; i < 8; ++i)
+ if (d.p[i] > max)
+ max = d.p[i];
+ if (d.c > max)
+ max = d.c;
+ return max;
+ }
+ inline int getLatestRd() const
+ {
+ return getLatest(rd);
+ }
+ inline int getLatestWr() const
+ {
+ return getLatest(wr);
+ }
+ inline int getLatest() const
+ {
+ const int a = getLatestRd();
+ const int b = getLatestWr();
+
+ int max = MAX2(a, b);
+ for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+ max = MAX2(res.ld[f], max);
+ max = MAX2(res.st[f], max);
+ }
+ max = MAX2(res.sfu, max);
+ max = MAX2(res.imul, max);
+ max = MAX2(res.tex, max);
+ return max;
+ }
+ void setMax(const RegScores *that)
+ {
+ for (int i = 0; i < 64; ++i) {
+ rd.r[i] = MAX2(rd.r[i], that->rd.r[i]);
+ wr.r[i] = MAX2(wr.r[i], that->wr.r[i]);
+ }
+ for (int i = 0; i < 8; ++i) {
+ rd.p[i] = MAX2(rd.p[i], that->rd.p[i]);
+ wr.p[i] = MAX2(wr.p[i], that->wr.p[i]);
+ }
+ rd.c = MAX2(rd.c, that->rd.c);
+ wr.c = MAX2(wr.c, that->wr.c);
+
+ for (unsigned int f = 0; f < DATA_FILE_COUNT; ++f) {
+ res.ld[f] = MAX2(res.ld[f], that->res.ld[f]);
+ res.st[f] = MAX2(res.st[f], that->res.st[f]);
+ }
+ res.sfu = MAX2(res.sfu, that->res.sfu);
+ res.imul = MAX2(res.imul, that->res.imul);
+ res.tex = MAX2(res.tex, that->res.tex);
+ }
+ void print(int cycle)
+ {
+ for (int i = 0; i < 64; ++i) {
+ if (rd.r[i] > cycle)
+ INFO("rd $r%i @ %i\n", i, rd.r[i]);
+ if (wr.r[i] > cycle)
+ INFO("wr $r%i @ %i\n", i, wr.r[i]);
+ }
+ for (int i = 0; i < 8; ++i) {
+ if (rd.p[i] > cycle)
+ INFO("rd $p%i @ %i\n", i, rd.p[i]);
+ if (wr.p[i] > cycle)
+ INFO("wr $p%i @ %i\n", i, wr.p[i]);
+ }
+ if (rd.c > cycle)
+ INFO("rd $c @ %i\n", rd.c);
+ if (wr.c > cycle)
+ INFO("wr $c @ %i\n", wr.c);
+ if (res.sfu > cycle)
+ INFO("sfu @ %i\n", res.sfu);
+ if (res.imul > cycle)
+ INFO("imul @ %i\n", res.imul);
+ if (res.tex > cycle)
+ INFO("tex @ %i\n", res.tex);
+ }
+ };
+
+ RegScores *score; // for current BB
+ std::vector<RegScores> scoreBoards;
+ int cycle;
+ int prevData;
+ operation prevOp;
+
+ const Target *targ;
+
+ bool visit(Function *);
+ bool visit(BasicBlock *);
+
+ void commitInsn(const Instruction *, int cycle);
+ int calcDelay(const Instruction *, int cycle) const;
+ void setDelay(Instruction *, int delay, Instruction *next);
+
+ void recordRd(const Value *, const int ready);
+ void recordWr(const Value *, const int ready);
+ void checkRd(const Value *, int cycle, int& delay) const;
+ void checkWr(const Value *, int cycle, int& delay) const;
+
+ int getCycles(const Instruction *, int origDelay) const;
+};
+
+void
+SchedDataCalculator::setDelay(Instruction *insn, int delay, Instruction *next)
+{
+ if (insn->op == OP_EXIT || insn->op == OP_RET)
+ delay = MAX2(delay, 14);
+
+ if (insn->op == OP_TEXBAR) {
+ // TODO: except if results not used before EXIT
+ insn->sched = 0xc2;
+ } else
+ if (insn->op == OP_JOIN || insn->join) {
+ insn->sched = 0x00;
+ } else
+ if (delay >= 0 || prevData == 0x04 ||
+ !next || !targ->canDualIssue(insn, next)) {
+ insn->sched = static_cast<uint8_t>(MAX2(delay, 0));
+ if (prevOp == OP_EXPORT)
+ insn->sched |= 0x40;
+ else
+ insn->sched |= 0x20;
+ } else {
+ insn->sched = 0x04; // dual-issue
+ }
+
+ if (prevData != 0x04 || prevOp != OP_EXPORT)
+ if (insn->sched != 0x04 || insn->op == OP_EXPORT)
+ prevOp = insn->op;
+
+ prevData = insn->sched;
+}
+
+int
+SchedDataCalculator::getCycles(const Instruction *insn, int origDelay) const
+{
+ if (insn->sched & 0x80) {
+ int c = (insn->sched & 0x0f) * 2 + 1;
+ if (insn->op == OP_TEXBAR && origDelay > 0)
+ c += origDelay;
+ return c;
+ }
+ if (insn->sched & 0x60)
+ return (insn->sched & 0x1f) + 1;
+ return (insn->sched == 0x04) ? 0 : 32;
+}
+
+bool
+SchedDataCalculator::visit(Function *func)
+{
+ scoreBoards.resize(func->cfg.getSize());
+ for (size_t i = 0; i < scoreBoards.size(); ++i)
+ scoreBoards[i].wipe();
+ return true;
+}
+
+bool
+SchedDataCalculator::visit(BasicBlock *bb)
+{
+ Instruction *insn;
+ Instruction *next = NULL;
+
+ int cycle = 0;
+
+ prevData = 0x00;
+ prevOp = OP_NOP;
+ score = &scoreBoards.at(bb->getId());
+
+ for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+ // back branches will wait until all target dependencies are satisfied
+ if (ei.getType() == Graph::Edge::BACK) // sched would be uninitialized
+ continue;
+ BasicBlock *in = BasicBlock::get(ei.getNode());
+ if (in->getExit()) {
+ if (prevData != 0x04)
+ prevData = in->getExit()->sched;
+ prevOp = in->getExit()->op;
+ }
+ score->setMax(&scoreBoards.at(in->getId()));
+ }
+ if (bb->cfg.incidentCount() > 1)
+ prevOp = OP_NOP;
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+ INFO("=== BB:%i initial scores\n", bb->getId());
+ score->print(cycle);
+#endif
+
+ for (insn = bb->getEntry(); insn && insn->next; insn = insn->next) {
+ next = insn->next;
+
+ commitInsn(insn, cycle);
+ int delay = calcDelay(next, cycle);
+ setDelay(insn, delay, next);
+ cycle += getCycles(insn, delay);
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+ INFO("cycle %i, sched %02x\n", cycle, insn->sched);
+ insn->print();
+ next->print();
+#endif
+ }
+ if (!insn)
+ return true;
+ commitInsn(insn, cycle);
+
+ int bbDelay = -1;
+
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+ BasicBlock *out = BasicBlock::get(ei.getNode());
+
+ if (ei.getType() != Graph::Edge::BACK) {
+ // only test the first instruction of the outgoing block
+ next = out->getEntry();
+ if (next)
+ bbDelay = MAX2(bbDelay, calcDelay(next, cycle));
+ } else {
+ // wait until all dependencies are satisfied
+ const int regsFree = score->getLatest();
+ next = out->getFirst();
+ for (int c = cycle; next && c < regsFree; next = next->next) {
+ bbDelay = MAX2(bbDelay, calcDelay(next, c));
+ c += getCycles(next, bbDelay);
+ }
+ next = NULL;
+ }
+ }
+ if (bb->cfg.outgoingCount() != 1)
+ next = NULL;
+ setDelay(insn, bbDelay, next);
+ cycle += getCycles(insn, bbDelay);
+
+ score->rebase(cycle); // common base for initializing out blocks' scores
+ return true;
+}
+
+#define NVE4_MAX_ISSUE_DELAY 0x1f
+int
+SchedDataCalculator::calcDelay(const Instruction *insn, int cycle) const
+{
+ int delay = 0, ready = cycle;
+
+ for (int s = 0; insn->srcExists(s); ++s)
+ checkRd(insn->getSrc(s), cycle, delay);
+ // WAR & WAW don't seem to matter
+ // for (int s = 0; insn->srcExists(s); ++s)
+ // recordRd(insn->getSrc(s), cycle);
+
+ switch (Target::getOpClass(insn->op)) {
+ case OPCLASS_SFU:
+ ready = score->res.sfu;
+ break;
+ case OPCLASS_ARITH:
+ if (insn->op == OP_MUL && !isFloatType(insn->dType))
+ ready = score->res.imul;
+ break;
+ case OPCLASS_TEXTURE:
+ ready = score->res.tex;
+ break;
+ case OPCLASS_LOAD:
+ ready = score->res.ld[insn->src(0).getFile()];
+ break;
+ case OPCLASS_STORE:
+ ready = score->res.st[insn->src(0).getFile()];
+ break;
+ default:
+ break;
+ }
+ if (Target::getOpClass(insn->op) != OPCLASS_TEXTURE)
+ ready = MAX2(ready, score->res.tex);
+
+ delay = MAX2(delay, ready - cycle);
+
+ // if can issue next cycle, delay is 0, not 1
+ return MIN2(delay - 1, NVE4_MAX_ISSUE_DELAY);
+}
+
+void
+SchedDataCalculator::commitInsn(const Instruction *insn, int cycle)
+{
+ const int ready = cycle + targ->getLatency(insn);
+
+ for (int d = 0; insn->defExists(d); ++d)
+ recordWr(insn->getDef(d), ready);
+ // WAR & WAW don't seem to matter
+ // for (int s = 0; insn->srcExists(s); ++s)
+ // recordRd(insn->getSrc(s), cycle);
+
+ switch (Target::getOpClass(insn->op)) {
+ case OPCLASS_SFU:
+ score->res.sfu = cycle + 4;
+ break;
+ case OPCLASS_ARITH:
+ if (insn->op == OP_MUL && !isFloatType(insn->dType))
+ score->res.imul = cycle + 4;
+ break;
+ case OPCLASS_TEXTURE:
+ score->res.tex = cycle + 18;
+ break;
+ case OPCLASS_LOAD:
+ if (insn->src(0).getFile() == FILE_MEMORY_CONST)
+ break;
+ score->res.ld[insn->src(0).getFile()] = cycle + 4;
+ score->res.st[insn->src(0).getFile()] = ready;
+ break;
+ case OPCLASS_STORE:
+ score->res.st[insn->src(0).getFile()] = cycle + 4;
+ score->res.ld[insn->src(0).getFile()] = ready;
+ break;
+ case OPCLASS_OTHER:
+ if (insn->op == OP_TEXBAR)
+ score->res.tex = cycle;
+ break;
+ default:
+ break;
+ }
+
+#ifdef NVC0_DEBUG_SCHED_DATA
+ score->print(cycle);
+#endif
+}
+
+void
+SchedDataCalculator::checkRd(const Value *v, int cycle, int& delay) const
+{
+ int ready = cycle;
+ int a, b;
+
+ switch (v->reg.file) {
+ case FILE_GPR:
+ a = v->reg.data.id;
+ b = a + v->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ ready = MAX2(ready, score->rd.r[r]);
+ break;
+ case FILE_PREDICATE:
+ ready = MAX2(ready, score->rd.p[v->reg.data.id]);
+ break;
+ case FILE_FLAGS:
+ ready = MAX2(ready, score->rd.c);
+ break;
+ case FILE_SHADER_INPUT:
+ case FILE_SHADER_OUTPUT: // yes, TCPs can read outputs
+ case FILE_MEMORY_LOCAL:
+ case FILE_MEMORY_CONST:
+ case FILE_MEMORY_SHARED:
+ case FILE_MEMORY_GLOBAL:
+ case FILE_SYSTEM_VALUE:
+ // TODO: any restrictions here ?
+ break;
+ case FILE_IMMEDIATE:
+ break;
+ default:
+ assert(0);
+ break;
+ }
+ if (cycle < ready)
+ delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculator::checkWr(const Value *v, int cycle, int& delay) const
+{
+ int ready = cycle;
+ int a, b;
+
+ switch (v->reg.file) {
+ case FILE_GPR:
+ a = v->reg.data.id;
+ b = a + v->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ ready = MAX2(ready, score->wr.r[r]);
+ break;
+ case FILE_PREDICATE:
+ ready = MAX2(ready, score->wr.p[v->reg.data.id]);
+ break;
+ default:
+ assert(v->reg.file == FILE_FLAGS);
+ ready = MAX2(ready, score->wr.c);
+ break;
+ }
+ if (cycle < ready)
+ delay = MAX2(delay, ready - cycle);
+}
+
+void
+SchedDataCalculator::recordWr(const Value *v, const int ready)
+{
+ int a = v->reg.data.id;
+
+ if (v->reg.file == FILE_GPR) {
+ int b = a + v->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ score->rd.r[r] = ready;
+ } else
+ // $c, $pX: shorter issue-to-read delay (at least as exec pred and carry)
+ if (v->reg.file == FILE_PREDICATE) {
+ score->rd.p[a] = ready + 4;
+ } else {
+ assert(v->reg.file == FILE_FLAGS);
+ score->rd.c = ready + 4;
+ }
+}
+
+void
+SchedDataCalculator::recordRd(const Value *v, const int ready)
+{
+ int a = v->reg.data.id;
+
+ if (v->reg.file == FILE_GPR) {
+ int b = a + v->reg.size / 4;
+ for (int r = a; r < b; ++r)
+ score->wr.r[r] = ready;
+ } else
+ if (v->reg.file == FILE_PREDICATE) {
+ score->wr.p[a] = ready;
+ } else
+ if (v->reg.file == FILE_FLAGS) {
+ score->wr.c = ready;
+ }
+}
+
+bool
+calculateSchedDataNVC0(const Target *targ, Function *func)
+{
+ SchedDataCalculator sched(targ);
+ return sched.run(func, true, true);
+}
+
+void
+CodeEmitterNVC0::prepareEmission(Function *func)
+{
+ CodeEmitter::prepareEmission(func);
+
+ if (targ->hasSWSched)
+ calculateSchedDataNVC0(targ, func);
+}
+
+CodeEmitterNVC0::CodeEmitterNVC0(const TargetNVC0 *target)
+ : CodeEmitter(target),
+ targNVC0(target),
+ writeIssueDelays(target->hasSWSched)
+{
+ code = NULL;
+ codeSize = codeSizeLimit = 0;
+ relocInfo = NULL;
+}
+
+CodeEmitter *
+TargetNVC0::createCodeEmitterNVC0(Program::Type type)
+{
+ CodeEmitterNVC0 *emit = new CodeEmitterNVC0(this);
+ emit->setProgramType(type);
+ return emit;
+}
+
+CodeEmitter *
+TargetNVC0::getCodeEmitter(Program::Type type)
+{
+ if (chipset >= NVISA_GK110_CHIPSET)
+ return createCodeEmitterGK110(type);
+ return createCodeEmitterNVC0(type);
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
new file mode 100644
index 0000000..3193ea6
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -0,0 +1,2852 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+extern "C" {
+#include "tgsi/tgsi_dump.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_util.h"
+}
+
+#include <set>
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_util.h"
+#include "codegen/nv50_ir_build_util.h"
+
+namespace tgsi {
+
+class Source;
+
+static nv50_ir::operation translateOpcode(uint opcode);
+static nv50_ir::DataFile translateFile(uint file);
+static nv50_ir::TexTarget translateTexture(uint texTarg);
+static nv50_ir::SVSemantic translateSysVal(uint sysval);
+
+class Instruction
+{
+public:
+ Instruction(const struct tgsi_full_instruction *inst) : insn(inst) { }
+
+ class SrcRegister
+ {
+ public:
+ SrcRegister(const struct tgsi_full_src_register *src)
+ : reg(src->Register),
+ fsr(src)
+ { }
+
+ SrcRegister(const struct tgsi_src_register& src) : reg(src), fsr(NULL) { }
+
+ SrcRegister(const struct tgsi_ind_register& ind)
+ : reg(tgsi_util_get_src_from_ind(&ind)),
+ fsr(NULL)
+ { }
+
+ struct tgsi_src_register offsetToSrc(struct tgsi_texture_offset off)
+ {
+ struct tgsi_src_register reg;
+ memset(&reg, 0, sizeof(reg));
+ reg.Index = off.Index;
+ reg.File = off.File;
+ reg.SwizzleX = off.SwizzleX;
+ reg.SwizzleY = off.SwizzleY;
+ reg.SwizzleZ = off.SwizzleZ;
+ return reg;
+ }
+
+ SrcRegister(const struct tgsi_texture_offset& off) :
+ reg(offsetToSrc(off)),
+ fsr(NULL)
+ { }
+
+ uint getFile() const { return reg.File; }
+
+ bool is2D() const { return reg.Dimension; }
+
+ bool isIndirect(int dim) const
+ {
+ return (dim && fsr) ? fsr->Dimension.Indirect : reg.Indirect;
+ }
+
+ int getIndex(int dim) const
+ {
+ return (dim && fsr) ? fsr->Dimension.Index : reg.Index;
+ }
+
+ int getSwizzle(int chan) const
+ {
+ return tgsi_util_get_src_register_swizzle(&reg, chan);
+ }
+
+ nv50_ir::Modifier getMod(int chan) const;
+
+ SrcRegister getIndirect(int dim) const
+ {
+ assert(fsr && isIndirect(dim));
+ if (dim)
+ return SrcRegister(fsr->DimIndirect);
+ return SrcRegister(fsr->Indirect);
+ }
+
+ uint32_t getValueU32(int c, const struct nv50_ir_prog_info *info) const
+ {
+ assert(reg.File == TGSI_FILE_IMMEDIATE);
+ assert(!reg.Absolute);
+ assert(!reg.Negate);
+ return info->immd.data[reg.Index * 4 + getSwizzle(c)];
+ }
+
+ private:
+ const struct tgsi_src_register reg;
+ const struct tgsi_full_src_register *fsr;
+ };
+
+ class DstRegister
+ {
+ public:
+ DstRegister(const struct tgsi_full_dst_register *dst)
+ : reg(dst->Register),
+ fdr(dst)
+ { }
+
+ DstRegister(const struct tgsi_dst_register& dst) : reg(dst), fdr(NULL) { }
+
+ uint getFile() const { return reg.File; }
+
+ bool is2D() const { return reg.Dimension; }
+
+ bool isIndirect(int dim) const
+ {
+ return (dim && fdr) ? fdr->Dimension.Indirect : reg.Indirect;
+ }
+
+ int getIndex(int dim) const
+ {
+ return (dim && fdr) ? fdr->Dimension.Dimension : reg.Index;
+ }
+
+ unsigned int getMask() const { return reg.WriteMask; }
+
+ bool isMasked(int chan) const { return !(getMask() & (1 << chan)); }
+
+ SrcRegister getIndirect(int dim) const
+ {
+ assert(fdr && isIndirect(dim));
+ if (dim)
+ return SrcRegister(fdr->DimIndirect);
+ return SrcRegister(fdr->Indirect);
+ }
+
+ private:
+ const struct tgsi_dst_register reg;
+ const struct tgsi_full_dst_register *fdr;
+ };
+
+ inline uint getOpcode() const { return insn->Instruction.Opcode; }
+
+ unsigned int srcCount() const { return insn->Instruction.NumSrcRegs; }
+ unsigned int dstCount() const { return insn->Instruction.NumDstRegs; }
+
+ // mask of used components of source s
+ unsigned int srcMask(unsigned int s) const;
+
+ SrcRegister getSrc(unsigned int s) const
+ {
+ assert(s < srcCount());
+ return SrcRegister(&insn->Src[s]);
+ }
+
+ DstRegister getDst(unsigned int d) const
+ {
+ assert(d < dstCount());
+ return DstRegister(&insn->Dst[d]);
+ }
+
+ SrcRegister getTexOffset(unsigned int i) const
+ {
+ assert(i < TGSI_FULL_MAX_TEX_OFFSETS);
+ return SrcRegister(insn->TexOffsets[i]);
+ }
+
+ unsigned int getNumTexOffsets() const { return insn->Texture.NumOffsets; }
+
+ bool checkDstSrcAliasing() const;
+
+ inline nv50_ir::operation getOP() const {
+ return translateOpcode(getOpcode()); }
+
+ nv50_ir::DataType inferSrcType() const;
+ nv50_ir::DataType inferDstType() const;
+
+ nv50_ir::CondCode getSetCond() const;
+
+ nv50_ir::TexInstruction::Target getTexture(const Source *, int s) const;
+
+ inline uint getLabel() { return insn->Label.Label; }
+
+ unsigned getSaturate() const { return insn->Instruction.Saturate; }
+
+ void print() const
+ {
+ tgsi_dump_instruction(insn, 1);
+ }
+
+private:
+ const struct tgsi_full_instruction *insn;
+};
+
+unsigned int Instruction::srcMask(unsigned int s) const
+{
+ unsigned int mask = insn->Dst[0].Register.WriteMask;
+
+ switch (insn->Instruction.Opcode) {
+ case TGSI_OPCODE_COS:
+ case TGSI_OPCODE_SIN:
+ return (mask & 0x8) | ((mask & 0x7) ? 0x1 : 0x0);
+ case TGSI_OPCODE_DP2:
+ return 0x3;
+ case TGSI_OPCODE_DP3:
+ return 0x7;
+ case TGSI_OPCODE_DP4:
+ case TGSI_OPCODE_DPH:
+ case TGSI_OPCODE_KILL_IF: /* WriteMask ignored */
+ return 0xf;
+ case TGSI_OPCODE_DST:
+ return mask & (s ? 0xa : 0x6);
+ case TGSI_OPCODE_EX2:
+ case TGSI_OPCODE_EXP:
+ case TGSI_OPCODE_LG2:
+ case TGSI_OPCODE_LOG:
+ case TGSI_OPCODE_POW:
+ case TGSI_OPCODE_RCP:
+ case TGSI_OPCODE_RSQ:
+ case TGSI_OPCODE_SCS:
+ return 0x1;
+ case TGSI_OPCODE_IF:
+ case TGSI_OPCODE_UIF:
+ return 0x1;
+ case TGSI_OPCODE_LIT:
+ return 0xb;
+ case TGSI_OPCODE_TEX2:
+ case TGSI_OPCODE_TXB2:
+ case TGSI_OPCODE_TXL2:
+ return (s == 0) ? 0xf : 0x3;
+ case TGSI_OPCODE_TEX:
+ case TGSI_OPCODE_TXB:
+ case TGSI_OPCODE_TXD:
+ case TGSI_OPCODE_TXL:
+ case TGSI_OPCODE_TXP:
+ {
+ const struct tgsi_instruction_texture *tex = &insn->Texture;
+
+ assert(insn->Instruction.Texture);
+
+ mask = 0x7;
+ if (insn->Instruction.Opcode != TGSI_OPCODE_TEX &&
+ insn->Instruction.Opcode != TGSI_OPCODE_TXD)
+ mask |= 0x8; /* bias, lod or proj */
+
+ switch (tex->Texture) {
+ case TGSI_TEXTURE_1D:
+ mask &= 0x9;
+ break;
+ case TGSI_TEXTURE_SHADOW1D:
+ mask &= 0xd;
+ break;
+ case TGSI_TEXTURE_1D_ARRAY:
+ case TGSI_TEXTURE_2D:
+ case TGSI_TEXTURE_RECT:
+ mask &= 0xb;
+ break;
+ case TGSI_TEXTURE_CUBE_ARRAY:
+ case TGSI_TEXTURE_SHADOW2D_ARRAY:
+ case TGSI_TEXTURE_SHADOWCUBE:
+ case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+ mask |= 0x8;
+ break;
+ default:
+ break;
+ }
+ }
+ return mask;
+ case TGSI_OPCODE_XPD:
+ {
+ unsigned int x = 0;
+ if (mask & 1) x |= 0x6;
+ if (mask & 2) x |= 0x5;
+ if (mask & 4) x |= 0x3;
+ return x;
+ }
+ default:
+ break;
+ }
+
+ return mask;
+}
+
+nv50_ir::Modifier Instruction::SrcRegister::getMod(int chan) const
+{
+ nv50_ir::Modifier m(0);
+
+ if (reg.Absolute)
+ m = m | nv50_ir::Modifier(NV50_IR_MOD_ABS);
+ if (reg.Negate)
+ m = m | nv50_ir::Modifier(NV50_IR_MOD_NEG);
+ return m;
+}
+
+static nv50_ir::DataFile translateFile(uint file)
+{
+ switch (file) {
+ case TGSI_FILE_CONSTANT: return nv50_ir::FILE_MEMORY_CONST;
+ case TGSI_FILE_INPUT: return nv50_ir::FILE_SHADER_INPUT;
+ case TGSI_FILE_OUTPUT: return nv50_ir::FILE_SHADER_OUTPUT;
+ case TGSI_FILE_TEMPORARY: return nv50_ir::FILE_GPR;
+ case TGSI_FILE_ADDRESS: return nv50_ir::FILE_ADDRESS;
+ case TGSI_FILE_PREDICATE: return nv50_ir::FILE_PREDICATE;
+ case TGSI_FILE_IMMEDIATE: return nv50_ir::FILE_IMMEDIATE;
+ case TGSI_FILE_SYSTEM_VALUE: return nv50_ir::FILE_SYSTEM_VALUE;
+ case TGSI_FILE_RESOURCE: return nv50_ir::FILE_MEMORY_GLOBAL;
+ case TGSI_FILE_SAMPLER:
+ case TGSI_FILE_NULL:
+ default:
+ return nv50_ir::FILE_NULL;
+ }
+}
+
+static nv50_ir::SVSemantic translateSysVal(uint sysval)
+{
+ switch (sysval) {
+ case TGSI_SEMANTIC_FACE: return nv50_ir::SV_FACE;
+ case TGSI_SEMANTIC_PSIZE: return nv50_ir::SV_POINT_SIZE;
+ case TGSI_SEMANTIC_PRIMID: return nv50_ir::SV_PRIMITIVE_ID;
+ case TGSI_SEMANTIC_INSTANCEID: return nv50_ir::SV_INSTANCE_ID;
+ case TGSI_SEMANTIC_VERTEXID: return nv50_ir::SV_VERTEX_ID;
+ case TGSI_SEMANTIC_GRID_SIZE: return nv50_ir::SV_NCTAID;
+ case TGSI_SEMANTIC_BLOCK_ID: return nv50_ir::SV_CTAID;
+ case TGSI_SEMANTIC_BLOCK_SIZE: return nv50_ir::SV_NTID;
+ case TGSI_SEMANTIC_THREAD_ID: return nv50_ir::SV_TID;
+ default:
+ assert(0);
+ return nv50_ir::SV_CLOCK;
+ }
+}
+
+#define NV50_IR_TEX_TARG_CASE(a, b) \
+ case TGSI_TEXTURE_##a: return nv50_ir::TEX_TARGET_##b;
+
+static nv50_ir::TexTarget translateTexture(uint tex)
+{
+ switch (tex) {
+ NV50_IR_TEX_TARG_CASE(1D, 1D);
+ NV50_IR_TEX_TARG_CASE(2D, 2D);
+ NV50_IR_TEX_TARG_CASE(2D_MSAA, 2D_MS);
+ NV50_IR_TEX_TARG_CASE(3D, 3D);
+ NV50_IR_TEX_TARG_CASE(CUBE, CUBE);
+ NV50_IR_TEX_TARG_CASE(RECT, RECT);
+ NV50_IR_TEX_TARG_CASE(1D_ARRAY, 1D_ARRAY);
+ NV50_IR_TEX_TARG_CASE(2D_ARRAY, 2D_ARRAY);
+ NV50_IR_TEX_TARG_CASE(2D_ARRAY_MSAA, 2D_MS_ARRAY);
+ NV50_IR_TEX_TARG_CASE(CUBE_ARRAY, CUBE_ARRAY);
+ NV50_IR_TEX_TARG_CASE(SHADOW1D, 1D_SHADOW);
+ NV50_IR_TEX_TARG_CASE(SHADOW2D, 2D_SHADOW);
+ NV50_IR_TEX_TARG_CASE(SHADOWCUBE, CUBE_SHADOW);
+ NV50_IR_TEX_TARG_CASE(SHADOWRECT, RECT_SHADOW);
+ NV50_IR_TEX_TARG_CASE(SHADOW1D_ARRAY, 1D_ARRAY_SHADOW);
+ NV50_IR_TEX_TARG_CASE(SHADOW2D_ARRAY, 2D_ARRAY_SHADOW);
+ NV50_IR_TEX_TARG_CASE(SHADOWCUBE_ARRAY, CUBE_ARRAY_SHADOW);
+ NV50_IR_TEX_TARG_CASE(BUFFER, BUFFER);
+
+ case TGSI_TEXTURE_UNKNOWN:
+ default:
+ assert(!"invalid texture target");
+ return nv50_ir::TEX_TARGET_2D;
+ }
+}
+
+nv50_ir::DataType Instruction::inferSrcType() const
+{
+ switch (getOpcode()) {
+ case TGSI_OPCODE_UIF:
+ case TGSI_OPCODE_AND:
+ case TGSI_OPCODE_OR:
+ case TGSI_OPCODE_XOR:
+ case TGSI_OPCODE_NOT:
+ case TGSI_OPCODE_U2F:
+ case TGSI_OPCODE_UADD:
+ case TGSI_OPCODE_UDIV:
+ case TGSI_OPCODE_UMOD:
+ case TGSI_OPCODE_UMAD:
+ case TGSI_OPCODE_UMUL:
+ case TGSI_OPCODE_UMAX:
+ case TGSI_OPCODE_UMIN:
+ case TGSI_OPCODE_USEQ:
+ case TGSI_OPCODE_USGE:
+ case TGSI_OPCODE_USLT:
+ case TGSI_OPCODE_USNE:
+ case TGSI_OPCODE_USHR:
+ case TGSI_OPCODE_UCMP:
+ case TGSI_OPCODE_ATOMUADD:
+ case TGSI_OPCODE_ATOMXCHG:
+ case TGSI_OPCODE_ATOMCAS:
+ case TGSI_OPCODE_ATOMAND:
+ case TGSI_OPCODE_ATOMOR:
+ case TGSI_OPCODE_ATOMXOR:
+ case TGSI_OPCODE_ATOMUMIN:
+ case TGSI_OPCODE_ATOMUMAX:
+ return nv50_ir::TYPE_U32;
+ case TGSI_OPCODE_I2F:
+ case TGSI_OPCODE_IDIV:
+ case TGSI_OPCODE_IMAX:
+ case TGSI_OPCODE_IMIN:
+ case TGSI_OPCODE_IABS:
+ case TGSI_OPCODE_INEG:
+ case TGSI_OPCODE_ISGE:
+ case TGSI_OPCODE_ISHR:
+ case TGSI_OPCODE_ISLT:
+ case TGSI_OPCODE_ISSG:
+ case TGSI_OPCODE_SAD: // not sure about SAD, but no one has a float version
+ case TGSI_OPCODE_MOD:
+ case TGSI_OPCODE_UARL:
+ case TGSI_OPCODE_ATOMIMIN:
+ case TGSI_OPCODE_ATOMIMAX:
+ return nv50_ir::TYPE_S32;
+ default:
+ return nv50_ir::TYPE_F32;
+ }
+}
+
+nv50_ir::DataType Instruction::inferDstType() const
+{
+ switch (getOpcode()) {
+ case TGSI_OPCODE_F2U: return nv50_ir::TYPE_U32;
+ case TGSI_OPCODE_F2I: return nv50_ir::TYPE_S32;
+ case TGSI_OPCODE_FSEQ:
+ case TGSI_OPCODE_FSGE:
+ case TGSI_OPCODE_FSLT:
+ case TGSI_OPCODE_FSNE:
+ return nv50_ir::TYPE_U32;
+ case TGSI_OPCODE_I2F:
+ case TGSI_OPCODE_U2F:
+ return nv50_ir::TYPE_F32;
+ default:
+ return inferSrcType();
+ }
+}
+
+nv50_ir::CondCode Instruction::getSetCond() const
+{
+ using namespace nv50_ir;
+
+ switch (getOpcode()) {
+ case TGSI_OPCODE_SLT:
+ case TGSI_OPCODE_ISLT:
+ case TGSI_OPCODE_USLT:
+ case TGSI_OPCODE_FSLT:
+ return CC_LT;
+ case TGSI_OPCODE_SLE:
+ return CC_LE;
+ case TGSI_OPCODE_SGE:
+ case TGSI_OPCODE_ISGE:
+ case TGSI_OPCODE_USGE:
+ case TGSI_OPCODE_FSGE:
+ return CC_GE;
+ case TGSI_OPCODE_SGT:
+ return CC_GT;
+ case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_USEQ:
+ case TGSI_OPCODE_FSEQ:
+ return CC_EQ;
+ case TGSI_OPCODE_SNE:
+ case TGSI_OPCODE_FSNE:
+ return CC_NEU;
+ case TGSI_OPCODE_USNE:
+ return CC_NE;
+ case TGSI_OPCODE_SFL:
+ return CC_NEVER;
+ case TGSI_OPCODE_STR:
+ default:
+ return CC_ALWAYS;
+ }
+}
+
+#define NV50_IR_OPCODE_CASE(a, b) case TGSI_OPCODE_##a: return nv50_ir::OP_##b
+
+static nv50_ir::operation translateOpcode(uint opcode)
+{
+ switch (opcode) {
+ NV50_IR_OPCODE_CASE(ARL, SHL);
+ NV50_IR_OPCODE_CASE(MOV, MOV);
+
+ NV50_IR_OPCODE_CASE(RCP, RCP);
+ NV50_IR_OPCODE_CASE(RSQ, RSQ);
+
+ NV50_IR_OPCODE_CASE(MUL, MUL);
+ NV50_IR_OPCODE_CASE(ADD, ADD);
+
+ NV50_IR_OPCODE_CASE(MIN, MIN);
+ NV50_IR_OPCODE_CASE(MAX, MAX);
+ NV50_IR_OPCODE_CASE(SLT, SET);
+ NV50_IR_OPCODE_CASE(SGE, SET);
+ NV50_IR_OPCODE_CASE(MAD, MAD);
+ NV50_IR_OPCODE_CASE(SUB, SUB);
+
+ NV50_IR_OPCODE_CASE(FLR, FLOOR);
+ NV50_IR_OPCODE_CASE(ROUND, CVT);
+ NV50_IR_OPCODE_CASE(EX2, EX2);
+ NV50_IR_OPCODE_CASE(LG2, LG2);
+ NV50_IR_OPCODE_CASE(POW, POW);
+
+ NV50_IR_OPCODE_CASE(ABS, ABS);
+
+ NV50_IR_OPCODE_CASE(COS, COS);
+ NV50_IR_OPCODE_CASE(DDX, DFDX);
+ NV50_IR_OPCODE_CASE(DDY, DFDY);
+ NV50_IR_OPCODE_CASE(KILL, DISCARD);
+
+ NV50_IR_OPCODE_CASE(SEQ, SET);
+ NV50_IR_OPCODE_CASE(SFL, SET);
+ NV50_IR_OPCODE_CASE(SGT, SET);
+ NV50_IR_OPCODE_CASE(SIN, SIN);
+ NV50_IR_OPCODE_CASE(SLE, SET);
+ NV50_IR_OPCODE_CASE(SNE, SET);
+ NV50_IR_OPCODE_CASE(STR, SET);
+ NV50_IR_OPCODE_CASE(TEX, TEX);
+ NV50_IR_OPCODE_CASE(TXD, TXD);
+ NV50_IR_OPCODE_CASE(TXP, TEX);
+
+ NV50_IR_OPCODE_CASE(BRA, BRA);
+ NV50_IR_OPCODE_CASE(CAL, CALL);
+ NV50_IR_OPCODE_CASE(RET, RET);
+ NV50_IR_OPCODE_CASE(CMP, SLCT);
+
+ NV50_IR_OPCODE_CASE(TXB, TXB);
+
+ NV50_IR_OPCODE_CASE(DIV, DIV);
+
+ NV50_IR_OPCODE_CASE(TXL, TXL);
+
+ NV50_IR_OPCODE_CASE(CEIL, CEIL);
+ NV50_IR_OPCODE_CASE(I2F, CVT);
+ NV50_IR_OPCODE_CASE(NOT, NOT);
+ NV50_IR_OPCODE_CASE(TRUNC, TRUNC);
+ NV50_IR_OPCODE_CASE(SHL, SHL);
+
+ NV50_IR_OPCODE_CASE(AND, AND);
+ NV50_IR_OPCODE_CASE(OR, OR);
+ NV50_IR_OPCODE_CASE(MOD, MOD);
+ NV50_IR_OPCODE_CASE(XOR, XOR);
+ NV50_IR_OPCODE_CASE(SAD, SAD);
+ NV50_IR_OPCODE_CASE(TXF, TXF);
+ NV50_IR_OPCODE_CASE(TXQ, TXQ);
+
+ NV50_IR_OPCODE_CASE(EMIT, EMIT);
+ NV50_IR_OPCODE_CASE(ENDPRIM, RESTART);
+
+ NV50_IR_OPCODE_CASE(KILL_IF, DISCARD);
+
+ NV50_IR_OPCODE_CASE(F2I, CVT);
+ NV50_IR_OPCODE_CASE(FSEQ, SET);
+ NV50_IR_OPCODE_CASE(FSGE, SET);
+ NV50_IR_OPCODE_CASE(FSLT, SET);
+ NV50_IR_OPCODE_CASE(FSNE, SET);
+ NV50_IR_OPCODE_CASE(IDIV, DIV);
+ NV50_IR_OPCODE_CASE(IMAX, MAX);
+ NV50_IR_OPCODE_CASE(IMIN, MIN);
+ NV50_IR_OPCODE_CASE(IABS, ABS);
+ NV50_IR_OPCODE_CASE(INEG, NEG);
+ NV50_IR_OPCODE_CASE(ISGE, SET);
+ NV50_IR_OPCODE_CASE(ISHR, SHR);
+ NV50_IR_OPCODE_CASE(ISLT, SET);
+ NV50_IR_OPCODE_CASE(F2U, CVT);
+ NV50_IR_OPCODE_CASE(U2F, CVT);
+ NV50_IR_OPCODE_CASE(UADD, ADD);
+ NV50_IR_OPCODE_CASE(UDIV, DIV);
+ NV50_IR_OPCODE_CASE(UMAD, MAD);
+ NV50_IR_OPCODE_CASE(UMAX, MAX);
+ NV50_IR_OPCODE_CASE(UMIN, MIN);
+ NV50_IR_OPCODE_CASE(UMOD, MOD);
+ NV50_IR_OPCODE_CASE(UMUL, MUL);
+ NV50_IR_OPCODE_CASE(USEQ, SET);
+ NV50_IR_OPCODE_CASE(USGE, SET);
+ NV50_IR_OPCODE_CASE(USHR, SHR);
+ NV50_IR_OPCODE_CASE(USLT, SET);
+ NV50_IR_OPCODE_CASE(USNE, SET);
+
+ NV50_IR_OPCODE_CASE(SAMPLE, TEX);
+ NV50_IR_OPCODE_CASE(SAMPLE_B, TXB);
+ NV50_IR_OPCODE_CASE(SAMPLE_C, TEX);
+ NV50_IR_OPCODE_CASE(SAMPLE_C_LZ, TEX);
+ NV50_IR_OPCODE_CASE(SAMPLE_D, TXD);
+ NV50_IR_OPCODE_CASE(SAMPLE_L, TXL);
+ NV50_IR_OPCODE_CASE(SAMPLE_I, TXF);
+ NV50_IR_OPCODE_CASE(SAMPLE_I_MS, TXF);
+ NV50_IR_OPCODE_CASE(GATHER4, TXG);
+ NV50_IR_OPCODE_CASE(SVIEWINFO, TXQ);
+
+ NV50_IR_OPCODE_CASE(ATOMUADD, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMXCHG, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMCAS, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMAND, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMOR, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMXOR, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMUMIN, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMUMAX, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMIMIN, ATOM);
+ NV50_IR_OPCODE_CASE(ATOMIMAX, ATOM);
+
+ NV50_IR_OPCODE_CASE(TEX2, TEX);
+ NV50_IR_OPCODE_CASE(TXB2, TXB);
+ NV50_IR_OPCODE_CASE(TXL2, TXL);
+
+ NV50_IR_OPCODE_CASE(END, EXIT);
+
+ default:
+ return nv50_ir::OP_NOP;
+ }
+}
+
+static uint16_t opcodeToSubOp(uint opcode)
+{
+ switch (opcode) {
+ case TGSI_OPCODE_LFENCE: return NV50_IR_SUBOP_MEMBAR(L, GL);
+ case TGSI_OPCODE_SFENCE: return NV50_IR_SUBOP_MEMBAR(S, GL);
+ case TGSI_OPCODE_MFENCE: return NV50_IR_SUBOP_MEMBAR(M, GL);
+ case TGSI_OPCODE_ATOMUADD: return NV50_IR_SUBOP_ATOM_ADD;
+ case TGSI_OPCODE_ATOMXCHG: return NV50_IR_SUBOP_ATOM_EXCH;
+ case TGSI_OPCODE_ATOMCAS: return NV50_IR_SUBOP_ATOM_CAS;
+ case TGSI_OPCODE_ATOMAND: return NV50_IR_SUBOP_ATOM_AND;
+ case TGSI_OPCODE_ATOMOR: return NV50_IR_SUBOP_ATOM_OR;
+ case TGSI_OPCODE_ATOMXOR: return NV50_IR_SUBOP_ATOM_XOR;
+ case TGSI_OPCODE_ATOMUMIN: return NV50_IR_SUBOP_ATOM_MIN;
+ case TGSI_OPCODE_ATOMIMIN: return NV50_IR_SUBOP_ATOM_MIN;
+ case TGSI_OPCODE_ATOMUMAX: return NV50_IR_SUBOP_ATOM_MAX;
+ case TGSI_OPCODE_ATOMIMAX: return NV50_IR_SUBOP_ATOM_MAX;
+ default:
+ return 0;
+ }
+}
+
+bool Instruction::checkDstSrcAliasing() const
+{
+ if (insn->Dst[0].Register.Indirect) // no danger if indirect, using memory
+ return false;
+
+ for (int s = 0; s < TGSI_FULL_MAX_SRC_REGISTERS; ++s) {
+ if (insn->Src[s].Register.File == TGSI_FILE_NULL)
+ break;
+ if (insn->Src[s].Register.File == insn->Dst[0].Register.File &&
+ insn->Src[s].Register.Index == insn->Dst[0].Register.Index)
+ return true;
+ }
+ return false;
+}
+
+class Source
+{
+public:
+ Source(struct nv50_ir_prog_info *);
+ ~Source();
+
+public:
+ bool scanSource();
+ unsigned fileSize(unsigned file) const { return scan.file_max[file] + 1; }
+
+public:
+ struct tgsi_shader_info scan;
+ struct tgsi_full_instruction *insns;
+ const struct tgsi_token *tokens;
+ struct nv50_ir_prog_info *info;
+
+ nv50_ir::DynArray tempArrays;
+ nv50_ir::DynArray immdArrays;
+
+ typedef nv50_ir::BuildUtil::Location Location;
+ // these registers are per-subroutine, cannot be used for parameter passing
+ std::set<Location> locals;
+
+ bool mainTempsInLMem;
+
+ int clipVertexOutput;
+
+ struct TextureView {
+ uint8_t target; // TGSI_TEXTURE_*
+ };
+ std::vector<TextureView> textureViews;
+
+ struct Resource {
+ uint8_t target; // TGSI_TEXTURE_*
+ bool raw;
+ uint8_t slot; // $surface index
+ };
+ std::vector<Resource> resources;
+
+private:
+ int inferSysValDirection(unsigned sn) const;
+ bool scanDeclaration(const struct tgsi_full_declaration *);
+ bool scanInstruction(const struct tgsi_full_instruction *);
+ void scanProperty(const struct tgsi_full_property *);
+ void scanImmediate(const struct tgsi_full_immediate *);
+
+ inline bool isEdgeFlagPassthrough(const Instruction&) const;
+};
+
+Source::Source(struct nv50_ir_prog_info *prog) : info(prog)
+{
+ tokens = (const struct tgsi_token *)info->bin.source;
+
+ if (prog->dbgFlags & NV50_IR_DEBUG_BASIC)
+ tgsi_dump(tokens, 0);
+
+ mainTempsInLMem = FALSE;
+}
+
+Source::~Source()
+{
+ if (insns)
+ FREE(insns);
+
+ if (info->immd.data)
+ FREE(info->immd.data);
+ if (info->immd.type)
+ FREE(info->immd.type);
+}
+
+bool Source::scanSource()
+{
+ unsigned insnCount = 0;
+ struct tgsi_parse_context parse;
+
+ tgsi_scan_shader(tokens, &scan);
+
+ insns = (struct tgsi_full_instruction *)MALLOC(scan.num_instructions *
+ sizeof(insns[0]));
+ if (!insns)
+ return false;
+
+ clipVertexOutput = -1;
+
+ textureViews.resize(scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1);
+ resources.resize(scan.file_max[TGSI_FILE_RESOURCE] + 1);
+
+ info->immd.bufSize = 0;
+
+ info->numInputs = scan.file_max[TGSI_FILE_INPUT] + 1;
+ info->numOutputs = scan.file_max[TGSI_FILE_OUTPUT] + 1;
+ info->numSysVals = scan.file_max[TGSI_FILE_SYSTEM_VALUE] + 1;
+
+ if (info->type == PIPE_SHADER_FRAGMENT) {
+ info->prop.fp.writesDepth = scan.writes_z;
+ info->prop.fp.usesDiscard = scan.uses_kill;
+ } else
+ if (info->type == PIPE_SHADER_GEOMETRY) {
+ info->prop.gp.instanceCount = 1; // default value
+ }
+
+ info->immd.data = (uint32_t *)MALLOC(scan.immediate_count * 16);
+ info->immd.type = (ubyte *)MALLOC(scan.immediate_count * sizeof(ubyte));
+
+ tgsi_parse_init(&parse, tokens);
+ while (!tgsi_parse_end_of_tokens(&parse)) {
+ tgsi_parse_token(&parse);
+
+ switch (parse.FullToken.Token.Type) {
+ case TGSI_TOKEN_TYPE_IMMEDIATE:
+ scanImmediate(&parse.FullToken.FullImmediate);
+ break;
+ case TGSI_TOKEN_TYPE_DECLARATION:
+ scanDeclaration(&parse.FullToken.FullDeclaration);
+ break;
+ case TGSI_TOKEN_TYPE_INSTRUCTION:
+ insns[insnCount++] = parse.FullToken.FullInstruction;
+ scanInstruction(&parse.FullToken.FullInstruction);
+ break;
+ case TGSI_TOKEN_TYPE_PROPERTY:
+ scanProperty(&parse.FullToken.FullProperty);
+ break;
+ default:
+ INFO("unknown TGSI token type: %d\n", parse.FullToken.Token.Type);
+ break;
+ }
+ }
+ tgsi_parse_free(&parse);
+
+ if (mainTempsInLMem)
+ info->bin.tlsSpace += (scan.file_max[TGSI_FILE_TEMPORARY] + 1) * 16;
+
+ if (info->io.genUserClip > 0) {
+ info->io.clipDistanceMask = (1 << info->io.genUserClip) - 1;
+
+ const unsigned int nOut = (info->io.genUserClip + 3) / 4;
+
+ for (unsigned int n = 0; n < nOut; ++n) {
+ unsigned int i = info->numOutputs++;
+ info->out[i].id = i;
+ info->out[i].sn = TGSI_SEMANTIC_CLIPDIST;
+ info->out[i].si = n;
+ info->out[i].mask = info->io.clipDistanceMask >> (n * 4);
+ }
+ }
+
+ return info->assignSlots(info) == 0;
+}
+
+void Source::scanProperty(const struct tgsi_full_property *prop)
+{
+ switch (prop->Property.PropertyName) {
+ case TGSI_PROPERTY_GS_OUTPUT_PRIM:
+ info->prop.gp.outputPrim = prop->u[0].Data;
+ break;
+ case TGSI_PROPERTY_GS_INPUT_PRIM:
+ info->prop.gp.inputPrim = prop->u[0].Data;
+ break;
+ case TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES:
+ info->prop.gp.maxVertices = prop->u[0].Data;
+ break;
+#if 0
+ case TGSI_PROPERTY_GS_INSTANCE_COUNT:
+ info->prop.gp.instanceCount = prop->u[0].Data;
+ break;
+#endif
+ case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
+ info->prop.fp.separateFragData = TRUE;
+ break;
+ case TGSI_PROPERTY_FS_COORD_ORIGIN:
+ case TGSI_PROPERTY_FS_COORD_PIXEL_CENTER:
+ // we don't care
+ break;
+ case TGSI_PROPERTY_VS_PROHIBIT_UCPS:
+ info->io.genUserClip = -1;
+ break;
+ default:
+ INFO("unhandled TGSI property %d\n", prop->Property.PropertyName);
+ break;
+ }
+}
+
+void Source::scanImmediate(const struct tgsi_full_immediate *imm)
+{
+ const unsigned n = info->immd.count++;
+
+ assert(n < scan.immediate_count);
+
+ for (int c = 0; c < 4; ++c)
+ info->immd.data[n * 4 + c] = imm->u[c].Uint;
+
+ info->immd.type[n] = imm->Immediate.DataType;
+}
+
+int Source::inferSysValDirection(unsigned sn) const
+{
+ switch (sn) {
+ case TGSI_SEMANTIC_INSTANCEID:
+ case TGSI_SEMANTIC_VERTEXID:
+ return 1;
+#if 0
+ case TGSI_SEMANTIC_LAYER:
+ case TGSI_SEMANTIC_VIEWPORTINDEX:
+ return 0;
+#endif
+ case TGSI_SEMANTIC_PRIMID:
+ return (info->type == PIPE_SHADER_FRAGMENT) ? 1 : 0;
+ default:
+ return 0;
+ }
+}
+
+bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
+{
+ unsigned i, c;
+ unsigned sn = TGSI_SEMANTIC_GENERIC;
+ unsigned si = 0;
+ const unsigned first = decl->Range.First, last = decl->Range.Last;
+
+ if (decl->Declaration.Semantic) {
+ sn = decl->Semantic.Name;
+ si = decl->Semantic.Index;
+ }
+
+ if (decl->Declaration.Local) {
+ for (i = first; i <= last; ++i) {
+ for (c = 0; c < 4; ++c) {
+ locals.insert(
+ Location(decl->Declaration.File, decl->Dim.Index2D, i, c));
+ }
+ }
+ }
+
+ switch (decl->Declaration.File) {
+ case TGSI_FILE_INPUT:
+ if (info->type == PIPE_SHADER_VERTEX) {
+ // all vertex attributes are equal
+ for (i = first; i <= last; ++i) {
+ info->in[i].sn = TGSI_SEMANTIC_GENERIC;
+ info->in[i].si = i;
+ }
+ } else {
+ for (i = first; i <= last; ++i, ++si) {
+ info->in[i].id = i;
+ info->in[i].sn = sn;
+ info->in[i].si = si;
+ if (info->type == PIPE_SHADER_FRAGMENT) {
+ // translate interpolation mode
+ switch (decl->Interp.Interpolate) {
+ case TGSI_INTERPOLATE_CONSTANT:
+ info->in[i].flat = 1;
+ break;
+ case TGSI_INTERPOLATE_COLOR:
+ info->in[i].sc = 1;
+ break;
+ case TGSI_INTERPOLATE_LINEAR:
+ info->in[i].linear = 1;
+ break;
+ default:
+ break;
+ }
+ if (decl->Interp.Centroid)
+ info->in[i].centroid = 1;
+ }
+ }
+ }
+ break;
+ case TGSI_FILE_OUTPUT:
+ for (i = first; i <= last; ++i, ++si) {
+ switch (sn) {
+ case TGSI_SEMANTIC_POSITION:
+ if (info->type == PIPE_SHADER_FRAGMENT)
+ info->io.fragDepth = i;
+ else
+ if (clipVertexOutput < 0)
+ clipVertexOutput = i;
+ break;
+ case TGSI_SEMANTIC_COLOR:
+ if (info->type == PIPE_SHADER_FRAGMENT)
+ info->prop.fp.numColourResults++;
+ break;
+ case TGSI_SEMANTIC_EDGEFLAG:
+ info->io.edgeFlagOut = i;
+ break;
+ case TGSI_SEMANTIC_CLIPVERTEX:
+ clipVertexOutput = i;
+ break;
+ case TGSI_SEMANTIC_CLIPDIST:
+ info->io.clipDistanceMask |=
+ decl->Declaration.UsageMask << (si * 4);
+ info->io.genUserClip = -1;
+ break;
+ default:
+ break;
+ }
+ info->out[i].id = i;
+ info->out[i].sn = sn;
+ info->out[i].si = si;
+ }
+ break;
+ case TGSI_FILE_SYSTEM_VALUE:
+ switch (sn) {
+ case TGSI_SEMANTIC_INSTANCEID:
+ info->io.instanceId = first;
+ break;
+ case TGSI_SEMANTIC_VERTEXID:
+ info->io.vertexId = first;
+ break;
+ default:
+ break;
+ }
+ for (i = first; i <= last; ++i, ++si) {
+ info->sv[i].sn = sn;
+ info->sv[i].si = si;
+ info->sv[i].input = inferSysValDirection(sn);
+ }
+ break;
+ case TGSI_FILE_RESOURCE:
+ for (i = first; i <= last; ++i) {
+ resources[i].target = decl->Resource.Resource;
+ resources[i].raw = decl->Resource.Raw;
+ resources[i].slot = i;
+ }
+ break;
+ case TGSI_FILE_SAMPLER_VIEW:
+ for (i = first; i <= last; ++i)
+ textureViews[i].target = decl->SamplerView.Resource;
+ break;
+ case TGSI_FILE_NULL:
+ case TGSI_FILE_TEMPORARY:
+ case TGSI_FILE_ADDRESS:
+ case TGSI_FILE_CONSTANT:
+ case TGSI_FILE_IMMEDIATE:
+ case TGSI_FILE_PREDICATE:
+ case TGSI_FILE_SAMPLER:
+ break;
+ default:
+ ERROR("unhandled TGSI_FILE %d\n", decl->Declaration.File);
+ return false;
+ }
+ return true;
+}
+
+inline bool Source::isEdgeFlagPassthrough(const Instruction& insn) const
+{
+ return insn.getOpcode() == TGSI_OPCODE_MOV &&
+ insn.getDst(0).getIndex(0) == info->io.edgeFlagOut &&
+ insn.getSrc(0).getFile() == TGSI_FILE_INPUT;
+}
+
+bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
+{
+ Instruction insn(inst);
+
+ if (insn.getOpcode() == TGSI_OPCODE_BARRIER)
+ info->numBarriers = 1;
+
+ if (insn.dstCount()) {
+ if (insn.getDst(0).getFile() == TGSI_FILE_OUTPUT) {
+ Instruction::DstRegister dst = insn.getDst(0);
+
+ if (dst.isIndirect(0))
+ for (unsigned i = 0; i < info->numOutputs; ++i)
+ info->out[i].mask = 0xf;
+ else
+ info->out[dst.getIndex(0)].mask |= dst.getMask();
+
+ if (info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_PSIZE ||
+ info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_PRIMID ||
+ info->out[dst.getIndex(0)].sn == TGSI_SEMANTIC_FOG)
+ info->out[dst.getIndex(0)].mask &= 1;
+
+ if (isEdgeFlagPassthrough(insn))
+ info->io.edgeFlagIn = insn.getSrc(0).getIndex(0);
+ } else
+ if (insn.getDst(0).getFile() == TGSI_FILE_TEMPORARY) {
+ if (insn.getDst(0).isIndirect(0))
+ mainTempsInLMem = TRUE;
+ }
+ }
+
+ for (unsigned s = 0; s < insn.srcCount(); ++s) {
+ Instruction::SrcRegister src = insn.getSrc(s);
+ if (src.getFile() == TGSI_FILE_TEMPORARY) {
+ if (src.isIndirect(0))
+ mainTempsInLMem = TRUE;
+ } else
+ if (src.getFile() == TGSI_FILE_RESOURCE) {
+ if (src.getIndex(0) == TGSI_RESOURCE_GLOBAL)
+ info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
+ 0x1 : 0x2;
+ }
+ if (src.getFile() != TGSI_FILE_INPUT)
+ continue;
+ unsigned mask = insn.srcMask(s);
+
+ if (src.isIndirect(0)) {
+ for (unsigned i = 0; i < info->numInputs; ++i)
+ info->in[i].mask = 0xf;
+ } else {
+ const int i = src.getIndex(0);
+ for (unsigned c = 0; c < 4; ++c) {
+ if (!(mask & (1 << c)))
+ continue;
+ int k = src.getSwizzle(c);
+ if (k <= TGSI_SWIZZLE_W)
+ info->in[i].mask |= 1 << k;
+ }
+ switch (info->in[i].sn) {
+ case TGSI_SEMANTIC_PSIZE:
+ case TGSI_SEMANTIC_PRIMID:
+ case TGSI_SEMANTIC_FOG:
+ info->in[i].mask &= 0x1;
+ break;
+ case TGSI_SEMANTIC_PCOORD:
+ info->in[i].mask &= 0x3;
+ break;
+ default:
+ break;
+ }
+ }
+ }
+ return true;
+}
+
+nv50_ir::TexInstruction::Target
+Instruction::getTexture(const tgsi::Source *code, int s) const
+{
+ // XXX: indirect access
+ unsigned int r;
+
+ switch (getSrc(s).getFile()) {
+ case TGSI_FILE_RESOURCE:
+ r = getSrc(s).getIndex(0);
+ return translateTexture(code->resources.at(r).target);
+ case TGSI_FILE_SAMPLER_VIEW:
+ r = getSrc(s).getIndex(0);
+ return translateTexture(code->textureViews.at(r).target);
+ default:
+ return translateTexture(insn->Texture.Texture);
+ }
+}
+
+} // namespace tgsi
+
+namespace {
+
+using namespace nv50_ir;
+
+class Converter : public BuildUtil
+{
+public:
+ Converter(Program *, const tgsi::Source *);
+ ~Converter();
+
+ bool run();
+
+private:
+ struct Subroutine
+ {
+ Subroutine(Function *f) : f(f) { }
+ Function *f;
+ ValueMap values;
+ };
+
+ Value *getVertexBase(int s);
+ DataArray *getArrayForFile(unsigned file, int idx);
+ Value *fetchSrc(int s, int c);
+ Value *acquireDst(int d, int c);
+ void storeDst(int d, int c, Value *);
+
+ Value *fetchSrc(const tgsi::Instruction::SrcRegister src, int c, Value *ptr);
+ void storeDst(const tgsi::Instruction::DstRegister dst, int c,
+ Value *val, Value *ptr);
+
+ Value *applySrcMod(Value *, int s, int c);
+
+ Symbol *makeSym(uint file, int fileIndex, int idx, int c, uint32_t addr);
+ Symbol *srcToSym(tgsi::Instruction::SrcRegister, int c);
+ Symbol *dstToSym(tgsi::Instruction::DstRegister, int c);
+
+ bool handleInstruction(const struct tgsi_full_instruction *);
+ void exportOutputs();
+ inline Subroutine *getSubroutine(unsigned ip);
+ inline Subroutine *getSubroutine(Function *);
+ inline bool isEndOfSubroutine(uint ip);
+
+ void loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask);
+
+ // R,S,L,C,Dx,Dy encode TGSI sources for respective values (0xSf for auto)
+ void setTexRS(TexInstruction *, unsigned int& s, int R, int S);
+ void handleTEX(Value *dst0[4], int R, int S, int L, int C, int Dx, int Dy);
+ void handleTXF(Value *dst0[4], int R, int L_M);
+ void handleTXQ(Value *dst0[4], enum TexQuery);
+ void handleLIT(Value *dst0[4]);
+ void handleUserClipPlanes();
+
+ Symbol *getResourceBase(int r);
+ void getResourceCoords(std::vector<Value *>&, int r, int s);
+
+ void handleLOAD(Value *dst0[4]);
+ void handleSTORE();
+ void handleATOM(Value *dst0[4], DataType, uint16_t subOp);
+
+ Value *interpolate(tgsi::Instruction::SrcRegister, int c, Value *ptr);
+
+ void insertConvergenceOps(BasicBlock *conv, BasicBlock *fork);
+
+ Value *buildDot(int dim);
+
+ class BindArgumentsPass : public Pass {
+ public:
+ BindArgumentsPass(Converter &conv) : conv(conv) { }
+
+ private:
+ Converter &conv;
+ Subroutine *sub;
+
+ inline const Location *getValueLocation(Subroutine *, Value *);
+
+ template<typename T> inline void
+ updateCallArgs(Instruction *i, void (Instruction::*setArg)(int, Value *),
+ T (Function::*proto));
+
+ template<typename T> inline void
+ updatePrototype(BitSet *set, void (Function::*updateSet)(),
+ T (Function::*proto));
+
+ protected:
+ bool visit(Function *);
+ bool visit(BasicBlock *bb) { return false; }
+ };
+
+private:
+ const struct tgsi::Source *code;
+ const struct nv50_ir_prog_info *info;
+
+ struct {
+ std::map<unsigned, Subroutine> map;
+ Subroutine *cur;
+ } sub;
+
+ uint ip; // instruction pointer
+
+ tgsi::Instruction tgsi;
+
+ DataType dstTy;
+ DataType srcTy;
+
+ DataArray tData; // TGSI_FILE_TEMPORARY
+ DataArray aData; // TGSI_FILE_ADDRESS
+ DataArray pData; // TGSI_FILE_PREDICATE
+ DataArray oData; // TGSI_FILE_OUTPUT (if outputs in registers)
+
+ Value *zero;
+ Value *fragCoord[4];
+ Value *clipVtx[4];
+
+ Value *vtxBase[5]; // base address of vertex in primitive (for TP/GP)
+ uint8_t vtxBaseValid;
+
+ Stack condBBs; // fork BB, then else clause BB
+ Stack joinBBs; // fork BB, for inserting join ops on ENDIF
+ Stack loopBBs; // loop headers
+ Stack breakBBs; // end of / after loop
+};
+
+Symbol *
+Converter::srcToSym(tgsi::Instruction::SrcRegister src, int c)
+{
+ const int swz = src.getSwizzle(c);
+
+ return makeSym(src.getFile(),
+ src.is2D() ? src.getIndex(1) : 0,
+ src.isIndirect(0) ? -1 : src.getIndex(0), swz,
+ src.getIndex(0) * 16 + swz * 4);
+}
+
+Symbol *
+Converter::dstToSym(tgsi::Instruction::DstRegister dst, int c)
+{
+ return makeSym(dst.getFile(),
+ dst.is2D() ? dst.getIndex(1) : 0,
+ dst.isIndirect(0) ? -1 : dst.getIndex(0), c,
+ dst.getIndex(0) * 16 + c * 4);
+}
+
+Symbol *
+Converter::makeSym(uint tgsiFile, int fileIdx, int idx, int c, uint32_t address)
+{
+ Symbol *sym = new_Symbol(prog, tgsi::translateFile(tgsiFile));
+
+ sym->reg.fileIndex = fileIdx;
+
+ if (idx >= 0) {
+ if (sym->reg.file == FILE_SHADER_INPUT)
+ sym->setOffset(info->in[idx].slot[c] * 4);
+ else
+ if (sym->reg.file == FILE_SHADER_OUTPUT)
+ sym->setOffset(info->out[idx].slot[c] * 4);
+ else
+ if (sym->reg.file == FILE_SYSTEM_VALUE)
+ sym->setSV(tgsi::translateSysVal(info->sv[idx].sn), c);
+ else
+ sym->setOffset(address);
+ } else {
+ sym->setOffset(address);
+ }
+ return sym;
+}
+
+static inline uint8_t
+translateInterpMode(const struct nv50_ir_varying *var, operation& op)
+{
+ uint8_t mode = NV50_IR_INTERP_PERSPECTIVE;
+
+ if (var->flat)
+ mode = NV50_IR_INTERP_FLAT;
+ else
+ if (var->linear)
+ mode = NV50_IR_INTERP_LINEAR;
+ else
+ if (var->sc)
+ mode = NV50_IR_INTERP_SC;
+
+ op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
+ ? OP_PINTERP : OP_LINTERP;
+
+ if (var->centroid)
+ mode |= NV50_IR_INTERP_CENTROID;
+
+ return mode;
+}
+
+Value *
+Converter::interpolate(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
+{
+ operation op;
+
+ // XXX: no way to know interpolation mode if we don't know what's accessed
+ const uint8_t mode = translateInterpMode(&info->in[ptr ? 0 :
+ src.getIndex(0)], op);
+
+ Instruction *insn = new_Instruction(func, op, TYPE_F32);
+
+ insn->setDef(0, getScratch());
+ insn->setSrc(0, srcToSym(src, c));
+ if (op == OP_PINTERP)
+ insn->setSrc(1, fragCoord[3]);
+ if (ptr)
+ insn->setIndirect(0, 0, ptr);
+
+ insn->setInterpolate(mode);
+
+ bb->insertTail(insn);
+ return insn->getDef(0);
+}
+
+Value *
+Converter::applySrcMod(Value *val, int s, int c)
+{
+ Modifier m = tgsi.getSrc(s).getMod(c);
+ DataType ty = tgsi.inferSrcType();
+
+ if (m & Modifier(NV50_IR_MOD_ABS))
+ val = mkOp1v(OP_ABS, ty, getScratch(), val);
+
+ if (m & Modifier(NV50_IR_MOD_NEG))
+ val = mkOp1v(OP_NEG, ty, getScratch(), val);
+
+ return val;
+}
+
+Value *
+Converter::getVertexBase(int s)
+{
+ assert(s < 5);
+ if (!(vtxBaseValid & (1 << s))) {
+ const int index = tgsi.getSrc(s).getIndex(1);
+ Value *rel = NULL;
+ if (tgsi.getSrc(s).isIndirect(1))
+ rel = fetchSrc(tgsi.getSrc(s).getIndirect(1), 0, NULL);
+ vtxBaseValid |= 1 << s;
+ vtxBase[s] = mkOp2v(OP_PFETCH, TYPE_U32, getSSA(), mkImm(index), rel);
+ }
+ return vtxBase[s];
+}
+
+Value *
+Converter::fetchSrc(int s, int c)
+{
+ Value *res;
+ Value *ptr = NULL, *dimRel = NULL;
+
+ tgsi::Instruction::SrcRegister src = tgsi.getSrc(s);
+
+ if (src.isIndirect(0))
+ ptr = fetchSrc(src.getIndirect(0), 0, NULL);
+
+ if (src.is2D()) {
+ switch (src.getFile()) {
+ case TGSI_FILE_INPUT:
+ dimRel = getVertexBase(s);
+ break;
+ case TGSI_FILE_CONSTANT:
+ // on NVC0, this is valid and c{I+J}[k] == cI[(J << 16) + k]
+ if (src.isIndirect(1))
+ dimRel = fetchSrc(src.getIndirect(1), 0, 0);
+ break;
+ default:
+ break;
+ }
+ }
+
+ res = fetchSrc(src, c, ptr);
+
+ if (dimRel)
+ res->getInsn()->setIndirect(0, 1, dimRel);
+
+ return applySrcMod(res, s, c);
+}
+
+Converter::DataArray *
+Converter::getArrayForFile(unsigned file, int idx)
+{
+ switch (file) {
+ case TGSI_FILE_TEMPORARY:
+ return &tData;
+ case TGSI_FILE_PREDICATE:
+ return &pData;
+ case TGSI_FILE_ADDRESS:
+ return &aData;
+ case TGSI_FILE_OUTPUT:
+ assert(prog->getType() == Program::TYPE_FRAGMENT);
+ return &oData;
+ default:
+ assert(!"invalid/unhandled TGSI source file");
+ return NULL;
+ }
+}
+
+Value *
+Converter::fetchSrc(tgsi::Instruction::SrcRegister src, int c, Value *ptr)
+{
+ const int idx2d = src.is2D() ? src.getIndex(1) : 0;
+ const int idx = src.getIndex(0);
+ const int swz = src.getSwizzle(c);
+
+ switch (src.getFile()) {
+ case TGSI_FILE_IMMEDIATE:
+ assert(!ptr);
+ return loadImm(NULL, info->immd.data[idx * 4 + swz]);
+ case TGSI_FILE_CONSTANT:
+ return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
+ case TGSI_FILE_INPUT:
+ if (prog->getType() == Program::TYPE_FRAGMENT) {
+ // don't load masked inputs, won't be assigned a slot
+ if (!ptr && !(info->in[idx].mask & (1 << swz)))
+ return loadImm(NULL, swz == TGSI_SWIZZLE_W ? 1.0f : 0.0f);
+ if (!ptr && info->in[idx].sn == TGSI_SEMANTIC_FACE)
+ return mkOp1v(OP_RDSV, TYPE_F32, getSSA(), mkSysVal(SV_FACE, 0));
+ return interpolate(src, c, ptr);
+ }
+ return mkLoadv(TYPE_U32, srcToSym(src, c), ptr);
+ case TGSI_FILE_OUTPUT:
+ assert(!"load from output file");
+ return NULL;
+ case TGSI_FILE_SYSTEM_VALUE:
+ assert(!ptr);
+ return mkOp1v(OP_RDSV, TYPE_U32, getSSA(), srcToSym(src, c));
+ default:
+ return getArrayForFile(src.getFile(), idx2d)->load(
+ sub.cur->values, idx, swz, ptr);
+ }
+}
+
+Value *
+Converter::acquireDst(int d, int c)
+{
+ const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
+ const unsigned f = dst.getFile();
+ const int idx = dst.getIndex(0);
+ const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+
+ if (dst.isMasked(c) || f == TGSI_FILE_RESOURCE)
+ return NULL;
+
+ if (dst.isIndirect(0) ||
+ f == TGSI_FILE_SYSTEM_VALUE ||
+ (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT))
+ return getScratch();
+
+ return getArrayForFile(f, idx2d)-> acquire(sub.cur->values, idx, c);
+}
+
+void
+Converter::storeDst(int d, int c, Value *val)
+{
+ const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
+
+ switch (tgsi.getSaturate()) {
+ case TGSI_SAT_NONE:
+ break;
+ case TGSI_SAT_ZERO_ONE:
+ mkOp1(OP_SAT, dstTy, val, val);
+ break;
+ case TGSI_SAT_MINUS_PLUS_ONE:
+ mkOp2(OP_MAX, dstTy, val, val, mkImm(-1.0f));
+ mkOp2(OP_MIN, dstTy, val, val, mkImm(+1.0f));
+ break;
+ default:
+ assert(!"invalid saturation mode");
+ break;
+ }
+
+ Value *ptr = dst.isIndirect(0) ?
+ fetchSrc(dst.getIndirect(0), 0, NULL) : NULL;
+
+ if (info->io.genUserClip > 0 &&
+ dst.getFile() == TGSI_FILE_OUTPUT &&
+ !dst.isIndirect(0) && dst.getIndex(0) == code->clipVertexOutput) {
+ mkMov(clipVtx[c], val);
+ val = clipVtx[c];
+ }
+
+ storeDst(dst, c, val, ptr);
+}
+
+void
+Converter::storeDst(const tgsi::Instruction::DstRegister dst, int c,
+ Value *val, Value *ptr)
+{
+ const unsigned f = dst.getFile();
+ const int idx = dst.getIndex(0);
+ const int idx2d = dst.is2D() ? dst.getIndex(1) : 0;
+
+ if (f == TGSI_FILE_SYSTEM_VALUE) {
+ assert(!ptr);
+ mkOp2(OP_WRSV, TYPE_U32, NULL, dstToSym(dst, c), val);
+ } else
+ if (f == TGSI_FILE_OUTPUT && prog->getType() != Program::TYPE_FRAGMENT) {
+ if (ptr || (info->out[idx].mask & (1 << c)))
+ mkStore(OP_EXPORT, TYPE_U32, dstToSym(dst, c), ptr, val);
+ } else
+ if (f == TGSI_FILE_TEMPORARY ||
+ f == TGSI_FILE_PREDICATE ||
+ f == TGSI_FILE_ADDRESS ||
+ f == TGSI_FILE_OUTPUT) {
+ getArrayForFile(f, idx2d)->store(sub.cur->values, idx, c, ptr, val);
+ } else {
+ assert(!"invalid dst file");
+ }
+}
+
+#define FOR_EACH_DST_ENABLED_CHANNEL(d, chan, inst) \
+ for (chan = 0; chan < 4; ++chan) \
+ if (!inst.getDst(d).isMasked(chan))
+
+Value *
+Converter::buildDot(int dim)
+{
+ assert(dim > 0);
+
+ Value *src0 = fetchSrc(0, 0), *src1 = fetchSrc(1, 0);
+ Value *dotp = getScratch();
+
+ mkOp2(OP_MUL, TYPE_F32, dotp, src0, src1);
+
+ for (int c = 1; c < dim; ++c) {
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ mkOp3(OP_MAD, TYPE_F32, dotp, src0, src1, dotp);
+ }
+ return dotp;
+}
+
+void
+Converter::insertConvergenceOps(BasicBlock *conv, BasicBlock *fork)
+{
+ FlowInstruction *join = new_FlowInstruction(func, OP_JOIN, NULL);
+ join->fixed = 1;
+ conv->insertHead(join);
+
+ fork->joinAt = new_FlowInstruction(func, OP_JOINAT, conv);
+ fork->insertBefore(fork->getExit(), fork->joinAt);
+}
+
+void
+Converter::setTexRS(TexInstruction *tex, unsigned int& s, int R, int S)
+{
+ unsigned rIdx = 0, sIdx = 0;
+
+ if (R >= 0)
+ rIdx = tgsi.getSrc(R).getIndex(0);
+ if (S >= 0)
+ sIdx = tgsi.getSrc(S).getIndex(0);
+
+ tex->setTexture(tgsi.getTexture(code, R), rIdx, sIdx);
+
+ if (tgsi.getSrc(R).isIndirect(0)) {
+ tex->tex.rIndirectSrc = s;
+ tex->setSrc(s++, fetchSrc(tgsi.getSrc(R).getIndirect(0), 0, NULL));
+ }
+ if (S >= 0 && tgsi.getSrc(S).isIndirect(0)) {
+ tex->tex.sIndirectSrc = s;
+ tex->setSrc(s++, fetchSrc(tgsi.getSrc(S).getIndirect(0), 0, NULL));
+ }
+}
+
+void
+Converter::handleTXQ(Value *dst0[4], enum TexQuery query)
+{
+ TexInstruction *tex = new_TexInstruction(func, OP_TXQ);
+ tex->tex.query = query;
+ unsigned int c, d;
+
+ for (d = 0, c = 0; c < 4; ++c) {
+ if (!dst0[c])
+ continue;
+ tex->tex.mask |= 1 << c;
+ tex->setDef(d++, dst0[c]);
+ }
+ tex->setSrc((c = 0), fetchSrc(0, 0)); // mip level
+
+ setTexRS(tex, c, 1, -1);
+
+ bb->insertTail(tex);
+}
+
+void
+Converter::loadProjTexCoords(Value *dst[4], Value *src[4], unsigned int mask)
+{
+ Value *proj = fetchSrc(0, 3);
+ Instruction *insn = proj->getUniqueInsn();
+ int c;
+
+ if (insn->op == OP_PINTERP) {
+ bb->insertTail(insn = cloneForward(func, insn));
+ insn->op = OP_LINTERP;
+ insn->setInterpolate(NV50_IR_INTERP_LINEAR | insn->getSampleMode());
+ insn->setSrc(1, NULL);
+ proj = insn->getDef(0);
+ }
+ proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), proj);
+
+ for (c = 0; c < 4; ++c) {
+ if (!(mask & (1 << c)))
+ continue;
+ if ((insn = src[c]->getUniqueInsn())->op != OP_PINTERP)
+ continue;
+ mask &= ~(1 << c);
+
+ bb->insertTail(insn = cloneForward(func, insn));
+ insn->setInterpolate(NV50_IR_INTERP_PERSPECTIVE | insn->getSampleMode());
+ insn->setSrc(1, proj);
+ dst[c] = insn->getDef(0);
+ }
+ if (!mask)
+ return;
+
+ proj = mkOp1v(OP_RCP, TYPE_F32, getSSA(), fetchSrc(0, 3));
+
+ for (c = 0; c < 4; ++c)
+ if (mask & (1 << c))
+ dst[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), src[c], proj);
+}
+
+// order of nv50 ir sources: x y z layer lod/bias shadow
+// order of TGSI TEX sources: x y z layer shadow lod/bias
+// lowering will finally set the hw specific order (like array first on nvc0)
+void
+Converter::handleTEX(Value *dst[4], int R, int S, int L, int C, int Dx, int Dy)
+{
+ Value *val;
+ Value *arg[4], *src[8];
+ Value *lod = NULL, *shd = NULL;
+ unsigned int s, c, d;
+ TexInstruction *texi = new_TexInstruction(func, tgsi.getOP());
+
+ TexInstruction::Target tgt = tgsi.getTexture(code, R);
+
+ for (s = 0; s < tgt.getArgCount(); ++s)
+ arg[s] = src[s] = fetchSrc(0, s);
+
+ if (texi->op == OP_TXL || texi->op == OP_TXB)
+ lod = fetchSrc(L >> 4, L & 3);
+
+ if (C == 0x0f)
+ C = 0x00 | MAX2(tgt.getArgCount(), 2); // guess DC src
+
+ if (tgt.isShadow())
+ shd = fetchSrc(C >> 4, C & 3);
+
+ if (texi->op == OP_TXD) {
+ for (c = 0; c < tgt.getDim(); ++c) {
+ texi->dPdx[c].set(fetchSrc(Dx >> 4, (Dx & 3) + c));
+ texi->dPdy[c].set(fetchSrc(Dy >> 4, (Dy & 3) + c));
+ }
+ }
+
+ // cube textures don't care about projection value, it's divided out
+ if (tgsi.getOpcode() == TGSI_OPCODE_TXP && !tgt.isCube() && !tgt.isArray()) {
+ unsigned int n = tgt.getDim();
+ if (shd) {
+ arg[n] = shd;
+ ++n;
+ assert(tgt.getDim() == tgt.getArgCount());
+ }
+ loadProjTexCoords(src, arg, (1 << n) - 1);
+ if (shd)
+ shd = src[n - 1];
+ }
+
+ if (tgt.isCube()) {
+ for (c = 0; c < 3; ++c)
+ src[c] = mkOp1v(OP_ABS, TYPE_F32, getSSA(), arg[c]);
+ val = getScratch();
+ mkOp2(OP_MAX, TYPE_F32, val, src[0], src[1]);
+ mkOp2(OP_MAX, TYPE_F32, val, src[2], val);
+ mkOp1(OP_RCP, TYPE_F32, val, val);
+ for (c = 0; c < 3; ++c)
+ src[c] = mkOp2v(OP_MUL, TYPE_F32, getSSA(), arg[c], val);
+ }
+
+ for (c = 0, d = 0; c < 4; ++c) {
+ if (dst[c]) {
+ texi->setDef(d++, dst[c]);
+ texi->tex.mask |= 1 << c;
+ } else {
+ // NOTE: maybe hook up def too, for CSE
+ }
+ }
+ for (s = 0; s < tgt.getArgCount(); ++s)
+ texi->setSrc(s, src[s]);
+ if (lod)
+ texi->setSrc(s++, lod);
+ if (shd)
+ texi->setSrc(s++, shd);
+
+ setTexRS(texi, s, R, S);
+
+ if (tgsi.getOpcode() == TGSI_OPCODE_SAMPLE_C_LZ)
+ texi->tex.levelZero = true;
+
+ bb->insertTail(texi);
+}
+
+// 1st source: xyz = coordinates, w = lod/sample
+// 2nd source: offset
+void
+Converter::handleTXF(Value *dst[4], int R, int L_M)
+{
+ TexInstruction *texi = new_TexInstruction(func, tgsi.getOP());
+ int ms;
+ unsigned int c, d, s;
+
+ texi->tex.target = tgsi.getTexture(code, R);
+
+ ms = texi->tex.target.isMS() ? 1 : 0;
+ texi->tex.levelZero = ms; /* MS textures don't have mip-maps */
+
+ for (c = 0, d = 0; c < 4; ++c) {
+ if (dst[c]) {
+ texi->setDef(d++, dst[c]);
+ texi->tex.mask |= 1 << c;
+ }
+ }
+ for (c = 0; c < (texi->tex.target.getArgCount() - ms); ++c)
+ texi->setSrc(c, fetchSrc(0, c));
+ texi->setSrc(c++, fetchSrc(L_M >> 4, L_M & 3)); // lod or ms
+
+ setTexRS(texi, c, R, -1);
+
+ for (s = 0; s < tgsi.getNumTexOffsets(); ++s) {
+ for (c = 0; c < 3; ++c) {
+ texi->tex.offset[s][c] = tgsi.getTexOffset(s).getValueU32(c, info);
+ if (texi->tex.offset[s][c])
+ texi->tex.useOffsets = s + 1;
+ }
+ }
+
+ bb->insertTail(texi);
+}
+
+void
+Converter::handleLIT(Value *dst0[4])
+{
+ Value *val0 = NULL;
+ unsigned int mask = tgsi.getDst(0).getMask();
+
+ if (mask & (1 << 0))
+ loadImm(dst0[0], 1.0f);
+
+ if (mask & (1 << 3))
+ loadImm(dst0[3], 1.0f);
+
+ if (mask & (3 << 1)) {
+ val0 = getScratch();
+ mkOp2(OP_MAX, TYPE_F32, val0, fetchSrc(0, 0), zero);
+ if (mask & (1 << 1))
+ mkMov(dst0[1], val0);
+ }
+
+ if (mask & (1 << 2)) {
+ Value *src1 = fetchSrc(0, 1), *src3 = fetchSrc(0, 3);
+ Value *val1 = getScratch(), *val3 = getScratch();
+
+ Value *pos128 = loadImm(NULL, +127.999999f);
+ Value *neg128 = loadImm(NULL, -127.999999f);
+
+ mkOp2(OP_MAX, TYPE_F32, val1, src1, zero);
+ mkOp2(OP_MAX, TYPE_F32, val3, src3, neg128);
+ mkOp2(OP_MIN, TYPE_F32, val3, val3, pos128);
+ mkOp2(OP_POW, TYPE_F32, val3, val1, val3);
+
+ mkCmp(OP_SLCT, CC_GT, TYPE_F32, dst0[2], val3, zero, val0);
+ }
+}
+
+static inline bool
+isResourceSpecial(const int r)
+{
+ return (r == TGSI_RESOURCE_GLOBAL ||
+ r == TGSI_RESOURCE_LOCAL ||
+ r == TGSI_RESOURCE_PRIVATE ||
+ r == TGSI_RESOURCE_INPUT);
+}
+
+static inline bool
+isResourceRaw(const struct tgsi::Source *code, const int r)
+{
+ return isResourceSpecial(r) || code->resources[r].raw;
+}
+
+static inline nv50_ir::TexTarget
+getResourceTarget(const struct tgsi::Source *code, int r)
+{
+ if (isResourceSpecial(r))
+ return nv50_ir::TEX_TARGET_BUFFER;
+ return tgsi::translateTexture(code->resources.at(r).target);
+}
+
+Symbol *
+Converter::getResourceBase(const int r)
+{
+ Symbol *sym = NULL;
+
+ switch (r) {
+ case TGSI_RESOURCE_GLOBAL:
+ sym = new_Symbol(prog, nv50_ir::FILE_MEMORY_GLOBAL, 15);
+ break;
+ case TGSI_RESOURCE_LOCAL:
+ assert(prog->getType() == Program::TYPE_COMPUTE);
+ sym = mkSymbol(nv50_ir::FILE_MEMORY_SHARED, 0, TYPE_U32,
+ info->prop.cp.sharedOffset);
+ break;
+ case TGSI_RESOURCE_PRIVATE:
+ sym = mkSymbol(nv50_ir::FILE_MEMORY_LOCAL, 0, TYPE_U32,
+ info->bin.tlsSpace);
+ break;
+ case TGSI_RESOURCE_INPUT:
+ assert(prog->getType() == Program::TYPE_COMPUTE);
+ sym = mkSymbol(nv50_ir::FILE_SHADER_INPUT, 0, TYPE_U32,
+ info->prop.cp.inputOffset);
+ break;
+ default:
+ sym = new_Symbol(prog,
+ nv50_ir::FILE_MEMORY_GLOBAL, code->resources.at(r).slot);
+ break;
+ }
+ return sym;
+}
+
+void
+Converter::getResourceCoords(std::vector<Value *> &coords, int r, int s)
+{
+ const int arg =
+ TexInstruction::Target(getResourceTarget(code, r)).getArgCount();
+
+ for (int c = 0; c < arg; ++c)
+ coords.push_back(fetchSrc(s, c));
+
+ // NOTE: TGSI_RESOURCE_GLOBAL needs FILE_GPR; this is an nv50 quirk
+ if (r == TGSI_RESOURCE_LOCAL ||
+ r == TGSI_RESOURCE_PRIVATE ||
+ r == TGSI_RESOURCE_INPUT)
+ coords[0] = mkOp1v(OP_MOV, TYPE_U32, getScratch(4, FILE_ADDRESS),
+ coords[0]);
+}
+
+static inline int
+partitionLoadStore(uint8_t comp[2], uint8_t size[2], uint8_t mask)
+{
+ int n = 0;
+
+ while (mask) {
+ if (mask & 1) {
+ size[n]++;
+ } else {
+ if (size[n])
+ comp[n = 1] = size[0] + 1;
+ else
+ comp[n]++;
+ }
+ mask >>= 1;
+ }
+ if (size[0] == 3) {
+ n = 1;
+ size[0] = (comp[0] == 1) ? 1 : 2;
+ size[1] = 3 - size[0];
+ comp[1] = comp[0] + size[0];
+ }
+ return n + 1;
+}
+
+// For raw loads, granularity is 4 byte.
+// Usage of the texture read mask on OP_SULDP is not allowed.
+void
+Converter::handleLOAD(Value *dst0[4])
+{
+ const int r = tgsi.getSrc(0).getIndex(0);
+ int c;
+ std::vector<Value *> off, src, ldv, def;
+
+ getResourceCoords(off, r, 1);
+
+ if (isResourceRaw(code, r)) {
+ uint8_t mask = 0;
+ uint8_t comp[2] = { 0, 0 };
+ uint8_t size[2] = { 0, 0 };
+
+ Symbol *base = getResourceBase(r);
+
+ // determine the base and size of the at most 2 load ops
+ for (c = 0; c < 4; ++c)
+ if (!tgsi.getDst(0).isMasked(c))
+ mask |= 1 << (tgsi.getSrc(0).getSwizzle(c) - TGSI_SWIZZLE_X);
+
+ int n = partitionLoadStore(comp, size, mask);
+
+ src = off;
+
+ def.resize(4); // index by component, the ones we need will be non-NULL
+ for (c = 0; c < 4; ++c) {
+ if (dst0[c] && tgsi.getSrc(0).getSwizzle(c) == (TGSI_SWIZZLE_X + c))
+ def[c] = dst0[c];
+ else
+ if (mask & (1 << c))
+ def[c] = getScratch();
+ }
+
+ const bool useLd = isResourceSpecial(r) ||
+ (info->io.nv50styleSurfaces &&
+ code->resources[r].target == TGSI_TEXTURE_BUFFER);
+
+ for (int i = 0; i < n; ++i) {
+ ldv.assign(def.begin() + comp[i], def.begin() + comp[i] + size[i]);
+
+ if (comp[i]) // adjust x component of source address if necessary
+ src[0] = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, off[0]->reg.file),
+ off[0], mkImm(comp[i] * 4));
+ else
+ src[0] = off[0];
+
+ if (useLd) {
+ Instruction *ld =
+ mkLoad(typeOfSize(size[i] * 4), ldv[0], base, src[0]);
+ for (size_t c = 1; c < ldv.size(); ++c)
+ ld->setDef(c, ldv[c]);
+ } else {
+ mkTex(OP_SULDB, getResourceTarget(code, r), code->resources[r].slot,
+ 0, ldv, src)->dType = typeOfSize(size[i] * 4);
+ }
+ }
+ } else {
+ def.resize(4);
+ for (c = 0; c < 4; ++c) {
+ if (!dst0[c] || tgsi.getSrc(0).getSwizzle(c) != (TGSI_SWIZZLE_X + c))
+ def[c] = getScratch();
+ else
+ def[c] = dst0[c];
+ }
+
+ mkTex(OP_SULDP, getResourceTarget(code, r), code->resources[r].slot, 0,
+ def, off);
+ }
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ if (dst0[c] != def[c])
+ mkMov(dst0[c], def[tgsi.getSrc(0).getSwizzle(c)]);
+}
+
+// For formatted stores, the write mask on OP_SUSTP can be used.
+// Raw stores have to be split.
+void
+Converter::handleSTORE()
+{
+ const int r = tgsi.getDst(0).getIndex(0);
+ int c;
+ std::vector<Value *> off, src, dummy;
+
+ getResourceCoords(off, r, 0);
+ src = off;
+ const int s = src.size();
+
+ if (isResourceRaw(code, r)) {
+ uint8_t comp[2] = { 0, 0 };
+ uint8_t size[2] = { 0, 0 };
+
+ int n = partitionLoadStore(comp, size, tgsi.getDst(0).getMask());
+
+ Symbol *base = getResourceBase(r);
+
+ const bool useSt = isResourceSpecial(r) ||
+ (info->io.nv50styleSurfaces &&
+ code->resources[r].target == TGSI_TEXTURE_BUFFER);
+
+ for (int i = 0; i < n; ++i) {
+ if (comp[i]) // adjust x component of source address if necessary
+ src[0] = mkOp2v(OP_ADD, TYPE_U32, getSSA(4, off[0]->reg.file),
+ off[0], mkImm(comp[i] * 4));
+ else
+ src[0] = off[0];
+
+ const DataType stTy = typeOfSize(size[i] * 4);
+
+ if (useSt) {
+ Instruction *st =
+ mkStore(OP_STORE, stTy, base, NULL, fetchSrc(1, comp[i]));
+ for (c = 1; c < size[i]; ++c)
+ st->setSrc(1 + c, fetchSrc(1, comp[i] + c));
+ st->setIndirect(0, 0, src[0]);
+ } else {
+ // attach values to be stored
+ src.resize(s + size[i]);
+ for (c = 0; c < size[i]; ++c)
+ src[s + c] = fetchSrc(1, comp[i] + c);
+ mkTex(OP_SUSTB, getResourceTarget(code, r), code->resources[r].slot,
+ 0, dummy, src)->setType(stTy);
+ }
+ }
+ } else {
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ src.push_back(fetchSrc(1, c));
+
+ mkTex(OP_SUSTP, getResourceTarget(code, r), code->resources[r].slot, 0,
+ dummy, src)->tex.mask = tgsi.getDst(0).getMask();
+ }
+}
+
+// XXX: These only work on resources with the single-component u32/s32 formats.
+// Therefore the result is replicated. This might not be intended by TGSI, but
+// operating on more than 1 component would produce undefined results because
+// they do not exist.
+void
+Converter::handleATOM(Value *dst0[4], DataType ty, uint16_t subOp)
+{
+ const int r = tgsi.getSrc(0).getIndex(0);
+ std::vector<Value *> srcv;
+ std::vector<Value *> defv;
+ LValue *dst = getScratch();
+
+ getResourceCoords(srcv, r, 1);
+
+ if (isResourceSpecial(r)) {
+ assert(r != TGSI_RESOURCE_INPUT);
+ Instruction *insn;
+ insn = mkOp2(OP_ATOM, ty, dst, getResourceBase(r), fetchSrc(2, 0));
+ insn->subOp = subOp;
+ if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+ insn->setSrc(2, fetchSrc(3, 0));
+ insn->setIndirect(0, 0, srcv.at(0));
+ } else {
+ operation op = isResourceRaw(code, r) ? OP_SUREDB : OP_SUREDP;
+ TexTarget targ = getResourceTarget(code, r);
+ int idx = code->resources[r].slot;
+ defv.push_back(dst);
+ srcv.push_back(fetchSrc(2, 0));
+ if (subOp == NV50_IR_SUBOP_ATOM_CAS)
+ srcv.push_back(fetchSrc(3, 0));
+ TexInstruction *tex = mkTex(op, targ, idx, 0, defv, srcv);
+ tex->subOp = subOp;
+ tex->tex.mask = 1;
+ tex->setType(ty);
+ }
+
+ for (int c = 0; c < 4; ++c)
+ if (dst0[c])
+ dst0[c] = dst; // not equal to rDst so handleInstruction will do mkMov
+}
+
+Converter::Subroutine *
+Converter::getSubroutine(unsigned ip)
+{
+ std::map<unsigned, Subroutine>::iterator it = sub.map.find(ip);
+
+ if (it == sub.map.end())
+ it = sub.map.insert(std::make_pair(
+ ip, Subroutine(new Function(prog, "SUB", ip)))).first;
+
+ return &it->second;
+}
+
+Converter::Subroutine *
+Converter::getSubroutine(Function *f)
+{
+ unsigned ip = f->getLabel();
+ std::map<unsigned, Subroutine>::iterator it = sub.map.find(ip);
+
+ if (it == sub.map.end())
+ it = sub.map.insert(std::make_pair(ip, Subroutine(f))).first;
+
+ return &it->second;
+}
+
+bool
+Converter::isEndOfSubroutine(uint ip)
+{
+ assert(ip < code->scan.num_instructions);
+ tgsi::Instruction insn(&code->insns[ip]);
+ return (insn.getOpcode() == TGSI_OPCODE_END ||
+ insn.getOpcode() == TGSI_OPCODE_ENDSUB ||
+ // does END occur at end of main or the very end ?
+ insn.getOpcode() == TGSI_OPCODE_BGNSUB);
+}
+
+bool
+Converter::handleInstruction(const struct tgsi_full_instruction *insn)
+{
+ Instruction *geni;
+
+ Value *dst0[4], *rDst0[4];
+ Value *src0, *src1, *src2;
+ Value *val0, *val1;
+ int c;
+
+ tgsi = tgsi::Instruction(insn);
+
+ bool useScratchDst = tgsi.checkDstSrcAliasing();
+
+ operation op = tgsi.getOP();
+ dstTy = tgsi.inferDstType();
+ srcTy = tgsi.inferSrcType();
+
+ unsigned int mask = tgsi.dstCount() ? tgsi.getDst(0).getMask() : 0;
+
+ if (tgsi.dstCount()) {
+ for (c = 0; c < 4; ++c) {
+ rDst0[c] = acquireDst(0, c);
+ dst0[c] = (useScratchDst && rDst0[c]) ? getScratch() : rDst0[c];
+ }
+ }
+
+ switch (tgsi.getOpcode()) {
+ case TGSI_OPCODE_ADD:
+ case TGSI_OPCODE_UADD:
+ case TGSI_OPCODE_AND:
+ case TGSI_OPCODE_DIV:
+ case TGSI_OPCODE_IDIV:
+ case TGSI_OPCODE_UDIV:
+ case TGSI_OPCODE_MAX:
+ case TGSI_OPCODE_MIN:
+ case TGSI_OPCODE_IMAX:
+ case TGSI_OPCODE_IMIN:
+ case TGSI_OPCODE_UMAX:
+ case TGSI_OPCODE_UMIN:
+ case TGSI_OPCODE_MOD:
+ case TGSI_OPCODE_UMOD:
+ case TGSI_OPCODE_MUL:
+ case TGSI_OPCODE_UMUL:
+ case TGSI_OPCODE_OR:
+ case TGSI_OPCODE_POW:
+ case TGSI_OPCODE_SHL:
+ case TGSI_OPCODE_ISHR:
+ case TGSI_OPCODE_USHR:
+ case TGSI_OPCODE_SUB:
+ case TGSI_OPCODE_XOR:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ mkOp2(op, dstTy, dst0[c], src0, src1);
+ }
+ break;
+ case TGSI_OPCODE_MAD:
+ case TGSI_OPCODE_UMAD:
+ case TGSI_OPCODE_SAD:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ src2 = fetchSrc(2, c);
+ mkOp3(op, dstTy, dst0[c], src0, src1, src2);
+ }
+ break;
+ case TGSI_OPCODE_MOV:
+ case TGSI_OPCODE_ABS:
+ case TGSI_OPCODE_CEIL:
+ case TGSI_OPCODE_FLR:
+ case TGSI_OPCODE_TRUNC:
+ case TGSI_OPCODE_RCP:
+ case TGSI_OPCODE_IABS:
+ case TGSI_OPCODE_INEG:
+ case TGSI_OPCODE_NOT:
+ case TGSI_OPCODE_DDX:
+ case TGSI_OPCODE_DDY:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkOp1(op, dstTy, dst0[c], fetchSrc(0, c));
+ break;
+ case TGSI_OPCODE_RSQ:
+ src0 = fetchSrc(0, 0);
+ val0 = getScratch();
+ mkOp1(OP_ABS, TYPE_F32, val0, src0);
+ mkOp1(OP_RSQ, TYPE_F32, val0, val0);
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkMov(dst0[c], val0);
+ break;
+ case TGSI_OPCODE_ARL:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ mkCvt(OP_CVT, TYPE_S32, dst0[c], TYPE_F32, src0)->rnd = ROUND_M;
+ mkOp2(OP_SHL, TYPE_U32, dst0[c], dst0[c], mkImm(4));
+ }
+ break;
+ case TGSI_OPCODE_UARL:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkOp2(OP_SHL, TYPE_U32, dst0[c], fetchSrc(0, c), mkImm(4));
+ break;
+ case TGSI_OPCODE_EX2:
+ case TGSI_OPCODE_LG2:
+ val0 = mkOp1(op, TYPE_F32, getScratch(), fetchSrc(0, 0))->getDef(0);
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkOp1(OP_MOV, TYPE_F32, dst0[c], val0);
+ break;
+ case TGSI_OPCODE_COS:
+ case TGSI_OPCODE_SIN:
+ val0 = getScratch();
+ if (mask & 7) {
+ mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 0));
+ mkOp1(op, TYPE_F32, val0, val0);
+ for (c = 0; c < 3; ++c)
+ if (dst0[c])
+ mkMov(dst0[c], val0);
+ }
+ if (dst0[3]) {
+ mkOp1(OP_PRESIN, TYPE_F32, val0, fetchSrc(0, 3));
+ mkOp1(op, TYPE_F32, dst0[3], val0);
+ }
+ break;
+ case TGSI_OPCODE_SCS:
+ if (mask & 3) {
+ val0 = mkOp1v(OP_PRESIN, TYPE_F32, getSSA(), fetchSrc(0, 0));
+ if (dst0[0])
+ mkOp1(OP_COS, TYPE_F32, dst0[0], val0);
+ if (dst0[1])
+ mkOp1(OP_SIN, TYPE_F32, dst0[1], val0);
+ }
+ if (dst0[2])
+ loadImm(dst0[2], 0.0f);
+ if (dst0[3])
+ loadImm(dst0[3], 1.0f);
+ break;
+ case TGSI_OPCODE_EXP:
+ src0 = fetchSrc(0, 0);
+ val0 = mkOp1v(OP_FLOOR, TYPE_F32, getSSA(), src0);
+ if (dst0[1])
+ mkOp2(OP_SUB, TYPE_F32, dst0[1], src0, val0);
+ if (dst0[0])
+ mkOp1(OP_EX2, TYPE_F32, dst0[0], val0);
+ if (dst0[2])
+ mkOp1(OP_EX2, TYPE_F32, dst0[2], src0);
+ if (dst0[3])
+ loadImm(dst0[3], 1.0f);
+ break;
+ case TGSI_OPCODE_LOG:
+ src0 = mkOp1v(OP_ABS, TYPE_F32, getSSA(), fetchSrc(0, 0));
+ val0 = mkOp1v(OP_LG2, TYPE_F32, dst0[2] ? dst0[2] : getSSA(), src0);
+ if (dst0[0] || dst0[1])
+ val1 = mkOp1v(OP_FLOOR, TYPE_F32, dst0[0] ? dst0[0] : getSSA(), val0);
+ if (dst0[1]) {
+ mkOp1(OP_EX2, TYPE_F32, dst0[1], val1);
+ mkOp1(OP_RCP, TYPE_F32, dst0[1], dst0[1]);
+ mkOp2(OP_MUL, TYPE_F32, dst0[1], dst0[1], src0);
+ }
+ if (dst0[3])
+ loadImm(dst0[3], 1.0f);
+ break;
+ case TGSI_OPCODE_DP2:
+ val0 = buildDot(2);
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkMov(dst0[c], val0);
+ break;
+ case TGSI_OPCODE_DP3:
+ val0 = buildDot(3);
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkMov(dst0[c], val0);
+ break;
+ case TGSI_OPCODE_DP4:
+ val0 = buildDot(4);
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkMov(dst0[c], val0);
+ break;
+ case TGSI_OPCODE_DPH:
+ val0 = buildDot(3);
+ src1 = fetchSrc(1, 3);
+ mkOp2(OP_ADD, TYPE_F32, val0, val0, src1);
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkMov(dst0[c], val0);
+ break;
+ case TGSI_OPCODE_DST:
+ if (dst0[0])
+ loadImm(dst0[0], 1.0f);
+ if (dst0[1]) {
+ src0 = fetchSrc(0, 1);
+ src1 = fetchSrc(1, 1);
+ mkOp2(OP_MUL, TYPE_F32, dst0[1], src0, src1);
+ }
+ if (dst0[2])
+ mkMov(dst0[2], fetchSrc(0, 2));
+ if (dst0[3])
+ mkMov(dst0[3], fetchSrc(1, 3));
+ break;
+ case TGSI_OPCODE_LRP:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ src2 = fetchSrc(2, c);
+ mkOp3(OP_MAD, TYPE_F32, dst0[c],
+ mkOp2v(OP_SUB, TYPE_F32, getSSA(), src1, src2), src0, src2);
+ }
+ break;
+ case TGSI_OPCODE_LIT:
+ handleLIT(dst0);
+ break;
+ case TGSI_OPCODE_XPD:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ if (c < 3) {
+ val0 = getSSA();
+ src0 = fetchSrc(1, (c + 1) % 3);
+ src1 = fetchSrc(0, (c + 2) % 3);
+ mkOp2(OP_MUL, TYPE_F32, val0, src0, src1);
+ mkOp1(OP_NEG, TYPE_F32, val0, val0);
+
+ src0 = fetchSrc(0, (c + 1) % 3);
+ src1 = fetchSrc(1, (c + 2) % 3);
+ mkOp3(OP_MAD, TYPE_F32, dst0[c], src0, src1, val0);
+ } else {
+ loadImm(dst0[c], 1.0f);
+ }
+ }
+ break;
+ case TGSI_OPCODE_ISSG:
+ case TGSI_OPCODE_SSG:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ val0 = getScratch();
+ val1 = getScratch();
+ mkCmp(OP_SET, CC_GT, srcTy, val0, src0, zero);
+ mkCmp(OP_SET, CC_LT, srcTy, val1, src0, zero);
+ if (srcTy == TYPE_F32)
+ mkOp2(OP_SUB, TYPE_F32, dst0[c], val0, val1);
+ else
+ mkOp2(OP_SUB, TYPE_S32, dst0[c], val1, val0);
+ }
+ break;
+ case TGSI_OPCODE_UCMP:
+ case TGSI_OPCODE_CMP:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ src2 = fetchSrc(2, c);
+ if (src1 == src2)
+ mkMov(dst0[c], src1);
+ else
+ mkCmp(OP_SLCT, (srcTy == TYPE_F32) ? CC_LT : CC_NE,
+ srcTy, dst0[c], src1, src2, src0);
+ }
+ break;
+ case TGSI_OPCODE_FRC:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ val0 = getScratch();
+ mkOp1(OP_FLOOR, TYPE_F32, val0, src0);
+ mkOp2(OP_SUB, TYPE_F32, dst0[c], src0, val0);
+ }
+ break;
+ case TGSI_OPCODE_ROUND:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkCvt(OP_CVT, TYPE_F32, dst0[c], TYPE_F32, fetchSrc(0, c))
+ ->rnd = ROUND_NI;
+ break;
+ case TGSI_OPCODE_CLAMP:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ src2 = fetchSrc(2, c);
+ val0 = getScratch();
+ mkOp2(OP_MIN, TYPE_F32, val0, src0, src1);
+ mkOp2(OP_MAX, TYPE_F32, dst0[c], val0, src2);
+ }
+ break;
+ case TGSI_OPCODE_SLT:
+ case TGSI_OPCODE_SGE:
+ case TGSI_OPCODE_SEQ:
+ case TGSI_OPCODE_SFL:
+ case TGSI_OPCODE_SGT:
+ case TGSI_OPCODE_SLE:
+ case TGSI_OPCODE_SNE:
+ case TGSI_OPCODE_STR:
+ case TGSI_OPCODE_FSEQ:
+ case TGSI_OPCODE_FSGE:
+ case TGSI_OPCODE_FSLT:
+ case TGSI_OPCODE_FSNE:
+ case TGSI_OPCODE_ISGE:
+ case TGSI_OPCODE_ISLT:
+ case TGSI_OPCODE_USEQ:
+ case TGSI_OPCODE_USGE:
+ case TGSI_OPCODE_USLT:
+ case TGSI_OPCODE_USNE:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
+ src0 = fetchSrc(0, c);
+ src1 = fetchSrc(1, c);
+ mkCmp(op, tgsi.getSetCond(), dstTy, dst0[c], src0, src1);
+ }
+ break;
+ case TGSI_OPCODE_KILL_IF:
+ val0 = new_LValue(func, FILE_PREDICATE);
+ for (c = 0; c < 4; ++c) {
+ mkCmp(OP_SET, CC_LT, TYPE_F32, val0, fetchSrc(0, c), zero);
+ mkOp(OP_DISCARD, TYPE_NONE, NULL)->setPredicate(CC_P, val0);
+ }
+ break;
+ case TGSI_OPCODE_KILL:
+ mkOp(OP_DISCARD, TYPE_NONE, NULL);
+ break;
+ case TGSI_OPCODE_TEX:
+ case TGSI_OPCODE_TXB:
+ case TGSI_OPCODE_TXL:
+ case TGSI_OPCODE_TXP:
+ // R S L C Dx Dy
+ handleTEX(dst0, 1, 1, 0x03, 0x0f, 0x00, 0x00);
+ break;
+ case TGSI_OPCODE_TXD:
+ handleTEX(dst0, 3, 3, 0x03, 0x0f, 0x10, 0x20);
+ break;
+ case TGSI_OPCODE_TEX2:
+ handleTEX(dst0, 2, 2, 0x03, 0x10, 0x00, 0x00);
+ break;
+ case TGSI_OPCODE_TXB2:
+ case TGSI_OPCODE_TXL2:
+ handleTEX(dst0, 2, 2, 0x10, 0x11, 0x00, 0x00);
+ break;
+ case TGSI_OPCODE_SAMPLE:
+ case TGSI_OPCODE_SAMPLE_B:
+ case TGSI_OPCODE_SAMPLE_D:
+ case TGSI_OPCODE_SAMPLE_L:
+ case TGSI_OPCODE_SAMPLE_C:
+ case TGSI_OPCODE_SAMPLE_C_LZ:
+ handleTEX(dst0, 1, 2, 0x30, 0x30, 0x30, 0x40);
+ break;
+ case TGSI_OPCODE_TXF:
+ handleTXF(dst0, 1, 0x03);
+ break;
+ case TGSI_OPCODE_SAMPLE_I:
+ handleTXF(dst0, 1, 0x03);
+ break;
+ case TGSI_OPCODE_SAMPLE_I_MS:
+ handleTXF(dst0, 1, 0x20);
+ break;
+ case TGSI_OPCODE_TXQ:
+ case TGSI_OPCODE_SVIEWINFO:
+ handleTXQ(dst0, TXQ_DIMS);
+ break;
+ case TGSI_OPCODE_F2I:
+ case TGSI_OPCODE_F2U:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c))->rnd = ROUND_Z;
+ break;
+ case TGSI_OPCODE_I2F:
+ case TGSI_OPCODE_U2F:
+ FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi)
+ mkCvt(OP_CVT, dstTy, dst0[c], srcTy, fetchSrc(0, c));
+ break;
+ case TGSI_OPCODE_EMIT:
+ case TGSI_OPCODE_ENDPRIM:
+ // get vertex stream if specified (must be immediate)
+ src0 = tgsi.srcCount() ?
+ mkImm(tgsi.getSrc(0).getValueU32(0, info)) : zero;
+ mkOp1(op, TYPE_U32, NULL, src0)->fixed = 1;
+ break;
+ case TGSI_OPCODE_IF:
+ case TGSI_OPCODE_UIF:
+ {
+ BasicBlock *ifBB = new BasicBlock(func);
+
+ bb->cfg.attach(&ifBB->cfg, Graph::Edge::TREE);
+ condBBs.push(bb);
+ joinBBs.push(bb);
+
+ mkFlow(OP_BRA, NULL, CC_NOT_P, fetchSrc(0, 0))->setType(srcTy);
+
+ setPosition(ifBB, true);
+ }
+ break;
+ case TGSI_OPCODE_ELSE:
+ {
+ BasicBlock *elseBB = new BasicBlock(func);
+ BasicBlock *forkBB = reinterpret_cast<BasicBlock *>(condBBs.pop().u.p);
+
+ forkBB->cfg.attach(&elseBB->cfg, Graph::Edge::TREE);
+ condBBs.push(bb);
+
+ forkBB->getExit()->asFlow()->target.bb = elseBB;
+ if (!bb->isTerminated())
+ mkFlow(OP_BRA, NULL, CC_ALWAYS, NULL);
+
+ setPosition(elseBB, true);
+ }
+ break;
+ case TGSI_OPCODE_ENDIF:
+ {
+ BasicBlock *convBB = new BasicBlock(func);
+ BasicBlock *prevBB = reinterpret_cast<BasicBlock *>(condBBs.pop().u.p);
+ BasicBlock *forkBB = reinterpret_cast<BasicBlock *>(joinBBs.pop().u.p);
+
+ if (!bb->isTerminated()) {
+ // we only want join if none of the clauses ended with CONT/BREAK/RET
+ if (prevBB->getExit()->op == OP_BRA && joinBBs.getSize() < 6)
+ insertConvergenceOps(convBB, forkBB);
+ mkFlow(OP_BRA, convBB, CC_ALWAYS, NULL);
+ bb->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD);
+ }
+
+ if (prevBB->getExit()->op == OP_BRA) {
+ prevBB->cfg.attach(&convBB->cfg, Graph::Edge::FORWARD);
+ prevBB->getExit()->asFlow()->target.bb = convBB;
+ }
+ setPosition(convBB, true);
+ }
+ break;
+ case TGSI_OPCODE_BGNLOOP:
+ {
+ BasicBlock *lbgnBB = new BasicBlock(func);
+ BasicBlock *lbrkBB = new BasicBlock(func);
+
+ loopBBs.push(lbgnBB);
+ breakBBs.push(lbrkBB);
+ if (loopBBs.getSize() > func->loopNestingBound)
+ func->loopNestingBound++;
+
+ mkFlow(OP_PREBREAK, lbrkBB, CC_ALWAYS, NULL);
+
+ bb->cfg.attach(&lbgnBB->cfg, Graph::Edge::TREE);
+ setPosition(lbgnBB, true);
+ mkFlow(OP_PRECONT, lbgnBB, CC_ALWAYS, NULL);
+ }
+ break;
+ case TGSI_OPCODE_ENDLOOP:
+ {
+ BasicBlock *loopBB = reinterpret_cast<BasicBlock *>(loopBBs.pop().u.p);
+
+ if (!bb->isTerminated()) {
+ mkFlow(OP_CONT, loopBB, CC_ALWAYS, NULL);
+ bb->cfg.attach(&loopBB->cfg, Graph::Edge::BACK);
+ }
+ setPosition(reinterpret_cast<BasicBlock *>(breakBBs.pop().u.p), true);
+ }
+ break;
+ case TGSI_OPCODE_BRK:
+ {
+ if (bb->isTerminated())
+ break;
+ BasicBlock *brkBB = reinterpret_cast<BasicBlock *>(breakBBs.peek().u.p);
+ mkFlow(OP_BREAK, brkBB, CC_ALWAYS, NULL);
+ bb->cfg.attach(&brkBB->cfg, Graph::Edge::CROSS);
+ }
+ break;
+ case TGSI_OPCODE_CONT:
+ {
+ if (bb->isTerminated())
+ break;
+ BasicBlock *contBB = reinterpret_cast<BasicBlock *>(loopBBs.peek().u.p);
+ mkFlow(OP_CONT, contBB, CC_ALWAYS, NULL);
+ contBB->explicitCont = true;
+ bb->cfg.attach(&contBB->cfg, Graph::Edge::BACK);
+ }
+ break;
+ case TGSI_OPCODE_BGNSUB:
+ {
+ Subroutine *s = getSubroutine(ip);
+ BasicBlock *entry = new BasicBlock(s->f);
+ BasicBlock *leave = new BasicBlock(s->f);
+
+ // multiple entrypoints possible, keep the graph connected
+ if (prog->getType() == Program::TYPE_COMPUTE)
+ prog->main->call.attach(&s->f->call, Graph::Edge::TREE);
+
+ sub.cur = s;
+ s->f->setEntry(entry);
+ s->f->setExit(leave);
+ setPosition(entry, true);
+ return true;
+ }
+ case TGSI_OPCODE_ENDSUB:
+ {
+ sub.cur = getSubroutine(prog->main);
+ setPosition(BasicBlock::get(sub.cur->f->cfg.getRoot()), true);
+ return true;
+ }
+ case TGSI_OPCODE_CAL:
+ {
+ Subroutine *s = getSubroutine(tgsi.getLabel());
+ mkFlow(OP_CALL, s->f, CC_ALWAYS, NULL);
+ func->call.attach(&s->f->call, Graph::Edge::TREE);
+ return true;
+ }
+ case TGSI_OPCODE_RET:
+ {
+ if (bb->isTerminated())
+ return true;
+ BasicBlock *leave = BasicBlock::get(func->cfgExit);
+
+ if (!isEndOfSubroutine(ip + 1)) {
+ // insert a PRERET at the entry if this is an early return
+ // (only needed for sharing code in the epilogue)
+ BasicBlock *pos = getBB();
+ setPosition(BasicBlock::get(func->cfg.getRoot()), false);
+ mkFlow(OP_PRERET, leave, CC_ALWAYS, NULL)->fixed = 1;
+ setPosition(pos, true);
+ }
+ mkFlow(OP_RET, NULL, CC_ALWAYS, NULL)->fixed = 1;
+ bb->cfg.attach(&leave->cfg, Graph::Edge::CROSS);
+ }
+ break;
+ case TGSI_OPCODE_END:
+ {
+ // attach and generate epilogue code
+ BasicBlock *epilogue = BasicBlock::get(func->cfgExit);
+ bb->cfg.attach(&epilogue->cfg, Graph::Edge::TREE);
+ setPosition(epilogue, true);
+ if (prog->getType() == Program::TYPE_FRAGMENT)
+ exportOutputs();
+ if (info->io.genUserClip > 0)
+ handleUserClipPlanes();
+ mkOp(OP_EXIT, TYPE_NONE, NULL)->terminator = 1;
+ }
+ break;
+ case TGSI_OPCODE_SWITCH:
+ case TGSI_OPCODE_CASE:
+ ERROR("switch/case opcode encountered, should have been lowered\n");
+ abort();
+ break;
+ case TGSI_OPCODE_LOAD:
+ handleLOAD(dst0);
+ break;
+ case TGSI_OPCODE_STORE:
+ handleSTORE();
+ break;
+ case TGSI_OPCODE_BARRIER:
+ geni = mkOp2(OP_BAR, TYPE_U32, NULL, mkImm(0), mkImm(0));
+ geni->fixed = 1;
+ geni->subOp = NV50_IR_SUBOP_BAR_SYNC;
+ break;
+ case TGSI_OPCODE_MFENCE:
+ case TGSI_OPCODE_LFENCE:
+ case TGSI_OPCODE_SFENCE:
+ geni = mkOp(OP_MEMBAR, TYPE_NONE, NULL);
+ geni->fixed = 1;
+ geni->subOp = tgsi::opcodeToSubOp(tgsi.getOpcode());
+ break;
+ case TGSI_OPCODE_ATOMUADD:
+ case TGSI_OPCODE_ATOMXCHG:
+ case TGSI_OPCODE_ATOMCAS:
+ case TGSI_OPCODE_ATOMAND:
+ case TGSI_OPCODE_ATOMOR:
+ case TGSI_OPCODE_ATOMXOR:
+ case TGSI_OPCODE_ATOMUMIN:
+ case TGSI_OPCODE_ATOMIMIN:
+ case TGSI_OPCODE_ATOMUMAX:
+ case TGSI_OPCODE_ATOMIMAX:
+ handleATOM(dst0, dstTy, tgsi::opcodeToSubOp(tgsi.getOpcode()));
+ break;
+ default:
+ ERROR("unhandled TGSI opcode: %u\n", tgsi.getOpcode());
+ assert(0);
+ break;
+ }
+
+ if (tgsi.dstCount()) {
+ for (c = 0; c < 4; ++c) {
+ if (!dst0[c])
+ continue;
+ if (dst0[c] != rDst0[c])
+ mkMov(rDst0[c], dst0[c]);
+ storeDst(0, c, rDst0[c]);
+ }
+ }
+ vtxBaseValid = 0;
+
+ return true;
+}
+
+void
+Converter::handleUserClipPlanes()
+{
+ Value *res[8];
+ int n, i, c;
+
+ for (c = 0; c < 4; ++c) {
+ for (i = 0; i < info->io.genUserClip; ++i) {
+ Symbol *sym = mkSymbol(FILE_MEMORY_CONST, info->io.ucpCBSlot,
+ TYPE_F32, info->io.ucpBase + i * 16 + c * 4);
+ Value *ucp = mkLoadv(TYPE_F32, sym, NULL);
+ if (c == 0)
+ res[i] = mkOp2v(OP_MUL, TYPE_F32, getScratch(), clipVtx[c], ucp);
+ else
+ mkOp3(OP_MAD, TYPE_F32, res[i], clipVtx[c], ucp, res[i]);
+ }
+ }
+
+ const int first = info->numOutputs - (info->io.genUserClip + 3) / 4;
+
+ for (i = 0; i < info->io.genUserClip; ++i) {
+ n = i / 4 + first;
+ c = i % 4;
+ Symbol *sym =
+ mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32, info->out[n].slot[c] * 4);
+ mkStore(OP_EXPORT, TYPE_F32, sym, NULL, res[i]);
+ }
+}
+
+void
+Converter::exportOutputs()
+{
+ for (unsigned int i = 0; i < info->numOutputs; ++i) {
+ for (unsigned int c = 0; c < 4; ++c) {
+ if (!oData.exists(sub.cur->values, i, c))
+ continue;
+ Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32,
+ info->out[i].slot[c] * 4);
+ Value *val = oData.load(sub.cur->values, i, c, NULL);
+ if (val)
+ mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val);
+ }
+ }
+}
+
+Converter::Converter(Program *ir, const tgsi::Source *code) : BuildUtil(ir),
+ code(code),
+ tgsi(NULL),
+ tData(this), aData(this), pData(this), oData(this)
+{
+ info = code->info;
+
+ const DataFile tFile = code->mainTempsInLMem ? FILE_MEMORY_LOCAL : FILE_GPR;
+
+ const unsigned tSize = code->fileSize(TGSI_FILE_TEMPORARY);
+ const unsigned pSize = code->fileSize(TGSI_FILE_PREDICATE);
+ const unsigned aSize = code->fileSize(TGSI_FILE_ADDRESS);
+ const unsigned oSize = code->fileSize(TGSI_FILE_OUTPUT);
+
+ tData.setup(TGSI_FILE_TEMPORARY, 0, 0, tSize, 4, 4, tFile, 0);
+ pData.setup(TGSI_FILE_PREDICATE, 0, 0, pSize, 4, 4, FILE_PREDICATE, 0);
+ aData.setup(TGSI_FILE_ADDRESS, 0, 0, aSize, 4, 4, FILE_ADDRESS, 0);
+ oData.setup(TGSI_FILE_OUTPUT, 0, 0, oSize, 4, 4, FILE_GPR, 0);
+
+ zero = mkImm((uint32_t)0);
+
+ vtxBaseValid = 0;
+}
+
+Converter::~Converter()
+{
+}
+
+inline const Converter::Location *
+Converter::BindArgumentsPass::getValueLocation(Subroutine *s, Value *v)
+{
+ ValueMap::l_iterator it = s->values.l.find(v);
+ return it == s->values.l.end() ? NULL : &it->second;
+}
+
+template<typename T> inline void
+Converter::BindArgumentsPass::updateCallArgs(
+ Instruction *i, void (Instruction::*setArg)(int, Value *),
+ T (Function::*proto))
+{
+ Function *g = i->asFlow()->target.fn;
+ Subroutine *subg = conv.getSubroutine(g);
+
+ for (unsigned a = 0; a < (g->*proto).size(); ++a) {
+ Value *v = (g->*proto)[a].get();
+ const Converter::Location &l = *getValueLocation(subg, v);
+ Converter::DataArray *array = conv.getArrayForFile(l.array, l.arrayIdx);
+
+ (i->*setArg)(a, array->acquire(sub->values, l.i, l.c));
+ }
+}
+
+template<typename T> inline void
+Converter::BindArgumentsPass::updatePrototype(
+ BitSet *set, void (Function::*updateSet)(), T (Function::*proto))
+{
+ (func->*updateSet)();
+
+ for (unsigned i = 0; i < set->getSize(); ++i) {
+ Value *v = func->getLValue(i);
+ const Converter::Location *l = getValueLocation(sub, v);
+
+ // only include values with a matching TGSI register
+ if (set->test(i) && l && !conv.code->locals.count(*l))
+ (func->*proto).push_back(v);
+ }
+}
+
+bool
+Converter::BindArgumentsPass::visit(Function *f)
+{
+ sub = conv.getSubroutine(f);
+
+ for (ArrayList::Iterator bi = f->allBBlocks.iterator();
+ !bi.end(); bi.next()) {
+ for (Instruction *i = BasicBlock::get(bi)->getFirst();
+ i; i = i->next) {
+ if (i->op == OP_CALL && !i->asFlow()->builtin) {
+ updateCallArgs(i, &Instruction::setSrc, &Function::ins);
+ updateCallArgs(i, &Instruction::setDef, &Function::outs);
+ }
+ }
+ }
+
+ if (func == prog->main && prog->getType() != Program::TYPE_COMPUTE)
+ return true;
+ updatePrototype(&BasicBlock::get(f->cfg.getRoot())->liveSet,
+ &Function::buildLiveSets, &Function::ins);
+ updatePrototype(&BasicBlock::get(f->cfgExit)->defSet,
+ &Function::buildDefSets, &Function::outs);
+
+ return true;
+}
+
+bool
+Converter::run()
+{
+ BasicBlock *entry = new BasicBlock(prog->main);
+ BasicBlock *leave = new BasicBlock(prog->main);
+
+ prog->main->setEntry(entry);
+ prog->main->setExit(leave);
+
+ setPosition(entry, true);
+ sub.cur = getSubroutine(prog->main);
+
+ if (info->io.genUserClip > 0) {
+ for (int c = 0; c < 4; ++c)
+ clipVtx[c] = getScratch();
+ }
+
+ if (prog->getType() == Program::TYPE_FRAGMENT) {
+ Symbol *sv = mkSysVal(SV_POSITION, 3);
+ fragCoord[3] = mkOp1v(OP_RDSV, TYPE_F32, getSSA(), sv);
+ mkOp1(OP_RCP, TYPE_F32, fragCoord[3], fragCoord[3]);
+ }
+
+ for (ip = 0; ip < code->scan.num_instructions; ++ip) {
+ if (!handleInstruction(&code->insns[ip]))
+ return false;
+ }
+
+ if (!BindArgumentsPass(*this).run(prog))
+ return false;
+
+ return true;
+}
+
+} // unnamed namespace
+
+namespace nv50_ir {
+
+bool
+Program::makeFromTGSI(struct nv50_ir_prog_info *info)
+{
+ tgsi::Source src(info);
+ if (!src.scanSource())
+ return false;
+ tlsSize = info->bin.tlsSpace;
+
+ Converter builder(this, &src);
+ return builder.run();
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
new file mode 100644
index 0000000..3f8d00a
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_graph.h"
+#include <limits>
+#include <list>
+#include <stack>
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+Graph::Graph()
+{
+ root = NULL;
+ size = 0;
+ sequence = 0;
+}
+
+Graph::~Graph()
+{
+ for (IteratorRef it = safeIteratorDFS(); !it->end(); it->next())
+ reinterpret_cast<Node *>(it->get())->cut();
+}
+
+void Graph::insert(Node *node)
+{
+ if (!root)
+ root = node;
+
+ node->graph = this;
+ size++;
+}
+
+void Graph::Edge::unlink()
+{
+ if (origin) {
+ prev[0]->next[0] = next[0];
+ next[0]->prev[0] = prev[0];
+ if (origin->out == this)
+ origin->out = (next[0] == this) ? NULL : next[0];
+
+ --origin->outCount;
+ }
+ if (target) {
+ prev[1]->next[1] = next[1];
+ next[1]->prev[1] = prev[1];
+ if (target->in == this)
+ target->in = (next[1] == this) ? NULL : next[1];
+
+ --target->inCount;
+ }
+}
+
+const char *Graph::Edge::typeStr() const
+{
+ switch (type) {
+ case TREE: return "tree";
+ case FORWARD: return "forward";
+ case BACK: return "back";
+ case CROSS: return "cross";
+ case DUMMY: return "dummy";
+ case UNKNOWN:
+ default:
+ return "unk";
+ }
+}
+
+Graph::Node::Node(void *priv) : data(priv),
+ in(0), out(0), graph(0),
+ visited(0),
+ inCount(0), outCount(0)
+{
+ // nothing to do
+}
+
+void Graph::Node::attach(Node *node, Edge::Type kind)
+{
+ Edge *edge = new Edge(this, node, kind);
+
+ // insert head
+ if (this->out) {
+ edge->next[0] = this->out;
+ edge->prev[0] = this->out->prev[0];
+ edge->prev[0]->next[0] = edge;
+ this->out->prev[0] = edge;
+ }
+ this->out = edge;
+
+ if (node->in) {
+ edge->next[1] = node->in;
+ edge->prev[1] = node->in->prev[1];
+ edge->prev[1]->next[1] = edge;
+ node->in->prev[1] = edge;
+ }
+ node->in = edge;
+
+ ++this->outCount;
+ ++node->inCount;
+
+ assert(graph || node->graph);
+ if (!node->graph)
+ graph->insert(node);
+ if (!graph)
+ node->graph->insert(this);
+
+ if (kind == Edge::UNKNOWN)
+ graph->classifyEdges();
+}
+
+bool Graph::Node::detach(Graph::Node *node)
+{
+ EdgeIterator ei = this->outgoing();
+ for (; !ei.end(); ei.next())
+ if (ei.getNode() == node)
+ break;
+ if (ei.end()) {
+ ERROR("no such node attached\n");
+ return false;
+ }
+ delete ei.getEdge();
+ return true;
+}
+
+// Cut a node from the graph, deleting all attached edges.
+void Graph::Node::cut()
+{
+ while (out)
+ delete out;
+ while (in)
+ delete in;
+
+ if (graph) {
+ if (graph->root == this)
+ graph->root = NULL;
+ graph = NULL;
+ }
+}
+
+Graph::Edge::Edge(Node *org, Node *tgt, Type kind)
+{
+ target = tgt;
+ origin = org;
+ type = kind;
+
+ next[0] = next[1] = this;
+ prev[0] = prev[1] = this;
+}
+
+bool
+Graph::Node::reachableBy(const Node *node, const Node *term) const
+{
+ std::stack<const Node *> stack;
+ const Node *pos = NULL;
+ const int seq = graph->nextSequence();
+
+ stack.push(node);
+
+ while (!stack.empty()) {
+ pos = stack.top();
+ stack.pop();
+
+ if (pos == this)
+ return true;
+ if (pos == term)
+ continue;
+
+ for (EdgeIterator ei = pos->outgoing(); !ei.end(); ei.next()) {
+ if (ei.getType() == Edge::BACK || ei.getType() == Edge::DUMMY)
+ continue;
+ if (ei.getNode()->visit(seq))
+ stack.push(ei.getNode());
+ }
+ }
+ return pos == this;
+}
+
+class DFSIterator : public Iterator
+{
+public:
+ DFSIterator(Graph *graph, const bool preorder)
+ {
+ unsigned int seq = graph->nextSequence();
+
+ nodes = new Graph::Node * [graph->getSize() + 1];
+ count = 0;
+ pos = 0;
+ nodes[graph->getSize()] = 0;
+
+ if (graph->getRoot()) {
+ graph->getRoot()->visit(seq);
+ search(graph->getRoot(), preorder, seq);
+ }
+ }
+
+ ~DFSIterator()
+ {
+ if (nodes)
+ delete[] nodes;
+ }
+
+ void search(Graph::Node *node, const bool preorder, const int sequence)
+ {
+ if (preorder)
+ nodes[count++] = node;
+
+ for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+ if (ei.getNode()->visit(sequence))
+ search(ei.getNode(), preorder, sequence);
+
+ if (!preorder)
+ nodes[count++] = node;
+ }
+
+ virtual bool end() const { return pos >= count; }
+ virtual void next() { if (pos < count) ++pos; }
+ virtual void *get() const { return nodes[pos]; }
+ virtual void reset() { pos = 0; }
+
+protected:
+ Graph::Node **nodes;
+ int count;
+ int pos;
+};
+
+IteratorRef Graph::iteratorDFS(bool preorder)
+{
+ return IteratorRef(new DFSIterator(this, preorder));
+}
+
+IteratorRef Graph::safeIteratorDFS(bool preorder)
+{
+ return this->iteratorDFS(preorder);
+}
+
+class CFGIterator : public Iterator
+{
+public:
+ CFGIterator(Graph *graph)
+ {
+ nodes = new Graph::Node * [graph->getSize() + 1];
+ count = 0;
+ pos = 0;
+ nodes[graph->getSize()] = 0;
+
+ // TODO: argh, use graph->sequence instead of tag and just raise it by > 1
+ for (IteratorRef it = graph->iteratorDFS(); !it->end(); it->next())
+ reinterpret_cast<Graph::Node *>(it->get())->tag = 0;
+
+ if (graph->getRoot())
+ search(graph->getRoot(), graph->nextSequence());
+ }
+
+ ~CFGIterator()
+ {
+ if (nodes)
+ delete[] nodes;
+ }
+
+ virtual void *get() const { return nodes[pos]; }
+ virtual bool end() const { return pos >= count; }
+ virtual void next() { if (pos < count) ++pos; }
+ virtual void reset() { pos = 0; }
+
+private:
+ void search(Graph::Node *node, const int sequence)
+ {
+ Stack bb, cross;
+
+ bb.push(node);
+
+ while (bb.getSize()) {
+ node = reinterpret_cast<Graph::Node *>(bb.pop().u.p);
+ assert(node);
+ if (!node->visit(sequence))
+ continue;
+ node->tag = 0;
+
+ for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) {
+ switch (ei.getType()) {
+ case Graph::Edge::TREE:
+ case Graph::Edge::FORWARD:
+ case Graph::Edge::DUMMY:
+ if (++(ei.getNode()->tag) == ei.getNode()->incidentCountFwd())
+ bb.push(ei.getNode());
+ break;
+ case Graph::Edge::BACK:
+ continue;
+ case Graph::Edge::CROSS:
+ if (++(ei.getNode()->tag) == 1)
+ cross.push(ei.getNode());
+ break;
+ default:
+ assert(!"unknown edge kind in CFG");
+ break;
+ }
+ }
+ nodes[count++] = node;
+
+ if (bb.getSize() == 0)
+ cross.moveTo(bb);
+ }
+ }
+
+private:
+ Graph::Node **nodes;
+ int count;
+ int pos;
+};
+
+IteratorRef Graph::iteratorCFG()
+{
+ return IteratorRef(new CFGIterator(this));
+}
+
+IteratorRef Graph::safeIteratorCFG()
+{
+ return this->iteratorCFG();
+}
+
+void Graph::classifyEdges()
+{
+ int seq;
+
+ for (IteratorRef it = iteratorDFS(true); !it->end(); it->next()) {
+ Node *node = reinterpret_cast<Node *>(it->get());
+ node->visit(0);
+ node->tag = 0;
+ }
+
+ classifyDFS(root, (seq = 0));
+
+ sequence = seq;
+}
+
+void Graph::classifyDFS(Node *curr, int& seq)
+{
+ Graph::Edge *edge;
+ Graph::Node *node;
+
+ curr->visit(++seq);
+ curr->tag = 1;
+
+ for (edge = curr->out; edge; edge = edge->next[0]) {
+ node = edge->target;
+ if (edge->type == Edge::DUMMY)
+ continue;
+
+ if (node->getSequence() == 0) {
+ edge->type = Edge::TREE;
+ classifyDFS(node, seq);
+ } else
+ if (node->getSequence() > curr->getSequence()) {
+ edge->type = Edge::FORWARD;
+ } else {
+ edge->type = node->tag ? Edge::BACK : Edge::CROSS;
+ }
+ }
+
+ for (edge = curr->in; edge; edge = edge->next[1]) {
+ node = edge->origin;
+ if (edge->type == Edge::DUMMY)
+ continue;
+
+ if (node->getSequence() == 0) {
+ edge->type = Edge::TREE;
+ classifyDFS(node, seq);
+ } else
+ if (node->getSequence() > curr->getSequence()) {
+ edge->type = Edge::FORWARD;
+ } else {
+ edge->type = node->tag ? Edge::BACK : Edge::CROSS;
+ }
+ }
+
+ curr->tag = 0;
+}
+
+// @dist is indexed by Node::tag, returns -1 if no path found
+int
+Graph::findLightestPathWeight(Node *a, Node *b, const std::vector<int> &weight)
+{
+ std::vector<int> path(weight.size(), std::numeric_limits<int>::max());
+ std::list<Node *> nodeList;
+ const int seq = nextSequence();
+
+ path[a->tag] = 0;
+ for (Node *c = a; c && c != b;) {
+ const int p = path[c->tag] + weight[c->tag];
+ for (EdgeIterator ei = c->outgoing(); !ei.end(); ei.next()) {
+ Node *t = ei.getNode();
+ if (t->getSequence() < seq) {
+ if (path[t->tag] == std::numeric_limits<int>::max())
+ nodeList.push_front(t);
+ if (p < path[t->tag])
+ path[t->tag] = p;
+ }
+ }
+ c->visit(seq);
+ Node *next = NULL;
+ for (std::list<Node *>::iterator n = nodeList.begin();
+ n != nodeList.end(); ++n) {
+ if (!next || path[(*n)->tag] < path[next->tag])
+ next = *n;
+ if ((*n) == c) {
+ // erase visited
+ n = nodeList.erase(n);
+ --n;
+ }
+ }
+ c = next;
+ }
+ if (path[b->tag] == std::numeric_limits<int>::max())
+ return -1;
+ return path[b->tag];
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
new file mode 100644
index 0000000..b0981ff
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.h
@@ -0,0 +1,228 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_GRAPH_H__
+#define __NV50_IR_GRAPH_H__
+
+#include "codegen/nv50_ir_util.h"
+#include <vector>
+
+namespace nv50_ir {
+
+#define ITER_NODE(x) reinterpret_cast<Graph::Node *>((x).get())
+#define ITER_EDGE(x) reinterpret_cast<Graph::Edge *>((x).get())
+
+// A connected graph.
+class Graph
+{
+public:
+ class Node;
+
+ class Edge
+ {
+ public:
+ enum Type
+ {
+ UNKNOWN,
+ TREE,
+ FORWARD,
+ BACK,
+ CROSS, // e.g. loop break
+ DUMMY
+ };
+
+ Edge(Node *dst, Node *src, Type kind);
+ ~Edge() { unlink(); }
+
+ inline Node *getOrigin() const { return origin; }
+ inline Node *getTarget() const { return target; }
+
+ inline Type getType() const { return type; }
+ const char *typeStr() const;
+
+ private:
+ Node *origin;
+ Node *target;
+
+ Type type;
+ Edge *next[2]; // next edge outgoing/incident from/to origin/target
+ Edge *prev[2];
+
+ void unlink();
+
+ friend class Graph;
+ };
+
+ class EdgeIterator : public Iterator
+ {
+ public:
+ EdgeIterator() : e(0), t(0), d(0), rev(false) { }
+ EdgeIterator(Graph::Edge *first, int dir, bool reverse)
+ : d(dir), rev(reverse)
+ {
+ t = e = ((rev && first) ? first->prev[d] : first);
+ }
+
+ virtual void next()
+ {
+ Graph::Edge *n = (rev ? e->prev[d] : e->next[d]);
+ e = (n == t ? NULL : n);
+ }
+ virtual bool end() const { return !e; }
+ virtual void *get() const { return e; }
+
+ inline Node *getNode() const { assert(e); return d ?
+ e->origin : e->target; }
+ inline Edge *getEdge() const { return e; }
+ inline Edge::Type getType() { return e ? e->getType() : Edge::UNKNOWN; }
+
+ private:
+ Graph::Edge *e;
+ Graph::Edge *t;
+ int d;
+ bool rev;
+ };
+
+ class Node
+ {
+ public:
+ Node(void *);
+ ~Node() { cut(); }
+
+ void attach(Node *, Edge::Type);
+ bool detach(Node *);
+ void cut();
+
+ inline EdgeIterator outgoing(bool reverse = false) const;
+ inline EdgeIterator incident(bool reverse = false) const;
+
+ inline Node *parent() const; // returns NULL if count(incident edges) != 1
+
+ bool reachableBy(const Node *node, const Node *term) const;
+
+ inline bool visit(int);
+ inline int getSequence() const;
+
+ inline int incidentCountFwd() const; // count of incident non-back edges
+ inline int incidentCount() const { return inCount; }
+ inline int outgoingCount() const { return outCount; }
+
+ Graph *getGraph() const { return graph; }
+
+ void *data;
+
+ private:
+ Edge *in;
+ Edge *out;
+ Graph *graph;
+
+ int visited;
+
+ int16_t inCount;
+ int16_t outCount;
+ public:
+ int tag; // for temporary use
+
+ friend class Graph;
+ };
+
+public:
+ Graph();
+ ~Graph(); // does *not* free the nodes (make it an option ?)
+
+ inline Node *getRoot() const { return root; }
+
+ inline unsigned int getSize() const { return size; }
+
+ inline int nextSequence();
+
+ void insert(Node *node); // attach to or set as root
+
+ IteratorRef iteratorDFS(bool preorder = true);
+ IteratorRef iteratorCFG();
+
+ // safe iterators are unaffected by changes to the *edges* of the graph
+ IteratorRef safeIteratorDFS(bool preorder = true);
+ IteratorRef safeIteratorCFG();
+
+ void classifyEdges();
+
+ // @weights: indexed by Node::tag
+ int findLightestPathWeight(Node *, Node *, const std::vector<int>& weights);
+
+private:
+ void classifyDFS(Node *, int&);
+
+private:
+ Node *root;
+ unsigned int size;
+ int sequence;
+};
+
+int Graph::nextSequence()
+{
+ return ++sequence;
+}
+
+Graph::Node *Graph::Node::parent() const
+{
+ if (inCount != 1)
+ return NULL;
+ assert(in);
+ return in->origin;
+}
+
+bool Graph::Node::visit(int v)
+{
+ if (visited == v)
+ return false;
+ visited = v;
+ return true;
+}
+
+int Graph::Node::getSequence() const
+{
+ return visited;
+}
+
+Graph::EdgeIterator Graph::Node::outgoing(bool reverse) const
+{
+ return EdgeIterator(out, 0, reverse);
+}
+
+Graph::EdgeIterator Graph::Node::incident(bool reverse) const
+{
+ return EdgeIterator(in, 1, reverse);
+}
+
+int Graph::Node::incidentCountFwd() const
+{
+ int n = 0;
+ for (EdgeIterator ei = incident(); !ei.end(); ei.next())
+ if (ei.getType() != Edge::BACK)
+ ++n;
+ return n;
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_GRAPH_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
new file mode 100644
index 0000000..255324f
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_inlines.h
@@ -0,0 +1,420 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_INLINES_H__
+#define __NV50_IR_INLINES_H__
+
+static inline CondCode reverseCondCode(CondCode cc)
+{
+ static const uint8_t ccRev[8] = { 0, 4, 2, 6, 1, 5, 3, 7 };
+
+ return static_cast<CondCode>(ccRev[cc & 7] | (cc & ~7));
+}
+
+static inline CondCode inverseCondCode(CondCode cc)
+{
+ return static_cast<CondCode>(cc ^ 7);
+}
+
+static inline bool isMemoryFile(DataFile f)
+{
+ return (f >= FILE_MEMORY_CONST && f <= FILE_MEMORY_LOCAL);
+}
+
+// contrary to asTex(), this will never include SULD/SUST
+static inline bool isTextureOp(operation op)
+{
+ return (op >= OP_TEX && op <= OP_TEXPREP);
+}
+
+static inline bool isSurfaceOp(operation op)
+{
+ return (op >= OP_SULDB && op <= OP_SULEA);
+}
+
+static inline unsigned int typeSizeof(DataType ty)
+{
+ switch (ty) {
+ case TYPE_U8:
+ case TYPE_S8:
+ return 1;
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return 2;
+ case TYPE_F32:
+ case TYPE_U32:
+ case TYPE_S32:
+ return 4;
+ case TYPE_F64:
+ case TYPE_U64:
+ case TYPE_S64:
+ return 8;
+ case TYPE_B96:
+ return 12;
+ case TYPE_B128:
+ return 16;
+ default:
+ return 0;
+ }
+}
+
+static inline unsigned int typeSizeofLog2(DataType ty)
+{
+ switch (ty) {
+ case TYPE_F16:
+ case TYPE_U16:
+ case TYPE_S16:
+ return 1;
+ case TYPE_F32:
+ case TYPE_U32:
+ case TYPE_S32:
+ return 2;
+ case TYPE_F64:
+ case TYPE_U64:
+ case TYPE_S64:
+ return 3;
+ case TYPE_B96:
+ case TYPE_B128:
+ return 4;
+ case TYPE_U8:
+ case TYPE_S8:
+ default:
+ return 0;
+ }
+}
+
+static inline DataType typeOfSize(unsigned int size,
+ bool flt = false, bool sgn = false)
+{
+ switch (size) {
+ case 1: return sgn ? TYPE_S8 : TYPE_U8;
+ case 2: return flt ? TYPE_F16 : (sgn ? TYPE_S16 : TYPE_U16);
+ case 8: return flt ? TYPE_F64 : (sgn ? TYPE_S64 : TYPE_U64);
+ case 12: return TYPE_B96;
+ case 16: return TYPE_B128;
+ case 4:
+ return flt ? TYPE_F32 : (sgn ? TYPE_S32 : TYPE_U32);
+ default:
+ return TYPE_NONE;
+ }
+}
+
+static inline bool isFloatType(DataType ty)
+{
+ return (ty >= TYPE_F16 && ty <= TYPE_F64);
+}
+
+static inline bool isSignedIntType(DataType ty)
+{
+ return (ty == TYPE_S8 || ty == TYPE_S16 || ty == TYPE_S32);
+}
+
+static inline bool isSignedType(DataType ty)
+{
+ switch (ty) {
+ case TYPE_NONE:
+ case TYPE_U8:
+ case TYPE_U16:
+ case TYPE_U32:
+ case TYPE_B96:
+ case TYPE_B128:
+ return false;
+ default:
+ return true;
+ }
+}
+
+static inline DataType intTypeToSigned(DataType ty)
+{
+ switch (ty) {
+ case TYPE_U32: return TYPE_S32;
+ case TYPE_U16: return TYPE_S16;
+ case TYPE_U8: return TYPE_S8;
+ default:
+ return ty;
+ }
+}
+
+const ValueRef *ValueRef::getIndirect(int dim) const
+{
+ return isIndirect(dim) ? &insn->src(indirect[dim]) : NULL;
+}
+
+DataFile ValueRef::getFile() const
+{
+ return value ? value->reg.file : FILE_NULL;
+}
+
+unsigned int ValueRef::getSize() const
+{
+ return value ? value->reg.size : 0;
+}
+
+Value *ValueRef::rep() const
+{
+ assert(value);
+ return value->join;
+}
+
+Value *ValueDef::rep() const
+{
+ assert(value);
+ return value->join;
+}
+
+DataFile ValueDef::getFile() const
+{
+ return value ? value->reg.file : FILE_NULL;
+}
+
+unsigned int ValueDef::getSize() const
+{
+ return value ? value->reg.size : 0;
+}
+
+void ValueDef::setSSA(LValue *lval)
+{
+ origin = value->asLValue();
+ set(lval);
+}
+
+const LValue *ValueDef::preSSA() const
+{
+ return origin;
+}
+
+Instruction *Value::getInsn() const
+{
+ return defs.empty() ? NULL : defs.front()->getInsn();
+}
+
+Instruction *Value::getUniqueInsn() const
+{
+ if (defs.empty())
+ return NULL;
+
+ // after regalloc, the definitions of coalesced values are linked
+ if (join != this) {
+ for (DefCIterator it = defs.begin(); it != defs.end(); ++it)
+ if ((*it)->get() == this)
+ return (*it)->getInsn();
+ // should be unreachable and trigger assertion at the end
+ }
+#ifdef DEBUG
+ if (reg.data.id < 0) {
+ int n = 0;
+ for (DefCIterator it = defs.begin(); n < 2 && it != defs.end(); ++it)
+ if ((*it)->get() == this) // don't count joined values
+ ++n;
+ if (n > 1)
+ WARN("value %%%i not uniquely defined\n", id); // return NULL ?
+ }
+#endif
+ assert(defs.front()->get() == this);
+ return defs.front()->getInsn();
+}
+
+inline bool Instruction::constrainedDefs() const
+{
+ return defExists(1) || op == OP_UNION;
+}
+
+Value *Instruction::getIndirect(int s, int dim) const
+{
+ return srcs[s].isIndirect(dim) ? getSrc(srcs[s].indirect[dim]) : NULL;
+}
+
+Value *Instruction::getPredicate() const
+{
+ return (predSrc >= 0) ? getSrc(predSrc) : NULL;
+}
+
+void Instruction::setFlagsDef(int d, Value *val)
+{
+ if (val) {
+ if (flagsDef < 0)
+ flagsDef = d;
+ setDef(flagsDef, val);
+ } else {
+ if (flagsDef >= 0) {
+ setDef(flagsDef, NULL);
+ flagsDef = -1;
+ }
+ }
+}
+
+void Instruction::setFlagsSrc(int s, Value *val)
+{
+ flagsSrc = s;
+ setSrc(flagsSrc, val);
+}
+
+Value *TexInstruction::getIndirectR() const
+{
+ return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL;
+}
+
+Value *TexInstruction::getIndirectS() const
+{
+ return tex.rIndirectSrc >= 0 ? getSrc(tex.rIndirectSrc) : NULL;
+}
+
+CmpInstruction *Instruction::asCmp()
+{
+ if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP)
+ return static_cast<CmpInstruction *>(this);
+ return NULL;
+}
+
+const CmpInstruction *Instruction::asCmp() const
+{
+ if (op >= OP_SET_AND && op <= OP_SLCT && op != OP_SELP)
+ return static_cast<const CmpInstruction *>(this);
+ return NULL;
+}
+
+FlowInstruction *Instruction::asFlow()
+{
+ if (op >= OP_BRA && op <= OP_JOIN)
+ return static_cast<FlowInstruction *>(this);
+ return NULL;
+}
+
+const FlowInstruction *Instruction::asFlow() const
+{
+ if (op >= OP_BRA && op <= OP_JOINAT)
+ return static_cast<const FlowInstruction *>(this);
+ return NULL;
+}
+
+TexInstruction *Instruction::asTex()
+{
+ if (op >= OP_TEX && op <= OP_SULEA)
+ return static_cast<TexInstruction *>(this);
+ return NULL;
+}
+
+const TexInstruction *Instruction::asTex() const
+{
+ if (op >= OP_TEX && op <= OP_SULEA)
+ return static_cast<const TexInstruction *>(this);
+ return NULL;
+}
+
+static inline Instruction *cloneForward(Function *ctx, Instruction *obj)
+{
+ DeepClonePolicy<Function> pol(ctx);
+
+ for (int i = 0; obj->srcExists(i); ++i)
+ pol.set(obj->getSrc(i), obj->getSrc(i));
+
+ return obj->clone(pol);
+}
+
+// XXX: use a virtual function so we're really really safe ?
+LValue *Value::asLValue()
+{
+ if (reg.file >= FILE_GPR && reg.file <= FILE_ADDRESS)
+ return static_cast<LValue *>(this);
+ return NULL;
+}
+
+Symbol *Value::asSym()
+{
+ if (reg.file >= FILE_MEMORY_CONST)
+ return static_cast<Symbol *>(this);
+ return NULL;
+}
+
+const Symbol *Value::asSym() const
+{
+ if (reg.file >= FILE_MEMORY_CONST)
+ return static_cast<const Symbol *>(this);
+ return NULL;
+}
+
+void Symbol::setOffset(int32_t offset)
+{
+ reg.data.offset = offset;
+}
+
+void Symbol::setAddress(Symbol *base, int32_t offset)
+{
+ baseSym = base;
+ reg.data.offset = offset;
+}
+
+void Symbol::setSV(SVSemantic sv, uint32_t index)
+{
+ reg.data.sv.sv = sv;
+ reg.data.sv.index = index;
+}
+
+ImmediateValue *Value::asImm()
+{
+ if (reg.file == FILE_IMMEDIATE)
+ return static_cast<ImmediateValue *>(this);
+ return NULL;
+}
+
+const ImmediateValue *Value::asImm() const
+{
+ if (reg.file == FILE_IMMEDIATE)
+ return static_cast<const ImmediateValue *>(this);
+ return NULL;
+}
+
+Value *Value::get(Iterator &it)
+{
+ return reinterpret_cast<Value *>(it.get());
+}
+
+bool BasicBlock::reachableBy(const BasicBlock *by, const BasicBlock *term)
+{
+ return cfg.reachableBy(&by->cfg, &term->cfg);
+}
+
+BasicBlock *BasicBlock::get(Iterator &iter)
+{
+ return reinterpret_cast<BasicBlock *>(iter.get());
+}
+
+BasicBlock *BasicBlock::get(Graph::Node *node)
+{
+ assert(node);
+ return reinterpret_cast<BasicBlock *>(node->data);
+}
+
+Function *Function::get(Graph::Node *node)
+{
+ assert(node);
+ return reinterpret_cast<Function *>(node->data);
+}
+
+LValue *Function::getLValue(int id)
+{
+ assert((unsigned int)id < (unsigned int)allLValues.getSize());
+ return reinterpret_cast<LValue *>(allLValues.get(id));
+}
+
+#endif // __NV50_IR_INLINES_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
new file mode 100644
index 0000000..56eaad3
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -0,0 +1,1101 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+// nv50 doesn't support 32 bit integer multiplication
+//
+// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
+// -------------------
+// al*bh 00 HI32: (al * bh + ah * bl) >> 16 + (ah * bh) +
+// ah*bh 00 00 ( carry1) << 16 + ( carry2)
+// al*bl
+// ah*bl 00
+//
+// fffe0001 + fffe0001
+static bool
+expandIntegerMUL(BuildUtil *bld, Instruction *mul)
+{
+ const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
+
+ DataType fTy = mul->sType; // full type
+ DataType hTy;
+ switch (fTy) {
+ case TYPE_S32: hTy = TYPE_S16; break;
+ case TYPE_U32: hTy = TYPE_U16; break;
+ case TYPE_U64: hTy = TYPE_U32; break;
+ case TYPE_S64: hTy = TYPE_S32; break;
+ default:
+ return false;
+ }
+ unsigned int fullSize = typeSizeof(fTy);
+ unsigned int halfSize = typeSizeof(hTy);
+
+ Instruction *i[9];
+
+ bld->setPosition(mul, true);
+
+ Value *a[2], *b[2];
+ Value *c[2];
+ Value *t[4];
+ for (int j = 0; j < 4; ++j)
+ t[j] = bld->getSSA(fullSize);
+
+ // split sources into halves
+ i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
+ i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
+
+ i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
+ i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
+ i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
+ i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
+
+ if (highResult) {
+ Value *r[3];
+ Value *imm = bld->loadImm(NULL, 1 << (halfSize * 8));
+ c[0] = bld->getSSA(1, FILE_FLAGS);
+ c[1] = bld->getSSA(1, FILE_FLAGS);
+ for (int j = 0; j < 3; ++j)
+ r[j] = bld->getSSA(fullSize);
+
+ i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
+ i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
+ bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
+ i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
+
+ // set carry defs / sources
+ i[3]->setFlagsDef(1, c[0]);
+ i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
+ i[6]->setPredicate(CC_C, c[0]);
+ i[5]->setFlagsSrc(3, c[1]);
+ } else {
+ bld->mkMov(mul->getDef(0), t[3]);
+ }
+ delete_Instruction(bld->getProgram(), mul);
+
+ for (int j = 2; j <= (highResult ? 5 : 4); ++j)
+ if (i[j])
+ i[j]->sType = hTy;
+
+ return true;
+}
+
+#define QOP_ADD 0
+#define QOP_SUBR 1
+#define QOP_SUB 2
+#define QOP_MOV2 3
+
+// UL UR LL LR
+#define QUADOP(q, r, s, t) \
+ ((QOP_##q << 6) | (QOP_##r << 4) | \
+ (QOP_##s << 2) | (QOP_##t << 0))
+
+class NV50LegalizePostRA : public Pass
+{
+private:
+ virtual bool visit(Function *);
+ virtual bool visit(BasicBlock *);
+
+ void handlePRERET(FlowInstruction *);
+ void replaceZero(Instruction *);
+
+ LValue *r63;
+};
+
+bool
+NV50LegalizePostRA::visit(Function *fn)
+{
+ Program *prog = fn->getProgram();
+
+ r63 = new_LValue(fn, FILE_GPR);
+ r63->reg.data.id = 63;
+
+ // this is actually per-program, but we can do it all on visiting main()
+ std::list<Instruction *> *outWrites =
+ reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+
+ if (outWrites) {
+ for (std::list<Instruction *>::iterator it = outWrites->begin();
+ it != outWrites->end(); ++it)
+ (*it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (*it)->getSrc(0));
+ // instructions will be deleted on exit
+ outWrites->clear();
+ }
+
+ return true;
+}
+
+void
+NV50LegalizePostRA::replaceZero(Instruction *i)
+{
+ for (int s = 0; i->srcExists(s); ++s) {
+ ImmediateValue *imm = i->getSrc(s)->asImm();
+ if (imm && imm->reg.data.u64 == 0)
+ i->setSrc(s, r63);
+ }
+}
+
+// Emulate PRERET: jump to the target and call to the origin from there
+//
+// WARNING: atm only works if BBs are affected by at most a single PRERET
+//
+// BB:0
+// preret BB:3
+// (...)
+// BB:3
+// (...)
+// --->
+// BB:0
+// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
+// (...)
+// BB:3
+// bra BB:3 + n1 (skip the call)
+// call BB:0 + n2 (skip bra at beginning of BB:0)
+// (...)
+void
+NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
+{
+ BasicBlock *bbE = pre->bb;
+ BasicBlock *bbT = pre->target.bb;
+
+ pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
+ bbE->remove(pre);
+ bbE->insertHead(pre);
+
+ Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
+ Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
+
+ bbT->insertHead(call);
+ bbT->insertHead(skip);
+
+ // NOTE: maybe split blocks to prevent the instructions from moving ?
+
+ skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
+ call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
+}
+
+bool
+NV50LegalizePostRA::visit(BasicBlock *bb)
+{
+ Instruction *i, *next;
+
+ // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+ for (i = bb->getFirst(); i; i = next) {
+ next = i->next;
+ if (i->isNop()) {
+ bb->remove(i);
+ } else
+ if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
+ handlePRERET(i->asFlow());
+ } else {
+ // TODO: We will want to do this before register allocation,
+ // since have to use a $c register for the carry flag.
+ if (typeSizeof(i->dType) == 8) {
+ Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
+ if (hi)
+ next = hi;
+ }
+
+ if (i->op != OP_MOV && i->op != OP_PFETCH &&
+ i->op != OP_BAR &&
+ (!i->defExists(0) || i->def(0).getFile() != FILE_ADDRESS))
+ replaceZero(i);
+ }
+ }
+ if (!bb->getEntry())
+ return true;
+
+ return true;
+}
+
+class NV50LegalizeSSA : public Pass
+{
+public:
+ NV50LegalizeSSA(Program *);
+
+ virtual bool visit(BasicBlock *bb);
+
+private:
+ void propagateWriteToOutput(Instruction *);
+ void handleDIV(Instruction *);
+ void handleMOD(Instruction *);
+ void handleMUL(Instruction *);
+ void handleAddrDef(Instruction *);
+
+ inline bool isARL(const Instruction *) const;
+
+ BuildUtil bld;
+
+ std::list<Instruction *> *outWrites;
+};
+
+NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
+{
+ bld.setProgram(prog);
+
+ if (prog->optLevel >= 2 &&
+ (prog->getType() == Program::TYPE_GEOMETRY ||
+ prog->getType() == Program::TYPE_VERTEX))
+ outWrites =
+ reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+ else
+ outWrites = NULL;
+}
+
+void
+NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
+{
+ if (st->src(0).isIndirect(0) || st->getSrc(1)->refCount() != 1)
+ return;
+
+ // check def instruction can store
+ Instruction *di = st->getSrc(1)->defs.front()->getInsn();
+
+ // TODO: move exports (if beneficial) in common opt pass
+ if (di->isPseudo() || isTextureOp(di->op) || di->defCount(0xff, true) > 1)
+ return;
+ for (int s = 0; di->srcExists(s); ++s)
+ if (di->src(s).getFile() == FILE_IMMEDIATE)
+ return;
+
+ // We cannot set defs to non-lvalues before register allocation, so
+ // save & remove (to save registers) the exports and replace later.
+ outWrites->push_back(st);
+ st->bb->remove(st);
+}
+
+bool
+NV50LegalizeSSA::isARL(const Instruction *i) const
+{
+ ImmediateValue imm;
+
+ if (i->op != OP_SHL || i->src(0).getFile() != FILE_GPR)
+ return false;
+ if (!i->src(1).getImmediate(imm))
+ return false;
+ return imm.isInteger(0);
+}
+
+void
+NV50LegalizeSSA::handleAddrDef(Instruction *i)
+{
+ Instruction *arl;
+
+ i->getDef(0)->reg.size = 2; // $aX are only 16 bit
+
+ // only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
+ if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
+ if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
+ return;
+ if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
+ return;
+ }
+
+ // turn $a sources into $r sources (can't operate on $a)
+ for (int s = 0; i->srcExists(s); ++s) {
+ Value *a = i->getSrc(s);
+ Value *r;
+ if (a->reg.file == FILE_ADDRESS) {
+ if (a->getInsn() && isARL(a->getInsn())) {
+ i->setSrc(s, a->getInsn()->getSrc(0));
+ } else {
+ bld.setPosition(i, false);
+ r = bld.getSSA();
+ bld.mkMov(r, a);
+ i->setSrc(s, r);
+ }
+ }
+ }
+ if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
+ return;
+
+ // turn result back into $a
+ bld.setPosition(i, true);
+ arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
+ i->setDef(0, arl->getSrc(0));
+}
+
+void
+NV50LegalizeSSA::handleMUL(Instruction *mul)
+{
+ if (isFloatType(mul->sType) || typeSizeof(mul->sType) <= 2)
+ return;
+ Value *def = mul->getDef(0);
+ Value *pred = mul->getPredicate();
+ CondCode cc = mul->cc;
+ if (pred)
+ mul->setPredicate(CC_ALWAYS, NULL);
+
+ if (mul->op == OP_MAD) {
+ Instruction *add = mul;
+ bld.setPosition(add, false);
+ Value *res = cloneShallow(func, mul->getDef(0));
+ mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
+ add->op = OP_ADD;
+ add->setSrc(0, mul->getDef(0));
+ add->setSrc(1, add->getSrc(2));
+ for (int s = 2; add->srcExists(s); ++s)
+ add->setSrc(s, NULL);
+ mul->subOp = add->subOp;
+ add->subOp = 0;
+ }
+ expandIntegerMUL(&bld, mul);
+ if (pred)
+ def->getInsn()->setPredicate(cc, pred);
+}
+
+// Use f32 division: first compute an approximate result, use it to reduce
+// the dividend, which should then be representable as f32, divide the reduced
+// dividend, and add the quotients.
+void
+NV50LegalizeSSA::handleDIV(Instruction *div)
+{
+ const DataType ty = div->sType;
+
+ if (ty != TYPE_U32 && ty != TYPE_S32)
+ return;
+
+ Value *q, *q0, *qf, *aR, *aRf, *qRf, *qR, *t, *s, *m, *cond;
+
+ bld.setPosition(div, false);
+
+ Value *a, *af = bld.getSSA();
+ Value *b, *bf = bld.getSSA();
+
+ bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
+ bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
+
+ if (isSignedType(ty)) {
+ af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+ bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
+ a = bld.getSSA();
+ b = bld.getSSA();
+ bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
+ bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
+ } else {
+ a = div->getSrc(0);
+ b = div->getSrc(1);
+ }
+
+ bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
+ bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
+
+ bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
+ bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
+
+ // get error of 1st result
+ expandIntegerMUL(&bld,
+ bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
+ bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
+
+ bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
+
+ bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
+ bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
+ ->rnd = ROUND_Z;
+ bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
+
+ // correction: if modulus >= divisor, add 1
+ expandIntegerMUL(&bld,
+ bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
+ bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
+ bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
+ if (!isSignedType(ty)) {
+ div->op = OP_SUB;
+ div->setSrc(0, q);
+ div->setSrc(1, s);
+ } else {
+ t = q;
+ bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
+ s = bld.getSSA();
+ t = bld.getSSA();
+ // fix the sign
+ bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
+ ->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
+ bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
+ bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
+
+ div->op = OP_UNION;
+ div->setSrc(0, s);
+ div->setSrc(1, t);
+ }
+}
+
+void
+NV50LegalizeSSA::handleMOD(Instruction *mod)
+{
+ if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
+ return;
+ bld.setPosition(mod, false);
+
+ Value *q = bld.getSSA();
+ Value *m = bld.getSSA();
+
+ bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
+ handleDIV(q->getInsn());
+
+ bld.setPosition(mod, false);
+ expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
+
+ mod->op = OP_SUB;
+ mod->setSrc(1, m);
+}
+
+bool
+NV50LegalizeSSA::visit(BasicBlock *bb)
+{
+ Instruction *insn, *next;
+ // skipping PHIs (don't pass them to handleAddrDef) !
+ for (insn = bb->getEntry(); insn; insn = next) {
+ next = insn->next;
+
+ switch (insn->op) {
+ case OP_EXPORT:
+ if (outWrites)
+ propagateWriteToOutput(insn);
+ break;
+ case OP_DIV:
+ handleDIV(insn);
+ break;
+ case OP_MOD:
+ handleMOD(insn);
+ break;
+ case OP_MAD:
+ case OP_MUL:
+ handleMUL(insn);
+ break;
+ default:
+ break;
+ }
+
+ if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
+ handleAddrDef(insn);
+ }
+ return true;
+}
+
+class NV50LoweringPreSSA : public Pass
+{
+public:
+ NV50LoweringPreSSA(Program *);
+
+private:
+ virtual bool visit(Instruction *);
+ virtual bool visit(Function *);
+
+ bool handleRDSV(Instruction *);
+ bool handleWRSV(Instruction *);
+
+ bool handleEXPORT(Instruction *);
+
+ bool handleDIV(Instruction *);
+ bool handleSQRT(Instruction *);
+ bool handlePOW(Instruction *);
+
+ bool handleSET(Instruction *);
+ bool handleSLCT(CmpInstruction *);
+ bool handleSELP(Instruction *);
+
+ bool handleTEX(TexInstruction *);
+ bool handleTXB(TexInstruction *); // I really
+ bool handleTXL(TexInstruction *); // hate
+ bool handleTXD(TexInstruction *); // these 3
+
+ bool handleCALL(Instruction *);
+ bool handlePRECONT(Instruction *);
+ bool handleCONT(Instruction *);
+
+ void checkPredicate(Instruction *);
+
+private:
+ const Target *const targ;
+
+ BuildUtil bld;
+
+ Value *tid;
+};
+
+NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
+ targ(prog->getTarget()), tid(NULL)
+{
+ bld.setProgram(prog);
+}
+
+bool
+NV50LoweringPreSSA::visit(Function *f)
+{
+ BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
+
+ if (prog->getType() == Program::TYPE_COMPUTE) {
+ // Add implicit "thread id" argument in $r0 to the function
+ Value *arg = new_LValue(func, FILE_GPR);
+ arg->reg.data.id = 0;
+ f->ins.push_back(arg);
+
+ bld.setPosition(root, false);
+ tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
+ }
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTEX(TexInstruction *i)
+{
+ const int arg = i->tex.target.getArgCount();
+ const int dref = arg;
+ const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
+
+ // dref comes before bias/lod
+ if (i->tex.target.isShadow())
+ if (i->op == OP_TXB || i->op == OP_TXL)
+ i->swapSources(dref, lod);
+
+ // array index must be converted to u32
+ if (i->tex.target.isArray()) {
+ Value *layer = i->getSrc(arg - 1);
+ LValue *src = new_LValue(func, FILE_GPR);
+ bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
+ bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
+ i->setSrc(arg - 1, src);
+
+ if (i->tex.target.isCube()) {
+ std::vector<Value *> acube, a2d;
+ int c;
+
+ acube.resize(4);
+ for (c = 0; c < 4; ++c)
+ acube[c] = i->getSrc(c);
+ a2d.resize(4);
+ for (c = 0; c < 3; ++c)
+ a2d[c] = new_LValue(func, FILE_GPR);
+ a2d[3] = NULL;
+
+ bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
+ a2d, acube)->asTex()->tex.mask = 0x7;
+
+ for (c = 0; c < 3; ++c)
+ i->setSrc(c, a2d[c]);
+ i->setSrc(c, NULL);
+ for (; i->srcExists(c + 1); ++c)
+ i->setSrc(c, i->getSrc(c + 1));
+
+ i->tex.target = i->tex.target.isShadow() ?
+ TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
+ }
+ }
+
+ // texel offsets are 3 immediate fields in the instruction,
+ // nv50 cannot do textureGatherOffsets
+ assert(i->tex.useOffsets <= 1);
+
+ return true;
+}
+
+// Bias must be equal for all threads of a quad or lod calculation will fail.
+//
+// The lanes of a quad are grouped by the bit in the condition register they
+// have set, which is selected by differing bias values.
+// Move the input values for TEX into a new register set for each group and
+// execute TEX only for a specific group.
+// We always need to use 4 new registers for the inputs/outputs because the
+// implicitly calculated derivatives must be correct.
+//
+// TODO: move to SSA phase so we can easily determine whether bias is constant
+bool
+NV50LoweringPreSSA::handleTXB(TexInstruction *i)
+{
+ const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
+ int l, d;
+
+ handleTEX(i);
+ Value *bias = i->getSrc(i->tex.target.getArgCount());
+ if (bias->isUniform())
+ return true;
+
+ Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
+ bld.loadImm(NULL, 1));
+ bld.setPosition(cond, false);
+
+ for (l = 1; l < 4; ++l) {
+ const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+ Value *bit = bld.getSSA();
+ Value *pred = bld.getScratch(1, FILE_FLAGS);
+ Value *imm = bld.loadImm(NULL, (1 << l));
+ bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
+ bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
+ cond->setSrc(l, bit);
+ }
+ Value *flags = bld.getScratch(1, FILE_FLAGS);
+ bld.setPosition(cond, true);
+ bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
+
+ Instruction *tex[4];
+ for (l = 0; l < 4; ++l) {
+ (tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
+ bld.insert(tex[l]);
+ }
+
+ Value *res[4][4];
+ for (d = 0; i->defExists(d); ++d)
+ res[0][d] = tex[0]->getDef(d);
+ for (l = 1; l < 4; ++l) {
+ for (d = 0; tex[l]->defExists(d); ++d) {
+ res[l][d] = cloneShallow(func, res[0][d]);
+ bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
+ }
+ }
+
+ for (d = 0; i->defExists(d); ++d) {
+ Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
+ for (l = 0; l < 4; ++l)
+ dst->setSrc(l, res[l][d]);
+ }
+ delete_Instruction(prog, i);
+ return true;
+}
+
+// LOD must be equal for all threads of a quad.
+// Unlike with TXB, here we can just diverge since there's no LOD calculation
+// that would require all 4 threads' sources to be set up properly.
+bool
+NV50LoweringPreSSA::handleTXL(TexInstruction *i)
+{
+ handleTEX(i);
+ Value *lod = i->getSrc(i->tex.target.getArgCount());
+ if (lod->isUniform())
+ return true;
+
+ BasicBlock *currBB = i->bb;
+ BasicBlock *texiBB = i->bb->splitBefore(i, false);
+ BasicBlock *joinBB = i->bb->splitAfter(i);
+
+ bld.setPosition(currBB, true);
+ currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+ for (int l = 0; l <= 3; ++l) {
+ const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
+ Value *pred = bld.getScratch(1, FILE_FLAGS);
+ bld.setPosition(currBB, true);
+ bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
+ bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
+ currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
+ if (l <= 2) {
+ BasicBlock *laneBB = new BasicBlock(func);
+ currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
+ currBB = laneBB;
+ }
+ }
+ bld.setPosition(joinBB, false);
+ bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleTXD(TexInstruction *i)
+{
+ static const uint8_t qOps[4][2] =
+ {
+ { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
+ { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
+ { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+ { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+ };
+ Value *def[4][4];
+ Value *crd[3];
+ Instruction *tex;
+ Value *zero = bld.loadImm(bld.getSSA(), 0);
+ int l, c;
+ const int dim = i->tex.target.getDim();
+
+ handleTEX(i);
+ i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+ for (c = 0; c < dim; ++c)
+ crd[c] = bld.getScratch();
+
+ bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+ for (l = 0; l < 4; ++l) {
+ // mov coordinates from lane l to all lanes
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+ // add dPdx from lane l to lanes dx
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+ // add dPdy from lane l to lanes dy
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+ // texture
+ bld.insert(tex = cloneForward(func, i));
+ for (c = 0; c < dim; ++c)
+ tex->setSrc(c, crd[c]);
+ // save results
+ for (c = 0; i->defExists(c); ++c) {
+ Instruction *mov;
+ def[c][l] = bld.getSSA();
+ mov = bld.mkMov(def[c][l], tex->getDef(c));
+ mov->fixed = 1;
+ mov->lanes = 1 << l;
+ }
+ }
+ bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+ for (c = 0; i->defExists(c); ++c) {
+ Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+ for (l = 0; l < 4; ++l)
+ u->setSrc(l, def[c][l]);
+ }
+
+ i->bb->remove(i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSET(Instruction *i)
+{
+ if (i->dType == TYPE_F32) {
+ bld.setPosition(i, true);
+ i->dType = TYPE_U32;
+ bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
+ bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
+ }
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
+{
+ Value *src0 = bld.getSSA();
+ Value *src1 = bld.getSSA();
+ Value *pred = bld.getScratch(1, FILE_FLAGS);
+
+ Value *v0 = i->getSrc(0);
+ Value *v1 = i->getSrc(1);
+ // XXX: these probably shouldn't be immediates in the first place ...
+ if (v0->asImm())
+ v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+ if (v1->asImm())
+ v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+ bld.setPosition(i, true);
+ bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
+ bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
+ bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+
+ bld.setPosition(i, false);
+ i->op = OP_SET;
+ i->setFlagsDef(0, pred);
+ i->dType = TYPE_U8;
+ i->setSrc(0, i->getSrc(2));
+ i->setSrc(2, NULL);
+ i->setSrc(1, bld.loadImm(NULL, 0));
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSELP(Instruction *i)
+{
+ Value *src0 = bld.getSSA();
+ Value *src1 = bld.getSSA();
+
+ Value *v0 = i->getSrc(0);
+ Value *v1 = i->getSrc(1);
+ if (v0->asImm())
+ v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
+ if (v1->asImm())
+ v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
+
+ bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
+ bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
+ bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
+ delete_Instruction(prog, i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleWRSV(Instruction *i)
+{
+ Symbol *sym = i->getSrc(0)->asSym();
+
+ // these are all shader outputs, $sreg are not writeable
+ uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
+ if (addr >= 0x400)
+ return false;
+ sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+ bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
+
+ bld.getBB()->remove(i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCALL(Instruction *i)
+{
+ if (prog->getType() == Program::TYPE_COMPUTE) {
+ // Add implicit "thread id" argument in $r0 to the function
+ i->setSrc(i->srcCount(), tid);
+ }
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePRECONT(Instruction *i)
+{
+ delete_Instruction(prog, i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleCONT(Instruction *i)
+{
+ i->op = OP_BRA;
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleRDSV(Instruction *i)
+{
+ Symbol *sym = i->getSrc(0)->asSym();
+ uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+ Value *def = i->getDef(0);
+ SVSemantic sv = sym->reg.data.sv.sv;
+ int idx = sym->reg.data.sv.index;
+
+ if (addr >= 0x400) // mov $sreg
+ return true;
+
+ switch (sv) {
+ case SV_POSITION:
+ assert(prog->getType() == Program::TYPE_FRAGMENT);
+ bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+ break;
+ case SV_FACE:
+ bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
+ if (i->dType == TYPE_F32) {
+ bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
+ bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
+ }
+ break;
+ case SV_NCTAID:
+ case SV_CTAID:
+ case SV_NTID:
+ if ((sv == SV_NCTAID && idx >= 2) ||
+ (sv == SV_NTID && idx >= 3)) {
+ bld.mkMov(def, bld.mkImm(1));
+ } else if (sv == SV_CTAID && idx >= 2) {
+ bld.mkMov(def, bld.mkImm(0));
+ } else {
+ Value *x = bld.getSSA(2);
+ bld.mkOp1(OP_LOAD, TYPE_U16, x,
+ bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
+ bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
+ }
+ break;
+ case SV_TID:
+ if (idx == 0) {
+ bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
+ } else if (idx == 1) {
+ bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
+ bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
+ } else if (idx == 2) {
+ bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
+ } else {
+ bld.mkMov(def, bld.mkImm(0));
+ }
+ break;
+ default:
+ bld.mkFetch(i->getDef(0), i->dType,
+ FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
+ break;
+ }
+ bld.getBB()->remove(i);
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleDIV(Instruction *i)
+{
+ if (!isFloatType(i->dType))
+ return true;
+ bld.setPosition(i, false);
+ Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+ i->op = OP_MUL;
+ i->setSrc(1, rcp->getDef(0));
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleSQRT(Instruction *i)
+{
+ Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+ bld.getSSA(), i->getSrc(0));
+ i->op = OP_MUL;
+ i->setSrc(1, rsq->getDef(0));
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handlePOW(Instruction *i)
+{
+ LValue *val = bld.getScratch();
+
+ bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+ bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+ bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+ i->op = OP_EX2;
+ i->setSrc(0, val);
+ i->setSrc(1, NULL);
+
+ return true;
+}
+
+bool
+NV50LoweringPreSSA::handleEXPORT(Instruction *i)
+{
+ if (prog->getType() == Program::TYPE_FRAGMENT) {
+ if (i->getIndirect(0, 0)) {
+ // TODO: redirect to l[] here, load to GPRs at exit
+ return false;
+ } else {
+ int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
+
+ i->op = OP_MOV;
+ i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+ i->src(0).set(i->src(1));
+ i->setSrc(1, NULL);
+ i->setDef(0, new_LValue(func, FILE_GPR));
+ i->getDef(0)->reg.data.id = id;
+
+ prog->maxGPR = MAX2(prog->maxGPR, id);
+ }
+ }
+ return true;
+}
+
+// Set flags according to predicate and make the instruction read $cX.
+void
+NV50LoweringPreSSA::checkPredicate(Instruction *insn)
+{
+ Value *pred = insn->getPredicate();
+ Value *cdst;
+
+ if (!pred || pred->reg.file == FILE_FLAGS)
+ return;
+ cdst = bld.getSSA(1, FILE_FLAGS);
+
+ bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
+
+ insn->setPredicate(insn->cc, cdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NV50LoweringPreSSA::visit(Instruction *i)
+{
+ bld.setPosition(i, false);
+
+ if (i->cc != CC_ALWAYS)
+ checkPredicate(i);
+
+ switch (i->op) {
+ case OP_TEX:
+ case OP_TXF:
+ case OP_TXG:
+ return handleTEX(i->asTex());
+ case OP_TXB:
+ return handleTXB(i->asTex());
+ case OP_TXL:
+ return handleTXL(i->asTex());
+ case OP_TXD:
+ return handleTXD(i->asTex());
+ case OP_EX2:
+ bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+ i->setSrc(0, i->getDef(0));
+ break;
+ case OP_SET:
+ return handleSET(i);
+ case OP_SLCT:
+ return handleSLCT(i->asCmp());
+ case OP_SELP:
+ return handleSELP(i);
+ case OP_POW:
+ return handlePOW(i);
+ case OP_DIV:
+ return handleDIV(i);
+ case OP_SQRT:
+ return handleSQRT(i);
+ case OP_EXPORT:
+ return handleEXPORT(i);
+ case OP_RDSV:
+ return handleRDSV(i);
+ case OP_WRSV:
+ return handleWRSV(i);
+ case OP_CALL:
+ return handleCALL(i);
+ case OP_PRECONT:
+ return handlePRECONT(i);
+ case OP_CONT:
+ return handleCONT(i);
+ default:
+ break;
+ }
+ return true;
+}
+
+bool
+TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
+{
+ bool ret = false;
+
+ if (stage == CG_STAGE_PRE_SSA) {
+ NV50LoweringPreSSA pass(prog);
+ ret = pass.run(prog, false, true);
+ } else
+ if (stage == CG_STAGE_SSA) {
+ if (!prog->targetPriv)
+ prog->targetPriv = new std::list<Instruction *>();
+ NV50LegalizeSSA pass(prog);
+ ret = pass.run(prog, false, true);
+ } else
+ if (stage == CG_STAGE_POST_RA) {
+ NV50LegalizePostRA pass;
+ ret = pass.run(prog, false, true);
+ if (prog->targetPriv)
+ delete reinterpret_cast<std::list<Instruction *> *>(prog->targetPriv);
+ }
+ return ret;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
new file mode 100644
index 0000000..8d94dd1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -0,0 +1,1597 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_build_util.h"
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+#include <limits>
+
+namespace nv50_ir {
+
+#define QOP_ADD 0
+#define QOP_SUBR 1
+#define QOP_SUB 2
+#define QOP_MOV2 3
+
+// UL UR LL LR
+#define QUADOP(q, r, s, t) \
+ ((QOP_##q << 6) | (QOP_##r << 4) | \
+ (QOP_##s << 2) | (QOP_##t << 0))
+
+class NVC0LegalizeSSA : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+ virtual bool visit(Function *);
+
+ // we want to insert calls to the builtin library only after optimization
+ void handleDIV(Instruction *); // integer division, modulus
+ void handleRCPRSQ(Instruction *); // double precision float recip/rsqrt
+
+private:
+ BuildUtil bld;
+};
+
+void
+NVC0LegalizeSSA::handleDIV(Instruction *i)
+{
+ FlowInstruction *call;
+ int builtin;
+ Value *def[2];
+
+ bld.setPosition(i, false);
+ def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
+ def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
+ switch (i->dType) {
+ case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
+ case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
+ default:
+ return;
+ }
+ call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
+ bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
+ bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
+ bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
+
+ call->fixed = 1;
+ call->absolute = call->builtin = 1;
+ call->target.builtin = builtin;
+ delete_Instruction(prog, i);
+}
+
+void
+NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
+{
+ // TODO
+}
+
+bool
+NVC0LegalizeSSA::visit(Function *fn)
+{
+ bld.setProgram(fn->getProgram());
+ return true;
+}
+
+bool
+NVC0LegalizeSSA::visit(BasicBlock *bb)
+{
+ Instruction *next;
+ for (Instruction *i = bb->getEntry(); i; i = next) {
+ next = i->next;
+ if (i->dType == TYPE_F32)
+ continue;
+ switch (i->op) {
+ case OP_DIV:
+ case OP_MOD:
+ handleDIV(i);
+ break;
+ case OP_RCP:
+ case OP_RSQ:
+ if (i->dType == TYPE_F64)
+ handleRCPRSQ(i);
+ break;
+ default:
+ break;
+ }
+ }
+ return true;
+}
+
+class NVC0LegalizePostRA : public Pass
+{
+public:
+ NVC0LegalizePostRA(const Program *);
+
+private:
+ virtual bool visit(Function *);
+ virtual bool visit(BasicBlock *);
+
+ void replaceZero(Instruction *);
+ bool tryReplaceContWithBra(BasicBlock *);
+ void propagateJoin(BasicBlock *);
+
+ struct TexUse
+ {
+ TexUse(Instruction *use, const Instruction *tex)
+ : insn(use), tex(tex), level(-1) { }
+ Instruction *insn;
+ const Instruction *tex; // or split / mov
+ int level;
+ };
+ struct Limits
+ {
+ Limits() { }
+ Limits(int min, int max) : min(min), max(max) { }
+ int min, max;
+ };
+ bool insertTextureBarriers(Function *);
+ inline bool insnDominatedBy(const Instruction *, const Instruction *) const;
+ void findFirstUses(const Instruction *tex, const Instruction *def,
+ std::list<TexUse>&);
+ void findOverwritingDefs(const Instruction *tex, Instruction *insn,
+ const BasicBlock *term,
+ std::list<TexUse>&);
+ void addTexUse(std::list<TexUse>&, Instruction *, const Instruction *);
+ const Instruction *recurseDef(const Instruction *);
+
+private:
+ LValue *rZero;
+ LValue *carry;
+ const bool needTexBar;
+};
+
+NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
+ : rZero(NULL),
+ carry(NULL),
+ needTexBar(prog->getTarget()->getChipset() >= 0xe0)
+{
+}
+
+bool
+NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
+ const Instruction *early) const
+{
+ if (early->bb == later->bb)
+ return early->serial < later->serial;
+ return later->bb->dominatedBy(early->bb);
+}
+
+void
+NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
+ Instruction *usei, const Instruction *insn)
+{
+ bool add = true;
+ for (std::list<TexUse>::iterator it = uses.begin();
+ it != uses.end();) {
+ if (insnDominatedBy(usei, it->insn)) {
+ add = false;
+ break;
+ }
+ if (insnDominatedBy(it->insn, usei))
+ it = uses.erase(it);
+ else
+ ++it;
+ }
+ if (add)
+ uses.push_back(TexUse(usei, insn));
+}
+
+void
+NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
+ Instruction *insn,
+ const BasicBlock *term,
+ std::list<TexUse> &uses)
+{
+ while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
+ insn = insn->getSrc(0)->getUniqueInsn();
+
+ if (!insn || !insn->bb->reachableBy(texi->bb, term))
+ return;
+
+ switch (insn->op) {
+ /* Values not connected to the tex's definition through any of these should
+ * not be conflicting.
+ */
+ case OP_SPLIT:
+ case OP_MERGE:
+ case OP_PHI:
+ case OP_UNION:
+ /* recurse again */
+ for (int s = 0; insn->srcExists(s); ++s)
+ findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
+ uses);
+ break;
+ default:
+ // if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
+ addTexUse(uses, insn, texi);
+ break;
+ }
+}
+
+void
+NVC0LegalizePostRA::findFirstUses(const Instruction *texi,
+ const Instruction *insn,
+ std::list<TexUse> &uses)
+{
+ for (int d = 0; insn->defExists(d); ++d) {
+ Value *v = insn->getDef(d);
+ for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
+ Instruction *usei = (*u)->getInsn();
+
+ if (usei->op == OP_PHI || usei->op == OP_UNION) {
+ // need a barrier before WAW cases
+ for (int s = 0; usei->srcExists(s); ++s) {
+ Instruction *defi = usei->getSrc(s)->getUniqueInsn();
+ if (defi && &usei->src(s) != *u)
+ findOverwritingDefs(texi, defi, usei->bb, uses);
+ }
+ }
+
+ if (usei->op == OP_SPLIT ||
+ usei->op == OP_MERGE ||
+ usei->op == OP_PHI ||
+ usei->op == OP_UNION) {
+ // these uses don't manifest in the machine code
+ findFirstUses(texi, usei, uses);
+ } else
+ if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
+ usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
+ findFirstUses(texi, usei, uses);
+ } else {
+ addTexUse(uses, usei, insn);
+ }
+ }
+ }
+}
+
+// Texture barriers:
+// This pass is a bit long and ugly and can probably be optimized.
+//
+// 1. obtain a list of TEXes and their outputs' first use(s)
+// 2. calculate the barrier level of each first use (minimal number of TEXes,
+// over all paths, between the TEX and the use in question)
+// 3. for each barrier, if all paths from the source TEX to that barrier
+// contain a barrier of lesser level, it can be culled
+bool
+NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
+{
+ std::list<TexUse> *uses;
+ std::vector<Instruction *> texes;
+ std::vector<int> bbFirstTex;
+ std::vector<int> bbFirstUse;
+ std::vector<int> texCounts;
+ std::vector<TexUse> useVec;
+ ArrayList insns;
+
+ fn->orderInstructions(insns);
+
+ texCounts.resize(fn->allBBlocks.getSize(), 0);
+ bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
+ bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
+
+ // tag BB CFG nodes by their id for later
+ for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
+ BasicBlock *bb = reinterpret_cast<BasicBlock *>(i.get());
+ if (bb)
+ bb->cfg.tag = bb->getId();
+ }
+
+ // gather the first uses for each TEX
+ for (int i = 0; i < insns.getSize(); ++i) {
+ Instruction *tex = reinterpret_cast<Instruction *>(insns.get(i));
+ if (isTextureOp(tex->op)) {
+ texes.push_back(tex);
+ if (!texCounts.at(tex->bb->getId()))
+ bbFirstTex[tex->bb->getId()] = texes.size() - 1;
+ texCounts[tex->bb->getId()]++;
+ }
+ }
+ insns.clear();
+ if (texes.empty())
+ return false;
+ uses = new std::list<TexUse>[texes.size()];
+ if (!uses)
+ return false;
+ for (size_t i = 0; i < texes.size(); ++i)
+ findFirstUses(texes[i], texes[i], uses[i]);
+
+ // determine the barrier level at each use
+ for (size_t i = 0; i < texes.size(); ++i) {
+ for (std::list<TexUse>::iterator u = uses[i].begin(); u != uses[i].end();
+ ++u) {
+ BasicBlock *tb = texes[i]->bb;
+ BasicBlock *ub = u->insn->bb;
+ if (tb == ub) {
+ u->level = 0;
+ for (size_t j = i + 1; j < texes.size() &&
+ texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
+ ++j)
+ u->level++;
+ } else {
+ u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
+ &ub->cfg, texCounts);
+ if (u->level < 0) {
+ WARN("Failed to find path TEX -> TEXBAR\n");
+ u->level = 0;
+ continue;
+ }
+ // this counted all TEXes in the origin block, correct that
+ u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
+ // and did not count the TEXes in the destination block, add those
+ for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
+ texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
+ ++j)
+ u->level++;
+ }
+ assert(u->level >= 0);
+ useVec.push_back(*u);
+ }
+ }
+ delete[] uses;
+ uses = NULL;
+
+ // insert the barriers
+ for (size_t i = 0; i < useVec.size(); ++i) {
+ Instruction *prev = useVec[i].insn->prev;
+ if (useVec[i].level < 0)
+ continue;
+ if (prev && prev->op == OP_TEXBAR) {
+ if (prev->subOp > useVec[i].level)
+ prev->subOp = useVec[i].level;
+ prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
+ } else {
+ Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
+ bar->fixed = 1;
+ bar->subOp = useVec[i].level;
+ // make use explicit to ease latency calculation
+ bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
+ useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
+ }
+ }
+
+ if (fn->getProgram()->optLevel < 3) {
+ if (uses)
+ delete[] uses;
+ return true;
+ }
+
+ std::vector<Limits> limitT, limitB, limitS; // entry, exit, single
+
+ limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+ limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
+ limitS.resize(fn->allBBlocks.getSize());
+
+ // cull unneeded barriers (should do that earlier, but for simplicity)
+ IteratorRef bi = fn->cfg.iteratorCFG();
+ // first calculate min/max outstanding TEXes for each BB
+ for (bi->reset(); !bi->end(); bi->next()) {
+ Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+ BasicBlock *bb = BasicBlock::get(n);
+ int min = 0;
+ int max = std::numeric_limits<int>::max();
+ for (Instruction *i = bb->getFirst(); i; i = i->next) {
+ if (isTextureOp(i->op)) {
+ min++;
+ if (max < std::numeric_limits<int>::max())
+ max++;
+ } else
+ if (i->op == OP_TEXBAR) {
+ min = MIN2(min, i->subOp);
+ max = MIN2(max, i->subOp);
+ }
+ }
+ // limits when looking at an isolated block
+ limitS[bb->getId()].min = min;
+ limitS[bb->getId()].max = max;
+ }
+ // propagate the min/max values
+ for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
+ for (bi->reset(); !bi->end(); bi->next()) {
+ Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+ BasicBlock *bb = BasicBlock::get(n);
+ const int bbId = bb->getId();
+ for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
+ BasicBlock *in = BasicBlock::get(ei.getNode());
+ const int inId = in->getId();
+ limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
+ limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
+ }
+ // I just hope this is correct ...
+ if (limitS[bbId].max == std::numeric_limits<int>::max()) {
+ // no barrier
+ limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
+ limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
+ } else {
+ // block contained a barrier
+ limitB[bbId].min = MIN2(limitS[bbId].max,
+ limitT[bbId].min + limitS[bbId].min);
+ limitB[bbId].max = MIN2(limitS[bbId].max,
+ limitT[bbId].max + limitS[bbId].min);
+ }
+ }
+ }
+ // finally delete unnecessary barriers
+ for (bi->reset(); !bi->end(); bi->next()) {
+ Graph::Node *n = reinterpret_cast<Graph::Node *>(bi->get());
+ BasicBlock *bb = BasicBlock::get(n);
+ Instruction *prev = NULL;
+ Instruction *next;
+ int max = limitT[bb->getId()].max;
+ for (Instruction *i = bb->getFirst(); i; i = next) {
+ next = i->next;
+ if (i->op == OP_TEXBAR) {
+ if (i->subOp >= max) {
+ delete_Instruction(prog, i);
+ i = NULL;
+ } else {
+ max = i->subOp;
+ if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
+ delete_Instruction(prog, prev);
+ prev = NULL;
+ }
+ }
+ } else
+ if (isTextureOp(i->op)) {
+ max++;
+ }
+ if (i && !i->isNop())
+ prev = i;
+ }
+ }
+ if (uses)
+ delete[] uses;
+ return true;
+}
+
+bool
+NVC0LegalizePostRA::visit(Function *fn)
+{
+ if (needTexBar)
+ insertTextureBarriers(fn);
+
+ rZero = new_LValue(fn, FILE_GPR);
+ carry = new_LValue(fn, FILE_FLAGS);
+
+ rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
+ carry->reg.data.id = 0;
+
+ return true;
+}
+
+void
+NVC0LegalizePostRA::replaceZero(Instruction *i)
+{
+ for (int s = 0; i->srcExists(s); ++s) {
+ if (s == 2 && i->op == OP_SUCLAMP)
+ continue;
+ ImmediateValue *imm = i->getSrc(s)->asImm();
+ if (imm && imm->reg.data.u64 == 0)
+ i->setSrc(s, rZero);
+ }
+}
+
+// replace CONT with BRA for single unconditional continue
+bool
+NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
+{
+ if (bb->cfg.incidentCount() != 2 || bb->getEntry()->op != OP_PRECONT)
+ return false;
+ Graph::EdgeIterator ei = bb->cfg.incident();
+ if (ei.getType() != Graph::Edge::BACK)
+ ei.next();
+ if (ei.getType() != Graph::Edge::BACK)
+ return false;
+ BasicBlock *contBB = BasicBlock::get(ei.getNode());
+
+ if (!contBB->getExit() || contBB->getExit()->op != OP_CONT ||
+ contBB->getExit()->getPredicate())
+ return false;
+ contBB->getExit()->op = OP_BRA;
+ bb->remove(bb->getEntry()); // delete PRECONT
+
+ ei.next();
+ assert(ei.end() || ei.getType() != Graph::Edge::BACK);
+ return true;
+}
+
+// replace branches to join blocks with join ops
+void
+NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
+{
+ if (bb->getEntry()->op != OP_JOIN || bb->getEntry()->asFlow()->limit)
+ return;
+ for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+ BasicBlock *in = BasicBlock::get(ei.getNode());
+ Instruction *exit = in->getExit();
+ if (!exit) {
+ in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
+ // there should always be a terminator instruction
+ WARN("inserted missing terminator in BB:%i\n", in->getId());
+ } else
+ if (exit->op == OP_BRA) {
+ exit->op = OP_JOIN;
+ exit->asFlow()->limit = 1; // must-not-propagate marker
+ }
+ }
+ bb->remove(bb->getEntry());
+}
+
+bool
+NVC0LegalizePostRA::visit(BasicBlock *bb)
+{
+ Instruction *i, *next;
+
+ // remove pseudo operations and non-fixed no-ops, split 64 bit operations
+ for (i = bb->getFirst(); i; i = next) {
+ next = i->next;
+ if (i->op == OP_EMIT || i->op == OP_RESTART) {
+ if (!i->getDef(0)->refCount())
+ i->setDef(0, NULL);
+ if (i->src(0).getFile() == FILE_IMMEDIATE)
+ i->setSrc(0, rZero); // initial value must be 0
+ } else
+ if (i->isNop()) {
+ bb->remove(i);
+ } else {
+ // TODO: Move this to before register allocation for operations that
+ // need the $c register !
+ if (typeSizeof(i->dType) == 8) {
+ Instruction *hi;
+ hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
+ if (hi)
+ next = hi;
+ }
+
+ if (i->op != OP_MOV && i->op != OP_PFETCH)
+ replaceZero(i);
+ }
+ }
+ if (!bb->getEntry())
+ return true;
+
+ if (!tryReplaceContWithBra(bb))
+ propagateJoin(bb);
+
+ return true;
+}
+
+class NVC0LoweringPass : public Pass
+{
+public:
+ NVC0LoweringPass(Program *);
+
+private:
+ virtual bool visit(Function *);
+ virtual bool visit(BasicBlock *);
+ virtual bool visit(Instruction *);
+
+ bool handleRDSV(Instruction *);
+ bool handleWRSV(Instruction *);
+ bool handleEXPORT(Instruction *);
+ bool handleOUT(Instruction *);
+ bool handleDIV(Instruction *);
+ bool handleMOD(Instruction *);
+ bool handleSQRT(Instruction *);
+ bool handlePOW(Instruction *);
+ bool handleTEX(TexInstruction *);
+ bool handleTXD(TexInstruction *);
+ bool handleTXQ(TexInstruction *);
+ bool handleManualTXD(TexInstruction *);
+ bool handleATOM(Instruction *);
+ bool handleCasExch(Instruction *, bool needCctl);
+ void handleSurfaceOpNVE4(TexInstruction *);
+
+ void checkPredicate(Instruction *);
+
+ void readTessCoord(LValue *dst, int c);
+
+ Value *loadResInfo32(Value *ptr, uint32_t off);
+ Value *loadMsInfo32(Value *ptr, uint32_t off);
+ Value *loadTexHandle(Value *ptr, unsigned int slot);
+
+ void adjustCoordinatesMS(TexInstruction *);
+ void processSurfaceCoordsNVE4(TexInstruction *);
+
+private:
+ const Target *const targ;
+
+ BuildUtil bld;
+
+ Symbol *gMemBase;
+ LValue *gpEmitAddress;
+};
+
+NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
+{
+ bld.setProgram(prog);
+ gMemBase = NULL;
+}
+
+bool
+NVC0LoweringPass::visit(Function *fn)
+{
+ if (prog->getType() == Program::TYPE_GEOMETRY) {
+ assert(!strncmp(fn->getName(), "MAIN", 4));
+ // TODO: when we generate actual functions pass this value along somehow
+ bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
+ gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
+ if (fn->cfgExit) {
+ bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
+ bld.mkMovToReg(0, gpEmitAddress);
+ }
+ }
+ return true;
+}
+
+bool
+NVC0LoweringPass::visit(BasicBlock *bb)
+{
+ return true;
+}
+
+inline Value *
+NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
+{
+ uint8_t b = prog->driver->io.resInfoCBSlot;
+ uint32_t off = prog->driver->io.texBindBase + slot * 4;
+ return bld.
+ mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+// move array source to first slot, convert to u16, add indirections
+bool
+NVC0LoweringPass::handleTEX(TexInstruction *i)
+{
+ const int dim = i->tex.target.getDim() + i->tex.target.isCube();
+ const int arg = i->tex.target.getArgCount();
+ const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
+
+ if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
+ if (i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+ WARN("indirect TEX not implemented\n");
+ }
+ if (i->tex.r == i->tex.s) {
+ i->tex.r += prog->driver->io.texBindBase / 4;
+ i->tex.s = 0; // only a single cX[] value possible here
+ } else {
+ Value *hnd = bld.getScratch();
+ Value *rHnd = loadTexHandle(NULL, i->tex.r);
+ Value *sHnd = loadTexHandle(NULL, i->tex.s);
+
+ bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
+
+ i->tex.r = 0; // not used for indirect tex
+ i->tex.s = 0;
+ i->setIndirectR(hnd);
+ }
+ if (i->tex.target.isArray()) {
+ LValue *layer = new_LValue(func, FILE_GPR);
+ Value *src = i->getSrc(lyr);
+ const int sat = (i->op == OP_TXF) ? 1 : 0;
+ DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+ bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
+ for (int s = dim; s >= 1; --s)
+ i->setSrc(s, i->getSrc(s - 1));
+ i->setSrc(0, layer);
+ }
+ } else
+ // (nvc0) generate and move the tsc/tic/array source to the front
+ if (dim != arg || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
+ LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
+
+ Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
+ for (int s = dim; s >= 1; --s)
+ i->setSrc(s, i->getSrc(s - 1));
+ i->setSrc(0, arrayIndex);
+
+ Value *ticRel = i->getIndirectR();
+ Value *tscRel = i->getIndirectS();
+
+ if (arrayIndex) {
+ int sat = (i->op == OP_TXF) ? 1 : 0;
+ DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
+ bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
+ } else {
+ bld.loadImm(src, 0);
+ }
+
+ if (ticRel) {
+ i->setSrc(i->tex.rIndirectSrc, NULL);
+ bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
+ }
+ if (tscRel) {
+ i->setSrc(i->tex.sIndirectSrc, NULL);
+ bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
+ }
+
+ i->setSrc(0, src);
+ }
+
+ // offset is last source (lod 1st, dc 2nd)
+ if (i->tex.useOffsets) {
+ uint32_t value = 0;
+ int n, c;
+ int s = i->srcCount(0xff, true);
+ if (i->srcExists(s)) // move potential predicate out of the way
+ i->moveSources(s, 1);
+ for (n = 0; n < i->tex.useOffsets; ++n)
+ for (c = 0; c < 3; ++c)
+ value |= (i->tex.offset[n][c] & 0xf) << (n * 12 + c * 4);
+ i->setSrc(s, bld.loadImm(NULL, value));
+ }
+
+ if (prog->getTarget()->getChipset() >= NVISA_GK104_CHIPSET) {
+ //
+ // If TEX requires more than 4 sources, the 2nd register tuple must be
+ // aligned to 4, even if it consists of just a single 4-byte register.
+ //
+ // XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
+ //
+ int s = i->srcCount(0xff, true);
+ if (s > 4 && s < 7) {
+ if (i->srcExists(s)) // move potential predicate out of the way
+ i->moveSources(s, 7 - s);
+ while (s < 7)
+ i->setSrc(s++, bld.loadImm(NULL, 0));
+ }
+ }
+
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleManualTXD(TexInstruction *i)
+{
+ static const uint8_t qOps[4][2] =
+ {
+ { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
+ { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
+ { QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
+ { QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
+ };
+ Value *def[4][4];
+ Value *crd[3];
+ Instruction *tex;
+ Value *zero = bld.loadImm(bld.getSSA(), 0);
+ int l, c;
+ const int dim = i->tex.target.getDim();
+
+ i->op = OP_TEX; // no need to clone dPdx/dPdy later
+
+ for (c = 0; c < dim; ++c)
+ crd[c] = bld.getScratch();
+
+ bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
+ for (l = 0; l < 4; ++l) {
+ // mov coordinates from lane l to all lanes
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
+ // add dPdx from lane l to lanes dx
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
+ // add dPdy from lane l to lanes dy
+ for (c = 0; c < dim; ++c)
+ bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
+ // texture
+ bld.insert(tex = cloneForward(func, i));
+ for (c = 0; c < dim; ++c)
+ tex->setSrc(c, crd[c]);
+ // save results
+ for (c = 0; i->defExists(c); ++c) {
+ Instruction *mov;
+ def[c][l] = bld.getSSA();
+ mov = bld.mkMov(def[c][l], tex->getDef(c));
+ mov->fixed = 1;
+ mov->lanes = 1 << l;
+ }
+ }
+ bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
+
+ for (c = 0; i->defExists(c); ++c) {
+ Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
+ for (l = 0; l < 4; ++l)
+ u->setSrc(l, def[c][l]);
+ }
+
+ i->bb->remove(i);
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleTXD(TexInstruction *txd)
+{
+ int dim = txd->tex.target.getDim();
+ int arg = txd->tex.target.getArgCount();
+
+ handleTEX(txd);
+ while (txd->srcExists(arg))
+ ++arg;
+
+ txd->tex.derivAll = true;
+ if (dim > 2 ||
+ txd->tex.target.isCube() ||
+ arg > 4 ||
+ txd->tex.target.isShadow())
+ return handleManualTXD(txd);
+
+ for (int c = 0; c < dim; ++c) {
+ txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
+ txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
+ txd->dPdx[c].set(NULL);
+ txd->dPdy[c].set(NULL);
+ }
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleTXQ(TexInstruction *txq)
+{
+ // TODO: indirect resource/sampler index
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleATOM(Instruction *atom)
+{
+ SVSemantic sv;
+
+ switch (atom->src(0).getFile()) {
+ case FILE_MEMORY_LOCAL:
+ sv = SV_LBASE;
+ break;
+ case FILE_MEMORY_SHARED:
+ sv = SV_SBASE;
+ break;
+ default:
+ assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
+ return true;
+ }
+ Value *base =
+ bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
+ Value *ptr = atom->getIndirect(0, 0);
+
+ atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
+ atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+ if (ptr)
+ base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
+ atom->setIndirect(0, 0, base);
+
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
+{
+ if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
+ cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
+ return false;
+ bld.setPosition(cas, true);
+
+ if (needCctl) {
+ Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
+ cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
+ cctl->fixed = 1;
+ cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
+ if (cas->isPredicated())
+ cctl->setPredicate(cas->cc, cas->getPredicate());
+ }
+
+ if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+ // CAS is crazy. It's 2nd source is a double reg, and the 3rd source
+ // should be set to the high part of the double reg or bad things will
+ // happen elsewhere in the universe.
+ // Also, it sometimes returns the new value instead of the old one
+ // under mysterious circumstances.
+ Value *dreg = bld.getSSA(8);
+ bld.setPosition(cas, false);
+ bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
+ cas->setSrc(1, dreg);
+ }
+
+ return true;
+}
+
+inline Value *
+NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
+{
+ uint8_t b = prog->driver->io.resInfoCBSlot;
+ off += prog->driver->io.suInfoBase;
+ return bld.
+ mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+inline Value *
+NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
+{
+ uint8_t b = prog->driver->io.msInfoCBSlot;
+ off += prog->driver->io.msInfoBase;
+ return bld.
+ mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
+}
+
+/* On nvc0, surface info is obtained via the surface binding points passed
+ * to the SULD/SUST instructions.
+ * On nve4, surface info is stored in c[] and is used by various special
+ * instructions, e.g. for clamping coordiantes or generating an address.
+ * They couldn't just have added an equivalent to TIC now, couldn't they ?
+ */
+#define NVE4_SU_INFO_ADDR 0x00
+#define NVE4_SU_INFO_FMT 0x04
+#define NVE4_SU_INFO_DIM_X 0x08
+#define NVE4_SU_INFO_PITCH 0x0c
+#define NVE4_SU_INFO_DIM_Y 0x10
+#define NVE4_SU_INFO_ARRAY 0x14
+#define NVE4_SU_INFO_DIM_Z 0x18
+#define NVE4_SU_INFO_UNK1C 0x1c
+#define NVE4_SU_INFO_WIDTH 0x20
+#define NVE4_SU_INFO_HEIGHT 0x24
+#define NVE4_SU_INFO_DEPTH 0x28
+#define NVE4_SU_INFO_TARGET 0x2c
+#define NVE4_SU_INFO_CALL 0x30
+#define NVE4_SU_INFO_RAW_X 0x34
+#define NVE4_SU_INFO_MS_X 0x38
+#define NVE4_SU_INFO_MS_Y 0x3c
+
+#define NVE4_SU_INFO__STRIDE 0x40
+
+#define NVE4_SU_INFO_DIM(i) (0x08 + (i) * 8)
+#define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
+#define NVE4_SU_INFO_MS(i) (0x38 + (i) * 4)
+
+static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
+{
+ switch (su->tex.target.getEnum()) {
+ case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
+ case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ case TEX_TARGET_1D_ARRAY: return (c == 1) ?
+ NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
+ NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
+ case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
+ case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
+ default:
+ assert(0);
+ return 0;
+ }
+}
+
+void
+NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
+{
+ const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
+ const int arg = tex->tex.target.getArgCount();
+
+ if (tex->tex.target == TEX_TARGET_2D_MS)
+ tex->tex.target = TEX_TARGET_2D;
+ else
+ if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
+ tex->tex.target = TEX_TARGET_2D_ARRAY;
+ else
+ return;
+
+ Value *x = tex->getSrc(0);
+ Value *y = tex->getSrc(1);
+ Value *s = tex->getSrc(arg - 1);
+
+ Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
+
+ Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
+ Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
+
+ bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
+ bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
+
+ s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
+ s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
+
+ Value *dx = loadMsInfo32(ts, 0x0);
+ Value *dy = loadMsInfo32(ts, 0x4);
+
+ bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
+ bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
+
+ tex->setSrc(0, tx);
+ tex->setSrc(1, ty);
+ tex->moveSources(arg, -1);
+}
+
+// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
+// They're computed from the coordinates using the surface info in c[] space.
+void
+NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
+{
+ Instruction *insn;
+ const bool atom = su->op == OP_SUREDB || su->op == OP_SUREDP;
+ const bool raw =
+ su->op == OP_SULDB || su->op == OP_SUSTB || su->op == OP_SUREDB;
+ const int idx = su->tex.r;
+ const int dim = su->tex.target.getDim();
+ const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
+ const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
+ int c;
+ Value *zero = bld.mkImm(0);
+ Value *p1 = NULL;
+ Value *v;
+ Value *src[3];
+ Value *bf, *eau, *off;
+ Value *addr, *pred;
+
+ off = bld.getScratch(4);
+ bf = bld.getScratch(4);
+ addr = bld.getSSA(8);
+ pred = bld.getScratch(1, FILE_PREDICATE);
+
+ bld.setPosition(su, false);
+
+ adjustCoordinatesMS(su);
+
+ // calculate clamped coordinates
+ for (c = 0; c < arg; ++c) {
+ src[c] = bld.getScratch();
+ if (c == 0 && raw)
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
+ else
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
+ bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
+ ->subOp = getSuClampSubOp(su, c);
+ }
+ for (; c < 3; ++c)
+ src[c] = zero;
+
+ // set predicate output
+ if (su->tex.target == TEX_TARGET_BUFFER) {
+ src[0]->getInsn()->setFlagsDef(1, pred);
+ } else
+ if (su->tex.target.isArray()) {
+ p1 = bld.getSSA(1, FILE_PREDICATE);
+ src[dim]->getInsn()->setFlagsDef(1, p1);
+ }
+
+ // calculate pixel offset
+ if (dim == 1) {
+ if (su->tex.target != TEX_TARGET_BUFFER)
+ bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
+ } else
+ if (dim == 3) {
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+ bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
+ ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+ bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
+ ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
+ } else {
+ assert(dim == 2);
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+ bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
+ ->subOp = su->tex.target.isArray() ?
+ NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
+ }
+
+ // calculate effective address part 1
+ if (su->tex.target == TEX_TARGET_BUFFER) {
+ if (raw) {
+ bf = src[0];
+ } else {
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+ bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
+ ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
+ }
+ } else {
+ Value *y = src[1];
+ Value *z = src[2];
+ uint16_t subOp = 0;
+
+ switch (dim) {
+ case 1:
+ y = zero;
+ z = zero;
+ break;
+ case 2:
+ z = off;
+ if (!su->tex.target.isArray()) {
+ z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+ subOp = NV50_IR_SUBOP_SUBFM_3D;
+ }
+ break;
+ default:
+ subOp = NV50_IR_SUBOP_SUBFM_3D;
+ assert(dim == 3);
+ break;
+ }
+ insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
+ insn->subOp = subOp;
+ insn->setFlagsDef(1, pred);
+ }
+
+ // part 2
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
+
+ if (su->tex.target == TEX_TARGET_BUFFER) {
+ eau = v;
+ } else {
+ eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
+ }
+ // add array layer offset
+ if (su->tex.target.isArray()) {
+ v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
+ if (dim == 1)
+ bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
+ ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
+ else
+ bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
+ ->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
+ // combine predicates
+ assert(p1);
+ bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
+ }
+
+ if (atom) {
+ Value *lo = bf;
+ if (su->tex.target == TEX_TARGET_BUFFER) {
+ lo = zero;
+ bld.mkMov(off, bf);
+ }
+ // bf == g[] address & 0xff
+ // eau == g[] address >> 8
+ bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau);
+ bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
+ } else
+ if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
+ // Convert from u32 to u8 address format, which is what the library code
+ // doing SULDP currently uses.
+ // XXX: can SUEAU do this ?
+ // XXX: does it matter that we don't mask high bytes in bf ?
+ // Grrr.
+ bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
+ bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
+ }
+
+ bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
+
+ if (atom && su->tex.target == TEX_TARGET_BUFFER)
+ bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
+
+ // let's just set it 0 for raw access and hope it works
+ v = raw ?
+ bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+
+ // get rid of old coordinate sources, make space for fmt info and predicate
+ su->moveSources(arg, 3 - arg);
+ // set 64 bit address and 32-bit format sources
+ su->setSrc(0, addr);
+ su->setSrc(1, v);
+ su->setSrc(2, pred);
+}
+
+void
+NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
+{
+ processSurfaceCoordsNVE4(su);
+
+ // Who do we hate more ? The person who decided that nvc0's SULD doesn't
+ // have to support conversion or the person who decided that, in OpenCL,
+ // you don't have to specify the format here like you do in OpenGL ?
+
+ if (su->op == OP_SULDP) {
+ // We don't patch shaders. Ever.
+ // You get an indirect call to our library blob here.
+ // But at least it's uniform.
+ FlowInstruction *call;
+ LValue *p[3];
+ LValue *r[5];
+ uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
+
+ for (int i = 0; i < 4; ++i)
+ (r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
+ for (int i = 0; i < 3; ++i)
+ (p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
+ (r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
+
+ bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
+ bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
+ bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
+ bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
+ bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
+
+ call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
+
+ call->indirect = 1;
+ call->absolute = 1;
+ call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
+ prog->driver->io.resInfoCBSlot, TYPE_U32,
+ prog->driver->io.suInfoBase + base));
+ call->setSrc(1, r[2]);
+ call->setSrc(2, r[4]);
+ for (int i = 0; i < 3; ++i)
+ call->setSrc(3 + i, p[i]);
+ for (int i = 0; i < 4; ++i) {
+ call->setDef(i, r[i]);
+ bld.mkMov(su->getDef(i), r[i]);
+ }
+ call->setDef(4, p[1]);
+ delete_Instruction(bld.getProgram(), su);
+ }
+
+ if (su->op == OP_SUREDB || su->op == OP_SUREDP) {
+ // FIXME: for out of bounds access, destination value will be undefined !
+ Value *pred = su->getSrc(2);
+ CondCode cc = CC_NOT_P;
+ if (su->getPredicate()) {
+ pred = bld.getScratch(1, FILE_PREDICATE);
+ cc = su->cc;
+ if (cc == CC_NOT_P) {
+ bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
+ } else {
+ bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
+ pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
+ }
+ }
+ Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
+ red->subOp = su->subOp;
+ if (!gMemBase)
+ gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
+ red->setSrc(0, gMemBase);
+ red->setSrc(1, su->getSrc(3));
+ if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
+ red->setSrc(2, su->getSrc(4));
+ red->setIndirect(0, 0, su->getSrc(0));
+ red->setPredicate(cc, pred);
+ delete_Instruction(bld.getProgram(), su);
+ handleCasExch(red, true);
+ } else {
+ su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
+ }
+}
+
+bool
+NVC0LoweringPass::handleWRSV(Instruction *i)
+{
+ Instruction *st;
+ Symbol *sym;
+ uint32_t addr;
+
+ // must replace, $sreg are not writeable
+ addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
+ if (addr >= 0x400)
+ return false;
+ sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
+
+ st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
+ i->getSrc(1));
+ st->perPatch = i->perPatch;
+
+ bld.getBB()->remove(i);
+ return true;
+}
+
+void
+NVC0LoweringPass::readTessCoord(LValue *dst, int c)
+{
+ Value *laneid = bld.getSSA();
+ Value *x, *y;
+
+ bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
+
+ if (c == 0) {
+ x = dst;
+ y = NULL;
+ } else
+ if (c == 1) {
+ x = NULL;
+ y = dst;
+ } else {
+ assert(c == 2);
+ x = bld.getSSA();
+ y = bld.getSSA();
+ }
+ if (x)
+ bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
+ if (y)
+ bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
+
+ if (c == 2) {
+ bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
+ bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
+ }
+}
+
+bool
+NVC0LoweringPass::handleRDSV(Instruction *i)
+{
+ Symbol *sym = i->getSrc(0)->asSym();
+ const SVSemantic sv = sym->reg.data.sv.sv;
+ Value *vtx = NULL;
+ Instruction *ld;
+ uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
+
+ if (addr >= 0x400) {
+ // mov $sreg
+ if (sym->reg.data.sv.index == 3) {
+ // TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
+ i->op = OP_MOV;
+ i->setSrc(0, bld.mkImm((sv == SV_NTID || sv == SV_NCTAID) ? 1 : 0));
+ }
+ return true;
+ }
+
+ switch (sv) {
+ case SV_POSITION:
+ assert(prog->getType() == Program::TYPE_FRAGMENT);
+ bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
+ break;
+ case SV_FACE:
+ {
+ Value *face = i->getDef(0);
+ bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
+ if (i->dType == TYPE_F32) {
+ bld.mkOp2(OP_AND, TYPE_U32, face, face, bld.mkImm(0x80000000));
+ bld.mkOp2(OP_XOR, TYPE_U32, face, face, bld.mkImm(0xbf800000));
+ }
+ }
+ break;
+ case SV_TESS_COORD:
+ assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
+ readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
+ break;
+ case SV_NTID:
+ case SV_NCTAID:
+ case SV_GRIDID:
+ assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
+ if (sym->reg.data.sv.index == 3) {
+ i->op = OP_MOV;
+ i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
+ return true;
+ }
+ addr += prog->driver->prop.cp.gridInfoBase;
+ bld.mkLoad(TYPE_U32, i->getDef(0),
+ bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
+ break;
+ default:
+ if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
+ vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
+ ld = bld.mkFetch(i->getDef(0), i->dType,
+ FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
+ ld->perPatch = i->perPatch;
+ break;
+ }
+ bld.getBB()->remove(i);
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleDIV(Instruction *i)
+{
+ if (!isFloatType(i->dType))
+ return true;
+ bld.setPosition(i, false);
+ Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
+ i->op = OP_MUL;
+ i->setSrc(1, rcp->getDef(0));
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleMOD(Instruction *i)
+{
+ if (i->dType != TYPE_F32)
+ return true;
+ LValue *value = bld.getScratch();
+ bld.mkOp1(OP_RCP, TYPE_F32, value, i->getSrc(1));
+ bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(0), value);
+ bld.mkOp1(OP_TRUNC, TYPE_F32, value, value);
+ bld.mkOp2(OP_MUL, TYPE_F32, value, i->getSrc(1), value);
+ i->op = OP_SUB;
+ i->setSrc(1, value);
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleSQRT(Instruction *i)
+{
+ Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
+ bld.getSSA(), i->getSrc(0));
+ i->op = OP_MUL;
+ i->setSrc(1, rsq->getDef(0));
+
+ return true;
+}
+
+bool
+NVC0LoweringPass::handlePOW(Instruction *i)
+{
+ LValue *val = bld.getScratch();
+
+ bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
+ bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
+ bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
+
+ i->op = OP_EX2;
+ i->setSrc(0, val);
+ i->setSrc(1, NULL);
+
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleEXPORT(Instruction *i)
+{
+ if (prog->getType() == Program::TYPE_FRAGMENT) {
+ int id = i->getSrc(0)->reg.data.offset / 4;
+
+ if (i->src(0).isIndirect(0)) // TODO, ugly
+ return false;
+ i->op = OP_MOV;
+ i->subOp = NV50_IR_SUBOP_MOV_FINAL;
+ i->src(0).set(i->src(1));
+ i->setSrc(1, NULL);
+ i->setDef(0, new_LValue(func, FILE_GPR));
+ i->getDef(0)->reg.data.id = id;
+
+ prog->maxGPR = MAX2(prog->maxGPR, id);
+ } else
+ if (prog->getType() == Program::TYPE_GEOMETRY) {
+ i->setIndirect(0, 1, gpEmitAddress);
+ }
+ return true;
+}
+
+bool
+NVC0LoweringPass::handleOUT(Instruction *i)
+{
+ if (i->op == OP_RESTART && i->prev && i->prev->op == OP_EMIT) {
+ i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
+ delete_Instruction(prog, i);
+ } else {
+ assert(gpEmitAddress);
+ i->setDef(0, gpEmitAddress);
+ if (i->srcExists(0))
+ i->setSrc(1, i->getSrc(0));
+ i->setSrc(0, gpEmitAddress);
+ }
+ return true;
+}
+
+// Generate a binary predicate if an instruction is predicated by
+// e.g. an f32 value.
+void
+NVC0LoweringPass::checkPredicate(Instruction *insn)
+{
+ Value *pred = insn->getPredicate();
+ Value *pdst;
+
+ if (!pred || pred->reg.file == FILE_PREDICATE)
+ return;
+ pdst = new_LValue(func, FILE_PREDICATE);
+
+ // CAUTION: don't use pdst->getInsn, the definition might not be unique,
+ // delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
+
+ bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, bld.mkImm(0), pred);
+
+ insn->setPredicate(insn->cc, pdst);
+}
+
+//
+// - add quadop dance for texturing
+// - put FP outputs in GPRs
+// - convert instruction sequences
+//
+bool
+NVC0LoweringPass::visit(Instruction *i)
+{
+ bld.setPosition(i, false);
+
+ if (i->cc != CC_ALWAYS)
+ checkPredicate(i);
+
+ switch (i->op) {
+ case OP_TEX:
+ case OP_TXB:
+ case OP_TXL:
+ case OP_TXF:
+ case OP_TXG:
+ return handleTEX(i->asTex());
+ case OP_TXD:
+ return handleTXD(i->asTex());
+ case OP_TXQ:
+ return handleTXQ(i->asTex());
+ case OP_EX2:
+ bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
+ i->setSrc(0, i->getDef(0));
+ break;
+ case OP_POW:
+ return handlePOW(i);
+ case OP_DIV:
+ return handleDIV(i);
+ case OP_MOD:
+ return handleMOD(i);
+ case OP_SQRT:
+ return handleSQRT(i);
+ case OP_EXPORT:
+ return handleEXPORT(i);
+ case OP_EMIT:
+ case OP_RESTART:
+ return handleOUT(i);
+ case OP_RDSV:
+ return handleRDSV(i);
+ case OP_WRSV:
+ return handleWRSV(i);
+ case OP_LOAD:
+ if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+ if (prog->getType() == Program::TYPE_COMPUTE) {
+ i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+ i->getSrc(0)->reg.fileIndex = 0;
+ } else {
+ i->op = OP_VFETCH;
+ assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+ }
+ }
+ break;
+ case OP_ATOM:
+ {
+ const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
+ handleATOM(i);
+ handleCasExch(i, cctl);
+ }
+ break;
+ case OP_SULDB:
+ case OP_SULDP:
+ case OP_SUSTB:
+ case OP_SUSTP:
+ case OP_SUREDB:
+ case OP_SUREDP:
+ if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+ handleSurfaceOpNVE4(i->asTex());
+ break;
+ default:
+ break;
+ }
+ return true;
+}
+
+bool
+TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
+{
+ if (stage == CG_STAGE_PRE_SSA) {
+ NVC0LoweringPass pass(prog);
+ return pass.run(prog, false, true);
+ } else
+ if (stage == CG_STAGE_POST_RA) {
+ NVC0LegalizePostRA pass(prog);
+ return pass.run(prog, false, true);
+ } else
+ if (stage == CG_STAGE_SSA) {
+ NVC0LegalizeSSA pass;
+ return pass.run(prog, false, true);
+ }
+ return false;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
new file mode 100644
index 0000000..99bd2bf
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -0,0 +1,2464 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+#include "codegen/nv50_ir_build_util.h"
+
+extern "C" {
+#include "util/u_math.h"
+}
+
+namespace nv50_ir {
+
+bool
+Instruction::isNop() const
+{
+ if (op == OP_PHI || op == OP_SPLIT || op == OP_MERGE || op == OP_CONSTRAINT)
+ return true;
+ if (terminator || join) // XXX: should terminator imply flow ?
+ return false;
+ if (op == OP_ATOM)
+ return false;
+ if (!fixed && op == OP_NOP)
+ return true;
+
+ if (defExists(0) && def(0).rep()->reg.data.id < 0) {
+ for (int d = 1; defExists(d); ++d)
+ if (def(d).rep()->reg.data.id >= 0)
+ WARN("part of vector result is unused !\n");
+ return true;
+ }
+
+ if (op == OP_MOV || op == OP_UNION) {
+ if (!getDef(0)->equals(getSrc(0)))
+ return false;
+ if (op == OP_UNION)
+ if (!def(0).rep()->equals(getSrc(1)))
+ return false;
+ return true;
+ }
+
+ return false;
+}
+
+bool Instruction::isDead() const
+{
+ if (op == OP_STORE ||
+ op == OP_EXPORT ||
+ op == OP_ATOM ||
+ op == OP_SUSTB || op == OP_SUSTP || op == OP_SUREDP || op == OP_SUREDB ||
+ op == OP_WRSV)
+ return false;
+
+ for (int d = 0; defExists(d); ++d)
+ if (getDef(d)->refCount() || getDef(d)->reg.data.id >= 0)
+ return false;
+
+ if (terminator || asFlow())
+ return false;
+ if (fixed)
+ return false;
+
+ return true;
+};
+
+// =============================================================================
+
+class CopyPropagation : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+};
+
+// Propagate all MOVs forward to make subsequent optimization easier, except if
+// the sources stem from a phi, in which case we don't want to mess up potential
+// swaps $rX <-> $rY, i.e. do not create live range overlaps of phi src and def.
+bool
+CopyPropagation::visit(BasicBlock *bb)
+{
+ Instruction *mov, *si, *next;
+
+ for (mov = bb->getEntry(); mov; mov = next) {
+ next = mov->next;
+ if (mov->op != OP_MOV || mov->fixed || !mov->getSrc(0)->asLValue())
+ continue;
+ if (mov->getPredicate())
+ continue;
+ if (mov->def(0).getFile() != mov->src(0).getFile())
+ continue;
+ si = mov->getSrc(0)->getInsn();
+ if (mov->getDef(0)->reg.data.id < 0 && si && si->op != OP_PHI) {
+ // propagate
+ mov->def(0).replace(mov->getSrc(0), false);
+ delete_Instruction(prog, mov);
+ }
+ }
+ return true;
+}
+
+// =============================================================================
+
+class LoadPropagation : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+
+ void checkSwapSrc01(Instruction *);
+
+ bool isCSpaceLoad(Instruction *);
+ bool isImmd32Load(Instruction *);
+ bool isAttribOrSharedLoad(Instruction *);
+};
+
+bool
+LoadPropagation::isCSpaceLoad(Instruction *ld)
+{
+ return ld && ld->op == OP_LOAD && ld->src(0).getFile() == FILE_MEMORY_CONST;
+}
+
+bool
+LoadPropagation::isImmd32Load(Instruction *ld)
+{
+ if (!ld || (ld->op != OP_MOV) || (typeSizeof(ld->dType) != 4))
+ return false;
+ return ld->src(0).getFile() == FILE_IMMEDIATE;
+}
+
+bool
+LoadPropagation::isAttribOrSharedLoad(Instruction *ld)
+{
+ return ld &&
+ (ld->op == OP_VFETCH ||
+ (ld->op == OP_LOAD &&
+ (ld->src(0).getFile() == FILE_SHADER_INPUT ||
+ ld->src(0).getFile() == FILE_MEMORY_SHARED)));
+}
+
+void
+LoadPropagation::checkSwapSrc01(Instruction *insn)
+{
+ if (!prog->getTarget()->getOpInfo(insn).commutative)
+ if (insn->op != OP_SET && insn->op != OP_SLCT)
+ return;
+ if (insn->src(1).getFile() != FILE_GPR)
+ return;
+
+ Instruction *i0 = insn->getSrc(0)->getInsn();
+ Instruction *i1 = insn->getSrc(1)->getInsn();
+
+ if (isCSpaceLoad(i0)) {
+ if (!isCSpaceLoad(i1))
+ insn->swapSources(0, 1);
+ else
+ return;
+ } else
+ if (isImmd32Load(i0)) {
+ if (!isCSpaceLoad(i1) && !isImmd32Load(i1))
+ insn->swapSources(0, 1);
+ else
+ return;
+ } else
+ if (isAttribOrSharedLoad(i1)) {
+ if (!isAttribOrSharedLoad(i0))
+ insn->swapSources(0, 1);
+ else
+ return;
+ } else {
+ return;
+ }
+
+ if (insn->op == OP_SET)
+ insn->asCmp()->setCond = reverseCondCode(insn->asCmp()->setCond);
+ else
+ if (insn->op == OP_SLCT)
+ insn->asCmp()->setCond = inverseCondCode(insn->asCmp()->setCond);
+}
+
+bool
+LoadPropagation::visit(BasicBlock *bb)
+{
+ const Target *targ = prog->getTarget();
+ Instruction *next;
+
+ for (Instruction *i = bb->getEntry(); i; i = next) {
+ next = i->next;
+
+ if (i->op == OP_CALL) // calls have args as sources, they must be in regs
+ continue;
+
+ if (i->srcExists(1))
+ checkSwapSrc01(i);
+
+ for (int s = 0; i->srcExists(s); ++s) {
+ Instruction *ld = i->getSrc(s)->getInsn();
+
+ if (!ld || ld->fixed || (ld->op != OP_LOAD && ld->op != OP_MOV))
+ continue;
+ if (!targ->insnCanLoad(i, s, ld))
+ continue;
+
+ // propagate !
+ i->setSrc(s, ld->getSrc(0));
+ if (ld->src(0).isIndirect(0))
+ i->setIndirect(s, 0, ld->getIndirect(0, 0));
+
+ if (ld->getDef(0)->refCount() == 0)
+ delete_Instruction(prog, ld);
+ }
+ }
+ return true;
+}
+
+// =============================================================================
+
+// Evaluate constant expressions.
+class ConstantFolding : public Pass
+{
+public:
+ bool foldAll(Program *);
+
+private:
+ virtual bool visit(BasicBlock *);
+
+ void expr(Instruction *, ImmediateValue&, ImmediateValue&);
+ void opnd(Instruction *, ImmediateValue&, int s);
+
+ void unary(Instruction *, const ImmediateValue&);
+
+ void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
+
+ // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
+ CmpInstruction *findOriginForTestWithZero(Value *);
+
+ unsigned int foldCount;
+
+ BuildUtil bld;
+};
+
+// TODO: remember generated immediates and only revisit these
+bool
+ConstantFolding::foldAll(Program *prog)
+{
+ unsigned int iterCount = 0;
+ do {
+ foldCount = 0;
+ if (!run(prog))
+ return false;
+ } while (foldCount && ++iterCount < 2);
+ return true;
+}
+
+bool
+ConstantFolding::visit(BasicBlock *bb)
+{
+ Instruction *i, *next;
+
+ for (i = bb->getEntry(); i; i = next) {
+ next = i->next;
+ if (i->op == OP_MOV || i->op == OP_CALL)
+ continue;
+
+ ImmediateValue src0, src1;
+
+ if (i->srcExists(1) &&
+ i->src(0).getImmediate(src0) && i->src(1).getImmediate(src1))
+ expr(i, src0, src1);
+ else
+ if (i->srcExists(0) && i->src(0).getImmediate(src0))
+ opnd(i, src0, 0);
+ else
+ if (i->srcExists(1) && i->src(1).getImmediate(src1))
+ opnd(i, src1, 1);
+ }
+ return true;
+}
+
+CmpInstruction *
+ConstantFolding::findOriginForTestWithZero(Value *value)
+{
+ if (!value)
+ return NULL;
+ Instruction *insn = value->getInsn();
+
+ while (insn && insn->op != OP_SET) {
+ Instruction *next = NULL;
+ switch (insn->op) {
+ case OP_NEG:
+ case OP_ABS:
+ case OP_CVT:
+ next = insn->getSrc(0)->getInsn();
+ if (insn->sType != next->dType)
+ return NULL;
+ break;
+ case OP_MOV:
+ next = insn->getSrc(0)->getInsn();
+ break;
+ default:
+ return NULL;
+ }
+ insn = next;
+ }
+ return insn ? insn->asCmp() : NULL;
+}
+
+void
+Modifier::applyTo(ImmediateValue& imm) const
+{
+ if (!bits) // avoid failure if imm.reg.type is unhandled (e.g. b128)
+ return;
+ switch (imm.reg.type) {
+ case TYPE_F32:
+ if (bits & NV50_IR_MOD_ABS)
+ imm.reg.data.f32 = fabsf(imm.reg.data.f32);
+ if (bits & NV50_IR_MOD_NEG)
+ imm.reg.data.f32 = -imm.reg.data.f32;
+ if (bits & NV50_IR_MOD_SAT) {
+ if (imm.reg.data.f32 < 0.0f)
+ imm.reg.data.f32 = 0.0f;
+ else
+ if (imm.reg.data.f32 > 1.0f)
+ imm.reg.data.f32 = 1.0f;
+ }
+ assert(!(bits & NV50_IR_MOD_NOT));
+ break;
+
+ case TYPE_S8: // NOTE: will be extended
+ case TYPE_S16:
+ case TYPE_S32:
+ case TYPE_U8: // NOTE: treated as signed
+ case TYPE_U16:
+ case TYPE_U32:
+ if (bits & NV50_IR_MOD_ABS)
+ imm.reg.data.s32 = (imm.reg.data.s32 >= 0) ?
+ imm.reg.data.s32 : -imm.reg.data.s32;
+ if (bits & NV50_IR_MOD_NEG)
+ imm.reg.data.s32 = -imm.reg.data.s32;
+ if (bits & NV50_IR_MOD_NOT)
+ imm.reg.data.s32 = ~imm.reg.data.s32;
+ break;
+
+ case TYPE_F64:
+ if (bits & NV50_IR_MOD_ABS)
+ imm.reg.data.f64 = fabs(imm.reg.data.f64);
+ if (bits & NV50_IR_MOD_NEG)
+ imm.reg.data.f64 = -imm.reg.data.f64;
+ if (bits & NV50_IR_MOD_SAT) {
+ if (imm.reg.data.f64 < 0.0)
+ imm.reg.data.f64 = 0.0;
+ else
+ if (imm.reg.data.f64 > 1.0)
+ imm.reg.data.f64 = 1.0;
+ }
+ assert(!(bits & NV50_IR_MOD_NOT));
+ break;
+
+ default:
+ assert(!"invalid/unhandled type");
+ imm.reg.data.u64 = 0;
+ break;
+ }
+}
+
+operation
+Modifier::getOp() const
+{
+ switch (bits) {
+ case NV50_IR_MOD_ABS: return OP_ABS;
+ case NV50_IR_MOD_NEG: return OP_NEG;
+ case NV50_IR_MOD_SAT: return OP_SAT;
+ case NV50_IR_MOD_NOT: return OP_NOT;
+ case 0:
+ return OP_MOV;
+ default:
+ return OP_CVT;
+ }
+}
+
+void
+ConstantFolding::expr(Instruction *i,
+ ImmediateValue &imm0, ImmediateValue &imm1)
+{
+ struct Storage *const a = &imm0.reg, *const b = &imm1.reg;
+ struct Storage res;
+
+ memset(&res.data, 0, sizeof(res.data));
+
+ switch (i->op) {
+ case OP_MAD:
+ case OP_FMA:
+ case OP_MUL:
+ if (i->dnz && i->dType == TYPE_F32) {
+ if (!isfinite(a->data.f32))
+ a->data.f32 = 0.0f;
+ if (!isfinite(b->data.f32))
+ b->data.f32 = 0.0f;
+ }
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = a->data.f32 * b->data.f32; break;
+ case TYPE_F64: res.data.f64 = a->data.f64 * b->data.f64; break;
+ case TYPE_S32:
+ case TYPE_U32: res.data.u32 = a->data.u32 * b->data.u32; break;
+ default:
+ return;
+ }
+ break;
+ case OP_DIV:
+ if (b->data.u32 == 0)
+ break;
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = a->data.f32 / b->data.f32; break;
+ case TYPE_F64: res.data.f64 = a->data.f64 / b->data.f64; break;
+ case TYPE_S32: res.data.s32 = a->data.s32 / b->data.s32; break;
+ case TYPE_U32: res.data.u32 = a->data.u32 / b->data.u32; break;
+ default:
+ return;
+ }
+ break;
+ case OP_ADD:
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = a->data.f32 + b->data.f32; break;
+ case TYPE_F64: res.data.f64 = a->data.f64 + b->data.f64; break;
+ case TYPE_S32:
+ case TYPE_U32: res.data.u32 = a->data.u32 + b->data.u32; break;
+ default:
+ return;
+ }
+ break;
+ case OP_POW:
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = pow(a->data.f32, b->data.f32); break;
+ case TYPE_F64: res.data.f64 = pow(a->data.f64, b->data.f64); break;
+ default:
+ return;
+ }
+ break;
+ case OP_MAX:
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = MAX2(a->data.f32, b->data.f32); break;
+ case TYPE_F64: res.data.f64 = MAX2(a->data.f64, b->data.f64); break;
+ case TYPE_S32: res.data.s32 = MAX2(a->data.s32, b->data.s32); break;
+ case TYPE_U32: res.data.u32 = MAX2(a->data.u32, b->data.u32); break;
+ default:
+ return;
+ }
+ break;
+ case OP_MIN:
+ switch (i->dType) {
+ case TYPE_F32: res.data.f32 = MIN2(a->data.f32, b->data.f32); break;
+ case TYPE_F64: res.data.f64 = MIN2(a->data.f64, b->data.f64); break;
+ case TYPE_S32: res.data.s32 = MIN2(a->data.s32, b->data.s32); break;
+ case TYPE_U32: res.data.u32 = MIN2(a->data.u32, b->data.u32); break;
+ default:
+ return;
+ }
+ break;
+ case OP_AND:
+ res.data.u64 = a->data.u64 & b->data.u64;
+ break;
+ case OP_OR:
+ res.data.u64 = a->data.u64 | b->data.u64;
+ break;
+ case OP_XOR:
+ res.data.u64 = a->data.u64 ^ b->data.u64;
+ break;
+ case OP_SHL:
+ res.data.u32 = a->data.u32 << b->data.u32;
+ break;
+ case OP_SHR:
+ switch (i->dType) {
+ case TYPE_S32: res.data.s32 = a->data.s32 >> b->data.u32; break;
+ case TYPE_U32: res.data.u32 = a->data.u32 >> b->data.u32; break;
+ default:
+ return;
+ }
+ break;
+ case OP_SLCT:
+ if (a->data.u32 != b->data.u32)
+ return;
+ res.data.u32 = a->data.u32;
+ break;
+ default:
+ return;
+ }
+ ++foldCount;
+
+ i->src(0).mod = Modifier(0);
+ i->src(1).mod = Modifier(0);
+
+ i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.u32));
+ i->setSrc(1, NULL);
+
+ i->getSrc(0)->reg.data = res.data;
+
+ if (i->op == OP_MAD || i->op == OP_FMA) {
+ i->op = OP_ADD;
+
+ i->setSrc(1, i->getSrc(0));
+ i->src(1).mod = i->src(2).mod;
+ i->setSrc(0, i->getSrc(2));
+ i->setSrc(2, NULL);
+
+ ImmediateValue src0;
+ if (i->src(0).getImmediate(src0))
+ expr(i, src0, *i->getSrc(1)->asImm());
+ } else {
+ i->op = OP_MOV;
+ }
+}
+
+void
+ConstantFolding::unary(Instruction *i, const ImmediateValue &imm)
+{
+ Storage res;
+
+ if (i->dType != TYPE_F32)
+ return;
+ switch (i->op) {
+ case OP_NEG: res.data.f32 = -imm.reg.data.f32; break;
+ case OP_ABS: res.data.f32 = fabsf(imm.reg.data.f32); break;
+ case OP_RCP: res.data.f32 = 1.0f / imm.reg.data.f32; break;
+ case OP_RSQ: res.data.f32 = 1.0f / sqrtf(imm.reg.data.f32); break;
+ case OP_LG2: res.data.f32 = log2f(imm.reg.data.f32); break;
+ case OP_EX2: res.data.f32 = exp2f(imm.reg.data.f32); break;
+ case OP_SIN: res.data.f32 = sinf(imm.reg.data.f32); break;
+ case OP_COS: res.data.f32 = cosf(imm.reg.data.f32); break;
+ case OP_SQRT: res.data.f32 = sqrtf(imm.reg.data.f32); break;
+ case OP_PRESIN:
+ case OP_PREEX2:
+ // these should be handled in subsequent OP_SIN/COS/EX2
+ res.data.f32 = imm.reg.data.f32;
+ break;
+ default:
+ return;
+ }
+ i->op = OP_MOV;
+ i->setSrc(0, new_ImmediateValue(i->bb->getProgram(), res.data.f32));
+ i->src(0).mod = Modifier(0);
+}
+
+void
+ConstantFolding::tryCollapseChainedMULs(Instruction *mul2,
+ const int s, ImmediateValue& imm2)
+{
+ const int t = s ? 0 : 1;
+ Instruction *insn;
+ Instruction *mul1 = NULL; // mul1 before mul2
+ int e = 0;
+ float f = imm2.reg.data.f32;
+ ImmediateValue imm1;
+
+ assert(mul2->op == OP_MUL && mul2->dType == TYPE_F32);
+
+ if (mul2->getSrc(t)->refCount() == 1) {
+ insn = mul2->getSrc(t)->getInsn();
+ if (!mul2->src(t).mod && insn->op == OP_MUL && insn->dType == TYPE_F32)
+ mul1 = insn;
+ if (mul1 && !mul1->saturate) {
+ int s1;
+
+ if (mul1->src(s1 = 0).getImmediate(imm1) ||
+ mul1->src(s1 = 1).getImmediate(imm1)) {
+ bld.setPosition(mul1, false);
+ // a = mul r, imm1
+ // d = mul a, imm2 -> d = mul r, (imm1 * imm2)
+ mul1->setSrc(s1, bld.loadImm(NULL, f * imm1.reg.data.f32));
+ mul1->src(s1).mod = Modifier(0);
+ mul2->def(0).replace(mul1->getDef(0), false);
+ } else
+ if (prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
+ // c = mul a, b
+ // d = mul c, imm -> d = mul_x_imm a, b
+ mul1->postFactor = e;
+ mul2->def(0).replace(mul1->getDef(0), false);
+ if (f < 0)
+ mul1->src(0).mod *= Modifier(NV50_IR_MOD_NEG);
+ }
+ mul1->saturate = mul2->saturate;
+ return;
+ }
+ }
+ if (mul2->getDef(0)->refCount() == 1 && !mul2->saturate) {
+ // b = mul a, imm
+ // d = mul b, c -> d = mul_x_imm a, c
+ int s2, t2;
+ insn = mul2->getDef(0)->uses.front()->getInsn();
+ if (!insn)
+ return;
+ mul1 = mul2;
+ mul2 = NULL;
+ s2 = insn->getSrc(0) == mul1->getDef(0) ? 0 : 1;
+ t2 = s2 ? 0 : 1;
+ if (insn->op == OP_MUL && insn->dType == TYPE_F32)
+ if (!insn->src(s2).mod && !insn->src(t2).getImmediate(imm1))
+ mul2 = insn;
+ if (mul2 && prog->getTarget()->isPostMultiplySupported(OP_MUL, f, e)) {
+ mul2->postFactor = e;
+ mul2->setSrc(s2, mul1->src(t));
+ if (f < 0)
+ mul2->src(s2).mod *= Modifier(NV50_IR_MOD_NEG);
+ }
+ }
+}
+
+void
+ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
+{
+ const int t = !s;
+ const operation op = i->op;
+
+ switch (i->op) {
+ case OP_MUL:
+ if (i->dType == TYPE_F32)
+ tryCollapseChainedMULs(i, s, imm0);
+
+ if (imm0.isInteger(0)) {
+ i->op = OP_MOV;
+ i->setSrc(0, new_ImmediateValue(prog, 0u));
+ i->src(0).mod = Modifier(0);
+ i->setSrc(1, NULL);
+ } else
+ if (imm0.isInteger(1) || imm0.isInteger(-1)) {
+ if (imm0.isNegative())
+ i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
+ i->op = i->src(t).mod.getOp();
+ if (s == 0) {
+ i->setSrc(0, i->getSrc(1));
+ i->src(0).mod = i->src(1).mod;
+ i->src(1).mod = 0;
+ }
+ if (i->op != OP_CVT)
+ i->src(0).mod = 0;
+ i->setSrc(1, NULL);
+ } else
+ if (imm0.isInteger(2) || imm0.isInteger(-2)) {
+ if (imm0.isNegative())
+ i->src(t).mod = i->src(t).mod ^ Modifier(NV50_IR_MOD_NEG);
+ i->op = OP_ADD;
+ i->setSrc(s, i->getSrc(t));
+ i->src(s).mod = i->src(t).mod;
+ } else
+ if (!isFloatType(i->sType) && !imm0.isNegative() && imm0.isPow2()) {
+ i->op = OP_SHL;
+ imm0.applyLog2();
+ i->setSrc(0, i->getSrc(t));
+ i->src(0).mod = i->src(t).mod;
+ i->setSrc(1, new_ImmediateValue(prog, imm0.reg.data.u32));
+ i->src(1).mod = 0;
+ }
+ break;
+ case OP_ADD:
+ if (i->usesFlags())
+ break;
+ if (imm0.isInteger(0)) {
+ if (s == 0) {
+ i->setSrc(0, i->getSrc(1));
+ i->src(0).mod = i->src(1).mod;
+ }
+ i->setSrc(1, NULL);
+ i->op = i->src(0).mod.getOp();
+ if (i->op != OP_CVT)
+ i->src(0).mod = Modifier(0);
+ }
+ break;
+
+ case OP_DIV:
+ if (s != 1 || (i->dType != TYPE_S32 && i->dType != TYPE_U32))
+ break;
+ bld.setPosition(i, false);
+ if (imm0.reg.data.u32 == 0) {
+ break;
+ } else
+ if (imm0.reg.data.u32 == 1) {
+ i->op = OP_MOV;
+ i->setSrc(1, NULL);
+ } else
+ if (i->dType == TYPE_U32 && imm0.isPow2()) {
+ i->op = OP_SHR;
+ i->setSrc(1, bld.mkImm(util_logbase2(imm0.reg.data.u32)));
+ } else
+ if (i->dType == TYPE_U32) {
+ Instruction *mul;
+ Value *tA, *tB;
+ const uint32_t d = imm0.reg.data.u32;
+ uint32_t m;
+ int r, s;
+ uint32_t l = util_logbase2(d);
+ if (((uint32_t)1 << l) < d)
+ ++l;
+ m = (((uint64_t)1 << 32) * (((uint64_t)1 << l) - d)) / d + 1;
+ r = l ? 1 : 0;
+ s = l ? (l - 1) : 0;
+
+ tA = bld.getSSA();
+ tB = bld.getSSA();
+ mul = bld.mkOp2(OP_MUL, TYPE_U32, tA, i->getSrc(0),
+ bld.loadImm(NULL, m));
+ mul->subOp = NV50_IR_SUBOP_MUL_HIGH;
+ bld.mkOp2(OP_SUB, TYPE_U32, tB, i->getSrc(0), tA);
+ tA = bld.getSSA();
+ if (r)
+ bld.mkOp2(OP_SHR, TYPE_U32, tA, tB, bld.mkImm(r));
+ else
+ tA = tB;
+ tB = s ? bld.getSSA() : i->getDef(0);
+ bld.mkOp2(OP_ADD, TYPE_U32, tB, mul->getDef(0), tA);
+ if (s)
+ bld.mkOp2(OP_SHR, TYPE_U32, i->getDef(0), tB, bld.mkImm(s));
+
+ delete_Instruction(prog, i);
+ } else
+ if (imm0.reg.data.s32 == -1) {
+ i->op = OP_NEG;
+ i->setSrc(1, NULL);
+ } else {
+ LValue *tA, *tB;
+ LValue *tD;
+ const int32_t d = imm0.reg.data.s32;
+ int32_t m;
+ int32_t l = util_logbase2(static_cast<unsigned>(abs(d)));
+ if ((1 << l) < abs(d))
+ ++l;
+ if (!l)
+ l = 1;
+ m = ((uint64_t)1 << (32 + l - 1)) / abs(d) + 1 - ((uint64_t)1 << 32);
+
+ tA = bld.getSSA();
+ tB = bld.getSSA();
+ bld.mkOp3(OP_MAD, TYPE_S32, tA, i->getSrc(0), bld.loadImm(NULL, m),
+ i->getSrc(0))->subOp = NV50_IR_SUBOP_MUL_HIGH;
+ if (l > 1)
+ bld.mkOp2(OP_SHR, TYPE_S32, tB, tA, bld.mkImm(l - 1));
+ else
+ tB = tA;
+ tA = bld.getSSA();
+ bld.mkCmp(OP_SET, CC_LT, TYPE_S32, tA, i->getSrc(0), bld.mkImm(0));
+ tD = (d < 0) ? bld.getSSA() : i->getDef(0)->asLValue();
+ bld.mkOp2(OP_SUB, TYPE_U32, tD, tB, tA);
+ if (d < 0)
+ bld.mkOp1(OP_NEG, TYPE_S32, i->getDef(0), tB);
+
+ delete_Instruction(prog, i);
+ }
+ break;
+
+ case OP_MOD:
+ if (i->sType == TYPE_U32 && imm0.isPow2()) {
+ bld.setPosition(i, false);
+ i->op = OP_AND;
+ i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 - 1));
+ }
+ break;
+
+ case OP_SET: // TODO: SET_AND,OR,XOR
+ {
+ CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
+ CondCode cc, ccZ;
+ if (i->src(t).mod != Modifier(0))
+ return;
+ if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
+ return;
+ cc = si->setCond;
+ ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
+ if (s == 0)
+ ccZ = reverseCondCode(ccZ);
+ switch (ccZ) {
+ case CC_LT: cc = CC_FL; break;
+ case CC_GE: cc = CC_TR; break;
+ case CC_EQ: cc = inverseCondCode(cc); break;
+ case CC_LE: cc = inverseCondCode(cc); break;
+ case CC_GT: break;
+ case CC_NE: break;
+ default:
+ return;
+ }
+ i->asCmp()->setCond = cc;
+ i->setSrc(0, si->src(0));
+ i->setSrc(1, si->src(1));
+ i->sType = si->sType;
+ }
+ break;
+
+ case OP_SHL:
+ {
+ if (s != 1 || i->src(0).mod != Modifier(0))
+ break;
+ // try to concatenate shifts
+ Instruction *si = i->getSrc(0)->getInsn();
+ if (!si || si->op != OP_SHL)
+ break;
+ ImmediateValue imm1;
+ if (si->src(1).getImmediate(imm1)) {
+ bld.setPosition(i, false);
+ i->setSrc(0, si->getSrc(0));
+ i->setSrc(1, bld.loadImm(NULL, imm0.reg.data.u32 + imm1.reg.data.u32));
+ }
+ }
+ break;
+
+ case OP_ABS:
+ case OP_NEG:
+ case OP_LG2:
+ case OP_RCP:
+ case OP_SQRT:
+ case OP_RSQ:
+ case OP_PRESIN:
+ case OP_SIN:
+ case OP_COS:
+ case OP_PREEX2:
+ case OP_EX2:
+ unary(i, imm0);
+ break;
+ default:
+ return;
+ }
+ if (i->op != op)
+ foldCount++;
+}
+
+// =============================================================================
+
+// Merge modifier operations (ABS, NEG, NOT) into ValueRefs where allowed.
+class ModifierFolding : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+};
+
+bool
+ModifierFolding::visit(BasicBlock *bb)
+{
+ const Target *target = prog->getTarget();
+
+ Instruction *i, *next, *mi;
+ Modifier mod;
+
+ for (i = bb->getEntry(); i; i = next) {
+ next = i->next;
+
+ if (0 && i->op == OP_SUB) {
+ // turn "sub" into "add neg" (do we really want this ?)
+ i->op = OP_ADD;
+ i->src(0).mod = i->src(0).mod ^ Modifier(NV50_IR_MOD_NEG);
+ }
+
+ for (int s = 0; s < 3 && i->srcExists(s); ++s) {
+ mi = i->getSrc(s)->getInsn();
+ if (!mi ||
+ mi->predSrc >= 0 || mi->getDef(0)->refCount() > 8)
+ continue;
+ if (i->sType == TYPE_U32 && mi->dType == TYPE_S32) {
+ if ((i->op != OP_ADD &&
+ i->op != OP_MUL) ||
+ (mi->op != OP_ABS &&
+ mi->op != OP_NEG))
+ continue;
+ } else
+ if (i->sType != mi->dType) {
+ continue;
+ }
+ if ((mod = Modifier(mi->op)) == Modifier(0))
+ continue;
+ mod *= mi->src(0).mod;
+
+ if ((i->op == OP_ABS) || i->src(s).mod.abs()) {
+ // abs neg [abs] = abs
+ mod = mod & Modifier(~(NV50_IR_MOD_NEG | NV50_IR_MOD_ABS));
+ } else
+ if ((i->op == OP_NEG) && mod.neg()) {
+ assert(s == 0);
+ // neg as both opcode and modifier on same insn is prohibited
+ // neg neg abs = abs, neg neg = identity
+ mod = mod & Modifier(~NV50_IR_MOD_NEG);
+ i->op = mod.getOp();
+ mod = mod & Modifier(~NV50_IR_MOD_ABS);
+ if (mod == Modifier(0))
+ i->op = OP_MOV;
+ }
+
+ if (target->isModSupported(i, s, mod)) {
+ i->setSrc(s, mi->getSrc(0));
+ i->src(s).mod *= mod;
+ }
+ }
+
+ if (i->op == OP_SAT) {
+ mi = i->getSrc(0)->getInsn();
+ if (mi &&
+ mi->getDef(0)->refCount() <= 1 && target->isSatSupported(mi)) {
+ mi->saturate = 1;
+ mi->setDef(0, i->getDef(0));
+ delete_Instruction(prog, i);
+ }
+ }
+ }
+
+ return true;
+}
+
+// =============================================================================
+
+// MUL + ADD -> MAD/FMA
+// MIN/MAX(a, a) -> a, etc.
+// SLCT(a, b, const) -> cc(const) ? a : b
+// RCP(RCP(a)) -> a
+// MUL(MUL(a, b), const) -> MUL_Xconst(a, b)
+class AlgebraicOpt : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+
+ void handleABS(Instruction *);
+ bool handleADD(Instruction *);
+ bool tryADDToMADOrSAD(Instruction *, operation toOp);
+ void handleMINMAX(Instruction *);
+ void handleRCP(Instruction *);
+ void handleSLCT(Instruction *);
+ void handleLOGOP(Instruction *);
+ void handleCVT(Instruction *);
+ void handleSUCLAMP(Instruction *);
+
+ BuildUtil bld;
+};
+
+void
+AlgebraicOpt::handleABS(Instruction *abs)
+{
+ Instruction *sub = abs->getSrc(0)->getInsn();
+ DataType ty;
+ if (!sub ||
+ !prog->getTarget()->isOpSupported(OP_SAD, abs->dType))
+ return;
+ // expect not to have mods yet, if we do, bail
+ if (sub->src(0).mod || sub->src(1).mod)
+ return;
+ // hidden conversion ?
+ ty = intTypeToSigned(sub->dType);
+ if (abs->dType != abs->sType || ty != abs->sType)
+ return;
+
+ if ((sub->op != OP_ADD && sub->op != OP_SUB) ||
+ sub->src(0).getFile() != FILE_GPR || sub->src(0).mod ||
+ sub->src(1).getFile() != FILE_GPR || sub->src(1).mod)
+ return;
+
+ Value *src0 = sub->getSrc(0);
+ Value *src1 = sub->getSrc(1);
+
+ if (sub->op == OP_ADD) {
+ Instruction *neg = sub->getSrc(1)->getInsn();
+ if (neg && neg->op != OP_NEG) {
+ neg = sub->getSrc(0)->getInsn();
+ src0 = sub->getSrc(1);
+ }
+ if (!neg || neg->op != OP_NEG ||
+ neg->dType != neg->sType || neg->sType != ty)
+ return;
+ src1 = neg->getSrc(0);
+ }
+
+ // found ABS(SUB))
+ abs->moveSources(1, 2); // move sources >=1 up by 2
+ abs->op = OP_SAD;
+ abs->setType(sub->dType);
+ abs->setSrc(0, src0);
+ abs->setSrc(1, src1);
+ bld.setPosition(abs, false);
+ abs->setSrc(2, bld.loadImm(bld.getSSA(typeSizeof(ty)), 0));
+}
+
+bool
+AlgebraicOpt::handleADD(Instruction *add)
+{
+ Value *src0 = add->getSrc(0);
+ Value *src1 = add->getSrc(1);
+
+ if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
+ return false;
+
+ bool changed = false;
+ if (!changed && prog->getTarget()->isOpSupported(OP_MAD, add->dType))
+ changed = tryADDToMADOrSAD(add, OP_MAD);
+ if (!changed && prog->getTarget()->isOpSupported(OP_SAD, add->dType))
+ changed = tryADDToMADOrSAD(add, OP_SAD);
+ return changed;
+}
+
+// ADD(SAD(a,b,0), c) -> SAD(a,b,c)
+// ADD(MUL(a,b), c) -> MAD(a,b,c)
+bool
+AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
+{
+ Value *src0 = add->getSrc(0);
+ Value *src1 = add->getSrc(1);
+ Value *src;
+ int s;
+ const operation srcOp = toOp == OP_SAD ? OP_SAD : OP_MUL;
+ const Modifier modBad = Modifier(~((toOp == OP_MAD) ? NV50_IR_MOD_NEG : 0));
+ Modifier mod[4];
+
+ if (src0->refCount() == 1 &&
+ src0->getUniqueInsn() && src0->getUniqueInsn()->op == srcOp)
+ s = 0;
+ else
+ if (src1->refCount() == 1 &&
+ src1->getUniqueInsn() && src1->getUniqueInsn()->op == srcOp)
+ s = 1;
+ else
+ return false;
+
+ if ((src0->getUniqueInsn() && src0->getUniqueInsn()->bb != add->bb) ||
+ (src1->getUniqueInsn() && src1->getUniqueInsn()->bb != add->bb))
+ return false;
+
+ src = add->getSrc(s);
+
+ if (src->getInsn()->postFactor)
+ return false;
+ if (toOp == OP_SAD) {
+ ImmediateValue imm;
+ if (!src->getInsn()->src(2).getImmediate(imm))
+ return false;
+ if (!imm.isInteger(0))
+ return false;
+ }
+
+ mod[0] = add->src(0).mod;
+ mod[1] = add->src(1).mod;
+ mod[2] = src->getUniqueInsn()->src(0).mod;
+ mod[3] = src->getUniqueInsn()->src(1).mod;
+
+ if (((mod[0] | mod[1]) | (mod[2] | mod[3])) & modBad)
+ return false;
+
+ add->op = toOp;
+ add->subOp = src->getInsn()->subOp; // potentially mul-high
+
+ add->setSrc(2, add->src(s ? 0 : 1));
+
+ add->setSrc(0, src->getInsn()->getSrc(0));
+ add->src(0).mod = mod[2] ^ mod[s];
+ add->setSrc(1, src->getInsn()->getSrc(1));
+ add->src(1).mod = mod[3];
+
+ return true;
+}
+
+void
+AlgebraicOpt::handleMINMAX(Instruction *minmax)
+{
+ Value *src0 = minmax->getSrc(0);
+ Value *src1 = minmax->getSrc(1);
+
+ if (src0 != src1 || src0->reg.file != FILE_GPR)
+ return;
+ if (minmax->src(0).mod == minmax->src(1).mod) {
+ if (minmax->def(0).mayReplace(minmax->src(0))) {
+ minmax->def(0).replace(minmax->src(0), false);
+ minmax->bb->remove(minmax);
+ } else {
+ minmax->op = OP_CVT;
+ minmax->setSrc(1, NULL);
+ }
+ } else {
+ // TODO:
+ // min(x, -x) = -abs(x)
+ // min(x, -abs(x)) = -abs(x)
+ // min(x, abs(x)) = x
+ // max(x, -abs(x)) = x
+ // max(x, abs(x)) = abs(x)
+ // max(x, -x) = abs(x)
+ }
+}
+
+void
+AlgebraicOpt::handleRCP(Instruction *rcp)
+{
+ Instruction *si = rcp->getSrc(0)->getUniqueInsn();
+
+ if (si && si->op == OP_RCP) {
+ Modifier mod = rcp->src(0).mod * si->src(0).mod;
+ rcp->op = mod.getOp();
+ rcp->setSrc(0, si->getSrc(0));
+ }
+}
+
+void
+AlgebraicOpt::handleSLCT(Instruction *slct)
+{
+ if (slct->getSrc(2)->reg.file == FILE_IMMEDIATE) {
+ if (slct->getSrc(2)->asImm()->compare(slct->asCmp()->setCond, 0.0f))
+ slct->setSrc(0, slct->getSrc(1));
+ } else
+ if (slct->getSrc(0) != slct->getSrc(1)) {
+ return;
+ }
+ slct->op = OP_MOV;
+ slct->setSrc(1, NULL);
+ slct->setSrc(2, NULL);
+}
+
+void
+AlgebraicOpt::handleLOGOP(Instruction *logop)
+{
+ Value *src0 = logop->getSrc(0);
+ Value *src1 = logop->getSrc(1);
+
+ if (src0->reg.file != FILE_GPR || src1->reg.file != FILE_GPR)
+ return;
+
+ if (src0 == src1) {
+ if ((logop->op == OP_AND || logop->op == OP_OR) &&
+ logop->def(0).mayReplace(logop->src(0))) {
+ logop->def(0).replace(logop->src(0), false);
+ delete_Instruction(prog, logop);
+ }
+ } else {
+ // try AND(SET, SET) -> SET_AND(SET)
+ Instruction *set0 = src0->getInsn();
+ Instruction *set1 = src1->getInsn();
+
+ if (!set0 || set0->fixed || !set1 || set1->fixed)
+ return;
+ if (set1->op != OP_SET) {
+ Instruction *xchg = set0;
+ set0 = set1;
+ set1 = xchg;
+ if (set1->op != OP_SET)
+ return;
+ }
+ operation redOp = (logop->op == OP_AND ? OP_SET_AND :
+ logop->op == OP_XOR ? OP_SET_XOR : OP_SET_OR);
+ if (!prog->getTarget()->isOpSupported(redOp, set1->sType))
+ return;
+ if (set0->op != OP_SET &&
+ set0->op != OP_SET_AND &&
+ set0->op != OP_SET_OR &&
+ set0->op != OP_SET_XOR)
+ return;
+ if (set0->getDef(0)->refCount() > 1 &&
+ set1->getDef(0)->refCount() > 1)
+ return;
+ if (set0->getPredicate() || set1->getPredicate())
+ return;
+ // check that they don't source each other
+ for (int s = 0; s < 2; ++s)
+ if (set0->getSrc(s) == set1->getDef(0) ||
+ set1->getSrc(s) == set0->getDef(0))
+ return;
+
+ set0 = cloneForward(func, set0);
+ set1 = cloneShallow(func, set1);
+ logop->bb->insertAfter(logop, set1);
+ logop->bb->insertAfter(logop, set0);
+
+ set0->dType = TYPE_U8;
+ set0->getDef(0)->reg.file = FILE_PREDICATE;
+ set0->getDef(0)->reg.size = 1;
+ set1->setSrc(2, set0->getDef(0));
+ set1->op = redOp;
+ set1->setDef(0, logop->getDef(0));
+ delete_Instruction(prog, logop);
+ }
+}
+
+// F2I(NEG(SET with result 1.0f/0.0f)) -> SET with result -1/0
+// nv50:
+// F2I(NEG(I2F(ABS(SET))))
+void
+AlgebraicOpt::handleCVT(Instruction *cvt)
+{
+ if (cvt->sType != TYPE_F32 ||
+ cvt->dType != TYPE_S32 || cvt->src(0).mod != Modifier(0))
+ return;
+ Instruction *insn = cvt->getSrc(0)->getInsn();
+ if (!insn || insn->op != OP_NEG || insn->dType != TYPE_F32)
+ return;
+ if (insn->src(0).mod != Modifier(0))
+ return;
+ insn = insn->getSrc(0)->getInsn();
+
+ // check for nv50 SET(-1,0) -> SET(1.0f/0.0f) chain and nvc0's f32 SET
+ if (insn && insn->op == OP_CVT &&
+ insn->dType == TYPE_F32 &&
+ insn->sType == TYPE_S32) {
+ insn = insn->getSrc(0)->getInsn();
+ if (!insn || insn->op != OP_ABS || insn->sType != TYPE_S32 ||
+ insn->src(0).mod)
+ return;
+ insn = insn->getSrc(0)->getInsn();
+ if (!insn || insn->op != OP_SET || insn->dType != TYPE_U32)
+ return;
+ } else
+ if (!insn || insn->op != OP_SET || insn->dType != TYPE_F32) {
+ return;
+ }
+
+ Instruction *bset = cloneShallow(func, insn);
+ bset->dType = TYPE_U32;
+ bset->setDef(0, cvt->getDef(0));
+ cvt->bb->insertAfter(cvt, bset);
+ delete_Instruction(prog, cvt);
+}
+
+// SUCLAMP dst, (ADD b imm), k, 0 -> SUCLAMP dst, b, k, imm (if imm fits s6)
+void
+AlgebraicOpt::handleSUCLAMP(Instruction *insn)
+{
+ ImmediateValue imm;
+ int32_t val = insn->getSrc(2)->asImm()->reg.data.s32;
+ int s;
+ Instruction *add;
+
+ assert(insn->srcExists(0) && insn->src(0).getFile() == FILE_GPR);
+
+ // look for ADD (TODO: only count references by non-SUCLAMP)
+ if (insn->getSrc(0)->refCount() > 1)
+ return;
+ add = insn->getSrc(0)->getInsn();
+ if (!add || add->op != OP_ADD ||
+ (add->dType != TYPE_U32 &&
+ add->dType != TYPE_S32))
+ return;
+
+ // look for immediate
+ for (s = 0; s < 2; ++s)
+ if (add->src(s).getImmediate(imm))
+ break;
+ if (s >= 2)
+ return;
+ s = s ? 0 : 1;
+ // determine if immediate fits
+ val += imm.reg.data.s32;
+ if (val > 31 || val < -32)
+ return;
+ // determine if other addend fits
+ if (add->src(s).getFile() != FILE_GPR || add->src(s).mod != Modifier(0))
+ return;
+
+ bld.setPosition(insn, false); // make sure bld is init'ed
+ // replace sources
+ insn->setSrc(2, bld.mkImm(val));
+ insn->setSrc(0, add->getSrc(s));
+}
+
+bool
+AlgebraicOpt::visit(BasicBlock *bb)
+{
+ Instruction *next;
+ for (Instruction *i = bb->getEntry(); i; i = next) {
+ next = i->next;
+ switch (i->op) {
+ case OP_ABS:
+ handleABS(i);
+ break;
+ case OP_ADD:
+ handleADD(i);
+ break;
+ case OP_RCP:
+ handleRCP(i);
+ break;
+ case OP_MIN:
+ case OP_MAX:
+ handleMINMAX(i);
+ break;
+ case OP_SLCT:
+ handleSLCT(i);
+ break;
+ case OP_AND:
+ case OP_OR:
+ case OP_XOR:
+ handleLOGOP(i);
+ break;
+ case OP_CVT:
+ handleCVT(i);
+ break;
+ case OP_SUCLAMP:
+ handleSUCLAMP(i);
+ break;
+ default:
+ break;
+ }
+ }
+
+ return true;
+}
+
+// =============================================================================
+
+static inline void
+updateLdStOffset(Instruction *ldst, int32_t offset, Function *fn)
+{
+ if (offset != ldst->getSrc(0)->reg.data.offset) {
+ if (ldst->getSrc(0)->refCount() > 1)
+ ldst->setSrc(0, cloneShallow(fn, ldst->getSrc(0)));
+ ldst->getSrc(0)->reg.data.offset = offset;
+ }
+}
+
+// Combine loads and stores, forward stores to loads where possible.
+class MemoryOpt : public Pass
+{
+private:
+ class Record
+ {
+ public:
+ Record *next;
+ Instruction *insn;
+ const Value *rel[2];
+ const Value *base;
+ int32_t offset;
+ int8_t fileIndex;
+ uint8_t size;
+ bool locked;
+ Record *prev;
+
+ bool overlaps(const Instruction *ldst) const;
+
+ inline void link(Record **);
+ inline void unlink(Record **);
+ inline void set(const Instruction *ldst);
+ };
+
+public:
+ MemoryOpt();
+
+ Record *loads[DATA_FILE_COUNT];
+ Record *stores[DATA_FILE_COUNT];
+
+ MemoryPool recordPool;
+
+private:
+ virtual bool visit(BasicBlock *);
+ bool runOpt(BasicBlock *);
+
+ Record **getList(const Instruction *);
+
+ Record *findRecord(const Instruction *, bool load, bool& isAdjacent) const;
+
+ // merge @insn into load/store instruction from @rec
+ bool combineLd(Record *rec, Instruction *ld);
+ bool combineSt(Record *rec, Instruction *st);
+
+ bool replaceLdFromLd(Instruction *ld, Record *ldRec);
+ bool replaceLdFromSt(Instruction *ld, Record *stRec);
+ bool replaceStFromSt(Instruction *restrict st, Record *stRec);
+
+ void addRecord(Instruction *ldst);
+ void purgeRecords(Instruction *const st, DataFile);
+ void lockStores(Instruction *const ld);
+ void reset();
+
+private:
+ Record *prevRecord;
+};
+
+MemoryOpt::MemoryOpt() : recordPool(sizeof(MemoryOpt::Record), 6)
+{
+ for (int i = 0; i < DATA_FILE_COUNT; ++i) {
+ loads[i] = NULL;
+ stores[i] = NULL;
+ }
+ prevRecord = NULL;
+}
+
+void
+MemoryOpt::reset()
+{
+ for (unsigned int i = 0; i < DATA_FILE_COUNT; ++i) {
+ Record *it, *next;
+ for (it = loads[i]; it; it = next) {
+ next = it->next;
+ recordPool.release(it);
+ }
+ loads[i] = NULL;
+ for (it = stores[i]; it; it = next) {
+ next = it->next;
+ recordPool.release(it);
+ }
+ stores[i] = NULL;
+ }
+}
+
+bool
+MemoryOpt::combineLd(Record *rec, Instruction *ld)
+{
+ int32_t offRc = rec->offset;
+ int32_t offLd = ld->getSrc(0)->reg.data.offset;
+ int sizeRc = rec->size;
+ int sizeLd = typeSizeof(ld->dType);
+ int size = sizeRc + sizeLd;
+ int d, j;
+
+ if (!prog->getTarget()->
+ isAccessSupported(ld->getSrc(0)->reg.file, typeOfSize(size)))
+ return false;
+ // no unaligned loads
+ if (((size == 0x8) && (MIN2(offLd, offRc) & 0x7)) ||
+ ((size == 0xc) && (MIN2(offLd, offRc) & 0xf)))
+ return false;
+
+ assert(sizeRc + sizeLd <= 16 && offRc != offLd);
+
+ for (j = 0; sizeRc; sizeRc -= rec->insn->getDef(j)->reg.size, ++j);
+
+ if (offLd < offRc) {
+ int sz;
+ for (sz = 0, d = 0; sz < sizeLd; sz += ld->getDef(d)->reg.size, ++d);
+ // d: nr of definitions in ld
+ // j: nr of definitions in rec->insn, move:
+ for (d = d + j - 1; j > 0; --j, --d)
+ rec->insn->setDef(d, rec->insn->getDef(j - 1));
+
+ if (rec->insn->getSrc(0)->refCount() > 1)
+ rec->insn->setSrc(0, cloneShallow(func, rec->insn->getSrc(0)));
+ rec->offset = rec->insn->getSrc(0)->reg.data.offset = offLd;
+
+ d = 0;
+ } else {
+ d = j;
+ }
+ // move definitions of @ld to @rec->insn
+ for (j = 0; sizeLd; ++j, ++d) {
+ sizeLd -= ld->getDef(j)->reg.size;
+ rec->insn->setDef(d, ld->getDef(j));
+ }
+
+ rec->size = size;
+ rec->insn->getSrc(0)->reg.size = size;
+ rec->insn->setType(typeOfSize(size));
+
+ delete_Instruction(prog, ld);
+
+ return true;
+}
+
+bool
+MemoryOpt::combineSt(Record *rec, Instruction *st)
+{
+ int32_t offRc = rec->offset;
+ int32_t offSt = st->getSrc(0)->reg.data.offset;
+ int sizeRc = rec->size;
+ int sizeSt = typeSizeof(st->dType);
+ int s = sizeSt / 4;
+ int size = sizeRc + sizeSt;
+ int j, k;
+ Value *src[4]; // no modifiers in ValueRef allowed for st
+ Value *extra[3];
+
+ if (!prog->getTarget()->
+ isAccessSupported(st->getSrc(0)->reg.file, typeOfSize(size)))
+ return false;
+ if (size == 8 && MIN2(offRc, offSt) & 0x7)
+ return false;
+
+ st->takeExtraSources(0, extra); // save predicate and indirect address
+
+ if (offRc < offSt) {
+ // save values from @st
+ for (s = 0; sizeSt; ++s) {
+ sizeSt -= st->getSrc(s + 1)->reg.size;
+ src[s] = st->getSrc(s + 1);
+ }
+ // set record's values as low sources of @st
+ for (j = 1; sizeRc; ++j) {
+ sizeRc -= rec->insn->getSrc(j)->reg.size;
+ st->setSrc(j, rec->insn->getSrc(j));
+ }
+ // set saved values as high sources of @st
+ for (k = j, j = 0; j < s; ++j)
+ st->setSrc(k++, src[j]);
+
+ updateLdStOffset(st, offRc, func);
+ } else {
+ for (j = 1; sizeSt; ++j)
+ sizeSt -= st->getSrc(j)->reg.size;
+ for (s = 1; sizeRc; ++j, ++s) {
+ sizeRc -= rec->insn->getSrc(s)->reg.size;
+ st->setSrc(j, rec->insn->getSrc(s));
+ }
+ rec->offset = offSt;
+ }
+ st->putExtraSources(0, extra); // restore pointer and predicate
+
+ delete_Instruction(prog, rec->insn);
+ rec->insn = st;
+ rec->size = size;
+ rec->insn->getSrc(0)->reg.size = size;
+ rec->insn->setType(typeOfSize(size));
+ return true;
+}
+
+void
+MemoryOpt::Record::set(const Instruction *ldst)
+{
+ const Symbol *mem = ldst->getSrc(0)->asSym();
+ fileIndex = mem->reg.fileIndex;
+ rel[0] = ldst->getIndirect(0, 0);
+ rel[1] = ldst->getIndirect(0, 1);
+ offset = mem->reg.data.offset;
+ base = mem->getBase();
+ size = typeSizeof(ldst->sType);
+}
+
+void
+MemoryOpt::Record::link(Record **list)
+{
+ next = *list;
+ if (next)
+ next->prev = this;
+ prev = NULL;
+ *list = this;
+}
+
+void
+MemoryOpt::Record::unlink(Record **list)
+{
+ if (next)
+ next->prev = prev;
+ if (prev)
+ prev->next = next;
+ else
+ *list = next;
+}
+
+MemoryOpt::Record **
+MemoryOpt::getList(const Instruction *insn)
+{
+ if (insn->op == OP_LOAD || insn->op == OP_VFETCH)
+ return &loads[insn->src(0).getFile()];
+ return &stores[insn->src(0).getFile()];
+}
+
+void
+MemoryOpt::addRecord(Instruction *i)
+{
+ Record **list = getList(i);
+ Record *it = reinterpret_cast<Record *>(recordPool.allocate());
+
+ it->link(list);
+ it->set(i);
+ it->insn = i;
+ it->locked = false;
+}
+
+MemoryOpt::Record *
+MemoryOpt::findRecord(const Instruction *insn, bool load, bool& isAdj) const
+{
+ const Symbol *sym = insn->getSrc(0)->asSym();
+ const int size = typeSizeof(insn->sType);
+ Record *rec = NULL;
+ Record *it = load ? loads[sym->reg.file] : stores[sym->reg.file];
+
+ for (; it; it = it->next) {
+ if (it->locked && insn->op != OP_LOAD)
+ continue;
+ if ((it->offset >> 4) != (sym->reg.data.offset >> 4) ||
+ it->rel[0] != insn->getIndirect(0, 0) ||
+ it->fileIndex != sym->reg.fileIndex ||
+ it->rel[1] != insn->getIndirect(0, 1))
+ continue;
+
+ if (it->offset < sym->reg.data.offset) {
+ if (it->offset + it->size >= sym->reg.data.offset) {
+ isAdj = (it->offset + it->size == sym->reg.data.offset);
+ if (!isAdj)
+ return it;
+ if (!(it->offset & 0x7))
+ rec = it;
+ }
+ } else {
+ isAdj = it->offset != sym->reg.data.offset;
+ if (size <= it->size && !isAdj)
+ return it;
+ else
+ if (!(sym->reg.data.offset & 0x7))
+ if (it->offset - size <= sym->reg.data.offset)
+ rec = it;
+ }
+ }
+ return rec;
+}
+
+bool
+MemoryOpt::replaceLdFromSt(Instruction *ld, Record *rec)
+{
+ Instruction *st = rec->insn;
+ int32_t offSt = rec->offset;
+ int32_t offLd = ld->getSrc(0)->reg.data.offset;
+ int d, s;
+
+ for (s = 1; offSt != offLd && st->srcExists(s); ++s)
+ offSt += st->getSrc(s)->reg.size;
+ if (offSt != offLd)
+ return false;
+
+ for (d = 0; ld->defExists(d) && st->srcExists(s); ++d, ++s) {
+ if (ld->getDef(d)->reg.size != st->getSrc(s)->reg.size)
+ return false;
+ if (st->getSrc(s)->reg.file != FILE_GPR)
+ return false;
+ ld->def(d).replace(st->src(s), false);
+ }
+ ld->bb->remove(ld);
+ return true;
+}
+
+bool
+MemoryOpt::replaceLdFromLd(Instruction *ldE, Record *rec)
+{
+ Instruction *ldR = rec->insn;
+ int32_t offR = rec->offset;
+ int32_t offE = ldE->getSrc(0)->reg.data.offset;
+ int dR, dE;
+
+ assert(offR <= offE);
+ for (dR = 0; offR < offE && ldR->defExists(dR); ++dR)
+ offR += ldR->getDef(dR)->reg.size;
+ if (offR != offE)
+ return false;
+
+ for (dE = 0; ldE->defExists(dE) && ldR->defExists(dR); ++dE, ++dR) {
+ if (ldE->getDef(dE)->reg.size != ldR->getDef(dR)->reg.size)
+ return false;
+ ldE->def(dE).replace(ldR->getDef(dR), false);
+ }
+
+ delete_Instruction(prog, ldE);
+ return true;
+}
+
+bool
+MemoryOpt::replaceStFromSt(Instruction *restrict st, Record *rec)
+{
+ const Instruction *const ri = rec->insn;
+ Value *extra[3];
+
+ int32_t offS = st->getSrc(0)->reg.data.offset;
+ int32_t offR = rec->offset;
+ int32_t endS = offS + typeSizeof(st->dType);
+ int32_t endR = offR + typeSizeof(ri->dType);
+
+ rec->size = MAX2(endS, endR) - MIN2(offS, offR);
+
+ st->takeExtraSources(0, extra);
+
+ if (offR < offS) {
+ Value *vals[10];
+ int s, n;
+ int k = 0;
+ // get non-replaced sources of ri
+ for (s = 1; offR < offS; offR += ri->getSrc(s)->reg.size, ++s)
+ vals[k++] = ri->getSrc(s);
+ n = s;
+ // get replaced sources of st
+ for (s = 1; st->srcExists(s); offS += st->getSrc(s)->reg.size, ++s)
+ vals[k++] = st->getSrc(s);
+ // skip replaced sources of ri
+ for (s = n; offR < endS; offR += ri->getSrc(s)->reg.size, ++s);
+ // get non-replaced sources after values covered by st
+ for (; offR < endR; offR += ri->getSrc(s)->reg.size, ++s)
+ vals[k++] = ri->getSrc(s);
+ assert((unsigned int)k <= Elements(vals));
+ for (s = 0; s < k; ++s)
+ st->setSrc(s + 1, vals[s]);
+ st->setSrc(0, ri->getSrc(0));
+ } else
+ if (endR > endS) {
+ int j, s;
+ for (j = 1; offR < endS; offR += ri->getSrc(j++)->reg.size);
+ for (s = 1; offS < endS; offS += st->getSrc(s++)->reg.size);
+ for (; offR < endR; offR += ri->getSrc(j++)->reg.size)
+ st->setSrc(s++, ri->getSrc(j));
+ }
+ st->putExtraSources(0, extra);
+
+ delete_Instruction(prog, rec->insn);
+
+ rec->insn = st;
+ rec->offset = st->getSrc(0)->reg.data.offset;
+
+ st->setType(typeOfSize(rec->size));
+
+ return true;
+}
+
+bool
+MemoryOpt::Record::overlaps(const Instruction *ldst) const
+{
+ Record that;
+ that.set(ldst);
+
+ if (this->fileIndex != that.fileIndex)
+ return false;
+
+ if (this->rel[0] || that.rel[0])
+ return this->base == that.base;
+ return
+ (this->offset < that.offset + that.size) &&
+ (this->offset + this->size > that.offset);
+}
+
+// We must not eliminate stores that affect the result of @ld if
+// we find later stores to the same location, and we may no longer
+// merge them with later stores.
+// The stored value can, however, still be used to determine the value
+// returned by future loads.
+void
+MemoryOpt::lockStores(Instruction *const ld)
+{
+ for (Record *r = stores[ld->src(0).getFile()]; r; r = r->next)
+ if (!r->locked && r->overlaps(ld))
+ r->locked = true;
+}
+
+// Prior loads from the location of @st are no longer valid.
+// Stores to the location of @st may no longer be used to derive
+// the value at it nor be coalesced into later stores.
+void
+MemoryOpt::purgeRecords(Instruction *const st, DataFile f)
+{
+ if (st)
+ f = st->src(0).getFile();
+
+ for (Record *r = loads[f]; r; r = r->next)
+ if (!st || r->overlaps(st))
+ r->unlink(&loads[f]);
+
+ for (Record *r = stores[f]; r; r = r->next)
+ if (!st || r->overlaps(st))
+ r->unlink(&stores[f]);
+}
+
+bool
+MemoryOpt::visit(BasicBlock *bb)
+{
+ bool ret = runOpt(bb);
+ // Run again, one pass won't combine 4 32 bit ld/st to a single 128 bit ld/st
+ // where 96 bit memory operations are forbidden.
+ if (ret)
+ ret = runOpt(bb);
+ return ret;
+}
+
+bool
+MemoryOpt::runOpt(BasicBlock *bb)
+{
+ Instruction *ldst, *next;
+ Record *rec;
+ bool isAdjacent = true;
+
+ for (ldst = bb->getEntry(); ldst; ldst = next) {
+ bool keep = true;
+ bool isLoad = true;
+ next = ldst->next;
+
+ if (ldst->op == OP_LOAD || ldst->op == OP_VFETCH) {
+ if (ldst->isDead()) {
+ // might have been produced by earlier optimization
+ delete_Instruction(prog, ldst);
+ continue;
+ }
+ } else
+ if (ldst->op == OP_STORE || ldst->op == OP_EXPORT) {
+ isLoad = false;
+ } else {
+ // TODO: maybe have all fixed ops act as barrier ?
+ if (ldst->op == OP_CALL ||
+ ldst->op == OP_BAR ||
+ ldst->op == OP_MEMBAR) {
+ purgeRecords(NULL, FILE_MEMORY_LOCAL);
+ purgeRecords(NULL, FILE_MEMORY_GLOBAL);
+ purgeRecords(NULL, FILE_MEMORY_SHARED);
+ purgeRecords(NULL, FILE_SHADER_OUTPUT);
+ } else
+ if (ldst->op == OP_ATOM || ldst->op == OP_CCTL) {
+ if (ldst->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+ purgeRecords(NULL, FILE_MEMORY_LOCAL);
+ purgeRecords(NULL, FILE_MEMORY_GLOBAL);
+ purgeRecords(NULL, FILE_MEMORY_SHARED);
+ } else {
+ purgeRecords(NULL, ldst->src(0).getFile());
+ }
+ } else
+ if (ldst->op == OP_EMIT || ldst->op == OP_RESTART) {
+ purgeRecords(NULL, FILE_SHADER_OUTPUT);
+ }
+ continue;
+ }
+ if (ldst->getPredicate()) // TODO: handle predicated ld/st
+ continue;
+
+ if (isLoad) {
+ DataFile file = ldst->src(0).getFile();
+
+ // if ld l[]/g[] look for previous store to eliminate the reload
+ if (file == FILE_MEMORY_GLOBAL || file == FILE_MEMORY_LOCAL) {
+ // TODO: shared memory ?
+ rec = findRecord(ldst, false, isAdjacent);
+ if (rec && !isAdjacent)
+ keep = !replaceLdFromSt(ldst, rec);
+ }
+
+ // or look for ld from the same location and replace this one
+ rec = keep ? findRecord(ldst, true, isAdjacent) : NULL;
+ if (rec) {
+ if (!isAdjacent)
+ keep = !replaceLdFromLd(ldst, rec);
+ else
+ // or combine a previous load with this one
+ keep = !combineLd(rec, ldst);
+ }
+ if (keep)
+ lockStores(ldst);
+ } else {
+ rec = findRecord(ldst, false, isAdjacent);
+ if (rec) {
+ if (!isAdjacent)
+ keep = !replaceStFromSt(ldst, rec);
+ else
+ keep = !combineSt(rec, ldst);
+ }
+ if (keep)
+ purgeRecords(ldst, DATA_FILE_COUNT);
+ }
+ if (keep)
+ addRecord(ldst);
+ }
+ reset();
+
+ return true;
+}
+
+// =============================================================================
+
+// Turn control flow into predicated instructions (after register allocation !).
+// TODO:
+// Could move this to before register allocation on NVC0 and also handle nested
+// constructs.
+class FlatteningPass : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+
+ bool tryPredicateConditional(BasicBlock *);
+ void predicateInstructions(BasicBlock *, Value *pred, CondCode cc);
+ void tryPropagateBranch(BasicBlock *);
+ inline bool isConstantCondition(Value *pred);
+ inline bool mayPredicate(const Instruction *, const Value *pred) const;
+ inline void removeFlow(Instruction *);
+};
+
+bool
+FlatteningPass::isConstantCondition(Value *pred)
+{
+ Instruction *insn = pred->getUniqueInsn();
+ assert(insn);
+ if (insn->op != OP_SET || insn->srcExists(2))
+ return false;
+
+ for (int s = 0; s < 2 && insn->srcExists(s); ++s) {
+ Instruction *ld = insn->getSrc(s)->getUniqueInsn();
+ DataFile file;
+ if (ld) {
+ if (ld->op != OP_MOV && ld->op != OP_LOAD)
+ return false;
+ if (ld->src(0).isIndirect(0))
+ return false;
+ file = ld->src(0).getFile();
+ } else {
+ file = insn->src(s).getFile();
+ // catch $r63 on NVC0
+ if (file == FILE_GPR && insn->getSrc(s)->reg.data.id > prog->maxGPR)
+ file = FILE_IMMEDIATE;
+ }
+ if (file != FILE_IMMEDIATE && file != FILE_MEMORY_CONST)
+ return false;
+ }
+ return true;
+}
+
+void
+FlatteningPass::removeFlow(Instruction *insn)
+{
+ FlowInstruction *term = insn ? insn->asFlow() : NULL;
+ if (!term)
+ return;
+ Graph::Edge::Type ty = term->bb->cfg.outgoing().getType();
+
+ if (term->op == OP_BRA) {
+ // TODO: this might get more difficult when we get arbitrary BRAs
+ if (ty == Graph::Edge::CROSS || ty == Graph::Edge::BACK)
+ return;
+ } else
+ if (term->op != OP_JOIN)
+ return;
+
+ Value *pred = term->getPredicate();
+
+ delete_Instruction(prog, term);
+
+ if (pred && pred->refCount() == 0) {
+ Instruction *pSet = pred->getUniqueInsn();
+ pred->join->reg.data.id = -1; // deallocate
+ if (pSet->isDead())
+ delete_Instruction(prog, pSet);
+ }
+}
+
+void
+FlatteningPass::predicateInstructions(BasicBlock *bb, Value *pred, CondCode cc)
+{
+ for (Instruction *i = bb->getEntry(); i; i = i->next) {
+ if (i->isNop())
+ continue;
+ assert(!i->getPredicate());
+ i->setPredicate(cc, pred);
+ }
+ removeFlow(bb->getExit());
+}
+
+bool
+FlatteningPass::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+ if (insn->isPseudo())
+ return true;
+ // TODO: calls where we don't know which registers are modified
+
+ if (!prog->getTarget()->mayPredicate(insn, pred))
+ return false;
+ for (int d = 0; insn->defExists(d); ++d)
+ if (insn->getDef(d)->equals(pred))
+ return false;
+ return true;
+}
+
+// If we jump to BRA/RET/EXIT, replace the jump with it.
+// NOTE: We do not update the CFG anymore here !
+//
+// TODO: Handle cases where we skip over a branch (maybe do that elsewhere ?):
+// BB:0
+// @p0 bra BB:2 -> @!p0 bra BB:3 iff (!) BB:2 immediately adjoins BB:1
+// BB1:
+// bra BB:3
+// BB2:
+// ...
+// BB3:
+// ...
+void
+FlatteningPass::tryPropagateBranch(BasicBlock *bb)
+{
+ for (Instruction *i = bb->getExit(); i && i->op == OP_BRA; i = i->prev) {
+ BasicBlock *bf = i->asFlow()->target.bb;
+
+ if (bf->getInsnCount() != 1)
+ continue;
+
+ FlowInstruction *bra = i->asFlow();
+ FlowInstruction *rep = bf->getExit()->asFlow();
+
+ if (!rep || rep->getPredicate())
+ continue;
+ if (rep->op != OP_BRA &&
+ rep->op != OP_JOIN &&
+ rep->op != OP_EXIT)
+ continue;
+
+ // TODO: If there are multiple branches to @rep, only the first would
+ // be replaced, so only remove them after this pass is done ?
+ // Also, need to check all incident blocks for fall-through exits and
+ // add the branch there.
+ bra->op = rep->op;
+ bra->target.bb = rep->target.bb;
+ if (bf->cfg.incidentCount() == 1)
+ bf->remove(rep);
+ }
+}
+
+bool
+FlatteningPass::visit(BasicBlock *bb)
+{
+ if (tryPredicateConditional(bb))
+ return true;
+
+ // try to attach join to previous instruction
+ Instruction *insn = bb->getExit();
+ if (insn && insn->op == OP_JOIN && !insn->getPredicate()) {
+ insn = insn->prev;
+ if (insn && !insn->getPredicate() &&
+ !insn->asFlow() &&
+ insn->op != OP_TEXBAR &&
+ !isTextureOp(insn->op) && // probably just nve4
+ !isSurfaceOp(insn->op) && // not confirmed
+ insn->op != OP_LINTERP && // probably just nve4
+ insn->op != OP_PINTERP && // probably just nve4
+ ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
+ typeSizeof(insn->dType) <= 4) &&
+ !insn->isNop()) {
+ insn->join = 1;
+ bb->remove(bb->getExit());
+ return true;
+ }
+ }
+
+ tryPropagateBranch(bb);
+
+ return true;
+}
+
+bool
+FlatteningPass::tryPredicateConditional(BasicBlock *bb)
+{
+ BasicBlock *bL = NULL, *bR = NULL;
+ unsigned int nL = 0, nR = 0, limit = 12;
+ Instruction *insn;
+ unsigned int mask;
+
+ mask = bb->initiatesSimpleConditional();
+ if (!mask)
+ return false;
+
+ assert(bb->getExit());
+ Value *pred = bb->getExit()->getPredicate();
+ assert(pred);
+
+ if (isConstantCondition(pred))
+ limit = 4;
+
+ Graph::EdgeIterator ei = bb->cfg.outgoing();
+
+ if (mask & 1) {
+ bL = BasicBlock::get(ei.getNode());
+ for (insn = bL->getEntry(); insn; insn = insn->next, ++nL)
+ if (!mayPredicate(insn, pred))
+ return false;
+ if (nL > limit)
+ return false; // too long, do a real branch
+ }
+ ei.next();
+
+ if (mask & 2) {
+ bR = BasicBlock::get(ei.getNode());
+ for (insn = bR->getEntry(); insn; insn = insn->next, ++nR)
+ if (!mayPredicate(insn, pred))
+ return false;
+ if (nR > limit)
+ return false; // too long, do a real branch
+ }
+
+ if (bL)
+ predicateInstructions(bL, pred, bb->getExit()->cc);
+ if (bR)
+ predicateInstructions(bR, pred, inverseCondCode(bb->getExit()->cc));
+
+ if (bb->joinAt) {
+ bb->remove(bb->joinAt);
+ bb->joinAt = NULL;
+ }
+ removeFlow(bb->getExit()); // delete the branch/join at the fork point
+
+ // remove potential join operations at the end of the conditional
+ if (prog->getTarget()->joinAnterior) {
+ bb = BasicBlock::get((bL ? bL : bR)->cfg.outgoing().getNode());
+ if (bb->getEntry() && bb->getEntry()->op == OP_JOIN)
+ removeFlow(bb->getEntry());
+ }
+
+ return true;
+}
+
+// =============================================================================
+
+// Common subexpression elimination. Stupid O^2 implementation.
+class LocalCSE : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+
+ inline bool tryReplace(Instruction **, Instruction *);
+
+ DLList ops[OP_LAST + 1];
+};
+
+class GlobalCSE : public Pass
+{
+private:
+ virtual bool visit(BasicBlock *);
+};
+
+bool
+Instruction::isActionEqual(const Instruction *that) const
+{
+ if (this->op != that->op ||
+ this->dType != that->dType ||
+ this->sType != that->sType)
+ return false;
+ if (this->cc != that->cc)
+ return false;
+
+ if (this->asTex()) {
+ if (memcmp(&this->asTex()->tex,
+ &that->asTex()->tex,
+ sizeof(this->asTex()->tex)))
+ return false;
+ } else
+ if (this->asCmp()) {
+ if (this->asCmp()->setCond != that->asCmp()->setCond)
+ return false;
+ } else
+ if (this->asFlow()) {
+ return false;
+ } else {
+ if (this->ipa != that->ipa ||
+ this->lanes != that->lanes ||
+ this->perPatch != that->perPatch)
+ return false;
+ if (this->postFactor != that->postFactor)
+ return false;
+ }
+
+ if (this->subOp != that->subOp ||
+ this->saturate != that->saturate ||
+ this->rnd != that->rnd ||
+ this->ftz != that->ftz ||
+ this->dnz != that->dnz ||
+ this->cache != that->cache ||
+ this->mask != that->mask)
+ return false;
+
+ return true;
+}
+
+bool
+Instruction::isResultEqual(const Instruction *that) const
+{
+ unsigned int d, s;
+
+ // NOTE: location of discard only affects tex with liveOnly and quadops
+ if (!this->defExists(0) && this->op != OP_DISCARD)
+ return false;
+
+ if (!isActionEqual(that))
+ return false;
+
+ if (this->predSrc != that->predSrc)
+ return false;
+
+ for (d = 0; this->defExists(d); ++d) {
+ if (!that->defExists(d) ||
+ !this->getDef(d)->equals(that->getDef(d), false))
+ return false;
+ }
+ if (that->defExists(d))
+ return false;
+
+ for (s = 0; this->srcExists(s); ++s) {
+ if (!that->srcExists(s))
+ return false;
+ if (this->src(s).mod != that->src(s).mod)
+ return false;
+ if (!this->getSrc(s)->equals(that->getSrc(s), true))
+ return false;
+ }
+ if (that->srcExists(s))
+ return false;
+
+ if (op == OP_LOAD || op == OP_VFETCH) {
+ switch (src(0).getFile()) {
+ case FILE_MEMORY_CONST:
+ case FILE_SHADER_INPUT:
+ return true;
+ default:
+ return false;
+ }
+ }
+
+ return true;
+}
+
+// pull through common expressions from different in-blocks
+bool
+GlobalCSE::visit(BasicBlock *bb)
+{
+ Instruction *phi, *next, *ik;
+ int s;
+
+ // TODO: maybe do this with OP_UNION, too
+
+ for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = next) {
+ next = phi->next;
+ if (phi->getSrc(0)->refCount() > 1)
+ continue;
+ ik = phi->getSrc(0)->getInsn();
+ if (!ik)
+ continue; // probably a function input
+ for (s = 1; phi->srcExists(s); ++s) {
+ if (phi->getSrc(s)->refCount() > 1)
+ break;
+ if (!phi->getSrc(s)->getInsn() ||
+ !phi->getSrc(s)->getInsn()->isResultEqual(ik))
+ break;
+ }
+ if (!phi->srcExists(s)) {
+ Instruction *entry = bb->getEntry();
+ ik->bb->remove(ik);
+ if (!entry || entry->op != OP_JOIN)
+ bb->insertHead(ik);
+ else
+ bb->insertAfter(entry, ik);
+ ik->setDef(0, phi->getDef(0));
+ delete_Instruction(prog, phi);
+ }
+ }
+
+ return true;
+}
+
+bool
+LocalCSE::tryReplace(Instruction **ptr, Instruction *i)
+{
+ Instruction *old = *ptr;
+
+ // TODO: maybe relax this later (causes trouble with OP_UNION)
+ if (i->isPredicated())
+ return false;
+
+ if (!old->isResultEqual(i))
+ return false;
+
+ for (int d = 0; old->defExists(d); ++d)
+ old->def(d).replace(i->getDef(d), false);
+ delete_Instruction(prog, old);
+ *ptr = NULL;
+ return true;
+}
+
+bool
+LocalCSE::visit(BasicBlock *bb)
+{
+ unsigned int replaced;
+
+ do {
+ Instruction *ir, *next;
+
+ replaced = 0;
+
+ // will need to know the order of instructions
+ int serial = 0;
+ for (ir = bb->getFirst(); ir; ir = ir->next)
+ ir->serial = serial++;
+
+ for (ir = bb->getEntry(); ir; ir = next) {
+ int s;
+ Value *src = NULL;
+
+ next = ir->next;
+
+ if (ir->fixed) {
+ ops[ir->op].insert(ir);
+ continue;
+ }
+
+ for (s = 0; ir->srcExists(s); ++s)
+ if (ir->getSrc(s)->asLValue())
+ if (!src || ir->getSrc(s)->refCount() < src->refCount())
+ src = ir->getSrc(s);
+
+ if (src) {
+ for (Value::UseIterator it = src->uses.begin();
+ it != src->uses.end(); ++it) {
+ Instruction *ik = (*it)->getInsn();
+ if (ik && ik->bb == ir->bb && ik->serial < ir->serial)
+ if (tryReplace(&ir, ik))
+ break;
+ }
+ } else {
+ DLLIST_FOR_EACH(&ops[ir->op], iter)
+ {
+ Instruction *ik = reinterpret_cast<Instruction *>(iter.get());
+ if (tryReplace(&ir, ik))
+ break;
+ }
+ }
+
+ if (ir)
+ ops[ir->op].insert(ir);
+ else
+ ++replaced;
+ }
+ for (unsigned int i = 0; i <= OP_LAST; ++i)
+ ops[i].clear();
+
+ } while (replaced);
+
+ return true;
+}
+
+// =============================================================================
+
+// Remove computations of unused values.
+class DeadCodeElim : public Pass
+{
+public:
+ bool buryAll(Program *);
+
+private:
+ virtual bool visit(BasicBlock *);
+
+ void checkSplitLoad(Instruction *ld); // for partially dead loads
+
+ unsigned int deadCount;
+};
+
+bool
+DeadCodeElim::buryAll(Program *prog)
+{
+ do {
+ deadCount = 0;
+ if (!this->run(prog, false, false))
+ return false;
+ } while (deadCount);
+
+ return true;
+}
+
+bool
+DeadCodeElim::visit(BasicBlock *bb)
+{
+ Instruction *next;
+
+ for (Instruction *i = bb->getFirst(); i; i = next) {
+ next = i->next;
+ if (i->isDead()) {
+ ++deadCount;
+ delete_Instruction(prog, i);
+ } else
+ if (i->defExists(1) && (i->op == OP_VFETCH || i->op == OP_LOAD)) {
+ checkSplitLoad(i);
+ } else
+ if (i->defExists(0) && !i->getDef(0)->refCount()) {
+ if (i->op == OP_ATOM ||
+ i->op == OP_SUREDP ||
+ i->op == OP_SUREDB)
+ i->setDef(0, NULL);
+ }
+ }
+ return true;
+}
+
+void
+DeadCodeElim::checkSplitLoad(Instruction *ld1)
+{
+ Instruction *ld2 = NULL; // can get at most 2 loads
+ Value *def1[4];
+ Value *def2[4];
+ int32_t addr1, addr2;
+ int32_t size1, size2;
+ int d, n1, n2;
+ uint32_t mask = 0xffffffff;
+
+ for (d = 0; ld1->defExists(d); ++d)
+ if (!ld1->getDef(d)->refCount() && ld1->getDef(d)->reg.data.id < 0)
+ mask &= ~(1 << d);
+ if (mask == 0xffffffff)
+ return;
+
+ addr1 = ld1->getSrc(0)->reg.data.offset;
+ n1 = n2 = 0;
+ size1 = size2 = 0;
+ for (d = 0; ld1->defExists(d); ++d) {
+ if (mask & (1 << d)) {
+ if (size1 && (addr1 & 0x7))
+ break;
+ def1[n1] = ld1->getDef(d);
+ size1 += def1[n1++]->reg.size;
+ } else
+ if (!n1) {
+ addr1 += ld1->getDef(d)->reg.size;
+ } else {
+ break;
+ }
+ }
+ for (addr2 = addr1 + size1; ld1->defExists(d); ++d) {
+ if (mask & (1 << d)) {
+ def2[n2] = ld1->getDef(d);
+ size2 += def2[n2++]->reg.size;
+ } else {
+ assert(!n2);
+ addr2 += ld1->getDef(d)->reg.size;
+ }
+ }
+
+ updateLdStOffset(ld1, addr1, func);
+ ld1->setType(typeOfSize(size1));
+ for (d = 0; d < 4; ++d)
+ ld1->setDef(d, (d < n1) ? def1[d] : NULL);
+
+ if (!n2)
+ return;
+
+ ld2 = cloneShallow(func, ld1);
+ updateLdStOffset(ld2, addr2, func);
+ ld2->setType(typeOfSize(size2));
+ for (d = 0; d < 4; ++d)
+ ld2->setDef(d, (d < n2) ? def2[d] : NULL);
+
+ ld1->bb->insertAfter(ld1, ld2);
+}
+
+// =============================================================================
+
+#define RUN_PASS(l, n, f) \
+ if (level >= (l)) { \
+ if (dbgFlags & NV50_IR_DEBUG_VERBOSE) \
+ INFO("PEEPHOLE: %s\n", #n); \
+ n pass; \
+ if (!pass.f(this)) \
+ return false; \
+ }
+
+bool
+Program::optimizeSSA(int level)
+{
+ RUN_PASS(1, DeadCodeElim, buryAll);
+ RUN_PASS(1, CopyPropagation, run);
+ RUN_PASS(2, GlobalCSE, run);
+ RUN_PASS(1, LocalCSE, run);
+ RUN_PASS(2, AlgebraicOpt, run);
+ RUN_PASS(2, ModifierFolding, run); // before load propagation -> less checks
+ RUN_PASS(1, ConstantFolding, foldAll);
+ RUN_PASS(1, LoadPropagation, run);
+ RUN_PASS(2, MemoryOpt, run);
+ RUN_PASS(2, LocalCSE, run);
+ RUN_PASS(0, DeadCodeElim, buryAll);
+
+ return true;
+}
+
+bool
+Program::optimizePostRA(int level)
+{
+ RUN_PASS(2, FlatteningPass, run);
+ return true;
+}
+
+}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
new file mode 100644
index 0000000..ee39b3c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_print.cpp
@@ -0,0 +1,698 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+#define __STDC_FORMAT_MACROS
+#include <inttypes.h>
+
+namespace nv50_ir {
+
+enum TextStyle
+{
+ TXT_DEFAULT,
+ TXT_GPR,
+ TXT_REGISTER,
+ TXT_FLAGS,
+ TXT_MEM,
+ TXT_IMMD,
+ TXT_BRA,
+ TXT_INSN
+};
+
+static const char *_colour[8] =
+{
+ "\x1b[00m",
+ "\x1b[34m",
+ "\x1b[35m",
+ "\x1b[35m",
+ "\x1b[36m",
+ "\x1b[33m",
+ "\x1b[37m",
+ "\x1b[32m"
+};
+
+static const char *_nocolour[8] =
+{
+ "", "", "", "", "", "", "", ""
+};
+
+static const char **colour;
+
+static void init_colours()
+{
+ if (getenv("NV50_PROG_DEBUG_NO_COLORS") != NULL)
+ colour = _nocolour;
+ else
+ colour = _colour;
+}
+
+const char *operationStr[OP_LAST + 1] =
+{
+ "nop",
+ "phi",
+ "union",
+ "split",
+ "merge",
+ "consec",
+ "mov",
+ "ld",
+ "st",
+ "add",
+ "sub",
+ "mul",
+ "div",
+ "mod",
+ "mad",
+ "fma",
+ "sad",
+ "abs",
+ "neg",
+ "not",
+ "and",
+ "or",
+ "xor",
+ "shl",
+ "shr",
+ "max",
+ "min",
+ "sat",
+ "ceil",
+ "floor",
+ "trunc",
+ "cvt",
+ "set and",
+ "set or",
+ "set xor",
+ "set",
+ "selp",
+ "slct",
+ "rcp",
+ "rsq",
+ "lg2",
+ "sin",
+ "cos",
+ "ex2",
+ "exp",
+ "log",
+ "presin",
+ "preex2",
+ "sqrt",
+ "pow",
+ "bra",
+ "call",
+ "ret",
+ "cont",
+ "break",
+ "preret",
+ "precont",
+ "prebreak",
+ "brkpt",
+ "joinat",
+ "join",
+ "discard",
+ "exit",
+ "membar",
+ "vfetch",
+ "pfetch",
+ "export",
+ "linterp",
+ "pinterp",
+ "emit",
+ "restart",
+ "tex",
+ "texbias",
+ "texlod",
+ "texfetch",
+ "texquery",
+ "texgrad",
+ "texgather",
+ "texcsaa",
+ "texprep",
+ "suldb",
+ "suldp",
+ "sustb",
+ "sustp",
+ "suredb",
+ "suredp",
+ "sulea",
+ "subfm",
+ "suclamp",
+ "sueau",
+ "madsp",
+ "texbar",
+ "dfdx",
+ "dfdy",
+ "rdsv",
+ "wrsv",
+ "quadop",
+ "quadon",
+ "quadpop",
+ "popcnt",
+ "insbf",
+ "extbf",
+ "permt",
+ "atom",
+ "bar",
+ "vadd",
+ "vavg",
+ "vmin",
+ "vmax",
+ "vsad",
+ "vset",
+ "vshr",
+ "vshl",
+ "vsel",
+ "cctl",
+ "(invalid)"
+};
+
+static const char *atomSubOpStr[] =
+{
+ "add", "min", "max", "inc", "dec", "and", "or", "xor", "cas", "exch"
+};
+
+static const char *DataTypeStr[] =
+{
+ "-",
+ "u8", "s8",
+ "u16", "s16",
+ "u32", "s32",
+ "u64", "s64",
+ "f16", "f32", "f64",
+ "b96", "b128"
+};
+
+static const char *RoundModeStr[] =
+{
+ "", "rm", "rz", "rp", "rni", "rmi", "rzi", "rpi"
+};
+
+static const char *CondCodeStr[] =
+{
+ "never",
+ "lt",
+ "eq",
+ "le",
+ "gt",
+ "ne",
+ "ge",
+ "",
+ "(invalid)",
+ "ltu",
+ "equ",
+ "leu",
+ "gtu",
+ "neu",
+ "geu",
+ "",
+ "no",
+ "nc",
+ "ns",
+ "na",
+ "a",
+ "s",
+ "c",
+ "o"
+};
+
+static const char *SemanticStr[SV_LAST + 1] =
+{
+ "POSITION",
+ "VERTEX_ID",
+ "INSTANCE_ID",
+ "INVOCATION_ID",
+ "PRIMITIVE_ID",
+ "VERTEX_COUNT",
+ "LAYER",
+ "VIEWPORT_INDEX",
+ "Y_DIR",
+ "FACE",
+ "POINT_SIZE",
+ "POINT_COORD",
+ "CLIP_DISTANCE",
+ "SAMPLE_INDEX",
+ "TESS_FACTOR",
+ "TESS_COORD",
+ "TID",
+ "CTAID",
+ "NTID",
+ "GRIDID",
+ "NCTAID",
+ "LANEID",
+ "PHYSID",
+ "NPHYSID",
+ "CLOCK",
+ "LBASE",
+ "SBASE",
+ "?",
+ "(INVALID)"
+};
+
+static const char *interpStr[16] =
+{
+ "pass",
+ "mul",
+ "flat",
+ "sc",
+ "cent pass",
+ "cent mul",
+ "cent flat",
+ "cent sc",
+ "off pass",
+ "off mul",
+ "off flat",
+ "off sc",
+ "samp pass",
+ "samp mul",
+ "samp flat",
+ "samp sc"
+};
+
+#define PRINT(args...) \
+ do { \
+ pos += snprintf(&buf[pos], size - pos, args); \
+ } while(0)
+
+#define SPACE_PRINT(cond, args...) \
+ do { \
+ if (cond) \
+ buf[pos++] = ' '; \
+ pos += snprintf(&buf[pos], size - pos, args); \
+ } while(0)
+
+#define SPACE() \
+ do { \
+ if (pos < size) \
+ buf[pos++] = ' '; \
+ } while(0)
+
+int Modifier::print(char *buf, size_t size) const
+{
+ size_t pos = 0;
+
+ if (bits)
+ PRINT("%s", colour[TXT_INSN]);
+
+ size_t base = pos;
+
+ if (bits & NV50_IR_MOD_NOT)
+ PRINT("not");
+ if (bits & NV50_IR_MOD_SAT)
+ SPACE_PRINT(pos > base && pos < size, "sat");
+ if (bits & NV50_IR_MOD_NEG)
+ SPACE_PRINT(pos > base && pos < size, "neg");
+ if (bits & NV50_IR_MOD_ABS)
+ SPACE_PRINT(pos > base && pos < size, "abs");
+
+ return pos;
+}
+
+int LValue::print(char *buf, size_t size, DataType ty) const
+{
+ const char *postFix = "";
+ size_t pos = 0;
+ int idx = join->reg.data.id >= 0 ? join->reg.data.id : id;
+ char p = join->reg.data.id >= 0 ? '$' : '%';
+ char r;
+ int col = TXT_DEFAULT;
+
+ switch (reg.file) {
+ case FILE_GPR:
+ r = 'r'; col = TXT_GPR;
+ if (reg.size == 2) {
+ if (p == '$') {
+ postFix = (idx & 1) ? "h" : "l";
+ idx /= 2;
+ } else {
+ postFix = "s";
+ }
+ } else
+ if (reg.size == 8) {
+ postFix = "d";
+ } else
+ if (reg.size == 16) {
+ postFix = "q";
+ } else
+ if (reg.size == 12) {
+ postFix = "t";
+ }
+ break;
+ case FILE_PREDICATE:
+ r = 'p'; col = TXT_REGISTER;
+ if (reg.size == 2)
+ postFix = "d";
+ else
+ if (reg.size == 4)
+ postFix = "q";
+ break;
+ case FILE_FLAGS:
+ r = 'c'; col = TXT_FLAGS;
+ break;
+ case FILE_ADDRESS:
+ r = 'a'; col = TXT_REGISTER;
+ break;
+ default:
+ assert(!"invalid file for lvalue");
+ r = '?';
+ break;
+ }
+
+ PRINT("%s%c%c%i%s", colour[col], p, r, idx, postFix);
+
+ return pos;
+}
+
+int ImmediateValue::print(char *buf, size_t size, DataType ty) const
+{
+ size_t pos = 0;
+
+ PRINT("%s", colour[TXT_IMMD]);
+
+ switch (ty) {
+ case TYPE_F32: PRINT("%f", reg.data.f32); break;
+ case TYPE_F64: PRINT("%f", reg.data.f64); break;
+ case TYPE_U8: PRINT("0x%02x", reg.data.u8); break;
+ case TYPE_S8: PRINT("%i", reg.data.s8); break;
+ case TYPE_U16: PRINT("0x%04x", reg.data.u16); break;
+ case TYPE_S16: PRINT("%i", reg.data.s16); break;
+ case TYPE_U32: PRINT("0x%08x", reg.data.u32); break;
+ case TYPE_S32: PRINT("%i", reg.data.s32); break;
+ case TYPE_U64:
+ case TYPE_S64:
+ default:
+ PRINT("0x%016"PRIx64, reg.data.u64);
+ break;
+ }
+ return pos;
+}
+
+int Symbol::print(char *buf, size_t size, DataType ty) const
+{
+ return print(buf, size, NULL, NULL, ty);
+}
+
+int Symbol::print(char *buf, size_t size,
+ Value *rel, Value *dimRel, DataType ty) const
+{
+ size_t pos = 0;
+ char c;
+
+ if (ty == TYPE_NONE)
+ ty = typeOfSize(reg.size);
+
+ if (reg.file == FILE_SYSTEM_VALUE) {
+ PRINT("%ssv[%s%s:%i%s", colour[TXT_MEM],
+ colour[TXT_REGISTER],
+ SemanticStr[reg.data.sv.sv], reg.data.sv.index, colour[TXT_MEM]);
+ if (rel) {
+ PRINT("%s+", colour[TXT_DEFAULT]);
+ pos += rel->print(&buf[pos], size - pos);
+ }
+ PRINT("%s]", colour[TXT_MEM]);
+ return pos;
+ }
+
+ switch (reg.file) {
+ case FILE_MEMORY_CONST: c = 'c'; break;
+ case FILE_SHADER_INPUT: c = 'a'; break;
+ case FILE_SHADER_OUTPUT: c = 'o'; break;
+ case FILE_MEMORY_GLOBAL: c = 'g'; break;
+ case FILE_MEMORY_SHARED: c = 's'; break;
+ case FILE_MEMORY_LOCAL: c = 'l'; break;
+ default:
+ assert(!"invalid file");
+ c = '?';
+ break;
+ }
+
+ if (c == 'c')
+ PRINT("%s%c%i[", colour[TXT_MEM], c, reg.fileIndex);
+ else
+ PRINT("%s%c[", colour[TXT_MEM], c);
+
+ if (dimRel) {
+ pos += dimRel->print(&buf[pos], size - pos, TYPE_S32);
+ PRINT("%s][", colour[TXT_MEM]);
+ }
+
+ if (rel) {
+ pos += rel->print(&buf[pos], size - pos);
+ PRINT("%s%c", colour[TXT_DEFAULT], (reg.data.offset < 0) ? '-' : '+');
+ } else {
+ assert(reg.data.offset >= 0);
+ }
+ PRINT("%s0x%x%s]", colour[TXT_IMMD], abs(reg.data.offset), colour[TXT_MEM]);
+
+ return pos;
+}
+
+void Instruction::print() const
+{
+ #define BUFSZ 512
+
+ const size_t size = BUFSZ;
+
+ char buf[BUFSZ];
+ int s, d;
+ size_t pos = 0;
+
+ PRINT("%s", colour[TXT_INSN]);
+
+ if (join)
+ PRINT("join ");
+
+ if (predSrc >= 0) {
+ const size_t pre = pos;
+ if (getSrc(predSrc)->reg.file == FILE_PREDICATE) {
+ if (cc == CC_NOT_P)
+ PRINT("not");
+ } else {
+ PRINT("%s", CondCodeStr[cc]);
+ }
+ if (pos > pre)
+ SPACE();
+ pos += getSrc(predSrc)->print(&buf[pos], BUFSZ - pos);
+ PRINT(" %s", colour[TXT_INSN]);
+ }
+
+ if (saturate)
+ PRINT("sat ");
+
+ if (asFlow()) {
+ PRINT("%s", operationStr[op]);
+ if (asFlow()->indirect)
+ PRINT(" ind");
+ if (asFlow()->absolute)
+ PRINT(" abs");
+ if (op == OP_CALL && asFlow()->builtin) {
+ PRINT(" %sBUILTIN:%i", colour[TXT_BRA], asFlow()->target.builtin);
+ } else
+ if (op == OP_CALL && asFlow()->target.fn) {
+ PRINT(" %s%s:%i", colour[TXT_BRA],
+ asFlow()->target.fn->getName(),
+ asFlow()->target.fn->getLabel());
+ } else
+ if (asFlow()->target.bb)
+ PRINT(" %sBB:%i", colour[TXT_BRA], asFlow()->target.bb->getId());
+ } else {
+ PRINT("%s ", operationStr[op]);
+ if (op == OP_LINTERP || op == OP_PINTERP)
+ PRINT("%s ", interpStr[ipa]);
+ switch (op) {
+ case OP_SUREDP:
+ case OP_ATOM:
+ if (subOp < Elements(atomSubOpStr))
+ PRINT("%s ", atomSubOpStr[subOp]);
+ break;
+ default:
+ if (subOp)
+ PRINT("(SUBOP:%u) ", subOp);
+ break;
+ }
+ if (perPatch)
+ PRINT("patch ");
+ if (asTex())
+ PRINT("%s %s$r%u $s%u %s", asTex()->tex.target.getName(),
+ colour[TXT_MEM], asTex()->tex.r, asTex()->tex.s,
+ colour[TXT_INSN]);
+ if (postFactor)
+ PRINT("x2^%i ", postFactor);
+ PRINT("%s%s", dnz ? "dnz " : (ftz ? "ftz " : ""), DataTypeStr[dType]);
+ }
+
+ if (rnd != ROUND_N)
+ PRINT(" %s", RoundModeStr[rnd]);
+
+ if (defExists(1))
+ PRINT(" {");
+ for (d = 0; defExists(d); ++d) {
+ SPACE();
+ pos += getDef(d)->print(&buf[pos], size - pos);
+ }
+ if (d > 1)
+ PRINT(" %s}", colour[TXT_INSN]);
+ else
+ if (!d && !asFlow())
+ PRINT(" %s#", colour[TXT_INSN]);
+
+ if (asCmp())
+ PRINT(" %s%s", colour[TXT_INSN], CondCodeStr[asCmp()->setCond]);
+
+ if (sType != dType)
+ PRINT(" %s%s", colour[TXT_INSN], DataTypeStr[sType]);
+
+ for (s = 0; srcExists(s); ++s) {
+ if (s == predSrc || src(s).usedAsPtr)
+ continue;
+ const size_t pre = pos;
+ SPACE();
+ pos += src(s).mod.print(&buf[pos], BUFSZ - pos);
+ if (pos > pre + 1)
+ SPACE();
+ if (src(s).isIndirect(0) || src(s).isIndirect(1))
+ pos += getSrc(s)->asSym()->print(&buf[pos], BUFSZ - pos,
+ getIndirect(s, 0),
+ getIndirect(s, 1));
+ else
+ pos += getSrc(s)->print(&buf[pos], BUFSZ - pos, sType);
+ }
+ if (exit)
+ PRINT("%s exit", colour[TXT_INSN]);
+
+ PRINT("%s", colour[TXT_DEFAULT]);
+
+ buf[MIN2(pos, BUFSZ - 1)] = 0;
+
+ INFO("%s (%u)\n", buf, encSize);
+}
+
+class PrintPass : public Pass
+{
+public:
+ PrintPass() : serial(0) { }
+
+ virtual bool visit(Function *);
+ virtual bool visit(BasicBlock *);
+ virtual bool visit(Instruction *);
+
+private:
+ int serial;
+};
+
+bool
+PrintPass::visit(Function *fn)
+{
+ char str[16];
+
+ INFO("\n%s:%i (", fn->getName(), fn->getLabel());
+
+ if (!fn->outs.empty())
+ INFO("out");
+ for (std::deque<ValueRef>::iterator it = fn->outs.begin();
+ it != fn->outs.end();
+ ++it) {
+ it->get()->print(str, sizeof(str), typeOfSize(it->get()->reg.size));
+ INFO(" %s", str);
+ }
+
+ if (!fn->ins.empty())
+ INFO("%s%sin", colour[TXT_DEFAULT], fn->outs.empty() ? "" : ", ");
+ for (std::deque<ValueDef>::iterator it = fn->ins.begin();
+ it != fn->ins.end();
+ ++it) {
+ it->get()->print(str, sizeof(str), typeOfSize(it->get()->reg.size));
+ INFO(" %s", str);
+ }
+ INFO("%s)\n", colour[TXT_DEFAULT]);
+
+ return true;
+}
+
+bool
+PrintPass::visit(BasicBlock *bb)
+{
+#if 0
+ INFO("---\n");
+ for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next())
+ INFO(" <- BB:%i (%s)\n",
+ BasicBlock::get(ei.getNode())->getId(),
+ ei.getEdge()->typeStr());
+#endif
+ INFO("BB:%i (%u instructions) - ", bb->getId(), bb->getInsnCount());
+
+ if (bb->idom())
+ INFO("idom = BB:%i, ", bb->idom()->getId());
+
+ INFO("df = { ");
+ for (DLList::Iterator df = bb->getDF().iterator(); !df.end(); df.next())
+ INFO("BB:%i ", BasicBlock::get(df)->getId());
+
+ INFO("}\n");
+
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next())
+ INFO(" -> BB:%i (%s)\n",
+ BasicBlock::get(ei.getNode())->getId(),
+ ei.getEdge()->typeStr());
+
+ return true;
+}
+
+bool
+PrintPass::visit(Instruction *insn)
+{
+ INFO("%3i: ", serial++);
+ insn->print();
+ return true;
+}
+
+void
+Function::print()
+{
+ PrintPass pass;
+ pass.run(this, true, false);
+}
+
+void
+Program::print()
+{
+ PrintPass pass;
+ init_colours();
+ pass.run(this, true, false);
+}
+
+void
+Function::printLiveIntervals() const
+{
+ INFO("printing live intervals ...\n");
+
+ for (ArrayList::Iterator it = allLValues.iterator(); !it.end(); it.next()) {
+ const Value *lval = Value::get(it)->asLValue();
+ if (lval && !lval->livei.isEmpty()) {
+ INFO("livei(%%%i): ", lval->id);
+ lval->livei.print();
+ }
+ }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
new file mode 100644
index 0000000..d65003c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -0,0 +1,2050 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+#include <stack>
+#include <limits>
+
+namespace nv50_ir {
+
+#define MAX_REGISTER_FILE_SIZE 256
+
+class RegisterSet
+{
+public:
+ RegisterSet(const Target *);
+
+ void init(const Target *);
+ void reset(DataFile, bool resetMax = false);
+
+ void periodicMask(DataFile f, uint32_t lock, uint32_t unlock);
+ void intersect(DataFile f, const RegisterSet *);
+
+ bool assign(int32_t& reg, DataFile f, unsigned int size);
+ void release(DataFile f, int32_t reg, unsigned int size);
+ void occupy(DataFile f, int32_t reg, unsigned int size);
+ void occupy(const Value *);
+ void occupyMask(DataFile f, int32_t reg, uint8_t mask);
+ bool isOccupied(DataFile f, int32_t reg, unsigned int size) const;
+ bool testOccupy(const Value *);
+ bool testOccupy(DataFile f, int32_t reg, unsigned int size);
+
+ inline int getMaxAssigned(DataFile f) const { return fill[f]; }
+
+ inline unsigned int getFileSize(DataFile f, uint8_t regSize) const
+ {
+ if (restrictedGPR16Range && f == FILE_GPR && regSize == 2)
+ return (last[f] + 1) / 2;
+ return last[f] + 1;
+ }
+
+ inline unsigned int units(DataFile f, unsigned int size) const
+ {
+ return size >> unit[f];
+ }
+ // for regs of size >= 4, id is counted in 4-byte words (like nv50/c0 binary)
+ inline unsigned int idToBytes(const Value *v) const
+ {
+ return v->reg.data.id * MIN2(v->reg.size, 4);
+ }
+ inline unsigned int idToUnits(const Value *v) const
+ {
+ return units(v->reg.file, idToBytes(v));
+ }
+ inline int bytesToId(Value *v, unsigned int bytes) const
+ {
+ if (v->reg.size < 4)
+ return units(v->reg.file, bytes);
+ return bytes / 4;
+ }
+ inline int unitsToId(DataFile f, int u, uint8_t size) const
+ {
+ if (u < 0)
+ return -1;
+ return (size < 4) ? u : ((u << unit[f]) / 4);
+ }
+
+ void print() const;
+
+private:
+ BitSet bits[LAST_REGISTER_FILE + 1];
+
+ int unit[LAST_REGISTER_FILE + 1]; // log2 of allocation granularity
+
+ int last[LAST_REGISTER_FILE + 1];
+ int fill[LAST_REGISTER_FILE + 1];
+
+ const bool restrictedGPR16Range;
+};
+
+void
+RegisterSet::reset(DataFile f, bool resetMax)
+{
+ bits[f].fill(0);
+ if (resetMax)
+ fill[f] = -1;
+}
+
+void
+RegisterSet::init(const Target *targ)
+{
+ for (unsigned int rf = 0; rf <= FILE_ADDRESS; ++rf) {
+ DataFile f = static_cast<DataFile>(rf);
+ last[rf] = targ->getFileSize(f) - 1;
+ unit[rf] = targ->getFileUnit(f);
+ fill[rf] = -1;
+ assert(last[rf] < MAX_REGISTER_FILE_SIZE);
+ bits[rf].allocate(last[rf] + 1, true);
+ }
+}
+
+RegisterSet::RegisterSet(const Target *targ)
+ : restrictedGPR16Range(targ->getChipset() < 0xc0)
+{
+ init(targ);
+ for (unsigned int i = 0; i <= LAST_REGISTER_FILE; ++i)
+ reset(static_cast<DataFile>(i));
+}
+
+void
+RegisterSet::periodicMask(DataFile f, uint32_t lock, uint32_t unlock)
+{
+ bits[f].periodicMask32(lock, unlock);
+}
+
+void
+RegisterSet::intersect(DataFile f, const RegisterSet *set)
+{
+ bits[f] |= set->bits[f];
+}
+
+void
+RegisterSet::print() const
+{
+ INFO("GPR:");
+ bits[FILE_GPR].print();
+ INFO("\n");
+}
+
+bool
+RegisterSet::assign(int32_t& reg, DataFile f, unsigned int size)
+{
+ reg = bits[f].findFreeRange(size);
+ if (reg < 0)
+ return false;
+ fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
+ return true;
+}
+
+bool
+RegisterSet::isOccupied(DataFile f, int32_t reg, unsigned int size) const
+{
+ return bits[f].testRange(reg, size);
+}
+
+void
+RegisterSet::occupy(const Value *v)
+{
+ occupy(v->reg.file, idToUnits(v), v->reg.size >> unit[v->reg.file]);
+}
+
+void
+RegisterSet::occupyMask(DataFile f, int32_t reg, uint8_t mask)
+{
+ bits[f].setMask(reg & ~31, static_cast<uint32_t>(mask) << (reg % 32));
+}
+
+void
+RegisterSet::occupy(DataFile f, int32_t reg, unsigned int size)
+{
+ bits[f].setRange(reg, size);
+
+ INFO_DBG(0, REG_ALLOC, "reg occupy: %u[%i] %u\n", f, reg, size);
+
+ fill[f] = MAX2(fill[f], (int32_t)(reg + size - 1));
+}
+
+bool
+RegisterSet::testOccupy(const Value *v)
+{
+ return testOccupy(v->reg.file,
+ idToUnits(v), v->reg.size >> unit[v->reg.file]);
+}
+
+bool
+RegisterSet::testOccupy(DataFile f, int32_t reg, unsigned int size)
+{
+ if (isOccupied(f, reg, size))
+ return false;
+ occupy(f, reg, size);
+ return true;
+}
+
+void
+RegisterSet::release(DataFile f, int32_t reg, unsigned int size)
+{
+ bits[f].clrRange(reg, size);
+
+ INFO_DBG(0, REG_ALLOC, "reg release: %u[%i] %u\n", f, reg, size);
+}
+
+class RegAlloc
+{
+public:
+ RegAlloc(Program *program) : prog(program), sequence(0) { }
+
+ bool exec();
+ bool execFunc();
+
+private:
+ class PhiMovesPass : public Pass {
+ private:
+ virtual bool visit(BasicBlock *);
+ inline bool needNewElseBlock(BasicBlock *b, BasicBlock *p);
+ };
+
+ class ArgumentMovesPass : public Pass {
+ private:
+ virtual bool visit(BasicBlock *);
+ };
+
+ class BuildIntervalsPass : public Pass {
+ private:
+ virtual bool visit(BasicBlock *);
+ void collectLiveValues(BasicBlock *);
+ void addLiveRange(Value *, const BasicBlock *, int end);
+ };
+
+ class InsertConstraintsPass : public Pass {
+ public:
+ bool exec(Function *func);
+ private:
+ virtual bool visit(BasicBlock *);
+
+ bool insertConstraintMoves();
+
+ void condenseDefs(Instruction *);
+ void condenseSrcs(Instruction *, const int first, const int last);
+
+ void addHazard(Instruction *i, const ValueRef *src);
+ void textureMask(TexInstruction *);
+ void addConstraint(Instruction *, int s, int n);
+ bool detectConflict(Instruction *, int s);
+
+ // target specific functions, TODO: put in subclass or Target
+ void texConstraintNV50(TexInstruction *);
+ void texConstraintNVC0(TexInstruction *);
+ void texConstraintNVE0(TexInstruction *);
+
+ std::list<Instruction *> constrList;
+
+ const Target *targ;
+ };
+
+ bool buildLiveSets(BasicBlock *);
+
+private:
+ Program *prog;
+ Function *func;
+
+ // instructions in control flow / chronological order
+ ArrayList insns;
+
+ int sequence; // for manual passes through CFG
+};
+
+typedef std::pair<Value *, Value *> ValuePair;
+
+class SpillCodeInserter
+{
+public:
+ SpillCodeInserter(Function *fn) : func(fn), stackSize(0), stackBase(0) { }
+
+ bool run(const std::list<ValuePair>&);
+
+ Symbol *assignSlot(const Interval&, const unsigned int size);
+ inline int32_t getStackSize() const { return stackSize; }
+
+private:
+ Function *func;
+
+ struct SpillSlot
+ {
+ Interval occup;
+ std::list<Value *> residents; // needed to recalculate occup
+ Symbol *sym;
+ int32_t offset;
+ inline uint8_t size() const { return sym->reg.size; }
+ };
+ std::list<SpillSlot> slots;
+ int32_t stackSize;
+ int32_t stackBase;
+
+ LValue *unspill(Instruction *usei, LValue *, Value *slot);
+ void spill(Instruction *defi, Value *slot, LValue *);
+};
+
+void
+RegAlloc::BuildIntervalsPass::addLiveRange(Value *val,
+ const BasicBlock *bb,
+ int end)
+{
+ Instruction *insn = val->getUniqueInsn();
+
+ if (!insn)
+ insn = bb->getFirst();
+
+ assert(bb->getFirst()->serial <= bb->getExit()->serial);
+ assert(bb->getExit()->serial + 1 >= end);
+
+ int begin = insn->serial;
+ if (begin < bb->getEntry()->serial || begin > bb->getExit()->serial)
+ begin = bb->getEntry()->serial;
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "%%%i <- live range [%i(%i), %i)\n",
+ val->id, begin, insn->serial, end);
+
+ if (begin != end) // empty ranges are only added as hazards for fixed regs
+ val->livei.extend(begin, end);
+}
+
+bool
+RegAlloc::PhiMovesPass::needNewElseBlock(BasicBlock *b, BasicBlock *p)
+{
+ if (b->cfg.incidentCount() <= 1)
+ return false;
+
+ int n = 0;
+ for (Graph::EdgeIterator ei = p->cfg.outgoing(); !ei.end(); ei.next())
+ if (ei.getType() == Graph::Edge::TREE ||
+ ei.getType() == Graph::Edge::FORWARD)
+ ++n;
+ return (n == 2);
+}
+
+// For each operand of each PHI in b, generate a new value by inserting a MOV
+// at the end of the block it is coming from and replace the operand with its
+// result. This eliminates liveness conflicts and enables us to let values be
+// copied to the right register if such a conflict exists nonetheless.
+//
+// These MOVs are also crucial in making sure the live intervals of phi srces
+// are extended until the end of the loop, since they are not included in the
+// live-in sets.
+bool
+RegAlloc::PhiMovesPass::visit(BasicBlock *bb)
+{
+ Instruction *phi, *mov;
+ BasicBlock *pb, *pn;
+
+ std::stack<BasicBlock *> stack;
+
+ for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+ pb = BasicBlock::get(ei.getNode());
+ assert(pb);
+ if (needNewElseBlock(bb, pb))
+ stack.push(pb);
+ }
+ while (!stack.empty()) {
+ pb = stack.top();
+ pn = new BasicBlock(func);
+ stack.pop();
+
+ pb->cfg.detach(&bb->cfg);
+ pb->cfg.attach(&pn->cfg, Graph::Edge::TREE);
+ pn->cfg.attach(&bb->cfg, Graph::Edge::FORWARD);
+
+ assert(pb->getExit()->op != OP_CALL);
+ if (pb->getExit()->asFlow()->target.bb == bb)
+ pb->getExit()->asFlow()->target.bb = pn;
+ }
+
+ // insert MOVs (phi->src(j) should stem from j-th in-BB)
+ int j = 0;
+ for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+ pb = BasicBlock::get(ei.getNode());
+ if (!pb->isTerminated())
+ pb->insertTail(new_FlowInstruction(func, OP_BRA, bb));
+
+ for (phi = bb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+ mov = new_Instruction(func, OP_MOV, TYPE_U32);
+
+ mov->setSrc(0, phi->getSrc(j));
+ mov->setDef(0, new_LValue(func, phi->getDef(0)->asLValue()));
+ phi->setSrc(j, mov->getDef(0));
+
+ pb->insertBefore(pb->getExit(), mov);
+ }
+ ++j;
+ }
+
+ return true;
+}
+
+bool
+RegAlloc::ArgumentMovesPass::visit(BasicBlock *bb)
+{
+ // Bind function call inputs/outputs to the same physical register
+ // the callee uses, inserting moves as appropriate for the case a
+ // conflict arises.
+ for (Instruction *i = bb->getEntry(); i; i = i->next) {
+ FlowInstruction *cal = i->asFlow();
+ // TODO: Handle indirect calls.
+ // Right now they should only be generated for builtins.
+ if (!cal || cal->op != OP_CALL || cal->builtin || cal->indirect)
+ continue;
+ RegisterSet clobberSet(prog->getTarget());
+
+ // Bind input values.
+ for (int s = cal->indirect ? 1 : 0; cal->srcExists(s); ++s) {
+ const int t = cal->indirect ? (s - 1) : s;
+ LValue *tmp = new_LValue(func, cal->getSrc(s)->asLValue());
+ tmp->reg.data.id = cal->target.fn->ins[t].rep()->reg.data.id;
+
+ Instruction *mov =
+ new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
+ mov->setDef(0, tmp);
+ mov->setSrc(0, cal->getSrc(s));
+ cal->setSrc(s, tmp);
+
+ bb->insertBefore(cal, mov);
+ }
+
+ // Bind output values.
+ for (int d = 0; cal->defExists(d); ++d) {
+ LValue *tmp = new_LValue(func, cal->getDef(d)->asLValue());
+ tmp->reg.data.id = cal->target.fn->outs[d].rep()->reg.data.id;
+
+ Instruction *mov =
+ new_Instruction(func, OP_MOV, typeOfSize(tmp->reg.size));
+ mov->setSrc(0, tmp);
+ mov->setDef(0, cal->getDef(d));
+ cal->setDef(d, tmp);
+
+ bb->insertAfter(cal, mov);
+ clobberSet.occupy(tmp);
+ }
+
+ // Bind clobbered values.
+ for (std::deque<Value *>::iterator it = cal->target.fn->clobbers.begin();
+ it != cal->target.fn->clobbers.end();
+ ++it) {
+ if (clobberSet.testOccupy(*it)) {
+ Value *tmp = new_LValue(func, (*it)->asLValue());
+ tmp->reg.data.id = (*it)->reg.data.id;
+ cal->setDef(cal->defCount(), tmp);
+ }
+ }
+ }
+
+ // Update the clobber set of the function.
+ if (BasicBlock::get(func->cfgExit) == bb) {
+ func->buildDefSets();
+ for (unsigned int i = 0; i < bb->defSet.getSize(); ++i)
+ if (bb->defSet.test(i))
+ func->clobbers.push_back(func->getLValue(i));
+ }
+
+ return true;
+}
+
+// Build the set of live-in variables of bb.
+bool
+RegAlloc::buildLiveSets(BasicBlock *bb)
+{
+ Function *f = bb->getFunction();
+ BasicBlock *bn;
+ Instruction *i;
+ unsigned int s, d;
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "buildLiveSets(BB:%i)\n", bb->getId());
+
+ bb->liveSet.allocate(func->allLValues.getSize(), false);
+
+ int n = 0;
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+ bn = BasicBlock::get(ei.getNode());
+ if (bn == bb)
+ continue;
+ if (bn->cfg.visit(sequence))
+ if (!buildLiveSets(bn))
+ return false;
+ if (n++ || bb->liveSet.marker)
+ bb->liveSet |= bn->liveSet;
+ else
+ bb->liveSet = bn->liveSet;
+ }
+ if (!n && !bb->liveSet.marker)
+ bb->liveSet.fill(0);
+ bb->liveSet.marker = true;
+
+ if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+ INFO("BB:%i live set of out blocks:\n", bb->getId());
+ bb->liveSet.print();
+ }
+
+ // if (!bb->getEntry())
+ // return true;
+
+ if (bb == BasicBlock::get(f->cfgExit)) {
+ for (std::deque<ValueRef>::iterator it = f->outs.begin();
+ it != f->outs.end(); ++it) {
+ assert(it->get()->asLValue());
+ bb->liveSet.set(it->get()->id);
+ }
+ }
+
+ for (i = bb->getExit(); i && i != bb->getEntry()->prev; i = i->prev) {
+ for (d = 0; i->defExists(d); ++d)
+ bb->liveSet.clr(i->getDef(d)->id);
+ for (s = 0; i->srcExists(s); ++s)
+ if (i->getSrc(s)->asLValue())
+ bb->liveSet.set(i->getSrc(s)->id);
+ }
+ for (i = bb->getPhi(); i && i->op == OP_PHI; i = i->next)
+ bb->liveSet.clr(i->getDef(0)->id);
+
+ if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+ INFO("BB:%i live set after propagation:\n", bb->getId());
+ bb->liveSet.print();
+ }
+
+ return true;
+}
+
+void
+RegAlloc::BuildIntervalsPass::collectLiveValues(BasicBlock *bb)
+{
+ BasicBlock *bbA = NULL, *bbB = NULL;
+
+ if (bb->cfg.outgoingCount()) {
+ // trickery to save a loop of OR'ing liveSets
+ // aliasing works fine with BitSet::setOr
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+ if (ei.getType() == Graph::Edge::DUMMY)
+ continue;
+ if (bbA) {
+ bb->liveSet.setOr(&bbA->liveSet, &bbB->liveSet);
+ bbA = bb;
+ } else {
+ bbA = bbB;
+ }
+ bbB = BasicBlock::get(ei.getNode());
+ }
+ bb->liveSet.setOr(&bbB->liveSet, bbA ? &bbA->liveSet : NULL);
+ } else
+ if (bb->cfg.incidentCount()) {
+ bb->liveSet.fill(0);
+ }
+}
+
+bool
+RegAlloc::BuildIntervalsPass::visit(BasicBlock *bb)
+{
+ collectLiveValues(bb);
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "BuildIntervals(BB:%i)\n", bb->getId());
+
+ // go through out blocks and delete phi sources that do not originate from
+ // the current block from the live set
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+ BasicBlock *out = BasicBlock::get(ei.getNode());
+
+ for (Instruction *i = out->getPhi(); i && i->op == OP_PHI; i = i->next) {
+ bb->liveSet.clr(i->getDef(0)->id);
+
+ for (int s = 0; i->srcExists(s); ++s) {
+ assert(i->src(s).getInsn());
+ if (i->getSrc(s)->getUniqueInsn()->bb == bb) // XXX: reachableBy ?
+ bb->liveSet.set(i->getSrc(s)->id);
+ else
+ bb->liveSet.clr(i->getSrc(s)->id);
+ }
+ }
+ }
+
+ // remaining live-outs are live until end
+ if (bb->getExit()) {
+ for (unsigned int j = 0; j < bb->liveSet.getSize(); ++j)
+ if (bb->liveSet.test(j))
+ addLiveRange(func->getLValue(j), bb, bb->getExit()->serial + 1);
+ }
+
+ for (Instruction *i = bb->getExit(); i && i->op != OP_PHI; i = i->prev) {
+ for (int d = 0; i->defExists(d); ++d) {
+ bb->liveSet.clr(i->getDef(d)->id);
+ if (i->getDef(d)->reg.data.id >= 0) // add hazard for fixed regs
+ i->getDef(d)->livei.extend(i->serial, i->serial);
+ }
+
+ for (int s = 0; i->srcExists(s); ++s) {
+ if (!i->getSrc(s)->asLValue())
+ continue;
+ if (!bb->liveSet.test(i->getSrc(s)->id)) {
+ bb->liveSet.set(i->getSrc(s)->id);
+ addLiveRange(i->getSrc(s), bb, i->serial);
+ }
+ }
+ }
+
+ if (bb == BasicBlock::get(func->cfg.getRoot())) {
+ for (std::deque<ValueDef>::iterator it = func->ins.begin();
+ it != func->ins.end(); ++it) {
+ if (it->get()->reg.data.id >= 0) // add hazard for fixed regs
+ it->get()->livei.extend(0, 1);
+ }
+ }
+
+ return true;
+}
+
+
+#define JOIN_MASK_PHI (1 << 0)
+#define JOIN_MASK_UNION (1 << 1)
+#define JOIN_MASK_MOV (1 << 2)
+#define JOIN_MASK_TEX (1 << 3)
+
+class GCRA
+{
+public:
+ GCRA(Function *, SpillCodeInserter&);
+ ~GCRA();
+
+ bool allocateRegisters(ArrayList& insns);
+
+ void printNodeInfo() const;
+
+private:
+ class RIG_Node : public Graph::Node
+ {
+ public:
+ RIG_Node();
+
+ void init(const RegisterSet&, LValue *);
+
+ void addInterference(RIG_Node *);
+ void addRegPreference(RIG_Node *);
+
+ inline LValue *getValue() const
+ {
+ return reinterpret_cast<LValue *>(data);
+ }
+ inline void setValue(LValue *lval) { data = lval; }
+
+ inline uint8_t getCompMask() const
+ {
+ return ((1 << colors) - 1) << (reg & 7);
+ }
+
+ static inline RIG_Node *get(const Graph::EdgeIterator& ei)
+ {
+ return static_cast<RIG_Node *>(ei.getNode());
+ }
+
+ public:
+ uint32_t degree;
+ uint16_t degreeLimit; // if deg < degLimit, node is trivially colourable
+ uint16_t colors;
+
+ DataFile f;
+ int32_t reg;
+
+ float weight;
+
+ // list pointers for simplify() phase
+ RIG_Node *next;
+ RIG_Node *prev;
+
+ // union of the live intervals of all coalesced values (we want to retain
+ // the separate intervals for testing interference of compound values)
+ Interval livei;
+
+ std::list<RIG_Node *> prefRegs;
+ };
+
+private:
+ inline RIG_Node *getNode(const LValue *v) const { return &nodes[v->id]; }
+
+ void buildRIG(ArrayList&);
+ bool coalesce(ArrayList&);
+ bool doCoalesce(ArrayList&, unsigned int mask);
+ void calculateSpillWeights();
+ void simplify();
+ bool selectRegisters();
+ void cleanup(const bool success);
+
+ void simplifyEdge(RIG_Node *, RIG_Node *);
+ void simplifyNode(RIG_Node *);
+
+ bool coalesceValues(Value *, Value *, bool force);
+ void resolveSplitsAndMerges();
+ void makeCompound(Instruction *, bool isSplit);
+
+ inline void checkInterference(const RIG_Node *, Graph::EdgeIterator&);
+
+ inline void insertOrderedTail(std::list<RIG_Node *>&, RIG_Node *);
+ void checkList(std::list<RIG_Node *>&);
+
+private:
+ std::stack<uint32_t> stack;
+
+ // list headers for simplify() phase
+ RIG_Node lo[2];
+ RIG_Node hi;
+
+ Graph RIG;
+ RIG_Node *nodes;
+ unsigned int nodeCount;
+
+ Function *func;
+ Program *prog;
+
+ static uint8_t relDegree[17][17];
+
+ RegisterSet regs;
+
+ // need to fixup register id for participants of OP_MERGE/SPLIT
+ std::list<Instruction *> merges;
+ std::list<Instruction *> splits;
+
+ SpillCodeInserter& spill;
+ std::list<ValuePair> mustSpill;
+};
+
+uint8_t GCRA::relDegree[17][17];
+
+GCRA::RIG_Node::RIG_Node() : Node(NULL), next(this), prev(this)
+{
+ colors = 0;
+}
+
+void
+GCRA::printNodeInfo() const
+{
+ for (unsigned int i = 0; i < nodeCount; ++i) {
+ if (!nodes[i].colors)
+ continue;
+ INFO("RIG_Node[%%%i]($[%u]%i): %u colors, weight %f, deg %u/%u\n X",
+ i,
+ nodes[i].f,nodes[i].reg,nodes[i].colors,
+ nodes[i].weight,
+ nodes[i].degree, nodes[i].degreeLimit);
+
+ for (Graph::EdgeIterator ei = nodes[i].outgoing(); !ei.end(); ei.next())
+ INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
+ for (Graph::EdgeIterator ei = nodes[i].incident(); !ei.end(); ei.next())
+ INFO(" %%%i", RIG_Node::get(ei)->getValue()->id);
+ INFO("\n");
+ }
+}
+
+void
+GCRA::RIG_Node::init(const RegisterSet& regs, LValue *lval)
+{
+ setValue(lval);
+ if (lval->reg.data.id >= 0)
+ lval->noSpill = lval->fixedReg = 1;
+
+ colors = regs.units(lval->reg.file, lval->reg.size);
+ f = lval->reg.file;
+ reg = -1;
+ if (lval->reg.data.id >= 0)
+ reg = regs.idToUnits(lval);
+
+ weight = std::numeric_limits<float>::infinity();
+ degree = 0;
+ degreeLimit = regs.getFileSize(f, lval->reg.size);
+
+ livei.insert(lval->livei);
+}
+
+bool
+GCRA::coalesceValues(Value *dst, Value *src, bool force)
+{
+ LValue *rep = dst->join->asLValue();
+ LValue *val = src->join->asLValue();
+
+ if (!force && val->reg.data.id >= 0) {
+ rep = src->join->asLValue();
+ val = dst->join->asLValue();
+ }
+ RIG_Node *nRep = &nodes[rep->id];
+ RIG_Node *nVal = &nodes[val->id];
+
+ if (src->reg.file != dst->reg.file) {
+ if (!force)
+ return false;
+ WARN("forced coalescing of values in different files !\n");
+ }
+ if (!force && dst->reg.size != src->reg.size)
+ return false;
+
+ if ((rep->reg.data.id >= 0) && (rep->reg.data.id != val->reg.data.id)) {
+ if (force) {
+ if (val->reg.data.id >= 0)
+ WARN("forced coalescing of values in different fixed regs !\n");
+ } else {
+ if (val->reg.data.id >= 0)
+ return false;
+ // make sure that there is no overlap with the fixed register of rep
+ for (ArrayList::Iterator it = func->allLValues.iterator();
+ !it.end(); it.next()) {
+ Value *reg = reinterpret_cast<Value *>(it.get())->asLValue();
+ assert(reg);
+ if (reg->interfers(rep) && reg->livei.overlaps(nVal->livei))
+ return false;
+ }
+ }
+ }
+
+ if (!force && nRep->livei.overlaps(nVal->livei))
+ return false;
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "joining %%%i($%i) <- %%%i\n",
+ rep->id, rep->reg.data.id, val->id);
+
+ // set join pointer of all values joined with val
+ for (Value::DefIterator def = val->defs.begin(); def != val->defs.end();
+ ++def)
+ (*def)->get()->join = rep;
+ assert(rep->join == rep && val->join == rep);
+
+ // add val's definitions to rep and extend the live interval of its RIG node
+ rep->defs.insert(rep->defs.end(), val->defs.begin(), val->defs.end());
+ nRep->livei.unify(nVal->livei);
+ return true;
+}
+
+bool
+GCRA::coalesce(ArrayList& insns)
+{
+ bool ret = doCoalesce(insns, JOIN_MASK_PHI);
+ if (!ret)
+ return false;
+ switch (func->getProgram()->getTarget()->getChipset() & ~0xf) {
+ case 0x50:
+ case 0x80:
+ case 0x90:
+ case 0xa0:
+ ret = doCoalesce(insns, JOIN_MASK_UNION | JOIN_MASK_TEX);
+ break;
+ case 0xc0:
+ case 0xd0:
+ case 0xe0:
+ ret = doCoalesce(insns, JOIN_MASK_UNION);
+ break;
+ default:
+ break;
+ }
+ if (!ret)
+ return false;
+ return doCoalesce(insns, JOIN_MASK_MOV);
+}
+
+static inline uint8_t makeCompMask(int compSize, int base, int size)
+{
+ uint8_t m = ((1 << size) - 1) << base;
+
+ switch (compSize) {
+ case 1:
+ return 0xff;
+ case 2:
+ m |= (m << 2);
+ return (m << 4) | m;
+ case 3:
+ case 4:
+ return (m << 4) | m;
+ default:
+ assert(compSize <= 8);
+ return m;
+ }
+}
+
+// Used when coalescing moves. The non-compound value will become one, e.g.:
+// mov b32 $r0 $r2 / merge b64 $r0d { $r0 $r1 }
+// split b64 { $r0 $r1 } $r0d / mov b64 $r0d f64 $r2d
+static inline void copyCompound(Value *dst, Value *src)
+{
+ LValue *ldst = dst->asLValue();
+ LValue *lsrc = src->asLValue();
+
+ if (ldst->compound && !lsrc->compound) {
+ LValue *swap = lsrc;
+ lsrc = ldst;
+ ldst = swap;
+ }
+
+ ldst->compound = lsrc->compound;
+ ldst->compMask = lsrc->compMask;
+}
+
+void
+GCRA::makeCompound(Instruction *insn, bool split)
+{
+ LValue *rep = (split ? insn->getSrc(0) : insn->getDef(0))->asLValue();
+
+ if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC) {
+ INFO("makeCompound(split = %i): ", split);
+ insn->print();
+ }
+
+ const unsigned int size = getNode(rep)->colors;
+ unsigned int base = 0;
+
+ if (!rep->compound)
+ rep->compMask = 0xff;
+ rep->compound = 1;
+
+ for (int c = 0; split ? insn->defExists(c) : insn->srcExists(c); ++c) {
+ LValue *val = (split ? insn->getDef(c) : insn->getSrc(c))->asLValue();
+
+ val->compound = 1;
+ if (!val->compMask)
+ val->compMask = 0xff;
+ val->compMask &= makeCompMask(size, base, getNode(val)->colors);
+ assert(val->compMask);
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "compound: %%%i:%02x <- %%%i:%02x\n",
+ rep->id, rep->compMask, val->id, val->compMask);
+
+ base += getNode(val)->colors;
+ }
+ assert(base == size);
+}
+
+bool
+GCRA::doCoalesce(ArrayList& insns, unsigned int mask)
+{
+ int c, n;
+
+ for (n = 0; n < insns.getSize(); ++n) {
+ Instruction *i;
+ Instruction *insn = reinterpret_cast<Instruction *>(insns.get(n));
+
+ switch (insn->op) {
+ case OP_PHI:
+ if (!(mask & JOIN_MASK_PHI))
+ break;
+ for (c = 0; insn->srcExists(c); ++c)
+ if (!coalesceValues(insn->getDef(0), insn->getSrc(c), false)) {
+ // this is bad
+ ERROR("failed to coalesce phi operands\n");
+ return false;
+ }
+ break;
+ case OP_UNION:
+ case OP_MERGE:
+ if (!(mask & JOIN_MASK_UNION))
+ break;
+ for (c = 0; insn->srcExists(c); ++c)
+ coalesceValues(insn->getDef(0), insn->getSrc(c), true);
+ if (insn->op == OP_MERGE) {
+ merges.push_back(insn);
+ if (insn->srcExists(1))
+ makeCompound(insn, false);
+ }
+ break;
+ case OP_SPLIT:
+ if (!(mask & JOIN_MASK_UNION))
+ break;
+ splits.push_back(insn);
+ for (c = 0; insn->defExists(c); ++c)
+ coalesceValues(insn->getSrc(0), insn->getDef(c), true);
+ makeCompound(insn, true);
+ break;
+ case OP_MOV:
+ if (!(mask & JOIN_MASK_MOV))
+ break;
+ i = NULL;
+ if (!insn->getDef(0)->uses.empty())
+ i = insn->getDef(0)->uses.front()->getInsn();
+ // if this is a contraint-move there will only be a single use
+ if (i && i->op == OP_MERGE) // do we really still need this ?
+ break;
+ i = insn->getSrc(0)->getUniqueInsn();
+ if (i && !i->constrainedDefs()) {
+ if (coalesceValues(insn->getDef(0), insn->getSrc(0), false))
+ copyCompound(insn->getSrc(0), insn->getDef(0));
+ }
+ break;
+ case OP_TEX:
+ case OP_TXB:
+ case OP_TXL:
+ case OP_TXF:
+ case OP_TXQ:
+ case OP_TXD:
+ case OP_TXG:
+ case OP_TEXCSAA:
+ if (!(mask & JOIN_MASK_TEX))
+ break;
+ for (c = 0; insn->srcExists(c) && c != insn->predSrc; ++c)
+ coalesceValues(insn->getDef(c), insn->getSrc(c), true);
+ break;
+ default:
+ break;
+ }
+ }
+ return true;
+}
+
+void
+GCRA::RIG_Node::addInterference(RIG_Node *node)
+{
+ this->degree += relDegree[node->colors][colors];
+ node->degree += relDegree[colors][node->colors];
+
+ this->attach(node, Graph::Edge::CROSS);
+}
+
+void
+GCRA::RIG_Node::addRegPreference(RIG_Node *node)
+{
+ prefRegs.push_back(node);
+}
+
+GCRA::GCRA(Function *fn, SpillCodeInserter& spill) :
+ func(fn),
+ regs(fn->getProgram()->getTarget()),
+ spill(spill)
+{
+ prog = func->getProgram();
+
+ // initialize relative degrees array - i takes away from j
+ for (int i = 1; i <= 16; ++i)
+ for (int j = 1; j <= 16; ++j)
+ relDegree[i][j] = j * ((i + j - 1) / j);
+}
+
+GCRA::~GCRA()
+{
+ if (nodes)
+ delete[] nodes;
+}
+
+void
+GCRA::checkList(std::list<RIG_Node *>& lst)
+{
+ GCRA::RIG_Node *prev = NULL;
+
+ for (std::list<RIG_Node *>::iterator it = lst.begin();
+ it != lst.end();
+ ++it) {
+ assert((*it)->getValue()->join == (*it)->getValue());
+ if (prev)
+ assert(prev->livei.begin() <= (*it)->livei.begin());
+ prev = *it;
+ }
+}
+
+void
+GCRA::insertOrderedTail(std::list<RIG_Node *>& list, RIG_Node *node)
+{
+ if (node->livei.isEmpty())
+ return;
+ // only the intervals of joined values don't necessarily arrive in order
+ std::list<RIG_Node *>::iterator prev, it;
+ for (it = list.end(); it != list.begin(); it = prev) {
+ prev = it;
+ --prev;
+ if ((*prev)->livei.begin() <= node->livei.begin())
+ break;
+ }
+ list.insert(it, node);
+}
+
+void
+GCRA::buildRIG(ArrayList& insns)
+{
+ std::list<RIG_Node *> values, active;
+
+ for (std::deque<ValueDef>::iterator it = func->ins.begin();
+ it != func->ins.end(); ++it)
+ insertOrderedTail(values, getNode(it->get()->asLValue()));
+
+ for (int i = 0; i < insns.getSize(); ++i) {
+ Instruction *insn = reinterpret_cast<Instruction *>(insns.get(i));
+ for (int d = 0; insn->defExists(d); ++d)
+ if (insn->getDef(d)->rep() == insn->getDef(d))
+ insertOrderedTail(values, getNode(insn->getDef(d)->asLValue()));
+ }
+ checkList(values);
+
+ while (!values.empty()) {
+ RIG_Node *cur = values.front();
+
+ for (std::list<RIG_Node *>::iterator it = active.begin();
+ it != active.end();) {
+ RIG_Node *node = *it;
+
+ if (node->livei.end() <= cur->livei.begin()) {
+ it = active.erase(it);
+ } else {
+ if (node->f == cur->f && node->livei.overlaps(cur->livei))
+ cur->addInterference(node);
+ ++it;
+ }
+ }
+ values.pop_front();
+ active.push_back(cur);
+ }
+}
+
+void
+GCRA::calculateSpillWeights()
+{
+ for (unsigned int i = 0; i < nodeCount; ++i) {
+ RIG_Node *const n = &nodes[i];
+ if (!nodes[i].colors || nodes[i].livei.isEmpty())
+ continue;
+ if (nodes[i].reg >= 0) {
+ // update max reg
+ regs.occupy(n->f, n->reg, n->colors);
+ continue;
+ }
+ LValue *val = nodes[i].getValue();
+
+ if (!val->noSpill) {
+ int rc = 0;
+ for (Value::DefIterator it = val->defs.begin();
+ it != val->defs.end();
+ ++it)
+ rc += (*it)->get()->refCount();
+
+ nodes[i].weight =
+ (float)rc * (float)rc / (float)nodes[i].livei.extent();
+ }
+
+ if (nodes[i].degree < nodes[i].degreeLimit) {
+ int l = 0;
+ if (val->reg.size > 4)
+ l = 1;
+ DLLIST_ADDHEAD(&lo[l], &nodes[i]);
+ } else {
+ DLLIST_ADDHEAD(&hi, &nodes[i]);
+ }
+ }
+ if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+ printNodeInfo();
+}
+
+void
+GCRA::simplifyEdge(RIG_Node *a, RIG_Node *b)
+{
+ bool move = b->degree >= b->degreeLimit;
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC,
+ "edge: (%%%i, deg %u/%u) >-< (%%%i, deg %u/%u)\n",
+ a->getValue()->id, a->degree, a->degreeLimit,
+ b->getValue()->id, b->degree, b->degreeLimit);
+
+ b->degree -= relDegree[a->colors][b->colors];
+
+ move = move && b->degree < b->degreeLimit;
+ if (move && !DLLIST_EMPTY(b)) {
+ int l = (b->getValue()->reg.size > 4) ? 1 : 0;
+ DLLIST_DEL(b);
+ DLLIST_ADDTAIL(&lo[l], b);
+ }
+}
+
+void
+GCRA::simplifyNode(RIG_Node *node)
+{
+ for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+ simplifyEdge(node, RIG_Node::get(ei));
+
+ for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
+ simplifyEdge(node, RIG_Node::get(ei));
+
+ DLLIST_DEL(node);
+ stack.push(node->getValue()->id);
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "SIMPLIFY: pushed %%%i%s\n",
+ node->getValue()->id,
+ (node->degree < node->degreeLimit) ? "" : "(spill)");
+}
+
+void
+GCRA::simplify()
+{
+ for (;;) {
+ if (!DLLIST_EMPTY(&lo[0])) {
+ do {
+ simplifyNode(lo[0].next);
+ } while (!DLLIST_EMPTY(&lo[0]));
+ } else
+ if (!DLLIST_EMPTY(&lo[1])) {
+ simplifyNode(lo[1].next);
+ } else
+ if (!DLLIST_EMPTY(&hi)) {
+ RIG_Node *best = hi.next;
+ float bestScore = best->weight / (float)best->degree;
+ // spill candidate
+ for (RIG_Node *it = best->next; it != &hi; it = it->next) {
+ float score = it->weight / (float)it->degree;
+ if (score < bestScore) {
+ best = it;
+ bestScore = score;
+ }
+ }
+ if (isinf(bestScore)) {
+ ERROR("no viable spill candidates left\n");
+ break;
+ }
+ simplifyNode(best);
+ } else {
+ break;
+ }
+ }
+}
+
+void
+GCRA::checkInterference(const RIG_Node *node, Graph::EdgeIterator& ei)
+{
+ const RIG_Node *intf = RIG_Node::get(ei);
+
+ if (intf->reg < 0)
+ return;
+ const LValue *vA = node->getValue();
+ const LValue *vB = intf->getValue();
+
+ const uint8_t intfMask = ((1 << intf->colors) - 1) << (intf->reg & 7);
+
+ if (vA->compound | vB->compound) {
+ // NOTE: this only works for >aligned< register tuples !
+ for (Value::DefCIterator D = vA->defs.begin(); D != vA->defs.end(); ++D) {
+ for (Value::DefCIterator d = vB->defs.begin(); d != vB->defs.end(); ++d) {
+ const LValue *vD = (*D)->get()->asLValue();
+ const LValue *vd = (*d)->get()->asLValue();
+
+ if (!vD->livei.overlaps(vd->livei)) {
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "(%%%i) X (%%%i): no overlap\n",
+ vD->id, vd->id);
+ continue;
+ }
+
+ uint8_t mask = vD->compound ? vD->compMask : ~0;
+ if (vd->compound) {
+ assert(vB->compound);
+ mask &= vd->compMask & vB->compMask;
+ } else {
+ mask &= intfMask;
+ }
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC,
+ "(%%%i)%02x X (%%%i)%02x & %02x: $r%i.%02x\n",
+ vD->id,
+ vD->compound ? vD->compMask : 0xff,
+ vd->id,
+ vd->compound ? vd->compMask : intfMask,
+ vB->compMask, intf->reg & ~7, mask);
+ if (mask)
+ regs.occupyMask(node->f, intf->reg & ~7, mask);
+ }
+ }
+ } else {
+ INFO_DBG(prog->dbgFlags, REG_ALLOC,
+ "(%%%i) X (%%%i): $r%i + %u\n",
+ vA->id, vB->id, intf->reg, intf->colors);
+ regs.occupy(node->f, intf->reg, intf->colors);
+ }
+}
+
+bool
+GCRA::selectRegisters()
+{
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nSELECT phase\n");
+
+ while (!stack.empty()) {
+ RIG_Node *node = &nodes[stack.top()];
+ stack.pop();
+
+ regs.reset(node->f);
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "\nNODE[%%%i, %u colors]\n",
+ node->getValue()->id, node->colors);
+
+ for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next())
+ checkInterference(node, ei);
+ for (Graph::EdgeIterator ei = node->incident(); !ei.end(); ei.next())
+ checkInterference(node, ei);
+
+ if (!node->prefRegs.empty()) {
+ for (std::list<RIG_Node *>::const_iterator it = node->prefRegs.begin();
+ it != node->prefRegs.end();
+ ++it) {
+ if ((*it)->reg >= 0 &&
+ regs.testOccupy(node->f, (*it)->reg, node->colors)) {
+ node->reg = (*it)->reg;
+ break;
+ }
+ }
+ }
+ if (node->reg >= 0)
+ continue;
+ LValue *lval = node->getValue();
+ if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+ regs.print();
+ bool ret = regs.assign(node->reg, node->f, node->colors);
+ if (ret) {
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "assigned reg %i\n", node->reg);
+ lval->compMask = node->getCompMask();
+ } else {
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "must spill: %%%i (size %u)\n",
+ lval->id, lval->reg.size);
+ Symbol *slot = NULL;
+ if (lval->reg.file == FILE_GPR)
+ slot = spill.assignSlot(node->livei, lval->reg.size);
+ mustSpill.push_back(ValuePair(lval, slot));
+ }
+ }
+ if (!mustSpill.empty())
+ return false;
+ for (unsigned int i = 0; i < nodeCount; ++i) {
+ LValue *lval = nodes[i].getValue();
+ if (nodes[i].reg >= 0 && nodes[i].colors > 0)
+ lval->reg.data.id =
+ regs.unitsToId(nodes[i].f, nodes[i].reg, lval->reg.size);
+ }
+ return true;
+}
+
+bool
+GCRA::allocateRegisters(ArrayList& insns)
+{
+ bool ret;
+
+ INFO_DBG(prog->dbgFlags, REG_ALLOC,
+ "allocateRegisters to %u instructions\n", insns.getSize());
+
+ nodeCount = func->allLValues.getSize();
+ nodes = new RIG_Node[nodeCount];
+ if (!nodes)
+ return false;
+ for (unsigned int i = 0; i < nodeCount; ++i) {
+ LValue *lval = reinterpret_cast<LValue *>(func->allLValues.get(i));
+ if (lval) {
+ nodes[i].init(regs, lval);
+ RIG.insert(&nodes[i]);
+ }
+ }
+
+ // coalesce first, we use only 1 RIG node for a group of joined values
+ ret = coalesce(insns);
+ if (!ret)
+ goto out;
+
+ if (func->getProgram()->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+ func->printLiveIntervals();
+
+ buildRIG(insns);
+ calculateSpillWeights();
+ simplify();
+
+ ret = selectRegisters();
+ if (!ret) {
+ INFO_DBG(prog->dbgFlags, REG_ALLOC,
+ "selectRegisters failed, inserting spill code ...\n");
+ regs.reset(FILE_GPR, true);
+ spill.run(mustSpill);
+ if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+ func->print();
+ } else {
+ prog->maxGPR = std::max(prog->maxGPR, regs.getMaxAssigned(FILE_GPR));
+ }
+
+out:
+ cleanup(ret);
+ return ret;
+}
+
+void
+GCRA::cleanup(const bool success)
+{
+ mustSpill.clear();
+
+ for (ArrayList::Iterator it = func->allLValues.iterator();
+ !it.end(); it.next()) {
+ LValue *lval = reinterpret_cast<LValue *>(it.get());
+
+ lval->livei.clear();
+
+ lval->compound = 0;
+ lval->compMask = 0;
+
+ if (lval->join == lval)
+ continue;
+
+ if (success) {
+ lval->reg.data.id = lval->join->reg.data.id;
+ } else {
+ for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
+ ++d)
+ lval->join->defs.remove(*d);
+ lval->join = lval;
+ }
+ }
+
+ if (success)
+ resolveSplitsAndMerges();
+ splits.clear(); // avoid duplicate entries on next coalesce pass
+ merges.clear();
+
+ delete[] nodes;
+ nodes = NULL;
+}
+
+Symbol *
+SpillCodeInserter::assignSlot(const Interval &livei, const unsigned int size)
+{
+ SpillSlot slot;
+ int32_t offsetBase = stackSize;
+ int32_t offset;
+ std::list<SpillSlot>::iterator pos = slots.end(), it = slots.begin();
+
+ if (offsetBase % size)
+ offsetBase += size - (offsetBase % size);
+
+ slot.sym = NULL;
+
+ for (offset = offsetBase; offset < stackSize; offset += size) {
+ const int32_t entryEnd = offset + size;
+ while (it != slots.end() && it->offset < offset)
+ ++it;
+ if (it == slots.end()) // no slots left
+ break;
+ std::list<SpillSlot>::iterator bgn = it;
+
+ while (it != slots.end() && it->offset < entryEnd) {
+ it->occup.print();
+ if (it->occup.overlaps(livei))
+ break;
+ ++it;
+ }
+ if (it == slots.end() || it->offset >= entryEnd) {
+ // fits
+ for (; bgn != slots.end() && bgn->offset < entryEnd; ++bgn) {
+ bgn->occup.insert(livei);
+ if (bgn->size() == size)
+ slot.sym = bgn->sym;
+ }
+ break;
+ }
+ }
+ if (!slot.sym) {
+ stackSize = offset + size;
+ slot.offset = offset;
+ slot.sym = new_Symbol(func->getProgram(), FILE_MEMORY_LOCAL);
+ if (!func->stackPtr)
+ offset += func->tlsBase;
+ slot.sym->setAddress(NULL, offset);
+ slot.sym->reg.size = size;
+ slots.insert(pos, slot)->occup.insert(livei);
+ }
+ return slot.sym;
+}
+
+void
+SpillCodeInserter::spill(Instruction *defi, Value *slot, LValue *lval)
+{
+ const DataType ty = typeOfSize(slot->reg.size);
+
+ Instruction *st;
+ if (slot->reg.file == FILE_MEMORY_LOCAL) {
+ st = new_Instruction(func, OP_STORE, ty);
+ st->setSrc(0, slot);
+ st->setSrc(1, lval);
+ lval->noSpill = 1;
+ } else {
+ st = new_Instruction(func, OP_CVT, ty);
+ st->setDef(0, slot);
+ st->setSrc(0, lval);
+ }
+ defi->bb->insertAfter(defi, st);
+}
+
+LValue *
+SpillCodeInserter::unspill(Instruction *usei, LValue *lval, Value *slot)
+{
+ const DataType ty = typeOfSize(slot->reg.size);
+
+ lval = cloneShallow(func, lval);
+
+ Instruction *ld;
+ if (slot->reg.file == FILE_MEMORY_LOCAL) {
+ lval->noSpill = 1;
+ ld = new_Instruction(func, OP_LOAD, ty);
+ } else {
+ ld = new_Instruction(func, OP_CVT, ty);
+ }
+ ld->setDef(0, lval);
+ ld->setSrc(0, slot);
+
+ usei->bb->insertBefore(usei, ld);
+ return lval;
+}
+
+bool
+SpillCodeInserter::run(const std::list<ValuePair>& lst)
+{
+ for (std::list<ValuePair>::const_iterator it = lst.begin(); it != lst.end();
+ ++it) {
+ LValue *lval = it->first->asLValue();
+ Symbol *mem = it->second ? it->second->asSym() : NULL;
+
+ for (Value::DefIterator d = lval->defs.begin(); d != lval->defs.end();
+ ++d) {
+ Value *slot = mem ?
+ static_cast<Value *>(mem) : new_LValue(func, FILE_GPR);
+ Value *tmp = NULL;
+ Instruction *last = NULL;
+
+ LValue *dval = (*d)->get()->asLValue();
+ Instruction *defi = (*d)->getInsn();
+
+ // handle uses first or they'll contain the spill stores
+ while (!dval->uses.empty()) {
+ ValueRef *u = dval->uses.front();
+ Instruction *usei = u->getInsn();
+ assert(usei);
+ if (usei->op == OP_PHI) {
+ tmp = (slot->reg.file == FILE_MEMORY_LOCAL) ? NULL : slot;
+ last = NULL;
+ } else
+ if (!last || usei != last->next) { // TODO: sort uses
+ tmp = unspill(usei, dval, slot);
+ last = usei;
+ }
+ u->set(tmp);
+ }
+
+ assert(defi);
+ if (defi->op == OP_PHI) {
+ d = lval->defs.erase(d);
+ --d;
+ if (slot->reg.file == FILE_MEMORY_LOCAL)
+ delete_Instruction(func->getProgram(), defi);
+ else
+ defi->setDef(0, slot);
+ } else {
+ spill(defi, slot, dval);
+ }
+ }
+
+ }
+
+ // TODO: We're not trying to reuse old slots in a potential next iteration.
+ // We have to update the slots' livei intervals to be able to do that.
+ stackBase = stackSize;
+ slots.clear();
+ return true;
+}
+
+bool
+RegAlloc::exec()
+{
+ for (IteratorRef it = prog->calls.iteratorDFS(false);
+ !it->end(); it->next()) {
+ func = Function::get(reinterpret_cast<Graph::Node *>(it->get()));
+
+ func->tlsBase = prog->tlsSize;
+ if (!execFunc())
+ return false;
+ prog->tlsSize += func->tlsSize;
+ }
+ return true;
+}
+
+bool
+RegAlloc::execFunc()
+{
+ InsertConstraintsPass insertConstr;
+ PhiMovesPass insertPhiMoves;
+ ArgumentMovesPass insertArgMoves;
+ BuildIntervalsPass buildIntervals;
+ SpillCodeInserter insertSpills(func);
+
+ GCRA gcra(func, insertSpills);
+
+ unsigned int i, retries;
+ bool ret;
+
+ if (!func->ins.empty()) {
+ // Insert a nop at the entry so inputs only used by the first instruction
+ // don't count as having an empty live range.
+ Instruction *nop = new_Instruction(func, OP_NOP, TYPE_NONE);
+ BasicBlock::get(func->cfg.getRoot())->insertHead(nop);
+ }
+
+ ret = insertConstr.exec(func);
+ if (!ret)
+ goto out;
+
+ ret = insertPhiMoves.run(func);
+ if (!ret)
+ goto out;
+
+ ret = insertArgMoves.run(func);
+ if (!ret)
+ goto out;
+
+ // TODO: need to fix up spill slot usage ranges to support > 1 retry
+ for (retries = 0; retries < 3; ++retries) {
+ if (retries && (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC))
+ INFO("Retry: %i\n", retries);
+ if (prog->dbgFlags & NV50_IR_DEBUG_REG_ALLOC)
+ func->print();
+
+ // spilling to registers may add live ranges, need to rebuild everything
+ ret = true;
+ for (sequence = func->cfg.nextSequence(), i = 0;
+ ret && i <= func->loopNestingBound;
+ sequence = func->cfg.nextSequence(), ++i)
+ ret = buildLiveSets(BasicBlock::get(func->cfg.getRoot()));
+ if (!ret)
+ break;
+ func->orderInstructions(this->insns);
+
+ ret = buildIntervals.run(func);
+ if (!ret)
+ break;
+ ret = gcra.allocateRegisters(insns);
+ if (ret)
+ break; // success
+ }
+ INFO_DBG(prog->dbgFlags, REG_ALLOC, "RegAlloc done: %i\n", ret);
+
+ func->tlsSize = insertSpills.getStackSize();
+out:
+ return ret;
+}
+
+// TODO: check if modifying Instruction::join here breaks anything
+void
+GCRA::resolveSplitsAndMerges()
+{
+ for (std::list<Instruction *>::iterator it = splits.begin();
+ it != splits.end();
+ ++it) {
+ Instruction *split = *it;
+ unsigned int reg = regs.idToBytes(split->getSrc(0));
+ for (int d = 0; split->defExists(d); ++d) {
+ Value *v = split->getDef(d);
+ v->reg.data.id = regs.bytesToId(v, reg);
+ v->join = v;
+ reg += v->reg.size;
+ }
+ }
+ splits.clear();
+
+ for (std::list<Instruction *>::iterator it = merges.begin();
+ it != merges.end();
+ ++it) {
+ Instruction *merge = *it;
+ unsigned int reg = regs.idToBytes(merge->getDef(0));
+ for (int s = 0; merge->srcExists(s); ++s) {
+ Value *v = merge->getSrc(s);
+ v->reg.data.id = regs.bytesToId(v, reg);
+ v->join = v;
+ reg += v->reg.size;
+ }
+ }
+ merges.clear();
+}
+
+bool Program::registerAllocation()
+{
+ RegAlloc ra(this);
+ return ra.exec();
+}
+
+bool
+RegAlloc::InsertConstraintsPass::exec(Function *ir)
+{
+ constrList.clear();
+
+ bool ret = run(ir, true, true);
+ if (ret)
+ ret = insertConstraintMoves();
+ return ret;
+}
+
+// TODO: make part of texture insn
+void
+RegAlloc::InsertConstraintsPass::textureMask(TexInstruction *tex)
+{
+ Value *def[4];
+ int c, k, d;
+ uint8_t mask = 0;
+
+ for (d = 0, k = 0, c = 0; c < 4; ++c) {
+ if (!(tex->tex.mask & (1 << c)))
+ continue;
+ if (tex->getDef(k)->refCount()) {
+ mask |= 1 << c;
+ def[d++] = tex->getDef(k);
+ }
+ ++k;
+ }
+ tex->tex.mask = mask;
+
+ for (c = 0; c < d; ++c)
+ tex->setDef(c, def[c]);
+ for (; c < 4; ++c)
+ tex->setDef(c, NULL);
+}
+
+bool
+RegAlloc::InsertConstraintsPass::detectConflict(Instruction *cst, int s)
+{
+ Value *v = cst->getSrc(s);
+
+ // current register allocation can't handle it if a value participates in
+ // multiple constraints
+ for (Value::UseIterator it = v->uses.begin(); it != v->uses.end(); ++it) {
+ if (cst != (*it)->getInsn())
+ return true;
+ }
+
+ // can start at s + 1 because detectConflict is called on all sources
+ for (int c = s + 1; cst->srcExists(c); ++c)
+ if (v == cst->getSrc(c))
+ return true;
+
+ Instruction *defi = v->getInsn();
+
+ return (!defi || defi->constrainedDefs());
+}
+
+void
+RegAlloc::InsertConstraintsPass::addConstraint(Instruction *i, int s, int n)
+{
+ Instruction *cst;
+ int d;
+
+ // first, look for an existing identical constraint op
+ for (std::list<Instruction *>::iterator it = constrList.begin();
+ it != constrList.end();
+ ++it) {
+ cst = (*it);
+ if (!i->bb->dominatedBy(cst->bb))
+ break;
+ for (d = 0; d < n; ++d)
+ if (cst->getSrc(d) != i->getSrc(d + s))
+ break;
+ if (d >= n) {
+ for (d = 0; d < n; ++d, ++s)
+ i->setSrc(s, cst->getDef(d));
+ return;
+ }
+ }
+ cst = new_Instruction(func, OP_CONSTRAINT, i->dType);
+
+ for (d = 0; d < n; ++s, ++d) {
+ cst->setDef(d, new_LValue(func, FILE_GPR));
+ cst->setSrc(d, i->getSrc(s));
+ i->setSrc(s, cst->getDef(d));
+ }
+ i->bb->insertBefore(i, cst);
+
+ constrList.push_back(cst);
+}
+
+// Add a dummy use of the pointer source of >= 8 byte loads after the load
+// to prevent it from being assigned a register which overlapping the load's
+// destination, which would produce random corruptions.
+void
+RegAlloc::InsertConstraintsPass::addHazard(Instruction *i, const ValueRef *src)
+{
+ Instruction *hzd = new_Instruction(func, OP_NOP, TYPE_NONE);
+ hzd->setSrc(0, src->get());
+ i->bb->insertAfter(i, hzd);
+
+}
+
+// b32 { %r0 %r1 %r2 %r3 } -> b128 %r0q
+void
+RegAlloc::InsertConstraintsPass::condenseDefs(Instruction *insn)
+{
+ uint8_t size = 0;
+ int n;
+ for (n = 0; insn->defExists(n) && insn->def(n).getFile() == FILE_GPR; ++n)
+ size += insn->getDef(n)->reg.size;
+ if (n < 2)
+ return;
+ LValue *lval = new_LValue(func, FILE_GPR);
+ lval->reg.size = size;
+
+ Instruction *split = new_Instruction(func, OP_SPLIT, typeOfSize(size));
+ split->setSrc(0, lval);
+ for (int d = 0; d < n; ++d) {
+ split->setDef(d, insn->getDef(d));
+ insn->setDef(d, NULL);
+ }
+ insn->setDef(0, lval);
+
+ for (int k = 1, d = n; insn->defExists(d); ++d, ++k) {
+ insn->setDef(k, insn->getDef(d));
+ insn->setDef(d, NULL);
+ }
+ // carry over predicate if any (mainly for OP_UNION uses)
+ split->setPredicate(insn->cc, insn->getPredicate());
+
+ insn->bb->insertAfter(insn, split);
+ constrList.push_back(split);
+}
+void
+RegAlloc::InsertConstraintsPass::condenseSrcs(Instruction *insn,
+ const int a, const int b)
+{
+ uint8_t size = 0;
+ if (a >= b)
+ return;
+ for (int s = a; s <= b; ++s)
+ size += insn->getSrc(s)->reg.size;
+ if (!size)
+ return;
+ LValue *lval = new_LValue(func, FILE_GPR);
+ lval->reg.size = size;
+
+ Value *save[3];
+ insn->takeExtraSources(0, save);
+
+ Instruction *merge = new_Instruction(func, OP_MERGE, typeOfSize(size));
+ merge->setDef(0, lval);
+ for (int s = a, i = 0; s <= b; ++s, ++i) {
+ merge->setSrc(i, insn->getSrc(s));
+ insn->setSrc(s, NULL);
+ }
+ insn->setSrc(a, lval);
+
+ for (int k = a + 1, s = b + 1; insn->srcExists(s); ++s, ++k) {
+ insn->setSrc(k, insn->getSrc(s));
+ insn->setSrc(s, NULL);
+ }
+ insn->bb->insertBefore(insn, merge);
+
+ insn->putExtraSources(0, save);
+
+ constrList.push_back(merge);
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNVE0(TexInstruction *tex)
+{
+ if (isTextureOp(tex->op))
+ textureMask(tex);
+ condenseDefs(tex);
+
+ if (tex->op == OP_SUSTB || tex->op == OP_SUSTP) {
+ condenseSrcs(tex, 3, (3 + typeSizeof(tex->dType) / 4) - 1);
+ } else
+ if (isTextureOp(tex->op)) {
+ int n = tex->srcCount(0xff, true);
+ if (n > 4) {
+ condenseSrcs(tex, 0, 3);
+ if (n > 5) // NOTE: first call modified positions already
+ condenseSrcs(tex, 4 - (4 - 1), n - 1 - (4 - 1));
+ } else
+ if (n > 1) {
+ condenseSrcs(tex, 0, n - 1);
+ }
+ }
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNVC0(TexInstruction *tex)
+{
+ int n, s;
+
+ textureMask(tex);
+
+ if (tex->op == OP_TXQ) {
+ s = tex->srcCount(0xff);
+ n = 0;
+ } else {
+ s = tex->tex.target.getArgCount();
+ if (!tex->tex.target.isArray() &&
+ (tex->tex.rIndirectSrc >= 0 || tex->tex.sIndirectSrc >= 0))
+ ++s;
+ if (tex->op == OP_TXD && tex->tex.useOffsets)
+ ++s;
+ n = tex->srcCount(0xff) - s;
+ assert(n <= 4);
+ }
+
+ if (s > 1)
+ condenseSrcs(tex, 0, s - 1);
+ if (n > 1) // NOTE: first call modified positions already
+ condenseSrcs(tex, 1, n);
+
+ condenseDefs(tex);
+}
+
+void
+RegAlloc::InsertConstraintsPass::texConstraintNV50(TexInstruction *tex)
+{
+ Value *pred = tex->getPredicate();
+ if (pred)
+ tex->setPredicate(tex->cc, NULL);
+
+ textureMask(tex);
+
+ assert(tex->defExists(0) && tex->srcExists(0));
+ // make src and def count match
+ int c;
+ for (c = 0; tex->srcExists(c) || tex->defExists(c); ++c) {
+ if (!tex->srcExists(c))
+ tex->setSrc(c, new_LValue(func, tex->getSrc(0)->asLValue()));
+ if (!tex->defExists(c))
+ tex->setDef(c, new_LValue(func, tex->getDef(0)->asLValue()));
+ }
+ if (pred)
+ tex->setPredicate(tex->cc, pred);
+ condenseDefs(tex);
+ condenseSrcs(tex, 0, c - 1);
+}
+
+// Insert constraint markers for instructions whose multiple sources must be
+// located in consecutive registers.
+bool
+RegAlloc::InsertConstraintsPass::visit(BasicBlock *bb)
+{
+ TexInstruction *tex;
+ Instruction *next;
+ int s, size;
+
+ targ = bb->getProgram()->getTarget();
+
+ for (Instruction *i = bb->getEntry(); i; i = next) {
+ next = i->next;
+
+ if ((tex = i->asTex())) {
+ switch (targ->getChipset() & ~0xf) {
+ case 0x50:
+ case 0x80:
+ case 0x90:
+ case 0xa0:
+ texConstraintNV50(tex);
+ break;
+ case 0xc0:
+ case 0xd0:
+ texConstraintNVC0(tex);
+ break;
+ case 0xe0:
+ case NVISA_GK110_CHIPSET:
+ texConstraintNVE0(tex);
+ break;
+ default:
+ break;
+ }
+ } else
+ if (i->op == OP_EXPORT || i->op == OP_STORE) {
+ for (size = typeSizeof(i->dType), s = 1; size > 0; ++s) {
+ assert(i->srcExists(s));
+ size -= i->getSrc(s)->reg.size;
+ }
+ condenseSrcs(i, 1, s - 1);
+ } else
+ if (i->op == OP_LOAD || i->op == OP_VFETCH) {
+ condenseDefs(i);
+ if (i->src(0).isIndirect(0) && typeSizeof(i->dType) >= 8)
+ addHazard(i, i->src(0).getIndirect(0));
+ } else
+ if (i->op == OP_UNION ||
+ i->op == OP_MERGE ||
+ i->op == OP_SPLIT) {
+ constrList.push_back(i);
+ }
+ }
+ return true;
+}
+
+// Insert extra moves so that, if multiple register constraints on a value are
+// in conflict, these conflicts can be resolved.
+bool
+RegAlloc::InsertConstraintsPass::insertConstraintMoves()
+{
+ for (std::list<Instruction *>::iterator it = constrList.begin();
+ it != constrList.end();
+ ++it) {
+ Instruction *cst = *it;
+ Instruction *mov;
+
+ if (cst->op == OP_SPLIT && 0) {
+ // spilling splits is annoying, just make sure they're separate
+ for (int d = 0; cst->defExists(d); ++d) {
+ if (!cst->getDef(d)->refCount())
+ continue;
+ LValue *lval = new_LValue(func, cst->def(d).getFile());
+ const uint8_t size = cst->def(d).getSize();
+ lval->reg.size = size;
+
+ mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+ mov->setSrc(0, lval);
+ mov->setDef(0, cst->getDef(d));
+ cst->setDef(d, mov->getSrc(0));
+ cst->bb->insertAfter(cst, mov);
+
+ cst->getSrc(0)->asLValue()->noSpill = 1;
+ mov->getSrc(0)->asLValue()->noSpill = 1;
+ }
+ } else
+ if (cst->op == OP_MERGE || cst->op == OP_UNION) {
+ for (int s = 0; cst->srcExists(s); ++s) {
+ const uint8_t size = cst->src(s).getSize();
+
+ if (!cst->getSrc(s)->defs.size()) {
+ mov = new_Instruction(func, OP_NOP, typeOfSize(size));
+ mov->setDef(0, cst->getSrc(s));
+ cst->bb->insertBefore(cst, mov);
+ continue;
+ }
+ assert(cst->getSrc(s)->defs.size() == 1); // still SSA
+
+ Instruction *defi = cst->getSrc(s)->defs.front()->getInsn();
+ // catch some cases where don't really need MOVs
+ if (cst->getSrc(s)->refCount() == 1 && !defi->constrainedDefs())
+ continue;
+
+ LValue *lval = new_LValue(func, cst->src(s).getFile());
+ lval->reg.size = size;
+
+ mov = new_Instruction(func, OP_MOV, typeOfSize(size));
+ mov->setDef(0, lval);
+ mov->setSrc(0, cst->getSrc(s));
+ cst->setSrc(s, mov->getDef(0));
+ cst->bb->insertBefore(cst, mov);
+
+ cst->getDef(0)->asLValue()->noSpill = 1; // doesn't help
+
+ if (cst->op == OP_UNION)
+ mov->setPredicate(defi->cc, defi->getPredicate());
+ }
+ }
+ }
+
+ return true;
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
new file mode 100644
index 0000000..2e43234
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ssa.cpp
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+// Converts nv50 IR generated from TGSI to SSA form.
+
+// DominatorTree implements an algorithm for finding immediate dominators,
+// as described by T. Lengauer & R. Tarjan.
+class DominatorTree : public Graph
+{
+public:
+ DominatorTree(Graph *cfg);
+ ~DominatorTree() { }
+
+ bool dominates(BasicBlock *, BasicBlock *);
+
+ void findDominanceFrontiers();
+
+private:
+ void build();
+ void buildDFS(Node *);
+
+ void squash(int);
+ inline void link(int, int);
+ inline int eval(int);
+
+ void debugPrint();
+
+ Graph *cfg;
+
+ Node **vert;
+ int *data;
+ const int count;
+
+ #define SEMI(i) (data[(i) + 0 * count])
+ #define ANCESTOR(i) (data[(i) + 1 * count])
+ #define PARENT(i) (data[(i) + 2 * count])
+ #define LABEL(i) (data[(i) + 3 * count])
+ #define DOM(i) (data[(i) + 4 * count])
+};
+
+void DominatorTree::debugPrint()
+{
+ for (int i = 0; i < count; ++i) {
+ INFO("SEMI(%i) = %i\n", i, SEMI(i));
+ INFO("ANCESTOR(%i) = %i\n", i, ANCESTOR(i));
+ INFO("PARENT(%i) = %i\n", i, PARENT(i));
+ INFO("LABEL(%i) = %i\n", i, LABEL(i));
+ INFO("DOM(%i) = %i\n", i, DOM(i));
+ }
+}
+
+DominatorTree::DominatorTree(Graph *cfgraph) : cfg(cfgraph),
+ count(cfg->getSize())
+{
+ int i = 0;
+
+ vert = new Node * [count];
+ data = new int[5 * count];
+
+ for (IteratorRef it = cfg->iteratorDFS(true); !it->end(); it->next(), ++i) {
+ vert[i] = reinterpret_cast<Node *>(it->get());
+ vert[i]->tag = i;
+ LABEL(i) = i;
+ SEMI(i) = ANCESTOR(i) = -1;
+ }
+
+ build();
+
+ delete[] vert;
+ delete[] data;
+}
+
+void DominatorTree::buildDFS(Graph::Node *node)
+{
+ SEMI(node->tag) = node->tag;
+
+ for (Graph::EdgeIterator ei = node->outgoing(); !ei.end(); ei.next()) {
+ if (SEMI(ei.getNode()->tag) < 0) {
+ buildDFS(ei.getNode());
+ PARENT(ei.getNode()->tag) = node->tag;
+ }
+ }
+}
+
+void DominatorTree::squash(int v)
+{
+ if (ANCESTOR(ANCESTOR(v)) >= 0) {
+ squash(ANCESTOR(v));
+
+ if (SEMI(LABEL(ANCESTOR(v))) < SEMI(LABEL(v)))
+ LABEL(v) = LABEL(ANCESTOR(v));
+ ANCESTOR(v) = ANCESTOR(ANCESTOR(v));
+ }
+}
+
+int DominatorTree::eval(int v)
+{
+ if (ANCESTOR(v) < 0)
+ return v;
+ squash(v);
+ return LABEL(v);
+}
+
+void DominatorTree::link(int v, int w)
+{
+ ANCESTOR(w) = v;
+}
+
+void DominatorTree::build()
+{
+ DLList *bucket = new DLList[count];
+ Node *nv, *nw;
+ int p, u, v, w;
+
+ buildDFS(cfg->getRoot());
+
+ for (w = count - 1; w >= 1; --w) {
+ nw = vert[w];
+ assert(nw->tag == w);
+ for (Graph::EdgeIterator ei = nw->incident(); !ei.end(); ei.next()) {
+ nv = ei.getNode();
+ v = nv->tag;
+ u = eval(v);
+ if (SEMI(u) < SEMI(w))
+ SEMI(w) = SEMI(u);
+ }
+ p = PARENT(w);
+ bucket[SEMI(w)].insert(nw);
+ link(p, w);
+
+ for (DLList::Iterator it = bucket[p].iterator(); !it.end(); it.erase()) {
+ v = reinterpret_cast<Node *>(it.get())->tag;
+ u = eval(v);
+ DOM(v) = (SEMI(u) < SEMI(v)) ? u : p;
+ }
+ }
+ for (w = 1; w < count; ++w) {
+ if (DOM(w) != SEMI(w))
+ DOM(w) = DOM(DOM(w));
+ }
+ DOM(0) = 0;
+
+ insert(&BasicBlock::get(cfg->getRoot())->dom);
+ do {
+ p = 0;
+ for (v = 1; v < count; ++v) {
+ nw = &BasicBlock::get(vert[DOM(v)])->dom;;
+ nv = &BasicBlock::get(vert[v])->dom;
+ if (nw->getGraph() && !nv->getGraph()) {
+ ++p;
+ nw->attach(nv, Graph::Edge::TREE);
+ }
+ }
+ } while (p);
+
+ delete[] bucket;
+}
+
+#undef SEMI
+#undef ANCESTOR
+#undef PARENT
+#undef LABEL
+#undef DOM
+
+void DominatorTree::findDominanceFrontiers()
+{
+ BasicBlock *bb;
+
+ for (IteratorRef dtIt = iteratorDFS(false); !dtIt->end(); dtIt->next()) {
+ EdgeIterator succIt, chldIt;
+
+ bb = BasicBlock::get(reinterpret_cast<Node *>(dtIt->get()));
+ bb->getDF().clear();
+
+ for (succIt = bb->cfg.outgoing(); !succIt.end(); succIt.next()) {
+ BasicBlock *dfLocal = BasicBlock::get(succIt.getNode());
+ if (dfLocal->idom() != bb)
+ bb->getDF().insert(dfLocal);
+ }
+
+ for (chldIt = bb->dom.outgoing(); !chldIt.end(); chldIt.next()) {
+ BasicBlock *cb = BasicBlock::get(chldIt.getNode());
+
+ DLList::Iterator dfIt = cb->getDF().iterator();
+ for (; !dfIt.end(); dfIt.next()) {
+ BasicBlock *dfUp = BasicBlock::get(dfIt);
+ if (dfUp->idom() != bb)
+ bb->getDF().insert(dfUp);
+ }
+ }
+ }
+}
+
+// liveIn(bb) = usedBeforeAssigned(bb) U (liveOut(bb) - assigned(bb))
+void
+Function::buildLiveSetsPreSSA(BasicBlock *bb, const int seq)
+{
+ Function *f = bb->getFunction();
+ BitSet usedBeforeAssigned(allLValues.getSize(), true);
+ BitSet assigned(allLValues.getSize(), true);
+
+ bb->liveSet.allocate(allLValues.getSize(), false);
+
+ int n = 0;
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+ BasicBlock *out = BasicBlock::get(ei.getNode());
+ if (out == bb)
+ continue;
+ if (out->cfg.visit(seq))
+ buildLiveSetsPreSSA(out, seq);
+ if (!n++)
+ bb->liveSet = out->liveSet;
+ else
+ bb->liveSet |= out->liveSet;
+ }
+ if (!n && !bb->liveSet.marker)
+ bb->liveSet.fill(0);
+ bb->liveSet.marker = true;
+
+ for (Instruction *i = bb->getEntry(); i; i = i->next) {
+ for (int s = 0; i->srcExists(s); ++s)
+ if (i->getSrc(s)->asLValue() && !assigned.test(i->getSrc(s)->id))
+ usedBeforeAssigned.set(i->getSrc(s)->id);
+ for (int d = 0; i->defExists(d); ++d)
+ assigned.set(i->getDef(d)->id);
+ }
+
+ if (bb == BasicBlock::get(f->cfgExit)) {
+ for (std::deque<ValueRef>::iterator it = f->outs.begin();
+ it != f->outs.end(); ++it) {
+ if (!assigned.test(it->get()->id))
+ usedBeforeAssigned.set(it->get()->id);
+ }
+ }
+
+ bb->liveSet.andNot(assigned);
+ bb->liveSet |= usedBeforeAssigned;
+}
+
+void
+Function::buildDefSetsPreSSA(BasicBlock *bb, const int seq)
+{
+ bb->defSet.allocate(allLValues.getSize(), !bb->liveSet.marker);
+ bb->liveSet.marker = true;
+
+ for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
+ BasicBlock *in = BasicBlock::get(ei.getNode());
+
+ if (in->cfg.visit(seq))
+ buildDefSetsPreSSA(in, seq);
+
+ bb->defSet |= in->defSet;
+ }
+
+ for (Instruction *i = bb->getEntry(); i; i = i->next) {
+ for (int d = 0; i->defExists(d); ++d)
+ bb->defSet.set(i->getDef(d)->id);
+ }
+}
+
+class RenamePass
+{
+public:
+ RenamePass(Function *);
+ ~RenamePass();
+
+ bool run();
+ void search(BasicBlock *);
+
+ inline LValue *getStackTop(Value *);
+
+ LValue *mkUndefined(Value *);
+
+private:
+ Stack *stack;
+ Function *func;
+ Program *prog;
+};
+
+bool
+Program::convertToSSA()
+{
+ for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
+ Function *fn = reinterpret_cast<Function *>(fi.get());
+ if (!fn->convertToSSA())
+ return false;
+ }
+ return true;
+}
+
+// XXX: add edge from entry to exit ?
+
+// Efficiently Computing Static Single Assignment Form and
+// the Control Dependence Graph,
+// R. Cytron, J. Ferrante, B. K. Rosen, M. N. Wegman, F. K. Zadeck
+bool
+Function::convertToSSA()
+{
+ // 0. calculate live in variables (for pruned SSA)
+ buildLiveSets();
+
+ // 1. create the dominator tree
+ domTree = new DominatorTree(&cfg);
+ reinterpret_cast<DominatorTree *>(domTree)->findDominanceFrontiers();
+
+ // 2. insert PHI functions
+ DLList workList;
+ LValue *lval;
+ BasicBlock *bb;
+ int var;
+ int iterCount = 0;
+ int *hasAlready = new int[allBBlocks.getSize() * 2];
+ int *work = &hasAlready[allBBlocks.getSize()];
+
+ memset(hasAlready, 0, allBBlocks.getSize() * 2 * sizeof(int));
+
+ // for each variable
+ for (var = 0; var < allLValues.getSize(); ++var) {
+ if (!allLValues.get(var))
+ continue;
+ lval = reinterpret_cast<Value *>(allLValues.get(var))->asLValue();
+ if (!lval || lval->defs.empty())
+ continue;
+ ++iterCount;
+
+ // TODO: don't add phi functions for values that aren't used outside
+ // the BB they're defined in
+
+ // gather blocks with assignments to lval in workList
+ for (Value::DefIterator d = lval->defs.begin();
+ d != lval->defs.end(); ++d) {
+ bb = ((*d)->getInsn() ? (*d)->getInsn()->bb : NULL);
+ if (!bb)
+ continue; // instruction likely been removed but not XXX deleted
+
+ if (work[bb->getId()] == iterCount)
+ continue;
+ work[bb->getId()] = iterCount;
+ workList.insert(bb);
+ }
+
+ // for each block in workList, insert a phi for lval in the block's
+ // dominance frontier (if we haven't already done so)
+ for (DLList::Iterator wI = workList.iterator(); !wI.end(); wI.erase()) {
+ bb = BasicBlock::get(wI);
+
+ DLList::Iterator dfIter = bb->getDF().iterator();
+ for (; !dfIter.end(); dfIter.next()) {
+ Instruction *phi;
+ BasicBlock *dfBB = BasicBlock::get(dfIter);
+
+ if (hasAlready[dfBB->getId()] >= iterCount)
+ continue;
+ hasAlready[dfBB->getId()] = iterCount;
+
+ // pruned SSA: don't need a phi if the value is not live-in
+ if (!dfBB->liveSet.test(lval->id))
+ continue;
+
+ phi = new_Instruction(this, OP_PHI, typeOfSize(lval->reg.size));
+ dfBB->insertTail(phi);
+
+ phi->setDef(0, lval);
+ for (int s = 0; s < dfBB->cfg.incidentCount(); ++s)
+ phi->setSrc(s, lval);
+
+ if (work[dfBB->getId()] < iterCount) {
+ work[dfBB->getId()] = iterCount;
+ wI.insert(dfBB);
+ }
+ }
+ }
+ }
+ delete[] hasAlready;
+
+ RenamePass rename(this);
+ return rename.run();
+}
+
+RenamePass::RenamePass(Function *fn) : func(fn), prog(fn->getProgram())
+{
+ stack = new Stack[func->allLValues.getSize()];
+}
+
+RenamePass::~RenamePass()
+{
+ if (stack)
+ delete[] stack;
+}
+
+LValue *
+RenamePass::getStackTop(Value *val)
+{
+ if (!stack[val->id].getSize())
+ return 0;
+ return reinterpret_cast<LValue *>(stack[val->id].peek().u.p);
+}
+
+LValue *
+RenamePass::mkUndefined(Value *val)
+{
+ LValue *lval = val->asLValue();
+ assert(lval);
+ LValue *ud = new_LValue(func, lval);
+ Instruction *nop = new_Instruction(func, OP_NOP, typeOfSize(lval->reg.size));
+ nop->setDef(0, ud);
+ BasicBlock::get(func->cfg.getRoot())->insertHead(nop);
+ return ud;
+}
+
+bool RenamePass::run()
+{
+ if (!stack)
+ return false;
+ search(BasicBlock::get(func->domTree->getRoot()));
+
+ return true;
+}
+
+// Go through BBs in dominance order, create new values for each definition,
+// and replace all sources with their current new values.
+//
+// NOTE: The values generated for function inputs/outputs have no connection
+// to their corresponding outputs/inputs in other functions. Only allocation
+// of physical registers will establish this connection.
+//
+void RenamePass::search(BasicBlock *bb)
+{
+ LValue *lval, *ssa;
+ int d, s;
+ const Target *targ = prog->getTarget();
+
+ // Put current definitions for function inputs values on the stack.
+ // They can be used before any redefinitions are pushed.
+ if (bb == BasicBlock::get(func->cfg.getRoot())) {
+ for (std::deque<ValueDef>::iterator it = func->ins.begin();
+ it != func->ins.end(); ++it) {
+ lval = it->get()->asLValue();
+ assert(lval);
+
+ ssa = new_LValue(func, targ->nativeFile(lval->reg.file));
+ ssa->reg.size = lval->reg.size;
+ ssa->reg.data.id = lval->reg.data.id;
+
+ it->setSSA(ssa);
+ stack[lval->id].push(ssa);
+ }
+ }
+
+ for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) {
+ // PHI sources get definitions from the passes through the incident BBs,
+ // so skip them here.
+ if (stmt->op != OP_PHI) {
+ for (s = 0; stmt->srcExists(s); ++s) {
+ lval = stmt->getSrc(s)->asLValue();
+ if (!lval)
+ continue;
+ // Values on the stack created in previously visited blocks, and
+ // function inputs, will be valid because they dominate this one.
+ lval = getStackTop(lval);
+ if (!lval)
+ lval = mkUndefined(stmt->getSrc(s));
+ stmt->setSrc(s, lval);
+ }
+ }
+ for (d = 0; stmt->defExists(d); ++d) {
+ lval = stmt->def(d).get()->asLValue();
+ assert(lval);
+ stmt->def(d).setSSA(
+ new_LValue(func, targ->nativeFile(lval->reg.file)));
+ stmt->def(d).get()->reg.size = lval->reg.size;
+ stmt->def(d).get()->reg.data.id = lval->reg.data.id;
+ stack[lval->id].push(stmt->def(d).get());
+ }
+ }
+
+ // Update sources of PHI ops corresponding to this BB in outgoing BBs.
+ for (Graph::EdgeIterator ei = bb->cfg.outgoing(); !ei.end(); ei.next()) {
+ Instruction *phi;
+ int p = 0;
+ BasicBlock *sb = BasicBlock::get(ei.getNode());
+
+ // which predecessor of sb is bb ?
+ for (Graph::EdgeIterator ei = sb->cfg.incident(); !ei.end(); ei.next()) {
+ if (ei.getNode() == &bb->cfg)
+ break;
+ ++p;
+ }
+ assert(p < sb->cfg.incidentCount());
+
+ for (phi = sb->getPhi(); phi && phi->op == OP_PHI; phi = phi->next) {
+ lval = getStackTop(phi->getSrc(p));
+ if (!lval)
+ lval = mkUndefined(phi->getSrc(p));
+ phi->setSrc(p, lval);
+ }
+ }
+
+ // Visit the BBs we dominate.
+ for (Graph::EdgeIterator ei = bb->dom.outgoing(); !ei.end(); ei.next())
+ search(BasicBlock::get(ei.getNode()));
+
+ // Update function outputs to the last definitions of their pre-SSA values.
+ // I hope they're unique, i.e. that we get PHIs for all of them ...
+ if (bb == BasicBlock::get(func->cfgExit)) {
+ for (std::deque<ValueRef>::iterator it = func->outs.begin();
+ it != func->outs.end(); ++it) {
+ lval = it->get()->asLValue();
+ if (!lval)
+ continue;
+ lval = getStackTop(lval);
+ if (!lval)
+ lval = mkUndefined(it->get());
+ it->set(lval);
+ }
+ }
+
+ // Pop the values we created in this block from the stack because we will
+ // return to blocks that we do not dominate.
+ for (Instruction *stmt = bb->getFirst(); stmt; stmt = stmt->next) {
+ if (stmt->op == OP_NOP)
+ continue;
+ for (d = 0; stmt->defExists(d); ++d)
+ stack[stmt->def(d).preSSA()->id].pop();
+ }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
new file mode 100644
index 0000000..443acfc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.cpp
@@ -0,0 +1,469 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir.h"
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+const uint8_t Target::operationSrcNr[OP_LAST + 1] =
+{
+ 0, 0, // NOP, PHI
+ 0, 0, 0, 0, // UNION, SPLIT, MERGE, CONSTRAINT
+ 1, 1, 2, // MOV, LOAD, STORE
+ 2, 2, 2, 2, 2, 3, 3, 3, // ADD, SUB, MUL, DIV, MOD, MAD, FMA, SAD
+ 1, 1, 1, // ABS, NEG, NOT
+ 2, 2, 2, 2, 2, // AND, OR, XOR, SHL, SHR
+ 2, 2, 1, // MAX, MIN, SAT
+ 1, 1, 1, 1, // CEIL, FLOOR, TRUNC, CVT
+ 3, 3, 3, 2, 3, 3, // SET_AND,OR,XOR, SET, SELP, SLCT
+ 1, 1, 1, 1, 1, 1, // RCP, RSQ, LG2, SIN, COS, EX2
+ 1, 1, 1, 1, 1, 2, // EXP, LOG, PRESIN, PREEX2, SQRT, POW
+ 0, 0, 0, 0, 0, // BRA, CALL, RET, CONT, BREAK,
+ 0, 0, 0, // PRERET,CONT,BREAK
+ 0, 0, 0, 0, 0, 0, // BRKPT, JOINAT, JOIN, DISCARD, EXIT, MEMBAR
+ 1, 1, 2, 1, 2, // VFETCH, PFETCH, EXPORT, LINTERP, PINTERP
+ 1, 1, // EMIT, RESTART
+ 1, 1, 1, // TEX, TXB, TXL,
+ 1, 1, 1, 1, 1, 2, // TXF, TXQ, TXD, TXG, TEXCSAA, TEXPREP
+ 1, 1, 2, 2, 2, 2, 2, // SULDB, SULDP, SUSTB, SUSTP, SUREDB, SUREDP, SULEA
+ 3, 3, 3, 3, // SUBFM, SUCLAMP, SUEAU, MADSP
+ 0, // TEXBAR
+ 1, 1, // DFDX, DFDY
+ 1, 2, 2, 0, 0, // RDSV, WRSV, QUADOP, QUADON, QUADPOP
+ 2, 3, 2, 3, // POPCNT, INSBF, EXTBF, PERMT
+ 2, 2, // ATOM, BAR
+ 2, 2, 2, 2, 3, 2, // VADD, VAVG, VMIN, VMAX, VSAD, VSET,
+ 2, 2, 2, 1, // VSHR, VSHL, VSEL, CCTL
+ 0
+};
+
+const OpClass Target::operationClass[OP_LAST + 1] =
+{
+ // NOP; PHI; UNION, SPLIT, MERGE, CONSTRAINT
+ OPCLASS_OTHER,
+ OPCLASS_PSEUDO,
+ OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO, OPCLASS_PSEUDO,
+ // MOV; LOAD; STORE
+ OPCLASS_MOVE,
+ OPCLASS_LOAD,
+ OPCLASS_STORE,
+ // ADD, SUB, MUL; DIV, MOD; MAD, FMA, SAD
+ OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
+ OPCLASS_ARITH, OPCLASS_ARITH,
+ OPCLASS_ARITH, OPCLASS_ARITH, OPCLASS_ARITH,
+ // ABS, NEG; NOT, AND, OR, XOR; SHL, SHR
+ OPCLASS_CONVERT, OPCLASS_CONVERT,
+ OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC, OPCLASS_LOGIC,
+ OPCLASS_SHIFT, OPCLASS_SHIFT,
+ // MAX, MIN
+ OPCLASS_COMPARE, OPCLASS_COMPARE,
+ // SAT, CEIL, FLOOR, TRUNC; CVT
+ OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT, OPCLASS_CONVERT,
+ OPCLASS_CONVERT,
+ // SET(AND,OR,XOR); SELP, SLCT
+ OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE, OPCLASS_COMPARE,
+ OPCLASS_COMPARE, OPCLASS_COMPARE,
+ // RCP, RSQ, LG2, SIN, COS; EX2, EXP, LOG, PRESIN, PREEX2; SQRT, POW
+ OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
+ OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU, OPCLASS_SFU,
+ OPCLASS_SFU, OPCLASS_SFU,
+ // BRA, CALL, RET; CONT, BREAK, PRE(RET,CONT,BREAK); BRKPT, JOINAT, JOIN
+ OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+ OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+ OPCLASS_FLOW, OPCLASS_FLOW, OPCLASS_FLOW,
+ // DISCARD, EXIT
+ OPCLASS_FLOW, OPCLASS_FLOW,
+ // MEMBAR
+ OPCLASS_CONTROL,
+ // VFETCH, PFETCH, EXPORT
+ OPCLASS_LOAD, OPCLASS_OTHER, OPCLASS_STORE,
+ // LINTERP, PINTERP
+ OPCLASS_SFU, OPCLASS_SFU,
+ // EMIT, RESTART
+ OPCLASS_CONTROL, OPCLASS_CONTROL,
+ // TEX, TXB, TXL, TXF; TXQ, TXD, TXG, TEXCSAA; TEXPREP
+ OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
+ OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE, OPCLASS_TEXTURE,
+ OPCLASS_TEXTURE,
+ // SULDB, SULDP, SUSTB, SUSTP; SUREDB, SUREDP, SULEA
+ OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_ATOMIC, OPCLASS_SURFACE,
+ OPCLASS_SURFACE, OPCLASS_SURFACE, OPCLASS_SURFACE,
+ // SUBFM, SUCLAMP, SUEAU, MADSP
+ OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_ARITH,
+ // TEXBAR
+ OPCLASS_OTHER,
+ // DFDX, DFDY, RDSV, WRSV; QUADOP, QUADON, QUADPOP
+ OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER, OPCLASS_OTHER,
+ OPCLASS_OTHER, OPCLASS_CONTROL, OPCLASS_CONTROL,
+ // POPCNT, INSBF, EXTBF, PERMT
+ OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD, OPCLASS_BITFIELD,
+ // ATOM, BAR
+ OPCLASS_ATOMIC, OPCLASS_CONTROL,
+ // VADD, VAVG, VMIN, VMAX
+ OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR,
+ // VSAD, VSET, VSHR, VSHL
+ OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR, OPCLASS_VECTOR,
+ // VSEL, CCTL
+ OPCLASS_VECTOR, OPCLASS_CONTROL,
+ OPCLASS_PSEUDO // LAST
+};
+
+
+extern Target *getTargetNVC0(unsigned int chipset);
+extern Target *getTargetNV50(unsigned int chipset);
+
+Target *Target::create(unsigned int chipset)
+{
+ switch (chipset & 0xf0) {
+ case 0xc0:
+ case 0xd0:
+ case 0xe0:
+ case NVISA_GK110_CHIPSET:
+ return getTargetNVC0(chipset);
+ case 0x50:
+ case 0x80:
+ case 0x90:
+ case 0xa0:
+ return getTargetNV50(chipset);
+ default:
+ ERROR("unsupported target: NV%x\n", chipset);
+ return 0;
+ }
+}
+
+void Target::destroy(Target *targ)
+{
+ delete targ;
+}
+
+CodeEmitter::CodeEmitter(const Target *target) : targ(target)
+{
+}
+
+void
+CodeEmitter::setCodeLocation(void *ptr, uint32_t size)
+{
+ code = reinterpret_cast<uint32_t *>(ptr);
+ codeSize = 0;
+ codeSizeLimit = size;
+}
+
+void
+CodeEmitter::printBinary() const
+{
+ uint32_t *bin = code - codeSize / 4;
+ INFO("program binary (%u bytes)", codeSize);
+ for (unsigned int pos = 0; pos < codeSize / 4; ++pos) {
+ if ((pos % 8) == 0)
+ INFO("\n");
+ INFO("%08x ", bin[pos]);
+ }
+ INFO("\n");
+}
+
+static inline uint32_t sizeToBundlesNVE4(uint32_t size)
+{
+ return (size + 55) / 56;
+}
+
+void
+CodeEmitter::prepareEmission(Program *prog)
+{
+ for (ArrayList::Iterator fi = prog->allFuncs.iterator();
+ !fi.end(); fi.next()) {
+ Function *func = reinterpret_cast<Function *>(fi.get());
+ func->binPos = prog->binSize;
+ prepareEmission(func);
+
+ // adjust sizes & positions for schedulding info:
+ if (prog->getTarget()->hasSWSched) {
+ uint32_t adjPos = func->binPos;
+ BasicBlock *bb = NULL;
+ for (int i = 0; i < func->bbCount; ++i) {
+ bb = func->bbArray[i];
+ int32_t adjSize = bb->binSize;
+ if (adjPos % 64) {
+ adjSize -= 64 - adjPos % 64;
+ if (adjSize < 0)
+ adjSize = 0;
+ }
+ adjSize = bb->binSize + sizeToBundlesNVE4(adjSize) * 8;
+ bb->binPos = adjPos;
+ bb->binSize = adjSize;
+ adjPos += adjSize;
+ }
+ if (bb)
+ func->binSize = adjPos - func->binPos;
+ }
+
+ prog->binSize += func->binSize;
+ }
+}
+
+void
+CodeEmitter::prepareEmission(Function *func)
+{
+ func->bbCount = 0;
+ func->bbArray = new BasicBlock * [func->cfg.getSize()];
+
+ BasicBlock::get(func->cfg.getRoot())->binPos = func->binPos;
+
+ for (IteratorRef it = func->cfg.iteratorCFG(); !it->end(); it->next())
+ prepareEmission(BasicBlock::get(*it));
+}
+
+void
+CodeEmitter::prepareEmission(BasicBlock *bb)
+{
+ Instruction *i, *next;
+ Function *func = bb->getFunction();
+ int j;
+ unsigned int nShort;
+
+ for (j = func->bbCount - 1; j >= 0 && !func->bbArray[j]->binSize; --j);
+
+ for (; j >= 0; --j) {
+ BasicBlock *in = func->bbArray[j];
+ Instruction *exit = in->getExit();
+
+ if (exit && exit->op == OP_BRA && exit->asFlow()->target.bb == bb) {
+ in->binSize -= 8;
+ func->binSize -= 8;
+
+ for (++j; j < func->bbCount; ++j)
+ func->bbArray[j]->binPos -= 8;
+
+ in->remove(exit);
+ }
+ bb->binPos = in->binPos + in->binSize;
+ if (in->binSize) // no more no-op branches to bb
+ break;
+ }
+ func->bbArray[func->bbCount++] = bb;
+
+ if (!bb->getExit())
+ return;
+
+ // determine encoding size, try to group short instructions
+ nShort = 0;
+ for (i = bb->getEntry(); i; i = next) {
+ next = i->next;
+
+ if (i->op == OP_MEMBAR && !targ->isOpSupported(OP_MEMBAR, TYPE_NONE)) {
+ bb->remove(i);
+ continue;
+ }
+
+ i->encSize = getMinEncodingSize(i);
+ if (next && i->encSize < 8)
+ ++nShort;
+ else
+ if ((nShort & 1) && next && getMinEncodingSize(next) == 4) {
+ if (i->isCommutationLegal(i->next)) {
+ bb->permuteAdjacent(i, next);
+ next->encSize = 4;
+ next = i;
+ i = i->prev;
+ ++nShort;
+ } else
+ if (i->isCommutationLegal(i->prev) && next->next) {
+ bb->permuteAdjacent(i->prev, i);
+ next->encSize = 4;
+ next = next->next;
+ bb->binSize += 4;
+ ++nShort;
+ } else {
+ i->encSize = 8;
+ i->prev->encSize = 8;
+ bb->binSize += 4;
+ nShort = 0;
+ }
+ } else {
+ i->encSize = 8;
+ if (nShort & 1) {
+ i->prev->encSize = 8;
+ bb->binSize += 4;
+ }
+ nShort = 0;
+ }
+ bb->binSize += i->encSize;
+ }
+
+ if (bb->getExit()->encSize == 4) {
+ assert(nShort);
+ bb->getExit()->encSize = 8;
+ bb->binSize += 4;
+
+ if ((bb->getExit()->prev->encSize == 4) && !(nShort & 1)) {
+ bb->binSize += 8;
+ bb->getExit()->prev->encSize = 8;
+ }
+ }
+ assert(!bb->getEntry() || (bb->getExit() && bb->getExit()->encSize == 8));
+
+ func->binSize += bb->binSize;
+}
+
+void
+Program::emitSymbolTable(struct nv50_ir_prog_info *info)
+{
+ unsigned int n = 0, nMax = allFuncs.getSize();
+
+ info->bin.syms =
+ (struct nv50_ir_prog_symbol *)MALLOC(nMax * sizeof(*info->bin.syms));
+
+ for (ArrayList::Iterator fi = allFuncs.iterator();
+ !fi.end();
+ fi.next(), ++n) {
+ Function *f = (Function *)fi.get();
+ assert(n < nMax);
+
+ info->bin.syms[n].label = f->getLabel();
+ info->bin.syms[n].offset = f->binPos;
+ }
+
+ info->bin.numSyms = n;
+}
+
+bool
+Program::emitBinary(struct nv50_ir_prog_info *info)
+{
+ CodeEmitter *emit = target->getCodeEmitter(progType);
+
+ emit->prepareEmission(this);
+
+ if (dbgFlags & NV50_IR_DEBUG_BASIC)
+ this->print();
+
+ if (!binSize) {
+ code = NULL;
+ return false;
+ }
+ code = reinterpret_cast<uint32_t *>(MALLOC(binSize));
+ if (!code)
+ return false;
+ emit->setCodeLocation(code, binSize);
+
+ for (ArrayList::Iterator fi = allFuncs.iterator(); !fi.end(); fi.next()) {
+ Function *fn = reinterpret_cast<Function *>(fi.get());
+
+ assert(emit->getCodeSize() == fn->binPos);
+
+ for (int b = 0; b < fn->bbCount; ++b)
+ for (Instruction *i = fn->bbArray[b]->getEntry(); i; i = i->next)
+ emit->emitInstruction(i);
+ }
+ info->bin.relocData = emit->getRelocInfo();
+
+ emitSymbolTable(info);
+
+ // the nvc0 driver will print the binary iself together with the header
+ if ((dbgFlags & NV50_IR_DEBUG_BASIC) && getTarget()->getChipset() < 0xc0)
+ emit->printBinary();
+
+ delete emit;
+ return true;
+}
+
+#define RELOC_ALLOC_INCREMENT 8
+
+bool
+CodeEmitter::addReloc(RelocEntry::Type ty, int w, uint32_t data, uint32_t m,
+ int s)
+{
+ unsigned int n = relocInfo ? relocInfo->count : 0;
+
+ if (!(n % RELOC_ALLOC_INCREMENT)) {
+ size_t size = sizeof(RelocInfo) + n * sizeof(RelocEntry);
+ relocInfo = reinterpret_cast<RelocInfo *>(
+ REALLOC(relocInfo, n ? size : 0,
+ size + RELOC_ALLOC_INCREMENT * sizeof(RelocEntry)));
+ if (!relocInfo)
+ return false;
+ if (n == 0)
+ memset(relocInfo, 0, sizeof(RelocInfo));
+ }
+ ++relocInfo->count;
+
+ relocInfo->entry[n].data = data;
+ relocInfo->entry[n].mask = m;
+ relocInfo->entry[n].offset = codeSize + w * 4;
+ relocInfo->entry[n].bitPos = s;
+ relocInfo->entry[n].type = ty;
+
+ return true;
+}
+
+void
+RelocEntry::apply(uint32_t *binary, const RelocInfo *info) const
+{
+ uint32_t value = 0;
+
+ switch (type) {
+ case TYPE_CODE: value = info->codePos; break;
+ case TYPE_BUILTIN: value = info->libPos; break;
+ case TYPE_DATA: value = info->dataPos; break;
+ default:
+ assert(0);
+ break;
+ }
+ value += data;
+ value = (bitPos < 0) ? (value >> -bitPos) : (value << bitPos);
+
+ binary[offset / 4] &= ~mask;
+ binary[offset / 4] |= value & mask;
+}
+
+} // namespace nv50_ir
+
+
+#include "codegen/nv50_ir_driver.h"
+
+extern "C" {
+
+void
+nv50_ir_relocate_code(void *relocData, uint32_t *code,
+ uint32_t codePos,
+ uint32_t libPos,
+ uint32_t dataPos)
+{
+ nv50_ir::RelocInfo *info = reinterpret_cast<nv50_ir::RelocInfo *>(relocData);
+
+ info->codePos = codePos;
+ info->libPos = libPos;
+ info->dataPos = dataPos;
+
+ for (unsigned int i = 0; i < info->count; ++i)
+ info->entry[i].apply(code, info);
+}
+
+void
+nv50_ir_get_target_library(uint32_t chipset,
+ const uint32_t **code, uint32_t *size)
+{
+ nv50_ir::Target *targ = nv50_ir::Target::create(chipset);
+ targ->getBuiltinCode(code, size);
+ nv50_ir::Target::destroy(targ);
+}
+
+}
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
new file mode 100644
index 0000000..9913ca1
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target.h
@@ -0,0 +1,235 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_TARGET_H__
+#define __NV50_IR_TARGET_H__
+
+#include "codegen/nv50_ir.h"
+
+namespace nv50_ir {
+
+struct RelocInfo;
+
+struct RelocEntry
+{
+ enum Type
+ {
+ TYPE_CODE,
+ TYPE_BUILTIN,
+ TYPE_DATA
+ };
+
+ uint32_t data;
+ uint32_t mask;
+ uint32_t offset;
+ int8_t bitPos;
+ Type type;
+
+ inline void apply(uint32_t *binary, const RelocInfo *info) const;
+};
+
+struct RelocInfo
+{
+ uint32_t codePos;
+ uint32_t libPos;
+ uint32_t dataPos;
+
+ uint32_t count;
+
+ RelocEntry entry[0];
+};
+
+class CodeEmitter
+{
+public:
+ CodeEmitter(const Target *);
+ virtual ~CodeEmitter() { }
+
+ // returns whether the instruction was encodable and written
+ virtual bool emitInstruction(Instruction *) = 0;
+
+ virtual uint32_t getMinEncodingSize(const Instruction *) const = 0;
+
+ void setCodeLocation(void *, uint32_t size);
+ inline void *getCodeLocation() const { return code; }
+ inline uint32_t getCodeSize() const { return codeSize; }
+
+ bool addReloc(RelocEntry::Type, int w, uint32_t data, uint32_t m,
+ int s);
+
+ inline void *getRelocInfo() const { return relocInfo; }
+
+ void prepareEmission(Program *);
+ virtual void prepareEmission(Function *);
+ virtual void prepareEmission(BasicBlock *);
+
+ void printBinary() const;
+
+protected:
+ const Target *targ;
+
+ uint32_t *code;
+ uint32_t codeSize;
+ uint32_t codeSizeLimit;
+
+ RelocInfo *relocInfo;
+};
+
+
+enum OpClass
+{
+ OPCLASS_MOVE = 0,
+ OPCLASS_LOAD = 1,
+ OPCLASS_STORE = 2,
+ OPCLASS_ARITH = 3,
+ OPCLASS_SHIFT = 4,
+ OPCLASS_SFU = 5,
+ OPCLASS_LOGIC = 6,
+ OPCLASS_COMPARE = 7,
+ OPCLASS_CONVERT = 8,
+ OPCLASS_ATOMIC = 9,
+ OPCLASS_TEXTURE = 10,
+ OPCLASS_SURFACE = 11,
+ OPCLASS_FLOW = 12,
+ OPCLASS_PSEUDO = 14,
+ OPCLASS_VECTOR = 15,
+ OPCLASS_BITFIELD = 16,
+ OPCLASS_CONTROL = 17,
+ OPCLASS_OTHER = 18
+};
+
+class Target
+{
+public:
+ Target(bool j, bool s) : joinAnterior(j), hasSWSched(s) { }
+ virtual ~Target() { }
+
+ static Target *create(uint32_t chipset);
+ static void destroy(Target *);
+
+ // 0x50 and 0x84 to 0xaf for nv50
+ // 0xc0 to 0xdf for nvc0
+ inline uint32_t getChipset() const { return chipset; }
+
+ virtual CodeEmitter *getCodeEmitter(Program::Type) = 0;
+
+ // Drivers should upload this so we can use it from all programs.
+ // The address chosen is supplied to the relocation routine.
+ virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const = 0;
+
+ virtual void parseDriverInfo(const struct nv50_ir_prog_info *info) { }
+
+ virtual bool runLegalizePass(Program *, CGStage stage) const = 0;
+
+public:
+ struct OpInfo
+ {
+ OpInfo *variants;
+ operation op;
+ uint16_t srcTypes;
+ uint16_t dstTypes;
+ uint32_t immdBits;
+ uint8_t srcNr;
+ uint8_t srcMods[3];
+ uint8_t dstMods;
+ uint8_t srcFiles[3];
+ uint8_t dstFiles;
+ unsigned int minEncSize : 4;
+ unsigned int vector : 1;
+ unsigned int predicate : 1;
+ unsigned int commutative : 1;
+ unsigned int pseudo : 1;
+ unsigned int flow : 1;
+ unsigned int hasDest : 1;
+ unsigned int terminator : 1;
+ };
+
+ inline const OpInfo& getOpInfo(const Instruction *) const;
+ inline const OpInfo& getOpInfo(const operation) const;
+
+ inline DataFile nativeFile(DataFile f) const;
+
+ virtual bool insnCanLoad(const Instruction *insn, int s,
+ const Instruction *ld) const = 0;
+ virtual bool isOpSupported(operation, DataType) const = 0;
+ virtual bool isAccessSupported(DataFile, DataType) const = 0;
+ virtual bool isModSupported(const Instruction *,
+ int s, Modifier) const = 0;
+ virtual bool isSatSupported(const Instruction *) const = 0;
+ virtual bool isPostMultiplySupported(operation op, float f,
+ int& e) const { return false; }
+ virtual bool mayPredicate(const Instruction *,
+ const Value *) const = 0;
+
+ // whether @insn can be issued together with @next (order matters)
+ virtual bool canDualIssue(const Instruction *insn,
+ const Instruction *next) const { return false; }
+ virtual int getLatency(const Instruction *) const { return 1; }
+ virtual int getThroughput(const Instruction *) const { return 1; }
+
+ virtual unsigned int getFileSize(DataFile) const = 0;
+ virtual unsigned int getFileUnit(DataFile) const = 0;
+
+ virtual uint32_t getSVAddress(DataFile, const Symbol *) const = 0;
+
+public:
+ const bool joinAnterior; // true if join is executed before the op
+ const bool hasSWSched; // true if code should provide scheduling data
+
+ static const uint8_t operationSrcNr[OP_LAST + 1];
+ static const OpClass operationClass[OP_LAST + 1];
+
+ static inline uint8_t getOpSrcNr(operation op)
+ {
+ return operationSrcNr[op];
+ }
+ static inline OpClass getOpClass(operation op)
+ {
+ return operationClass[op];
+ }
+
+protected:
+ uint32_t chipset;
+
+ DataFile nativeFileMap[DATA_FILE_COUNT];
+
+ OpInfo opInfo[OP_LAST + 1];
+};
+
+const Target::OpInfo& Target::getOpInfo(const Instruction *insn) const
+{
+ return opInfo[MIN2(insn->op, OP_LAST)];
+}
+
+const Target::OpInfo& Target::getOpInfo(const operation op) const
+{
+ return opInfo[op];
+}
+
+inline DataFile Target::nativeFile(DataFile f) const
+{
+ return nativeFileMap[f];
+}
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_TARGET_H__
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
new file mode 100644
index 0000000..ade9be0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -0,0 +1,552 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nv50.h"
+
+namespace nv50_ir {
+
+Target *getTargetNV50(unsigned int chipset)
+{
+ return new TargetNV50(chipset);
+}
+
+TargetNV50::TargetNV50(unsigned int card) : Target(true, false)
+{
+ chipset = card;
+
+ wposMask = 0;
+ for (unsigned int i = 0; i <= SV_LAST; ++i)
+ sysvalLocation[i] = ~0;
+
+ initOpInfo();
+}
+
+#if 0
+// BULTINS / LIBRARY FUNCTIONS:
+
+// TODO
+static const uint32_t nvc0_builtin_code[] =
+{
+};
+
+static const uint16_t nvc0_builtin_offsets[NV50_BUILTIN_COUNT] =
+{
+};
+#endif
+
+void
+TargetNV50::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+ *code = NULL;
+ *size = 0;
+}
+
+uint32_t
+TargetNV50::getBuiltinOffset(int builtin) const
+{
+ return 0;
+}
+
+struct opProperties
+{
+ operation op;
+ unsigned int mNeg : 4;
+ unsigned int mAbs : 4;
+ unsigned int mNot : 4;
+ unsigned int mSat : 4;
+ unsigned int fConst : 3;
+ unsigned int fShared : 3;
+ unsigned int fAttrib : 3;
+ unsigned int fImm : 3;
+};
+
+static const struct opProperties _initProps[] =
+{
+ // neg abs not sat c[] s[], a[], imm
+ { OP_ADD, 0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
+ { OP_SUB, 0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+ { OP_MUL, 0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
+ { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+ { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+ { OP_MAD, 0x7, 0x0, 0x0, 0x0, 0x6, 0x1, 0x1, 0x0 }, // special constraint
+ { OP_ABS, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+ { OP_NEG, 0x0, 0x1, 0x0, 0x0, 0x0, 0x1, 0x1, 0x0 },
+ { OP_CVT, 0x1, 0x1, 0x0, 0x8, 0x0, 0x1, 0x1, 0x0 },
+ { OP_AND, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+ { OP_OR, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+ { OP_XOR, 0x0, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x2 },
+ { OP_SHL, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+ { OP_SHR, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2 },
+ { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
+ { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_LG2, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_RCP, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_RSQ, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0 },
+};
+
+void TargetNV50::initOpInfo()
+{
+ unsigned int i, j;
+
+ static const uint32_t commutative[(OP_LAST + 31) / 32] =
+ {
+ // ADD,MAD,MUL,AND,OR,XOR,MAX,MIN
+ 0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
+ };
+ static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+ {
+ // MOV,ADD,SUB,MUL,SAD,L/PINTERP,RCP,TEX,TXF
+ 0x00010e40, 0x00000040, 0x00000498, 0x00000000
+ };
+ static const operation noDestList[] =
+ {
+ OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+ OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+ OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+ OP_QUADON, OP_QUADPOP, OP_TEXBAR, OP_SUSTB, OP_SUSTP, OP_SUREDP,
+ OP_SUREDB, OP_BAR
+ };
+ static const operation noPredList[] =
+ {
+ OP_CALL, OP_PREBREAK, OP_PRERET, OP_QUADON, OP_QUADPOP, OP_JOINAT
+ };
+
+ for (i = 0; i < DATA_FILE_COUNT; ++i)
+ nativeFileMap[i] = (DataFile)i;
+ nativeFileMap[FILE_PREDICATE] = FILE_FLAGS;
+
+ for (i = 0; i < OP_LAST; ++i) {
+ opInfo[i].variants = NULL;
+ opInfo[i].op = (operation)i;
+ opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+ opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+ opInfo[i].immdBits = 0xffffffff;
+ opInfo[i].srcNr = operationSrcNr[i];
+
+ for (j = 0; j < opInfo[i].srcNr; ++j) {
+ opInfo[i].srcMods[j] = 0;
+ opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+ }
+ opInfo[i].dstMods = 0;
+ opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+ opInfo[i].hasDest = 1;
+ opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+ opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+ opInfo[i].pseudo = (i < OP_MOV);
+ opInfo[i].predicate = !opInfo[i].pseudo;
+ opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+ opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+ }
+ for (i = 0; i < sizeof(noDestList) / sizeof(noDestList[0]); ++i)
+ opInfo[noDestList[i]].hasDest = 0;
+ for (i = 0; i < sizeof(noPredList) / sizeof(noPredList[0]); ++i)
+ opInfo[noPredList[i]].predicate = 0;
+
+ for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+ const struct opProperties *prop = &_initProps[i];
+
+ for (int s = 0; s < 3; ++s) {
+ if (prop->mNeg & (1 << s))
+ opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+ if (prop->mAbs & (1 << s))
+ opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+ if (prop->mNot & (1 << s))
+ opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+ if (prop->fConst & (1 << s))
+ opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+ if (prop->fShared & (1 << s))
+ opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_SHARED;
+ if (prop->fAttrib & (1 << s))
+ opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_SHADER_INPUT;
+ if (prop->fImm & (1 << s))
+ opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+ }
+ if (prop->mSat & 8)
+ opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+ }
+}
+
+unsigned int
+TargetNV50::getFileSize(DataFile file) const
+{
+ switch (file) {
+ case FILE_NULL: return 0;
+ case FILE_GPR: return 256; // in 16-bit units **
+ case FILE_PREDICATE: return 0;
+ case FILE_FLAGS: return 4;
+ case FILE_ADDRESS: return 4;
+ case FILE_IMMEDIATE: return 0;
+ case FILE_MEMORY_CONST: return 65536;
+ case FILE_SHADER_INPUT: return 0x200;
+ case FILE_SHADER_OUTPUT: return 0x200;
+ case FILE_MEMORY_GLOBAL: return 0xffffffff;
+ case FILE_MEMORY_SHARED: return 16 << 10;
+ case FILE_MEMORY_LOCAL: return 48 << 10;
+ case FILE_SYSTEM_VALUE: return 16;
+ default:
+ assert(!"invalid file");
+ return 0;
+ }
+ // ** only first 128 units encodable for 16-bit regs
+}
+
+unsigned int
+TargetNV50::getFileUnit(DataFile file) const
+{
+ if (file == FILE_GPR || file == FILE_ADDRESS)
+ return 1;
+ if (file == FILE_SYSTEM_VALUE)
+ return 2;
+ return 0;
+}
+
+uint32_t
+TargetNV50::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+ switch (sym->reg.data.sv.sv) {
+ case SV_FACE:
+ return 0x3fc;
+ case SV_POSITION:
+ {
+ uint32_t addr = sysvalLocation[sym->reg.data.sv.sv];
+ for (int c = 0; c < sym->reg.data.sv.index; ++c)
+ if (wposMask & (1 << c))
+ addr += 4;
+ return addr;
+ }
+ case SV_NCTAID:
+ return 0x8 + 2 * sym->reg.data.sv.index;
+ case SV_CTAID:
+ return 0xc + 2 * sym->reg.data.sv.index;
+ case SV_NTID:
+ return 0x2 + 2 * sym->reg.data.sv.index;
+ case SV_TID:
+ return 0;
+ default:
+ return sysvalLocation[sym->reg.data.sv.sv];
+ }
+}
+
+// long: rrr, arr, rcr, acr, rrc, arc, gcr, grr
+// short: rr, ar, rc, gr
+// immd: ri, gi
+bool
+TargetNV50::insnCanLoad(const Instruction *i, int s,
+ const Instruction *ld) const
+{
+ DataFile sf = ld->src(0).getFile();
+
+ if (sf == FILE_IMMEDIATE && (i->predSrc >= 0 || i->flagsDef >= 0))
+ return false;
+ if (s >= opInfo[i->op].srcNr)
+ return false;
+ if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+ return false;
+ if (s == 2 && i->src(1).getFile() != FILE_GPR)
+ return false;
+
+ // NOTE: don't rely on flagsDef
+ for (int d = 0; i->defExists(d); ++d)
+ if (i->def(d).getFile() == FILE_FLAGS)
+ return false;
+
+ unsigned mode = 0;
+
+ for (int z = 0; z < Target::operationSrcNr[i->op]; ++z) {
+ DataFile zf = (z == s) ? sf : i->src(z).getFile();
+ switch (zf) {
+ case FILE_GPR:
+ break;
+ case FILE_MEMORY_SHARED:
+ case FILE_SHADER_INPUT:
+ mode |= 1 << (z * 2);
+ break;
+ case FILE_MEMORY_CONST:
+ mode |= 2 << (z * 2);
+ break;
+ case FILE_IMMEDIATE:
+ mode |= 3 << (z * 2);
+ default:
+ break;
+ }
+ }
+
+ switch (mode) {
+ case 0x00:
+ case 0x01:
+ case 0x03:
+ case 0x08:
+ case 0x09:
+ case 0x0c:
+ case 0x20:
+ case 0x21:
+ break;
+ case 0x0d:
+ if (ld->bb->getProgram()->getType() != Program::TYPE_GEOMETRY)
+ return false;
+ default:
+ return false;
+ }
+
+ uint8_t ldSize;
+
+ if ((i->op == OP_MUL || i->op == OP_MAD) && !isFloatType(i->dType)) {
+ // 32-bit MUL will be split into 16-bit MULs
+ if (ld->src(0).isIndirect(0))
+ return false;
+ if (sf == FILE_IMMEDIATE)
+ return false;
+ ldSize = 2;
+ } else {
+ ldSize = typeSizeof(ld->dType);
+ }
+
+ if (sf == FILE_IMMEDIATE)
+ return true;
+
+
+ // Check if memory access is encodable:
+
+ if (ldSize < 4 && sf == FILE_SHADER_INPUT) // no < 4-byte aligned a[] access
+ return false;
+ if (ld->getSrc(0)->reg.data.offset > (int32_t)(127 * ldSize))
+ return false;
+
+ if (ld->src(0).isIndirect(0)) {
+ for (int z = 0; i->srcExists(z); ++z)
+ if (i->src(z).isIndirect(0))
+ return false;
+
+ // s[] access only possible in CP, $aX always applies
+ if (sf == FILE_MEMORY_SHARED)
+ return true;
+ if (!ld->bb) // can't check type ...
+ return false;
+ Program::Type pt = ld->bb->getProgram()->getType();
+
+ // $aX applies to c[] only in VP, FP, GP if p[] is not accessed
+ if (pt == Program::TYPE_COMPUTE)
+ return false;
+ if (pt == Program::TYPE_GEOMETRY) {
+ if (sf == FILE_MEMORY_CONST)
+ return i->src(s).getFile() != FILE_SHADER_INPUT;
+ return sf == FILE_SHADER_INPUT;
+ }
+ return sf == FILE_MEMORY_CONST;
+ }
+ return true;
+}
+
+bool
+TargetNV50::isAccessSupported(DataFile file, DataType ty) const
+{
+ if (ty == TYPE_B96 || ty == TYPE_NONE)
+ return false;
+ if (typeSizeof(ty) > 4)
+ return (file == FILE_MEMORY_LOCAL) || (file == FILE_MEMORY_GLOBAL);
+ return true;
+}
+
+bool
+TargetNV50::isOpSupported(operation op, DataType ty) const
+{
+ if (ty == TYPE_F64 && chipset < 0xa0)
+ return false;
+
+ switch (op) {
+ case OP_PRERET:
+ return chipset >= 0xa0;
+ case OP_TXG:
+ return chipset >= 0xa3;
+ case OP_POW:
+ case OP_SQRT:
+ case OP_DIV:
+ case OP_MOD:
+ case OP_SET_AND:
+ case OP_SET_OR:
+ case OP_SET_XOR:
+ case OP_SLCT:
+ case OP_SELP:
+ case OP_POPCNT:
+ case OP_INSBF:
+ case OP_EXTBF:
+ case OP_EXIT: // want exit modifier instead (on NOP if required)
+ case OP_MEMBAR:
+ return false;
+ case OP_SAD:
+ return ty == TYPE_S32;
+ default:
+ return true;
+ }
+}
+
+bool
+TargetNV50::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+ if (!isFloatType(insn->dType)) {
+ switch (insn->op) {
+ case OP_ABS:
+ case OP_NEG:
+ case OP_CVT:
+ case OP_CEIL:
+ case OP_FLOOR:
+ case OP_TRUNC:
+ case OP_AND:
+ case OP_OR:
+ case OP_XOR:
+ break;
+ case OP_ADD:
+ if (insn->src(s ? 0 : 1).mod.neg())
+ return false;
+ break;
+ case OP_SUB:
+ if (s == 0)
+ return insn->src(1).mod.neg() ? false : true;
+ break;
+ case OP_SET:
+ if (insn->sType != TYPE_F32)
+ return false;
+ break;
+ default:
+ return false;
+ }
+ }
+ if (s > 3)
+ return false;
+ return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNV50::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+ if (insn->getPredicate() || insn->flagsSrc >= 0)
+ return false;
+ for (int s = 0; insn->srcExists(s); ++s)
+ if (insn->src(s).getFile() == FILE_IMMEDIATE)
+ return false;
+ return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNV50::isSatSupported(const Instruction *insn) const
+{
+ if (insn->op == OP_CVT)
+ return true;
+ if (insn->dType != TYPE_F32)
+ return false;
+ return opInfo[insn->op].dstMods & NV50_IR_MOD_SAT;
+}
+
+int TargetNV50::getLatency(const Instruction *i) const
+{
+ // TODO: tune these values
+ if (i->op == OP_LOAD) {
+ switch (i->src(0).getFile()) {
+ case FILE_MEMORY_LOCAL:
+ case FILE_MEMORY_GLOBAL:
+ return 100; // really 400 to 800
+ default:
+ return 22;
+ }
+ }
+ return 22;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNV50::getThroughput(const Instruction *i) const
+{
+ // TODO: tune these values
+ if (i->dType == TYPE_F32) {
+ switch (i->op) {
+ case OP_RCP:
+ case OP_RSQ:
+ case OP_LG2:
+ case OP_SIN:
+ case OP_COS:
+ case OP_PRESIN:
+ case OP_PREEX2:
+ return 16;
+ default:
+ return 4;
+ }
+ } else
+ if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+ return 4;
+ } else
+ if (i->dType == TYPE_F64) {
+ return 32;
+ } else {
+ return 1;
+ }
+}
+
+static void
+recordLocation(uint16_t *locs, uint8_t *masks,
+ const struct nv50_ir_varying *var)
+{
+ uint16_t addr = var->slot[0] * 4;
+
+ switch (var->sn) {
+ case TGSI_SEMANTIC_POSITION: locs[SV_POSITION] = addr; break;
+ case TGSI_SEMANTIC_INSTANCEID: locs[SV_INSTANCE_ID] = addr; break;
+ case TGSI_SEMANTIC_VERTEXID: locs[SV_VERTEX_ID] = addr; break;
+ case TGSI_SEMANTIC_PRIMID: locs[SV_PRIMITIVE_ID] = addr; break;
+ case NV50_SEMANTIC_LAYER: locs[SV_LAYER] = addr; break;
+ case NV50_SEMANTIC_VIEWPORTINDEX: locs[SV_VIEWPORT_INDEX] = addr; break;
+ default:
+ break;
+ }
+ if (var->sn == TGSI_SEMANTIC_POSITION && masks)
+ masks[0] = var->mask;
+}
+
+void
+TargetNV50::parseDriverInfo(const struct nv50_ir_prog_info *info)
+{
+ unsigned int i;
+ for (i = 0; i < info->numOutputs; ++i)
+ recordLocation(sysvalLocation, NULL, &info->out[i]);
+ for (i = 0; i < info->numInputs; ++i)
+ recordLocation(sysvalLocation, &wposMask, &info->in[i]);
+ for (i = 0; i < info->numSysVals; ++i)
+ recordLocation(sysvalLocation, NULL, &info->sv[i]);
+
+ if (sysvalLocation[SV_POSITION] >= 0x200) {
+ // not assigned by driver, but we need it internally
+ wposMask = 0x8;
+ sysvalLocation[SV_POSITION] = 0;
+ }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
new file mode 100644
index 0000000..0cbf180
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.h
@@ -0,0 +1,72 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNV50 : public Target
+{
+public:
+ TargetNV50(unsigned int chipset);
+
+ virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+ virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+ virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+ virtual void parseDriverInfo(const struct nv50_ir_prog_info *);
+
+ virtual bool insnCanLoad(const Instruction *insn, int s,
+ const Instruction *ld) const;
+ virtual bool isOpSupported(operation, DataType) const;
+ virtual bool isAccessSupported(DataFile, DataType) const;
+ virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+ virtual bool isSatSupported(const Instruction *) const;
+ virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+ virtual int getLatency(const Instruction *) const;
+ virtual int getThroughput(const Instruction *) const;
+
+ virtual unsigned int getFileSize(DataFile) const;
+ virtual unsigned int getFileUnit(DataFile) const;
+
+ virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+ uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+ void initOpInfo();
+
+ uint16_t sysvalLocation[SV_LAST + 1];
+ uint8_t wposMask;
+};
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
new file mode 100644
index 0000000..47e9c55
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.cpp
@@ -0,0 +1,604 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target_nvc0.h"
+
+namespace nv50_ir {
+
+Target *getTargetNVC0(unsigned int chipset)
+{
+ return new TargetNVC0(chipset);
+}
+
+TargetNVC0::TargetNVC0(unsigned int card) : Target(false, card >= 0xe4)
+{
+ chipset = card;
+ initOpInfo();
+}
+
+// BULTINS / LIBRARY FUNCTIONS:
+
+// lazyness -> will just hardcode everything for the time being
+
+#include "target_lib_nvc0.asm.h"
+#include "target_lib_nve4.asm.h"
+#include "target_lib_nvf0.asm.h"
+
+void
+TargetNVC0::getBuiltinCode(const uint32_t **code, uint32_t *size) const
+{
+ switch (chipset & 0xf0) {
+ case 0xe0:
+ *code = (const uint32_t *)&nve4_builtin_code[0];
+ *size = sizeof(nve4_builtin_code);
+ break;
+ case 0xf0:
+ *code = (const uint32_t *)&nvf0_builtin_code[0];
+ *size = sizeof(nvf0_builtin_code);
+ break;
+ default:
+ *code = (const uint32_t *)&nvc0_builtin_code[0];
+ *size = sizeof(nvc0_builtin_code);
+ break;
+ }
+}
+
+uint32_t
+TargetNVC0::getBuiltinOffset(int builtin) const
+{
+ assert(builtin < NVC0_BUILTIN_COUNT);
+
+ switch (chipset & 0xf0) {
+ case 0xe0: return nve4_builtin_offsets[builtin];
+ case 0xf0: return nvf0_builtin_offsets[builtin];
+ default:
+ return nvc0_builtin_offsets[builtin];
+ }
+}
+
+struct opProperties
+{
+ operation op;
+ unsigned int mNeg : 4;
+ unsigned int mAbs : 4;
+ unsigned int mNot : 4;
+ unsigned int mSat : 4;
+ unsigned int fConst : 3;
+ unsigned int fImmd : 4; // last bit indicates if full immediate is suppoted
+};
+
+static const struct opProperties _initProps[] =
+{
+ // neg abs not sat c[] imm
+ { OP_ADD, 0x3, 0x3, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+ { OP_SUB, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 | 0x8 },
+ { OP_MUL, 0x3, 0x0, 0x0, 0x8, 0x2, 0x2 | 0x8 },
+ { OP_MAX, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+ { OP_MIN, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+ { OP_MAD, 0x7, 0x0, 0x0, 0x8, 0x6, 0x2 | 0x8 }, // special c[] constraint
+ { OP_MADSP, 0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+ { OP_ABS, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+ { OP_NEG, 0x0, 0x1, 0x0, 0x0, 0x1, 0x0 },
+ { OP_CVT, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+ { OP_CEIL, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+ { OP_FLOOR, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+ { OP_TRUNC, 0x1, 0x1, 0x0, 0x8, 0x1, 0x0 },
+ { OP_AND, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+ { OP_OR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+ { OP_XOR, 0x0, 0x0, 0x3, 0x0, 0x2, 0x2 | 0x8 },
+ { OP_SHL, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+ { OP_SHR, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+ { OP_SET, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+ { OP_SLCT, 0x4, 0x0, 0x0, 0x0, 0x6, 0x2 }, // special c[] constraint
+ { OP_PREEX2, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+ { OP_PRESIN, 0x1, 0x1, 0x0, 0x0, 0x1, 0x1 },
+ { OP_COS, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+ { OP_SIN, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+ { OP_EX2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+ { OP_LG2, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+ { OP_RCP, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+ { OP_RSQ, 0x1, 0x1, 0x0, 0x8, 0x0, 0x0 },
+ { OP_DFDX, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_DFDY, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0 },
+ { OP_CALL, 0x0, 0x0, 0x0, 0x0, 0x1, 0x0 },
+ { OP_INSBF, 0x0, 0x0, 0x0, 0x0, 0x0, 0x4 },
+ { OP_PERMT, 0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+ { OP_SET_AND, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+ { OP_SET_OR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+ { OP_SET_XOR, 0x3, 0x3, 0x0, 0x0, 0x2, 0x2 },
+ // saturate only:
+ { OP_LINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+ { OP_PINTERP, 0x0, 0x0, 0x0, 0x8, 0x0, 0x0 },
+ // nve4 ops:
+ { OP_SULDB, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+ { OP_SUSTB, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+ { OP_SUSTP, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0 },
+ { OP_SUCLAMP, 0x0, 0x0, 0x0, 0x0, 0x2, 0x2 },
+ { OP_SUBFM, 0x0, 0x0, 0x0, 0x0, 0x6, 0x2 },
+ { OP_SUEAU, 0x0, 0x0, 0x0, 0x0, 0x6, 0x2 }
+};
+
+void TargetNVC0::initOpInfo()
+{
+ unsigned int i, j;
+
+ static const uint32_t commutative[(OP_LAST + 31) / 32] =
+ {
+ // ADD, MAD, MUL, AND, OR, XOR, MAX, MIN
+ 0x0670ca00, 0x0000003f, 0x00000000, 0x00000000
+ };
+
+ static const uint32_t shortForm[(OP_LAST + 31) / 32] =
+ {
+ // ADD, MAD, MUL, AND, OR, XOR, PRESIN, PREEX2, SFN, CVT, PINTERP, MOV
+ 0x0670ca00, 0x00000000, 0x00000000, 0x00000000
+ };
+
+ static const operation noDest[] =
+ {
+ OP_STORE, OP_WRSV, OP_EXPORT, OP_BRA, OP_CALL, OP_RET, OP_EXIT,
+ OP_DISCARD, OP_CONT, OP_BREAK, OP_PRECONT, OP_PREBREAK, OP_PRERET,
+ OP_JOIN, OP_JOINAT, OP_BRKPT, OP_MEMBAR, OP_EMIT, OP_RESTART,
+ OP_QUADON, OP_QUADPOP, OP_TEXBAR, OP_SUSTB, OP_SUSTP, OP_SUREDP,
+ OP_SUREDB, OP_BAR
+ };
+
+ static const operation noPred[] =
+ {
+ OP_CALL, OP_PRERET, OP_QUADON, OP_QUADPOP,
+ OP_JOINAT, OP_PREBREAK, OP_PRECONT, OP_BRKPT
+ };
+
+ for (i = 0; i < DATA_FILE_COUNT; ++i)
+ nativeFileMap[i] = (DataFile)i;
+ nativeFileMap[FILE_ADDRESS] = FILE_GPR;
+
+ for (i = 0; i < OP_LAST; ++i) {
+ opInfo[i].variants = NULL;
+ opInfo[i].op = (operation)i;
+ opInfo[i].srcTypes = 1 << (int)TYPE_F32;
+ opInfo[i].dstTypes = 1 << (int)TYPE_F32;
+ opInfo[i].immdBits = 0;
+ opInfo[i].srcNr = operationSrcNr[i];
+
+ for (j = 0; j < opInfo[i].srcNr; ++j) {
+ opInfo[i].srcMods[j] = 0;
+ opInfo[i].srcFiles[j] = 1 << (int)FILE_GPR;
+ }
+ opInfo[i].dstMods = 0;
+ opInfo[i].dstFiles = 1 << (int)FILE_GPR;
+
+ opInfo[i].hasDest = 1;
+ opInfo[i].vector = (i >= OP_TEX && i <= OP_TEXCSAA);
+ opInfo[i].commutative = (commutative[i / 32] >> (i % 32)) & 1;
+ opInfo[i].pseudo = (i < OP_MOV);
+ opInfo[i].predicate = !opInfo[i].pseudo;
+ opInfo[i].flow = (i >= OP_BRA && i <= OP_JOIN);
+ opInfo[i].minEncSize = (shortForm[i / 32] & (1 << (i % 32))) ? 4 : 8;
+ }
+ for (i = 0; i < sizeof(noDest) / sizeof(noDest[0]); ++i)
+ opInfo[noDest[i]].hasDest = 0;
+ for (i = 0; i < sizeof(noPred) / sizeof(noPred[0]); ++i)
+ opInfo[noPred[i]].predicate = 0;
+
+ for (i = 0; i < sizeof(_initProps) / sizeof(_initProps[0]); ++i) {
+ const struct opProperties *prop = &_initProps[i];
+
+ for (int s = 0; s < 3; ++s) {
+ if (prop->mNeg & (1 << s))
+ opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NEG;
+ if (prop->mAbs & (1 << s))
+ opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_ABS;
+ if (prop->mNot & (1 << s))
+ opInfo[prop->op].srcMods[s] |= NV50_IR_MOD_NOT;
+ if (prop->fConst & (1 << s))
+ opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_MEMORY_CONST;
+ if (prop->fImmd & (1 << s))
+ opInfo[prop->op].srcFiles[s] |= 1 << (int)FILE_IMMEDIATE;
+ if (prop->fImmd & 8)
+ opInfo[prop->op].immdBits = 0xffffffff;
+ }
+ if (prop->mSat & 8)
+ opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
+ }
+}
+
+unsigned int
+TargetNVC0::getFileSize(DataFile file) const
+{
+ switch (file) {
+ case FILE_NULL: return 0;
+ case FILE_GPR: return (chipset >= NVISA_GK110_CHIPSET) ? 255 : 63;
+ case FILE_PREDICATE: return 7;
+ case FILE_FLAGS: return 1;
+ case FILE_ADDRESS: return 0;
+ case FILE_IMMEDIATE: return 0;
+ case FILE_MEMORY_CONST: return 65536;
+ case FILE_SHADER_INPUT: return 0x400;
+ case FILE_SHADER_OUTPUT: return 0x400;
+ case FILE_MEMORY_GLOBAL: return 0xffffffff;
+ case FILE_MEMORY_SHARED: return 16 << 10;
+ case FILE_MEMORY_LOCAL: return 48 << 10;
+ case FILE_SYSTEM_VALUE: return 32;
+ default:
+ assert(!"invalid file");
+ return 0;
+ }
+}
+
+unsigned int
+TargetNVC0::getFileUnit(DataFile file) const
+{
+ if (file == FILE_GPR || file == FILE_ADDRESS || file == FILE_SYSTEM_VALUE)
+ return 2;
+ return 0;
+}
+
+uint32_t
+TargetNVC0::getSVAddress(DataFile shaderFile, const Symbol *sym) const
+{
+ const int idx = sym->reg.data.sv.index;
+ const SVSemantic sv = sym->reg.data.sv.sv;
+
+ const bool isInput = shaderFile == FILE_SHADER_INPUT;
+ const bool kepler = getChipset() >= NVISA_GK104_CHIPSET;
+
+ switch (sv) {
+ case SV_POSITION: return 0x070 + idx * 4;
+ case SV_INSTANCE_ID: return 0x2f8;
+ case SV_VERTEX_ID: return 0x2fc;
+ case SV_PRIMITIVE_ID: return isInput ? 0x060 : 0x040;
+ case SV_LAYER: return 0x064;
+ case SV_VIEWPORT_INDEX: return 0x068;
+ case SV_POINT_SIZE: return 0x06c;
+ case SV_CLIP_DISTANCE: return 0x2c0 + idx * 4;
+ case SV_POINT_COORD: return 0x2e0 + idx * 4;
+ case SV_FACE: return 0x3fc;
+ case SV_TESS_FACTOR: return 0x000 + idx * 4;
+ case SV_TESS_COORD: return 0x2f0 + idx * 4;
+ case SV_NTID: return kepler ? (0x00 + idx * 4) : ~0;
+ case SV_NCTAID: return kepler ? (0x0c + idx * 4) : ~0;
+ case SV_GRIDID: return kepler ? 0x18 : ~0;
+ default:
+ return 0xffffffff;
+ }
+}
+
+bool
+TargetNVC0::insnCanLoad(const Instruction *i, int s,
+ const Instruction *ld) const
+{
+ DataFile sf = ld->src(0).getFile();
+
+ // immediate 0 can be represented by GPR $r63/$r255
+ if (sf == FILE_IMMEDIATE && ld->getSrc(0)->reg.data.u64 == 0)
+ return (!i->isPseudo() &&
+ !i->asTex() &&
+ i->op != OP_EXPORT && i->op != OP_STORE);
+
+ if (s >= opInfo[i->op].srcNr)
+ return false;
+ if (!(opInfo[i->op].srcFiles[s] & (1 << (int)sf)))
+ return false;
+
+ // indirect loads can only be done by OP_LOAD/VFETCH/INTERP on nvc0
+ if (ld->src(0).isIndirect(0))
+ return false;
+
+ for (int k = 0; i->srcExists(k); ++k) {
+ if (i->src(k).getFile() == FILE_IMMEDIATE) {
+ if (k == 2 && i->op == OP_SUCLAMP) // special case
+ continue;
+ if (i->getSrc(k)->reg.data.u64 != 0)
+ return false;
+ } else
+ if (i->src(k).getFile() != FILE_GPR &&
+ i->src(k).getFile() != FILE_PREDICATE) {
+ return false;
+ }
+ }
+
+ // not all instructions support full 32 bit immediates
+ if (sf == FILE_IMMEDIATE) {
+ Storage &reg = ld->getSrc(0)->asImm()->reg;
+
+ if (opInfo[i->op].immdBits != 0xffffffff) {
+ if (i->sType == TYPE_F32) {
+ if (reg.data.u32 & 0xfff)
+ return false;
+ } else
+ if (i->sType == TYPE_S32 || i->sType == TYPE_U32) {
+ // with u32, 0xfffff counts as 0xffffffff as well
+ if (reg.data.s32 > 0x7ffff || reg.data.s32 < -0x80000)
+ return false;
+ }
+ } else
+ if (i->op == OP_MAD || i->op == OP_FMA) {
+ // requires src == dst, cannot decide before RA
+ // (except if we implement more constraints)
+ if (ld->getSrc(0)->asImm()->reg.data.u32 & 0xfff)
+ return false;
+ } else
+ if (i->op == OP_ADD && i->sType == TYPE_F32) {
+ // add f32 LIMM cannot saturate
+ if (i->saturate && (reg.data.u32 & 0xfff))
+ return false;
+ }
+ }
+
+ return true;
+}
+
+bool
+TargetNVC0::isAccessSupported(DataFile file, DataType ty) const
+{
+ if (ty == TYPE_NONE)
+ return false;
+ if (file == FILE_MEMORY_CONST && getChipset() >= 0xe0) // wrong encoding ?
+ return typeSizeof(ty) <= 8;
+ if (ty == TYPE_B96)
+ return false;
+ if (getChipset() >= 0xf0) {
+ // XXX: find wide vfetch/export
+ if (ty == TYPE_B128)
+ return false;
+ if (ty == TYPE_U64)
+ return false;
+ }
+ return true;
+}
+
+bool
+TargetNVC0::isOpSupported(operation op, DataType ty) const
+{
+ if ((op == OP_MAD || op == OP_FMA) && (ty != TYPE_F32))
+ return false;
+ if (op == OP_SAD && ty != TYPE_S32 && ty != TYPE_U32)
+ return false;
+ if (op == OP_POW || op == OP_SQRT || op == OP_DIV || op == OP_MOD)
+ return false;
+ return true;
+}
+
+bool
+TargetNVC0::isModSupported(const Instruction *insn, int s, Modifier mod) const
+{
+ if (!isFloatType(insn->dType)) {
+ switch (insn->op) {
+ case OP_ABS:
+ case OP_NEG:
+ case OP_CVT:
+ case OP_CEIL:
+ case OP_FLOOR:
+ case OP_TRUNC:
+ case OP_AND:
+ case OP_OR:
+ case OP_XOR:
+ break;
+ case OP_SET:
+ if (insn->sType != TYPE_F32)
+ return false;
+ break;
+ case OP_ADD:
+ if (mod.abs())
+ return false;
+ if (insn->src(s ? 0 : 1).mod.neg())
+ return false;
+ break;
+ case OP_SUB:
+ if (s == 0)
+ return insn->src(1).mod.neg() ? false : true;
+ break;
+ default:
+ return false;
+ }
+ }
+ if (s > 3)
+ return false;
+ return (mod & Modifier(opInfo[insn->op].srcMods[s])) == mod;
+}
+
+bool
+TargetNVC0::mayPredicate(const Instruction *insn, const Value *pred) const
+{
+ if (insn->getPredicate())
+ return false;
+ return opInfo[insn->op].predicate;
+}
+
+bool
+TargetNVC0::isSatSupported(const Instruction *insn) const
+{
+ if (insn->op == OP_CVT)
+ return true;
+ if (!(opInfo[insn->op].dstMods & NV50_IR_MOD_SAT))
+ return false;
+
+ if (insn->dType == TYPE_U32)
+ return (insn->op == OP_ADD) || (insn->op == OP_MAD);
+
+ // add f32 LIMM cannot saturate
+ if (insn->op == OP_ADD && insn->sType == TYPE_F32) {
+ if (insn->getSrc(1)->asImm() &&
+ insn->getSrc(1)->reg.data.u32 & 0xfff)
+ return false;
+ }
+
+ return insn->dType == TYPE_F32;
+}
+
+bool
+TargetNVC0::isPostMultiplySupported(operation op, float f, int& e) const
+{
+ if (op != OP_MUL)
+ return false;
+ f = fabsf(f);
+ e = static_cast<int>(log2f(f));
+ if (e < -3 || e > 3)
+ return false;
+ return f == exp2f(static_cast<float>(e));
+}
+
+// TODO: better values
+// this could be more precise, e.g. depending on the issue-to-read/write delay
+// of the depending instruction, but it's good enough
+int TargetNVC0::getLatency(const Instruction *i) const
+{
+ if (chipset >= 0xe4) {
+ if (i->dType == TYPE_F64 || i->sType == TYPE_F64)
+ return 20;
+ switch (i->op) {
+ case OP_LINTERP:
+ case OP_PINTERP:
+ return 15;
+ case OP_LOAD:
+ if (i->src(0).getFile() == FILE_MEMORY_CONST)
+ return 9;
+ // fall through
+ case OP_VFETCH:
+ return 24;
+ default:
+ if (Target::getOpClass(i->op) == OPCLASS_TEXTURE)
+ return 17;
+ if (i->op == OP_MUL && i->dType != TYPE_F32)
+ return 15;
+ return 9;
+ }
+ } else {
+ if (i->op == OP_LOAD) {
+ if (i->cache == CACHE_CV)
+ return 700;
+ return 48;
+ }
+ return 24;
+ }
+ return 32;
+}
+
+// These are "inverse" throughput values, i.e. the number of cycles required
+// to issue a specific instruction for a full warp (32 threads).
+//
+// Assuming we have more than 1 warp in flight, a higher issue latency results
+// in a lower result latency since the MP will have spent more time with other
+// warps.
+// This also helps to determine the number of cycles between instructions in
+// a single warp.
+//
+int TargetNVC0::getThroughput(const Instruction *i) const
+{
+ // TODO: better values
+ if (i->dType == TYPE_F32) {
+ switch (i->op) {
+ case OP_ADD:
+ case OP_MUL:
+ case OP_MAD:
+ case OP_FMA:
+ return 1;
+ case OP_CVT:
+ case OP_CEIL:
+ case OP_FLOOR:
+ case OP_TRUNC:
+ case OP_SET:
+ case OP_SLCT:
+ case OP_MIN:
+ case OP_MAX:
+ return 2;
+ case OP_RCP:
+ case OP_RSQ:
+ case OP_LG2:
+ case OP_SIN:
+ case OP_COS:
+ case OP_PRESIN:
+ case OP_PREEX2:
+ default:
+ return 8;
+ }
+ } else
+ if (i->dType == TYPE_U32 || i->dType == TYPE_S32) {
+ switch (i->op) {
+ case OP_ADD:
+ case OP_AND:
+ case OP_OR:
+ case OP_XOR:
+ case OP_NOT:
+ return 1;
+ case OP_MUL:
+ case OP_MAD:
+ case OP_CVT:
+ case OP_SET:
+ case OP_SLCT:
+ case OP_SHL:
+ case OP_SHR:
+ case OP_NEG:
+ case OP_ABS:
+ case OP_MIN:
+ case OP_MAX:
+ default:
+ return 2;
+ }
+ } else
+ if (i->dType == TYPE_F64) {
+ return 2;
+ } else {
+ return 1;
+ }
+}
+
+bool TargetNVC0::canDualIssue(const Instruction *a, const Instruction *b) const
+{
+ const OpClass clA = operationClass[a->op];
+ const OpClass clB = operationClass[b->op];
+
+ if (getChipset() >= 0xe4) {
+ // not texturing
+ // not if the 2nd instruction isn't necessarily executed
+ if (clA == OPCLASS_TEXTURE || clA == OPCLASS_FLOW)
+ return false;
+ // anything with MOV
+ if (a->op == OP_MOV || b->op == OP_MOV)
+ return true;
+ if (clA == clB) {
+ // only F32 arith or integer additions
+ if (clA != OPCLASS_ARITH)
+ return false;
+ return (a->dType == TYPE_F32 || a->op == OP_ADD ||
+ b->dType == TYPE_F32 || b->op == OP_ADD);
+ }
+ // nothing with TEXBAR
+ if (a->op == OP_TEXBAR || b->op == OP_TEXBAR)
+ return false;
+ // no loads and stores accessing the the same space
+ if ((clA == OPCLASS_LOAD && clB == OPCLASS_STORE) ||
+ (clB == OPCLASS_LOAD && clA == OPCLASS_STORE))
+ if (a->src(0).getFile() == b->src(0).getFile())
+ return false;
+ // no > 32-bit ops
+ if (typeSizeof(a->dType) > 4 || typeSizeof(b->dType) > 4 ||
+ typeSizeof(a->sType) > 4 || typeSizeof(b->sType) > 4)
+ return false;
+ return true;
+ } else {
+ return false; // info not needed (yet)
+ }
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
new file mode 100644
index 0000000..7831af5
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nvc0.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_target.h"
+
+namespace nv50_ir {
+
+#define NVC0_BUILTIN_DIV_U32 0
+#define NVC0_BUILTIN_DIV_S32 1
+#define NVC0_BUILTIN_RCP_F64 2
+#define NVC0_BUILTIN_RSQ_F64 3
+
+#define NVC0_BUILTIN_COUNT 4
+
+class TargetNVC0 : public Target
+{
+public:
+ TargetNVC0(unsigned int chipset);
+
+ virtual CodeEmitter *getCodeEmitter(Program::Type);
+
+ CodeEmitter *createCodeEmitterNVC0(Program::Type);
+ CodeEmitter *createCodeEmitterGK110(Program::Type);
+
+ virtual bool runLegalizePass(Program *, CGStage stage) const;
+
+ virtual void getBuiltinCode(const uint32_t **code, uint32_t *size) const;
+
+ virtual bool insnCanLoad(const Instruction *insn, int s,
+ const Instruction *ld) const;
+ virtual bool isOpSupported(operation, DataType) const;
+ virtual bool isAccessSupported(DataFile, DataType) const;
+ virtual bool isModSupported(const Instruction *, int s, Modifier) const;
+ virtual bool isSatSupported(const Instruction *) const;
+ virtual bool isPostMultiplySupported(operation, float, int& e) const;
+ virtual bool mayPredicate(const Instruction *, const Value *) const;
+
+ virtual bool canDualIssue(const Instruction *, const Instruction *) const;
+ virtual int getLatency(const Instruction *) const;
+ virtual int getThroughput(const Instruction *) const;
+
+ virtual unsigned int getFileSize(DataFile) const;
+ virtual unsigned int getFileUnit(DataFile) const;
+
+ virtual uint32_t getSVAddress(DataFile shaderFile, const Symbol *sv) const;
+
+ uint32_t getBuiltinOffset(int builtin) const;
+
+private:
+ void initOpInfo();
+};
+
+bool calculateSchedDataNVC0(const Target *, Function *);
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
new file mode 100644
index 0000000..8959777
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.cpp
@@ -0,0 +1,390 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "codegen/nv50_ir_util.h"
+
+namespace nv50_ir {
+
+void DLList::clear()
+{
+ for (Item *next, *item = head.next; item != &head; item = next) {
+ next = item->next;
+ delete item;
+ }
+ head.next = head.prev = &head;
+}
+
+void
+DLList::Iterator::erase()
+{
+ Item *rem = pos;
+
+ if (rem == term)
+ return;
+ pos = pos->next;
+
+ DLLIST_DEL(rem);
+ delete rem;
+}
+
+void DLList::Iterator::moveToList(DLList& dest)
+{
+ Item *item = pos;
+
+ assert(term != &dest.head);
+ assert(pos != term);
+
+ pos = pos->next;
+
+ DLLIST_DEL(item);
+ DLLIST_ADDHEAD(&dest.head, item);
+}
+
+bool
+DLList::Iterator::insert(void *data)
+{
+ Item *ins = new Item(data);
+
+ ins->next = pos->next;
+ ins->prev = pos;
+ pos->next->prev = ins;
+ pos->next = ins;
+
+ if (pos == term)
+ term = ins;
+
+ return true;
+}
+
+void
+Stack::moveTo(Stack& that)
+{
+ unsigned int newSize = this->size + that.size;
+
+ while (newSize > that.limit)
+ that.resize();
+ memcpy(&that.array[that.size], &array[0], this->size * sizeof(Item));
+
+ that.size = newSize;
+ this->size = 0;
+}
+
+Interval::Interval(const Interval& that) : head(NULL), tail(NULL)
+{
+ this->insert(that);
+}
+
+Interval::~Interval()
+{
+ clear();
+}
+
+void
+Interval::clear()
+{
+ for (Range *next, *r = head; r; r = next) {
+ next = r->next;
+ delete r;
+ }
+ head = tail = NULL;
+}
+
+bool
+Interval::extend(int a, int b)
+{
+ Range *r, **nextp = &head;
+
+ // NOTE: we need empty intervals for fixed registers
+ // if (a == b)
+ // return false;
+ assert(a <= b);
+
+ for (r = head; r; r = r->next) {
+ if (b < r->bgn)
+ break; // insert before
+ if (a > r->end) {
+ // insert after
+ nextp = &r->next;
+ continue;
+ }
+
+ // overlap
+ if (a < r->bgn) {
+ r->bgn = a;
+ if (b > r->end)
+ r->end = b;
+ r->coalesce(&tail);
+ return true;
+ }
+ if (b > r->end) {
+ r->end = b;
+ r->coalesce(&tail);
+ return true;
+ }
+ assert(a >= r->bgn);
+ assert(b <= r->end);
+ return true;
+ }
+
+ (*nextp) = new Range(a, b);
+ (*nextp)->next = r;
+
+ for (r = (*nextp); r->next; r = r->next);
+ tail = r;
+ return true;
+}
+
+bool Interval::contains(int pos) const
+{
+ for (Range *r = head; r && r->bgn <= pos; r = r->next)
+ if (r->end > pos)
+ return true;
+ return false;
+}
+
+bool Interval::overlaps(const Interval &that) const
+{
+#if 1
+ Range *a = this->head;
+ Range *b = that.head;
+
+ while (a && b) {
+ if (b->bgn < a->end &&
+ b->end > a->bgn)
+ return true;
+ if (a->end <= b->bgn)
+ a = a->next;
+ else
+ b = b->next;
+ }
+#else
+ for (Range *rA = this->head; rA; rA = rA->next)
+ for (Range *rB = iv.head; rB; rB = rB->next)
+ if (rB->bgn < rA->end &&
+ rB->end > rA->bgn)
+ return true;
+#endif
+ return false;
+}
+
+void Interval::insert(const Interval &that)
+{
+ for (Range *r = that.head; r; r = r->next)
+ this->extend(r->bgn, r->end);
+}
+
+void Interval::unify(Interval &that)
+{
+ assert(this != &that);
+ for (Range *next, *r = that.head; r; r = next) {
+ next = r->next;
+ this->extend(r->bgn, r->end);
+ delete r;
+ }
+ that.head = NULL;
+}
+
+int Interval::length() const
+{
+ int len = 0;
+ for (Range *r = head; r; r = r->next)
+ len += r->bgn - r->end;
+ return len;
+}
+
+void Interval::print() const
+{
+ if (!head)
+ return;
+ INFO("[%i %i)", head->bgn, head->end);
+ for (const Range *r = head->next; r; r = r->next)
+ INFO(" [%i %i)", r->bgn, r->end);
+ INFO("\n");
+}
+
+void
+BitSet::andNot(const BitSet &set)
+{
+ assert(data && set.data);
+ assert(size >= set.size);
+ for (unsigned int i = 0; i < (set.size + 31) / 32; ++i)
+ data[i] &= ~set.data[i];
+}
+
+BitSet& BitSet::operator|=(const BitSet &set)
+{
+ assert(data && set.data);
+ assert(size >= set.size);
+ for (unsigned int i = 0; i < (set.size + 31) / 32; ++i)
+ data[i] |= set.data[i];
+ return *this;
+}
+
+bool BitSet::resize(unsigned int nBits)
+{
+ if (!data || !nBits)
+ return allocate(nBits, true);
+ const unsigned int p = (size + 31) / 32;
+ const unsigned int n = (nBits + 31) / 32;
+ if (n == p)
+ return true;
+
+ data = (uint32_t *)REALLOC(data, 4 * p, 4 * n);
+ if (!data) {
+ size = 0;
+ return false;
+ }
+ if (n > p)
+ memset(&data[4 * p + 4], 0, (n - p) * 4);
+
+ size = nBits;
+ return true;
+}
+
+bool BitSet::allocate(unsigned int nBits, bool zero)
+{
+ if (data && size < nBits) {
+ FREE(data);
+ data = NULL;
+ }
+ size = nBits;
+
+ if (!data)
+ data = reinterpret_cast<uint32_t *>(CALLOC((size + 31) / 32, 4));
+
+ if (zero)
+ memset(data, 0, (size + 7) / 8);
+ else
+ if (nBits)
+ data[(size + 31) / 32 - 1] = 0; // clear unused bits (e.g. for popCount)
+
+ return data;
+}
+
+unsigned int BitSet::popCount() const
+{
+ unsigned int count = 0;
+
+ for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+ if (data[i])
+ count += util_bitcount(data[i]);
+ return count;
+}
+
+void BitSet::fill(uint32_t val)
+{
+ unsigned int i;
+ for (i = 0; i < (size + 31) / 32; ++i)
+ data[i] = val;
+ if (val)
+ data[i] &= ~(0xffffffff << (size % 32)); // BE ?
+}
+
+void BitSet::setOr(BitSet *pA, BitSet *pB)
+{
+ if (!pB) {
+ *this = *pA;
+ } else {
+ for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+ data[i] = pA->data[i] | pB->data[i];
+ }
+}
+
+int BitSet::findFreeRange(unsigned int count) const
+{
+ const uint32_t m = (1 << count) - 1;
+ int pos = size;
+ unsigned int i;
+ const unsigned int end = (size + 31) / 32;
+
+ if (count == 1) {
+ for (i = 0; i < end; ++i) {
+ pos = ffs(~data[i]) - 1;
+ if (pos >= 0)
+ break;
+ }
+ } else
+ if (count == 2) {
+ for (i = 0; i < end; ++i) {
+ if (data[i] != 0xffffffff) {
+ uint32_t b = data[i] | (data[i] >> 1) | 0xaaaaaaaa;
+ pos = ffs(~b) - 1;
+ if (pos >= 0)
+ break;
+ }
+ }
+ } else
+ if (count == 4 || count == 3) {
+ for (i = 0; i < end; ++i) {
+ if (data[i] != 0xffffffff) {
+ uint32_t b =
+ (data[i] >> 0) | (data[i] >> 1) |
+ (data[i] >> 2) | (data[i] >> 3) | 0xeeeeeeee;
+ pos = ffs(~b) - 1;
+ if (pos >= 0)
+ break;
+ }
+ }
+ } else {
+ if (count <= 8)
+ count = 8;
+ else
+ if (count <= 16)
+ count = 16;
+ else
+ count = 32;
+
+ for (i = 0; i < end; ++i) {
+ if (data[i] != 0xffffffff) {
+ for (pos = 0; pos < 32; pos += count)
+ if (!(data[i] & (m << pos)))
+ break;
+ if (pos < 32)
+ break;
+ }
+ }
+ }
+ pos += i * 32;
+
+ return ((pos + count) <= size) ? pos : -1;
+}
+
+void BitSet::print() const
+{
+ unsigned int n = 0;
+ INFO("BitSet of size %u:\n", size);
+ for (unsigned int i = 0; i < (size + 31) / 32; ++i) {
+ uint32_t bits = data[i];
+ while (bits) {
+ int pos = ffs(bits) - 1;
+ bits &= ~(1 << pos);
+ INFO(" %i", i * 32 + pos);
+ ++n;
+ if ((n % 16) == 0)
+ INFO("\n");
+ }
+ }
+ if (n % 16)
+ INFO("\n");
+}
+
+} // namespace nv50_ir
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
new file mode 100644
index 0000000..a4ea9d9
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_util.h
@@ -0,0 +1,788 @@
+/*
+ * Copyright 2011 Christoph Bumiller
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef __NV50_IR_UTIL_H__
+#define __NV50_IR_UTIL_H__
+
+#include <new>
+#include <assert.h>
+#include <stdio.h>
+#include <memory>
+#include <map>
+
+#ifndef NDEBUG
+# include <typeinfo>
+#endif
+
+#include "util/u_inlines.h"
+#include "util/u_memory.h"
+
+#define ERROR(args...) debug_printf("ERROR: " args)
+#define WARN(args...) debug_printf("WARNING: " args)
+#define INFO(args...) debug_printf(args)
+
+#define INFO_DBG(m, f, args...) \
+ do { \
+ if (m & NV50_IR_DEBUG_##f) \
+ debug_printf(args); \
+ } while(0)
+
+#define FATAL(args...) \
+ do { \
+ fprintf(stderr, args); \
+ abort(); \
+ } while(0)
+
+
+#define NV50_IR_FUNC_ALLOC_OBJ_DEF(obj, f, args...) \
+ new ((f)->getProgram()->mem_##obj.allocate()) obj(f, args)
+
+#define new_Instruction(f, args...) \
+ NV50_IR_FUNC_ALLOC_OBJ_DEF(Instruction, f, args)
+#define new_CmpInstruction(f, args...) \
+ NV50_IR_FUNC_ALLOC_OBJ_DEF(CmpInstruction, f, args)
+#define new_TexInstruction(f, args...) \
+ NV50_IR_FUNC_ALLOC_OBJ_DEF(TexInstruction, f, args)
+#define new_FlowInstruction(f, args...) \
+ NV50_IR_FUNC_ALLOC_OBJ_DEF(FlowInstruction, f, args)
+
+#define new_LValue(f, args...) \
+ NV50_IR_FUNC_ALLOC_OBJ_DEF(LValue, f, args)
+
+
+#define NV50_IR_PROG_ALLOC_OBJ_DEF(obj, p, args...) \
+ new ((p)->mem_##obj.allocate()) obj(p, args)
+
+#define new_Symbol(p, args...) \
+ NV50_IR_PROG_ALLOC_OBJ_DEF(Symbol, p, args)
+#define new_ImmediateValue(p, args...) \
+ NV50_IR_PROG_ALLOC_OBJ_DEF(ImmediateValue, p, args)
+
+
+#define delete_Instruction(p, insn) (p)->releaseInstruction(insn)
+#define delete_Value(p, val) (p)->releaseValue(val)
+
+
+namespace nv50_ir {
+
+class Iterator
+{
+public:
+ virtual ~Iterator() { };
+ virtual void next() = 0;
+ virtual void *get() const = 0;
+ virtual bool end() const = 0; // if true, get will return 0
+ virtual void reset() { assert(0); } // only for graph iterators
+};
+
+typedef std::auto_ptr<Iterator> IteratorRef;
+
+class ManipIterator : public Iterator
+{
+public:
+ virtual bool insert(void *) = 0; // insert after current position
+ virtual void erase() = 0;
+};
+
+// WARNING: do not use a->prev/next for __item or __list
+
+#define DLLIST_DEL(__item) \
+ do { \
+ (__item)->prev->next = (__item)->next; \
+ (__item)->next->prev = (__item)->prev; \
+ (__item)->next = (__item); \
+ (__item)->prev = (__item); \
+ } while(0)
+
+#define DLLIST_ADDTAIL(__list, __item) \
+ do { \
+ (__item)->next = (__list); \
+ (__item)->prev = (__list)->prev; \
+ (__list)->prev->next = (__item); \
+ (__list)->prev = (__item); \
+ } while(0)
+
+#define DLLIST_ADDHEAD(__list, __item) \
+ do { \
+ (__item)->prev = (__list); \
+ (__item)->next = (__list)->next; \
+ (__list)->next->prev = (__item); \
+ (__list)->next = (__item); \
+ } while(0)
+
+#define DLLIST_MERGE(__listA, __listB, ty) \
+ do { \
+ ty prevB = (__listB)->prev; \
+ (__listA)->prev->next = (__listB); \
+ (__listB)->prev->next = (__listA); \
+ (__listB)->prev = (__listA)->prev; \
+ (__listA)->prev = prevB; \
+ } while(0)
+
+#define DLLIST_EMPTY(__list) ((__list)->next == (__list))
+
+#define DLLIST_FOR_EACH(list, it) \
+ for (DLList::Iterator (it) = (list)->iterator(); !(it).end(); (it).next())
+
+class DLList
+{
+public:
+ class Item
+ {
+ public:
+ Item(void *priv) : next(this), prev(this), data(priv) { }
+
+ public:
+ Item *next;
+ Item *prev;
+ void *data;
+ };
+
+ DLList() : head(0) { }
+ ~DLList() { clear(); }
+
+ inline void insertHead(void *data)
+ {
+ Item *item = new Item(data);
+
+ assert(data);
+
+ item->prev = &head;
+ item->next = head.next;
+ head.next->prev = item;
+ head.next = item;
+ }
+
+ inline void insertTail(void *data)
+ {
+ Item *item = new Item(data);
+
+ assert(data);
+
+ DLLIST_ADDTAIL(&head, item);
+ }
+
+ inline void insert(void *data) { insertTail(data); }
+
+ void clear();
+
+ class Iterator : public ManipIterator
+ {
+ public:
+ Iterator(Item *head, bool r) : rev(r), pos(r ? head->prev : head->next),
+ term(head) { }
+
+ virtual void next() { if (!end()) pos = rev ? pos->prev : pos->next; }
+ virtual void *get() const { return pos->data; }
+ virtual bool end() const { return pos == term; }
+
+ // caution: if you're at end-2 and erase it, then do next, you're at end
+ virtual void erase();
+ virtual bool insert(void *data);
+
+ // move item to a another list, no consistency with its iterators though
+ void moveToList(DLList&);
+
+ private:
+ const bool rev;
+ Item *pos;
+ Item *term;
+
+ friend class DLList;
+ };
+
+ inline void erase(Iterator& pos)
+ {
+ pos.erase();
+ }
+
+ Iterator iterator()
+ {
+ return Iterator(&head, false);
+ }
+
+ Iterator revIterator()
+ {
+ return Iterator(&head, true);
+ }
+
+private:
+ Item head;
+};
+
+class Stack
+{
+public:
+ class Item {
+ public:
+ union {
+ void *p;
+ int i;
+ unsigned int u;
+ float f;
+ double d;
+ } u;
+
+ Item() { memset(&u, 0, sizeof(u)); }
+ };
+
+ Stack() : size(0), limit(0), array(0) { }
+ ~Stack() { if (array) FREE(array); }
+
+ inline void push(int i) { Item data; data.u.i = i; push(data); }
+ inline void push(unsigned int u) { Item data; data.u.u = u; push(data); }
+ inline void push(void *p) { Item data; data.u.p = p; push(data); }
+ inline void push(float f) { Item data; data.u.f = f; push(data); }
+
+ inline void push(Item data)
+ {
+ if (size == limit)
+ resize();
+ array[size++] = data;
+ }
+
+ inline Item pop()
+ {
+ if (!size) {
+ Item data;
+ assert(0);
+ return data;
+ }
+ return array[--size];
+ }
+
+ inline unsigned int getSize() { return size; }
+
+ inline Item& peek() { assert(size); return array[size - 1]; }
+
+ void clear(bool releaseStorage = false)
+ {
+ if (releaseStorage && array)
+ FREE(array);
+ size = limit = 0;
+ }
+
+ void moveTo(Stack&); // move all items to target (not like push(pop()))
+
+private:
+ void resize()
+ {
+ unsigned int sizeOld, sizeNew;
+
+ sizeOld = limit * sizeof(Item);
+ limit = MAX2(4, limit + limit);
+ sizeNew = limit * sizeof(Item);
+
+ array = (Item *)REALLOC(array, sizeOld, sizeNew);
+ }
+
+ unsigned int size;
+ unsigned int limit;
+ Item *array;
+};
+
+class DynArray
+{
+public:
+ class Item
+ {
+ public:
+ union {
+ uint32_t u32;
+ void *p;
+ };
+ };
+
+ DynArray() : data(NULL), size(0) { }
+
+ ~DynArray() { if (data) FREE(data); }
+
+ inline Item& operator[](unsigned int i)
+ {
+ if (i >= size)
+ resize(i);
+ return data[i];
+ }
+
+ inline const Item operator[](unsigned int i) const
+ {
+ return data[i];
+ }
+
+ void resize(unsigned int index)
+ {
+ const unsigned int oldSize = size * sizeof(Item);
+
+ if (!size)
+ size = 8;
+ while (size <= index)
+ size <<= 1;
+
+ data = (Item *)REALLOC(data, oldSize, size * sizeof(Item));
+ }
+
+ void clear()
+ {
+ FREE(data);
+ data = NULL;
+ size = 0;
+ }
+
+private:
+ Item *data;
+ unsigned int size;
+};
+
+class ArrayList
+{
+public:
+ ArrayList() : size(0) { }
+
+ void insert(void *item, int& id)
+ {
+ id = ids.getSize() ? ids.pop().u.i : size++;
+ data[id].p = item;
+ }
+
+ void remove(int& id)
+ {
+ const unsigned int uid = id;
+ assert(uid < size && data[id].p);
+ ids.push(uid);
+ data[uid].p = NULL;
+ id = -1;
+ }
+
+ inline int getSize() const { return size; }
+
+ inline void *get(unsigned int id) { assert(id < size); return data[id].p; }
+
+ class Iterator : public nv50_ir::Iterator
+ {
+ public:
+ Iterator(const ArrayList *array) : pos(0), data(array->data)
+ {
+ size = array->getSize();
+ if (size)
+ nextValid();
+ }
+
+ void nextValid() { while ((pos < size) && !data[pos].p) ++pos; }
+
+ void next() { if (pos < size) { ++pos; nextValid(); } }
+ void *get() const { assert(pos < size); return data[pos].p; }
+ bool end() const { return pos >= size; }
+
+ private:
+ unsigned int pos;
+ unsigned int size;
+ const DynArray& data;
+
+ friend class ArrayList;
+ };
+
+ Iterator iterator() const { return Iterator(this); }
+
+ void clear()
+ {
+ data.clear();
+ ids.clear(true);
+ size = 0;
+ }
+
+private:
+ DynArray data;
+ Stack ids;
+ unsigned int size;
+};
+
+class Interval
+{
+public:
+ Interval() : head(0), tail(0) { }
+ Interval(const Interval&);
+ ~Interval();
+
+ bool extend(int, int);
+ void insert(const Interval&);
+ void unify(Interval&); // clears source interval
+ void clear();
+
+ inline int begin() const { return head ? head->bgn : -1; }
+ inline int end() const { checkTail(); return tail ? tail->end : -1; }
+ inline bool isEmpty() const { return !head; }
+ bool overlaps(const Interval&) const;
+ bool contains(int pos) const;
+
+ inline int extent() const { return end() - begin(); }
+ int length() const;
+
+ void print() const;
+
+ inline void checkTail() const;
+
+private:
+ class Range
+ {
+ public:
+ Range(int a, int b) : next(0), bgn(a), end(b) { }
+
+ Range *next;
+ int bgn;
+ int end;
+
+ void coalesce(Range **ptail)
+ {
+ Range *rnn;
+
+ while (next && end >= next->bgn) {
+ assert(bgn <= next->bgn);
+ rnn = next->next;
+ end = MAX2(end, next->end);
+ delete next;
+ next = rnn;
+ }
+ if (!next)
+ *ptail = this;
+ }
+ };
+
+ Range *head;
+ Range *tail;
+};
+
+class BitSet
+{
+public:
+ BitSet() : marker(false), data(0), size(0) { }
+ BitSet(unsigned int nBits, bool zero) : marker(false), data(0), size(0)
+ {
+ allocate(nBits, zero);
+ }
+ ~BitSet()
+ {
+ if (data)
+ FREE(data);
+ }
+
+ bool allocate(unsigned int nBits, bool zero);
+ bool resize(unsigned int nBits); // keep old data, zero additional bits
+
+ inline unsigned int getSize() const { return size; }
+
+ void fill(uint32_t val);
+
+ void setOr(BitSet *, BitSet *); // second BitSet may be NULL
+
+ inline void set(unsigned int i)
+ {
+ assert(i < size);
+ data[i / 32] |= 1 << (i % 32);
+ }
+ // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+ inline void setRange(unsigned int i, unsigned int n)
+ {
+ assert((i + n) <= size && (((i % 32) + n) <= 32));
+ data[i / 32] |= ((1 << n) - 1) << (i % 32);
+ }
+ inline void setMask(unsigned int i, uint32_t m)
+ {
+ assert(i < size);
+ data[i / 32] |= m;
+ }
+
+ inline void clr(unsigned int i)
+ {
+ assert(i < size);
+ data[i / 32] &= ~(1 << (i % 32));
+ }
+ // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+ inline void clrRange(unsigned int i, unsigned int n)
+ {
+ assert((i + n) <= size && (((i % 32) + n) <= 32));
+ data[i / 32] &= ~(((1 << n) - 1) << (i % 32));
+ }
+
+ inline bool test(unsigned int i) const
+ {
+ assert(i < size);
+ return data[i / 32] & (1 << (i % 32));
+ }
+ // NOTE: range may not cross 32 bit boundary (implies n <= 32)
+ inline bool testRange(unsigned int i, unsigned int n) const
+ {
+ assert((i + n) <= size && (((i % 32) + n) <= 32));
+ return data[i / 32] & (((1 << n) - 1) << (i % 32));
+ }
+
+ // Find a range of size (<= 32) clear bits aligned to roundup_pow2(size).
+ int findFreeRange(unsigned int size) const;
+
+ BitSet& operator|=(const BitSet&);
+
+ BitSet& operator=(const BitSet& set)
+ {
+ assert(data && set.data);
+ assert(size == set.size);
+ memcpy(data, set.data, (set.size + 7) / 8);
+ return *this;
+ }
+
+ void andNot(const BitSet&);
+
+ // bits = (bits | setMask) & ~clrMask
+ inline void periodicMask32(uint32_t setMask, uint32_t clrMask)
+ {
+ for (unsigned int i = 0; i < (size + 31) / 32; ++i)
+ data[i] = (data[i] | setMask) & ~clrMask;
+ }
+
+ unsigned int popCount() const;
+
+ void print() const;
+
+public:
+ bool marker; // for user
+
+private:
+ uint32_t *data;
+ unsigned int size;
+};
+
+void Interval::checkTail() const
+{
+#if NV50_DEBUG & NV50_DEBUG_PROG_RA
+ Range *r = head;
+ while (r->next)
+ r = r->next;
+ assert(tail == r);
+#endif
+}
+
+class MemoryPool
+{
+private:
+ inline bool enlargeAllocationsArray(const unsigned int id, unsigned int nr)
+ {
+ const unsigned int size = sizeof(uint8_t *) * id;
+ const unsigned int incr = sizeof(uint8_t *) * nr;
+
+ uint8_t **alloc = (uint8_t **)REALLOC(allocArray, size, size + incr);
+ if (!alloc)
+ return false;
+ allocArray = alloc;
+ return true;
+ }
+
+ inline bool enlargeCapacity()
+ {
+ const unsigned int id = count >> objStepLog2;
+
+ uint8_t *const mem = (uint8_t *)MALLOC(objSize << objStepLog2);
+ if (!mem)
+ return false;
+
+ if (!(id % 32)) {
+ if (!enlargeAllocationsArray(id, 32)) {
+ FREE(mem);
+ return false;
+ }
+ }
+ allocArray[id] = mem;
+ return true;
+ }
+
+public:
+ MemoryPool(unsigned int size, unsigned int incr) : objSize(size),
+ objStepLog2(incr)
+ {
+ allocArray = NULL;
+ released = NULL;
+ count = 0;
+ }
+
+ ~MemoryPool()
+ {
+ unsigned int allocCount = (count + (1 << objStepLog2) - 1) >> objStepLog2;
+ for (unsigned int i = 0; i < allocCount && allocArray[i]; ++i)
+ FREE(allocArray[i]);
+ if (allocArray)
+ FREE(allocArray);
+ }
+
+ void *allocate()
+ {
+ void *ret;
+ const unsigned int mask = (1 << objStepLog2) - 1;
+
+ if (released) {
+ ret = released;
+ released = *(void **)released;
+ return ret;
+ }
+
+ if (!(count & mask))
+ if (!enlargeCapacity())
+ return NULL;
+
+ ret = allocArray[count >> objStepLog2] + (count & mask) * objSize;
+ ++count;
+ return ret;
+ }
+
+ void release(void *ptr)
+ {
+ *(void **)ptr = released;
+ released = ptr;
+ }
+
+private:
+ uint8_t **allocArray; // array (list) of MALLOC allocations
+
+ void *released; // list of released objects
+
+ unsigned int count; // highest allocated object
+
+ const unsigned int objSize;
+ const unsigned int objStepLog2;
+};
+
+/**
+ * Composite object cloning policy.
+ *
+ * Encapsulates how sub-objects are to be handled (if at all) when a
+ * composite object is being cloned.
+ */
+template<typename C>
+class ClonePolicy
+{
+protected:
+ C *c;
+
+public:
+ ClonePolicy(C *c) : c(c) {}
+
+ C *context() { return c; }
+
+ template<typename T> T *get(T *obj)
+ {
+ void *clone = lookup(obj);
+ if (!clone)
+ clone = obj->clone(*this);
+ return reinterpret_cast<T *>(clone);
+ }
+
+ template<typename T> void set(const T *obj, T *clone)
+ {
+ insert(obj, clone);
+ }
+
+protected:
+ virtual void *lookup(void *obj) = 0;
+ virtual void insert(const void *obj, void *clone) = 0;
+};
+
+/**
+ * Shallow non-recursive cloning policy.
+ *
+ * Objects cloned with the "shallow" policy don't clone their
+ * children recursively, instead, the new copy shares its children
+ * with the original object.
+ */
+template<typename C>
+class ShallowClonePolicy : public ClonePolicy<C>
+{
+public:
+ ShallowClonePolicy(C *c) : ClonePolicy<C>(c) {}
+
+protected:
+ virtual void *lookup(void *obj)
+ {
+ return obj;
+ }
+
+ virtual void insert(const void *obj, void *clone)
+ {
+ }
+};
+
+template<typename C, typename T>
+inline T *cloneShallow(C *c, T *obj)
+{
+ ShallowClonePolicy<C> pol(c);
+ return obj->clone(pol);
+}
+
+/**
+ * Recursive cloning policy.
+ *
+ * Objects cloned with the "deep" policy clone their children
+ * recursively, keeping track of what has already been cloned to
+ * avoid making several new copies of the same object.
+ */
+template<typename C>
+class DeepClonePolicy : public ClonePolicy<C>
+{
+public:
+ DeepClonePolicy(C *c) : ClonePolicy<C>(c) {}
+
+private:
+ std::map<const void *, void *> map;
+
+protected:
+ virtual void *lookup(void *obj)
+ {
+ return map[obj];
+ }
+
+ virtual void insert(const void *obj, void *clone)
+ {
+ map[obj] = clone;
+ }
+};
+
+template<typename S, typename T>
+struct bimap
+{
+ std::map<S, T> forth;
+ std::map<T, S> back;
+
+public:
+ bimap() : l(back), r(forth) { }
+ bimap(const bimap<S, T> &m)
+ : forth(m.forth), back(m.back), l(back), r(forth) { }
+
+ void insert(const S &s, const T &t)
+ {
+ forth.insert(std::make_pair(s, t));
+ back.insert(std::make_pair(t, s));
+ }
+
+ typedef typename std::map<T, S>::const_iterator l_iterator;
+ const std::map<T, S> &l;
+ typedef typename std::map<S, T>::const_iterator r_iterator;
+ const std::map<S, T> &r;
+};
+
+} // namespace nv50_ir
+
+#endif // __NV50_IR_UTIL_H__
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm
new file mode 100644
index 0000000..f40becc
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm
@@ -0,0 +1,96 @@
+//
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT: $r0: dividend, $r1: divisor
+// OUTPUT: $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE: 22 / 14 * 8 bytes
+//
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+ret
+//
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT: $r0: dividend, $r1: divisor
+// OUTPUT: $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+set $p2 0x1 lt s32 $r0 0x0
+set $p3 0x1 lt s32 $r1 0x0 xor $p2
+cvt s32 $r0 abs s32 $r0
+cvt s32 $r1 abs s32 $r1
+bfind u32 $r2 $r1
+xor b32 $r2 $r2 0x1f
+mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+cvt u32 $r2 neg u32 $r1
+add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p3 cvt s32 $r0 neg s32 $r0
+$p2 cvt s32 $r1 neg s32 $r1
+ret
+//
+// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
+//
+// INPUT: $r0d (x)
+// OUTPUT: $r0d (rcp(x))
+// CLOBBER: $r2 - $r7
+// SIZE: 9 * 8 bytes
+//
+nop
+ret
+// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
+//
+// INPUT: $r0d (x)
+// OUTPUT: $r0d (rsqrt(x))
+// CLOBBER: $r2 - $r7
+// SIZE: 14 * 8 bytes
+//
+nop
+ret
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h
new file mode 100644
index 0000000..3790504
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvc0.asm.h
@@ -0,0 +1,112 @@
+
+static const uint32_t nvc0_builtin_code[] =
+{
+ 0x04009c03,
+ 0x78000000,
+ 0x7c209cdd,
+ 0x0010dd18,
+ 0x08309c03,
+ 0x60000000,
+ 0x05605c18,
+ 0x0810dc2a,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0000dde4,
+ 0x28000000,
+ 0x08001c43,
+ 0x50000000,
+ 0x05609c18,
+ 0x0010430d,
+ 0x0811dc03,
+ 0x1b0e0000,
+ 0x08104103,
+ 0x48000000,
+ 0x04000002,
+ 0x08000000,
+ 0x0811c003,
+ 0x1b0e0000,
+ 0x08104103,
+ 0x48000000,
+ 0x040000ac,
+ 0x90001dff,
+ 0xfc05dc23,
+ 0x188e0000,
+ 0xfc17dc23,
+ 0x18c40000,
+ 0x03301e18,
+ 0x07305e18,
+ 0x04009c03,
+ 0x78000000,
+ 0x7c209cdd,
+ 0x0010dd18,
+ 0x08309c03,
+ 0x60000000,
+ 0x05605c18,
+ 0x0810dc2a,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0810dc03,
+ 0x50000000,
+ 0x0c209c43,
+ 0x20040000,
+ 0x0000dde4,
+ 0x28000000,
+ 0x08001c43,
+ 0x50000000,
+ 0x05609c18,
+ 0x0010430d,
+ 0x0811dc03,
+ 0x1b0e0000,
+ 0x08104103,
+ 0x48000000,
+ 0x04000002,
+ 0x08000000,
+ 0x0811c003,
+ 0x1b0e0000,
+ 0x08104103,
+ 0x48000000,
+ 0x040000ac,
+ 0x01700e18,
+ 0x05704a18,
+ 0x90001dff,
+ 0x00001c08,
+ 0x90001dff,
+ 0x00001c08,
+ 0x90001dff,
+};
+
+static const uint16_t nvc0_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+ 0x0000,
+ 0x00b0,
+ 0x0180,
+ 0x0188
+};
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm
new file mode 100644
index 0000000..5adc9ff
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm
@@ -0,0 +1,698 @@
+//
+// DIV U32
+//
+// UNR recurrence (q = a / b):
+// look for z such that 2^32 - b <= b * z < 2^32
+// then q - 1 <= (a * z) / 2^32 <= q
+//
+// INPUT: $r0: dividend, $r1: divisor
+// OUTPUT: $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p1
+// SIZE: 22 / 14 * 8 bytes
+//
+sched 0x28 0x4 0x28 0x4 0x28 0x28 0x28
+bfind u32 $r2 $r1
+long xor b32 $r2 $r2 0x1f
+long mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+long cvt u32 $r1 neg u32 $r1
+long mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x4 0x28 0x4 0x28 0x28 0x2c 0x4
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+long cvt u32 $r2 neg u32 $r1
+long add $r1 (mul u32 $r1 u32 $r0) $r3
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+sched 0x28 0x2c 0x4 0x20 0x2e 0x28 0x20
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+long ret
+//
+// DIV S32, like DIV U32 after taking ABS(inputs)
+//
+// INPUT: $r0: dividend, $r1: divisor
+// OUTPUT: $r0: result, $r1: modulus
+// CLOBBER: $r2 - $r3, $p0 - $p3
+//
+set $p2 0x1 lt s32 $r0 0x0
+set $p3 0x1 lt s32 $r1 0x0 xor $p2
+sched 0x20 0x28 0x28 0x4 0x28 0x04 0x28
+long cvt s32 $r0 abs s32 $r0
+long cvt s32 $r1 abs s32 $r1
+bfind u32 $r2 $r1
+long xor b32 $r2 $r2 0x1f
+long mov b32 $r3 0x1
+shl b32 $r2 $r3 clamp $r2
+cvt u32 $r1 neg u32 $r1
+sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+sched 0x28 0x28 0x4 0x28 0x04 0x28 0x28
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mul $r3 u32 $r1 u32 $r2
+add $r2 (mul high u32 $r2 u32 $r3) $r2
+mov b32 $r3 $r0
+mul high $r0 u32 $r0 u32 $r2
+long cvt u32 $r2 neg u32 $r1
+long add $r1 (mul u32 $r1 u32 $r0) $r3
+sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
+set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+$p0 add b32 $r0 $r0 0x1
+$p0 set $p0 0x1 ge u32 $r1 $r2
+$p0 sub b32 $r1 $r1 $r2
+long $p0 add b32 $r0 $r0 0x1
+long $p3 cvt s32 $r0 neg s32 $r0
+sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
+$p2 cvt s32 $r1 neg s32 $r1
+long ret
+//
+// SULDP [for each format]
+// $r4d: address
+// $r2: surface info (format)
+// $p0: access predicate
+// $p1, $p2: caching predicate (00: cv, 01: ca, 10: cg)
+//
+// RGBA32
+$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
+long ret
+// RGBA16_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b128 $r0q ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b128 $r0q cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b128 $r0q cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 u16 1 $r1
+cvt rn f32 $r2 u16 0 $r1
+mul f32 $r3 $r3 0x37800074
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt rn f32 $r1 u16 1 $r0
+mul f32 $r2 $r2 0x37800074
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x37800074
+mul f32 $r0 $r0 0x37800074
+long ret
+// RGBA16_SNORM
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 s16 1 $r1
+cvt rn f32 $r2 s16 0 $r1
+mul f32 $r3 $r3 0x38000187
+cvt rn f32 $r1 s16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x38000187
+cvt rn f32 $r0 s16 0 $r0
+mul f32 $r1 $r1 0x38000187
+mul f32 $r0 $r0 0x38000187
+long ret
+// RGBA16_SINT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt s32 $r3 s16 1 $r1
+cvt s32 $r2 s16 0 $r1
+cvt s32 $r1 s16 1 $r0
+cvt s32 $r0 s16 0 $r0
+long ret
+// RGBA16_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt u32 $r3 u16 1 $r1
+cvt u32 $r2 u16 0 $r1
+cvt u32 $r1 u16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u16 0 $r0
+long ret
+// RGBA16_FLOAT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+cvt f32 $r3 f16 $r1 1
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt f32 $r2 f16 $r1 0
+cvt f32 $r1 f16 $r0 1
+cvt f32 $r0 f16 $r0 0
+long ret
+// RG32_FLOAT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x3f800000
+long ret
+// RG32_xINT
+$p1 suldgb b64 $r0d ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b64 $r0d cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b64 $r0d cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x00000001
+long ret
+// RGB10A2_UNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0a0a
+long mov b32 $r3 0x3f800000
+ext u32 $r2 $r0 0x0a14
+long and b32 $r0 $r0 0x3ff
+cvt rn f32 $r2 u16 0 $r2
+cvt rn f32 $r1 u16 0 $r1
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3a802007
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x3a802007
+mul f32 $r0 $r0 0x3a802007
+long ret
+// RGB10A2_UINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0a0a
+long mov b32 $r3 0x00000001
+ext u32 $r2 $r0 0x0a14
+long and b32 $r0 $r0 0x3ff
+long ret
+// RGBA8_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 u8 3 $r0
+cvt rn f32 $r2 u8 2 $r0
+mul f32 $r3 $r3 0x3b808081
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt rn f32 $r1 u8 1 $r0
+mul f32 $r2 $r2 0x3b808081
+cvt rn f32 $r0 u8 0 $r0
+mul f32 $r1 $r1 0x3b808081
+mul f32 $r0 $r0 0x3b808081
+long ret
+// RGBA8_SNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r3 s8 3 $r0
+cvt rn f32 $r2 s8 2 $r0
+mul f32 $r3 $r3 0x3c010204
+cvt rn f32 $r1 s8 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3c010204
+cvt rn f32 $r0 s8 0 $r0
+mul f32 $r1 $r1 0x3c010204
+mul f32 $r0 $r0 0x3c010204
+long ret
+// RGBA8_SINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt s32 $r3 s8 3 $r0
+cvt s32 $r2 s8 2 $r0
+cvt s32 $r1 s8 1 $r0
+cvt s32 $r0 s8 0 $r0
+long ret
+// RGBA8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt u32 $r3 u8 3 $r0
+cvt u32 $r2 u8 2 $r0
+cvt u32 $r1 u8 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u8 0 $r0
+long ret
+// R5G6B5_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+ext u32 $r1 $r0 0x0605
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x3f800000
+ext u32 $r2 $r0 0x050b
+long and b32 $r0 $r0 0x1f
+cvt rn f32 $r2 u8 0 $r2
+cvt rn f32 $r1 u8 0 $r1
+mul f32 $r2 $r2 0x3d042108
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r1 $r1 0x3c820821
+mul f32 $r0 $r0 0x3d042108
+long ret
+// R5G5B5X1_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+ext u32 $r1 $r0 0x0505
+ext u32 $r2 $r0 0x050a
+long and b32 $r0 $r0 0x1f
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r2 u8 0 $r2
+cvt rn f32 $r1 u8 0 $r1
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r2 $r2 0x3d042108
+mul f32 $r1 $r1 0x3d042108
+mul f32 $r0 $r0 0x3d042108
+long ret
+// RG16_UNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+cvt rn f32 $r1 u16 1 $r0
+cvt rn f32 $r0 u16 0 $r0
+mul f32 $r1 $r1 0x37800074
+mul f32 $r0 $r0 0x37800074
+long mov b32 $r2 0x00000000
+long mov b32 $r3 0x3f800000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long ret
+// RG16_SNORM
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r1 s16 1 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mov b32 $r2 0x00000000
+cvt rn f32 $r0 s16 0 $r0
+mul f32 $r1 $r1 0x38000187
+mul f32 $r0 $r0 0x38000187
+long ret
+// RG16_SINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x00000001
+cvt s32 $r1 s16 1 $r0
+mov b32 $r2 0x00000000
+cvt s32 $r0 s16 0 $r0
+long ret
+// RG16_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x00000001
+cvt u32 $r1 u16 1 $r0
+mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u16 0 $r0
+long ret
+// RG16_FLOAT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt f32 $r1 f16 $r0 1
+mov b32 $r2 0x00000000
+cvt f32 $r0 f16 $r0 0
+long ret
+// R32_FLOAT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R32_xINT
+$p1 suldgb b32 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb b32 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// RG8_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r1 u8 1 $r0
+mov b32 $r2 0x00000000
+cvt rn f32 $r0 u8 0 $r0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r1 $r1 0x3b808081
+mul f32 $r0 $r0 0x3b808081
+long ret
+// RG8_SNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r1 s8 1 $r0
+long mov b32 $r2 0x00000000
+cvt rn f32 $r0 s8 0 $r0
+mul f32 $r1 $r1 0x3c010204
+mul f32 $r0 $r0 0x3c010204
+long ret
+// RG8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+cvt u32 $r1 u8 1 $r0
+long mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt u32 $r0 u8 0 $r0
+long ret
+// RG8_SINT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+cvt s32 $r1 s8 1 $r0
+long mov b32 $r2 0x00000000
+cvt s32 $r0 s8 0 $r0
+long ret
+// R16_UNORM
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+cvt rn f32 $r0 u16 0 $r0
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+mul f32 $r0 $r0 0x37800074
+long ret
+// R16_SNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 s16 0 $r0
+long mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r1 0x00000000
+mul f32 $r0 $r0 0x38000187
+long ret
+// R16_SINT
+$p1 suldgb s16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb s16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb s16 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R16_UINT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R16_FLOAT
+$p1 suldgb u16 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p2 suldgb u16 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u16 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long mov b32 $r2 0x00000000
+cvt f32 $r0 f16 $r0 0
+mov b32 $r1 0x00000000
+long ret
+// R8_UNORM
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 u8 0 $r0
+mov b32 $r2 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mul f32 $r0 $r0 0x3b808081
+mov b32 $r1 0x00000000
+long ret
+// R8_SNORM
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+mov b32 $r3 0x3f800000
+cvt rn f32 $r0 s8 0 $r0
+mov b32 $r2 0x00000000
+mul f32 $r0 $r0 0x3c010204
+mov b32 $r1 0x00000000
+long ret
+// R8_SINT
+$p1 suldgb s8 $r0 ca zero u8 g[$r4d] $r2 $p0
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb s8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb s8 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+long ret
+// R8_UINT
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+$p1 suldgb u8 $r0 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb u8 $r0 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb u8 $r0 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x00000001
+long mov b32 $r2 0x00000000
+long mov b32 $r1 0x00000000
+sched 0x00 0x00 0x00 0x00 0x00 0x00 0x00
+long ret
+// R11G11B10_FLOAT TODO
+$p1 suldgb b32 $r3 ca zero u8 g[$r4d] $r2 $p0
+set $p1 0x1 $p1 xor not $p2
+$p2 suldgb b32 $r3 cg zero u8 g[$r4d] $r2 $p0
+$p1 suldgb b32 $r3 cv zero u8 g[$r4d] $r2 $p0
+long mov b32 $r3 0x3f800000
+long nop
+long ret
+//
+// RCP F64: Newton Raphson reciprocal(x): r_{i+1} = r_i * (2.0 - x * r_i)
+//
+// INPUT: $r0d (x)
+// OUTPUT: $r0d (rcp(x))
+// CLOBBER: $r2 - $r7
+// SIZE: 9 * 8 bytes
+//
+long nop
+long ret
+// RSQ F64: Newton Raphson rsqrt(x): r_{i+1} = r_i * (1.5 - 0.5 * x * r_i * r_i)
+//
+// INPUT: $r0d (x)
+// OUTPUT: $r0d (rsqrt(x))
+// CLOBBER: $r2 - $r7
+// SIZE: 14 * 8 bytes
+//
+long nop
+long ret
+//
+// Trap handler.
+// Requires at least 4 GPRs and 32 bytes of l[] memory to temporarily save GPRs.
+// Low 32 bytes of l[] memory shouldn't be used if resumeability is required.
+//
+// Trap info:
+// 0x000: mutex
+// 0x004: PC
+// 0x008: trapstat
+// 0x00c: warperr
+// 0x010: tidx
+// 0x014: tidy
+// 0x018: tidz
+// 0x01c: ctaidx
+// 0x020: ctaidy
+// 0x024: ctaidz
+// 0x030: $r0q
+// 0x130: $flags
+// 0x140: s[]
+//
+st b128 wb l[0x00] $r0q
+// check state of the warp and continue if it didn't cause the trap
+long mov b32 $r1 $trapstat
+long mov b32 $r3 $warperr
+mov $r2 $flags mask 0xffff
+and b32 0 $c $r1 $r3
+e $c bra #end_cont
+// spill control flow stack to l[]
+long mov b32 $r3 16
+spill_cfstack:
+preret #end_exit
+sub b32 $r3 $c $r3 0x1
+lg $c bra #spill_cfstack
+// retrieve pointer to trap info
+mov b32 $r0 c0[0x1900]
+mov b32 $r1 c0[0x1904]
+// we only let a single faulting thread store its state
+mov b32 $r3 0x1
+exch b32 $r3 g[$r0d] $r3
+joinat #end_exit
+set $p0 0x1 eq u32 $r3 0x1
+join $p0 nop
+// store $c and $p registers
+st b32 wb g[$r0d+0x130] $r2
+// store $trapstat and $warperr
+long mov b32 $r2 $trapstat
+long mov b32 $r3 $warperr
+st b64 wb g[$r0d+0x8] $r2d
+// store registers
+st b128 wb g[$r0d+0x40] $r4q
+st b128 wb g[$r0d+0x50] $r8q
+st b128 wb g[$r0d+0x60] $r12q
+st b128 wb g[$r0d+0x70] $r16q
+st b128 wb g[$r0d+0x80] $r20q
+st b128 wb g[$r0d+0x90] $r24q
+st b128 wb g[$r0d+0xa0] $r28q
+st b128 wb g[$r0d+0xb0] $r32q
+st b128 wb g[$r0d+0xc0] $r36q
+st b128 wb g[$r0d+0xd0] $r40q
+st b128 wb g[$r0d+0xe0] $r44q
+st b128 wb g[$r0d+0xf0] $r48q
+st b128 wb g[$r0d+0x100] $r52q
+st b128 wb g[$r0d+0x110] $r56q
+st b128 wb g[$r0d+0x120] $r60q
+ld b64 $r2d cs l[0x0]
+st b64 wb g[$r0d+0x30] $r2d
+ld b64 $r2d cs l[0x8]
+st b64 wb g[$r0d+0x38] $r2d
+// store thread id
+long mov b32 $r2 $tidx
+long mov b32 $r3 $tidy
+st b64 wb g[$r0d+0x10] $r2d
+long mov b32 $r2 $tidz
+long mov b32 $r3 $ctaidx
+st b64 wb g[$r0d+0x18] $r2d
+long mov b32 $r2 $ctaidy
+long mov b32 $r3 $ctaidz
+st b64 wb g[$r0d+0x20] $r2d
+// store shared memory (in reverse order so $r0d is base again at the end)
+long mov b32 $r3 $smemsz
+sub b32 $r3 $c $r3 0x4
+s $c bra #shared_done
+add b32 $r0 $c $r0 $r3
+add b32 $r1 $r1 0x0 $c
+shared_loop:
+long ld b32 $r2 s[$r3]
+long st b32 wb g[$r0d+0x140] $r2
+sub b32 $r0 $c $r0 0x4
+sub b32 $r1 $r1 0x0 $c
+sub b32 $r3 $c $r3 0x4
+lg $c bra #shared_loop
+shared_done:
+// search the stack for trap entry to retrieve PC
+mov b32 $r0 c0[0x1908]
+mov b32 $r1 c0[0x190c]
+membar sys
+// invalidate caches so we can read stack entries via g[]
+cctl ivall 0 l[0]
+cctl ivall 0 g[$r0d]
+// get offsets
+mov b32 $r2 $physid
+ext u32 $r3 $r2 0x0814 // MP id
+ext u32 $r2 $r2 0x0608 // warp id
+mul $r2 u32 $r2 u32 c0[0x1914] // warp offset
+mul $r3 u32 $r3 u32 c0[0x1910] // MP offset
+add b32 $r2 $r2 $r3 // MP + warp offset
+add b32 $r0 $c $r0 $r2
+add b32 $r1 $r1 0x0 $c
+search_cstack:
+mov b32 $r3 c0[0x1918] // cstack size
+ld u8 $r2 cv g[$r0d+0x8]
+set $p0 0x1 eq u32 $r2 0xa
+$p0 bra #entry_found
+add b32 $r0 $c $r0 0x10
+add b32 $r1 $r1 0x0 $c
+sub b32 $r3 $c $r3 0x10
+lg $c bra #search_cstack
+bra #end_exit
+entry_found:
+// load PC (may be unaligned and spread out)
+ld b32 $r2 cv g[$r0d]
+mov b32 $r0 c0[0x1900]
+mov b32 $r1 c0[0x1904]
+st b32 wb g[$r0d+0x4] $r2
+join nop
+// invalidate caches and exit
+end_exit:
+cctl ivall 0 g[0]
+bpt pause 0x0
+rtt terminate
+end_cont:
+bpt pause 0x0
+mov $flags $r2 mask 0xffff
+ld b128 $r0q cs l[0x00]
+rtt
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h
new file mode 100644
index 0000000..53fa12c
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nve4.asm.h
@@ -0,0 +1,592 @@
+
+// Assembled from target_lib_nve4.asm by envyas -m nvc0 -V nve4 -W.
+
+static const uint64_t nve4_builtin_code[] =
+{
+ 0x2282828042804287ULL,
+ 0x7800000004009c03ULL,
+ 0x380000007c209c82ULL,
+ 0x180000000400dde2ULL,
+ 0x6000000008309c03ULL,
+ 0x1c00000005205d04ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x2282828282828287ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x500000000810dc03ULL,
+ 0x2042c28280428047ULL,
+ 0x200400000c209c43ULL,
+ 0x280000000000dde4ULL,
+ 0x5000000008001c43ULL,
+ 0x1c00000005209d04ULL,
+ 0x2006000000105c03ULL,
+ 0x1b0e00000811dc03ULL,
+ 0x4800000008104103ULL,
+ 0x220282e20042c287ULL,
+ 0x0800000004000002ULL,
+ 0x1b0e00000811c003ULL,
+ 0x4800000008104103ULL,
+ 0x0800000004000002ULL,
+ 0x9000000000001de7ULL,
+ 0x188e0000fc05dc23ULL,
+ 0x18c40000fc17dc23ULL,
+ 0x2280428042828207ULL,
+ 0x1c00000001201ec4ULL,
+ 0x1c00000005205ec4ULL,
+ 0x7800000004009c03ULL,
+ 0x380000007c209c82ULL,
+ 0x180000000400dde2ULL,
+ 0x6000000008309c03ULL,
+ 0x1c00000005205d04ULL,
+ 0x2282828282828287ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x500000000810dc03ULL,
+ 0x2282804280428287ULL,
+ 0x200400000c209c43ULL,
+ 0x500000000810dc03ULL,
+ 0x200400000c209c43ULL,
+ 0x280000000000dde4ULL,
+ 0x5000000008001c43ULL,
+ 0x1c00000005209d04ULL,
+ 0x2006000000105c03ULL,
+ 0x22028042c28042c7ULL,
+ 0x1b0e00000811dc03ULL,
+ 0x4800000008104103ULL,
+ 0x0800000004000002ULL,
+ 0x1b0e00000811c003ULL,
+ 0x4800000008104103ULL,
+ 0x0800000004000002ULL,
+ 0x1c00000001200f84ULL,
+ 0x22c200428042e047ULL,
+ 0x1c00000005204b84ULL,
+ 0x9000000000001de7ULL,
+ 0xd4004000084004c5ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd4004000084009c5ULL,
+ 0xd4004000084007c5ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd4004000084004c5ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd4004000084009c5ULL,
+ 0xd4004000084007c5ULL,
+ 0x1900000004a0dc04ULL,
+ 0x1800000004a09c04ULL,
+ 0x30de0001d030dc02ULL,
+ 0x2000000000000007ULL,
+ 0x1900000000a05c04ULL,
+ 0x30de0001d0209c02ULL,
+ 0x1800000000a01c04ULL,
+ 0x30de0001d0105c02ULL,
+ 0x30de0001d0001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd4004000084004a5ULL,
+ 0x2000000000000007ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd4004000084009a5ULL,
+ 0xd4004000084007a5ULL,
+ 0x1900000004a0de04ULL,
+ 0x1800000004a09e04ULL,
+ 0x30e000061c30dc02ULL,
+ 0x1900000000a05e04ULL,
+ 0x2000000000000007ULL,
+ 0x30e000061c209c02ULL,
+ 0x1800000000a01e04ULL,
+ 0x30e000061c105c02ULL,
+ 0x30e000061c001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd4004000084004a5ULL,
+ 0x0c5400000013dc04ULL,
+ 0x2000000000000007ULL,
+ 0xd4004000084009a5ULL,
+ 0xd4004000084007a5ULL,
+ 0x1d00000004a0de84ULL,
+ 0x1c00000004a09e84ULL,
+ 0x1d00000000a05e84ULL,
+ 0x1c00000000a01e84ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd4004000084004a5ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd4004000084009a5ULL,
+ 0xd4004000084007a5ULL,
+ 0x1d00000004a0dc04ULL,
+ 0x1c00000004a09c04ULL,
+ 0x1d00000000a05c04ULL,
+ 0x2000000000000007ULL,
+ 0x1c00000000a01c04ULL,
+ 0x9000000000001de7ULL,
+ 0xd4004000084004a5ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd4004000084009a5ULL,
+ 0xd4004000084007a5ULL,
+ 0x1100000004a0dc04ULL,
+ 0x2000000000000007ULL,
+ 0x1000000004a09c04ULL,
+ 0x1100000000a05c04ULL,
+ 0x1000000000a01c04ULL,
+ 0x9000000000001de7ULL,
+ 0xd4004000084004a5ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd4004000084009a5ULL,
+ 0x2000000000000007ULL,
+ 0xd4004000084007a5ULL,
+ 0x1800000000009de2ULL,
+ 0x18fe00000000dde2ULL,
+ 0x9000000000001de7ULL,
+ 0xd4004000084004a5ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd4004000084009a5ULL,
+ 0x2000000000000007ULL,
+ 0xd4004000084007a5ULL,
+ 0x1800000000009de2ULL,
+ 0x180000000400dde2ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400785ULL,
+ 0x7000c02828005c03ULL,
+ 0x18fe00000000dde2ULL,
+ 0x7000c02850009c03ULL,
+ 0x3800000ffc001c02ULL,
+ 0x1800000008a09c04ULL,
+ 0x1800000004a05c04ULL,
+ 0x2000000000000007ULL,
+ 0x30ea00801c209c02ULL,
+ 0x1800000000a01c04ULL,
+ 0x30ea00801c105c02ULL,
+ 0x30ea00801c001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x7000c02828005c03ULL,
+ 0x180000000400dde2ULL,
+ 0x7000c02850009c03ULL,
+ 0x3800000ffc001c02ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x198000000020dc04ULL,
+ 0x1900000000209c04ULL,
+ 0x30ee02020430dc02ULL,
+ 0x2000000000000007ULL,
+ 0x1880000000205c04ULL,
+ 0x30ee020204209c02ULL,
+ 0x1800000000201c04ULL,
+ 0x30ee020204105c02ULL,
+ 0x30ee020204001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x2000000000000007ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x198000000020de04ULL,
+ 0x1900000000209e04ULL,
+ 0x30f004081030dc02ULL,
+ 0x1880000000205e04ULL,
+ 0x2000000000000007ULL,
+ 0x30f0040810209c02ULL,
+ 0x1800000000201e04ULL,
+ 0x30f0040810105c02ULL,
+ 0x30f0040810001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x1d8000000020de84ULL,
+ 0x1d00000000209e84ULL,
+ 0x1c80000000205e84ULL,
+ 0x1c00000000201e84ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x1d8000000020dc04ULL,
+ 0x1d00000000209c04ULL,
+ 0x1c80000000205c04ULL,
+ 0x2000000000000007ULL,
+ 0x1c00000000201c04ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x7000c01814005c03ULL,
+ 0x2000000000000007ULL,
+ 0x18fe00000000dde2ULL,
+ 0x7000c0142c009c03ULL,
+ 0x380000007c001c02ULL,
+ 0x1800000008209c04ULL,
+ 0x1800000004205c04ULL,
+ 0x30f4108420209c02ULL,
+ 0x1800000000201c04ULL,
+ 0x2000000000000007ULL,
+ 0x30f2082084105c02ULL,
+ 0x30f4108420001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x2000000000000007ULL,
+ 0x7000c01414005c03ULL,
+ 0x7000c01428009c03ULL,
+ 0x380000007c001c02ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1800000008209c04ULL,
+ 0x1800000004205c04ULL,
+ 0x1800000000201c04ULL,
+ 0x2000000000000007ULL,
+ 0x30f4108420209c02ULL,
+ 0x30f4108420105c02ULL,
+ 0x30f4108420001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400785ULL,
+ 0x1900000000a05c04ULL,
+ 0x1800000000a01c04ULL,
+ 0x30de0001d0105c02ULL,
+ 0x30de0001d0001c02ULL,
+ 0x1800000000009de2ULL,
+ 0x18fe00000000dde2ULL,
+ 0x2000000000000007ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1900000000a05e04ULL,
+ 0x2000000000000007ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000a01e04ULL,
+ 0x30e000061c105c02ULL,
+ 0x30e000061c001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x180000000400dde2ULL,
+ 0x1d00000000a05e84ULL,
+ 0x1800000000009de2ULL,
+ 0x1c00000000a01e84ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x180000000400dde2ULL,
+ 0x1d00000000a05c04ULL,
+ 0x1800000000009de2ULL,
+ 0x2000000000000007ULL,
+ 0x1c00000000a01c04ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x18fe00000000dde2ULL,
+ 0x2000000000000007ULL,
+ 0x1100000000a05c04ULL,
+ 0x1800000000009de2ULL,
+ 0x1000000000a01c04ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400985ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400785ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400485ULL,
+ 0x0c5400000013dc04ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400985ULL,
+ 0xd400400008400785ULL,
+ 0x180000000400dde2ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x2000000000000007ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1880000000205c04ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000201c04ULL,
+ 0x2000000000000007ULL,
+ 0x30ee020204105c02ULL,
+ 0x30ee020204001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x2000000000000007ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1880000000205e04ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000201e04ULL,
+ 0x30f0040810105c02ULL,
+ 0x30f0040810001c02ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x180000000400dde2ULL,
+ 0x1c80000000205c04ULL,
+ 0x1800000000009de2ULL,
+ 0x2000000000000007ULL,
+ 0x1c00000000201c04ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x180000000400dde2ULL,
+ 0x2000000000000007ULL,
+ 0x1c80000000205e84ULL,
+ 0x1800000000009de2ULL,
+ 0x1c00000000201e84ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400745ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1800000000a01c04ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000005de2ULL,
+ 0x30de0001d0001c02ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1800000000a01e04ULL,
+ 0x1800000000009de2ULL,
+ 0x2000000000000007ULL,
+ 0x1800000000005de2ULL,
+ 0x30e000061c001c02ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400465ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400965ULL,
+ 0xd400400008400765ULL,
+ 0x2000000000000007ULL,
+ 0x180000000400dde2ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400945ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400745ULL,
+ 0x180000000400dde2ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400445ULL,
+ 0x0c5400000013dc04ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400945ULL,
+ 0xd400400008400745ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1800000000009de2ULL,
+ 0x1000000000a01c04ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400405ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400905ULL,
+ 0xd400400008400705ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1800000000201c04ULL,
+ 0x1800000000009de2ULL,
+ 0x2000000000000007ULL,
+ 0x30ee020204001c02ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400405ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400905ULL,
+ 0xd400400008400705ULL,
+ 0x2000000000000007ULL,
+ 0x18fe00000000dde2ULL,
+ 0x1800000000201e04ULL,
+ 0x1800000000009de2ULL,
+ 0x30f0040810001c02ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0xd400400008400425ULL,
+ 0x2000000000000007ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400925ULL,
+ 0xd400400008400725ULL,
+ 0x180000000400dde2ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000005de2ULL,
+ 0x9000000000001de7ULL,
+ 0x2000000000000007ULL,
+ 0xd400400008400405ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd400400008400905ULL,
+ 0xd400400008400705ULL,
+ 0x180000000400dde2ULL,
+ 0x1800000000009de2ULL,
+ 0x1800000000005de2ULL,
+ 0x2000000000000007ULL,
+ 0x9000000000001de7ULL,
+ 0xd40040000840c485ULL,
+ 0x0c5400000013dc04ULL,
+ 0xd40040000840c985ULL,
+ 0xd40040000840c785ULL,
+ 0x18fe00000000dde2ULL,
+ 0x4000000000001de4ULL,
+ 0x9000000000001de7ULL,
+ 0x4000000000001de4ULL,
+ 0x9000000000001de7ULL,
+ 0x4000000000001de4ULL,
+ 0x9000000000001de7ULL,
+ 0xc800000003f01cc5ULL,
+ 0x2c00000100005c04ULL,
+ 0x2c0000010800dc04ULL,
+ 0x3000c3fffff09c04ULL,
+ 0x680100000c1fdc03ULL,
+ 0x4000000a60001c47ULL,
+ 0x180000004000dde2ULL,
+ 0x78000009c0000007ULL,
+ 0x0c0000000430dd02ULL,
+ 0x4003ffffa0001ca7ULL,
+ 0x2800406400001de4ULL,
+ 0x2800406410005de4ULL,
+ 0x180000000400dde2ULL,
+ 0x547e18000000dd05ULL,
+ 0x60000008e0000007ULL,
+ 0x190ec0000431dc03ULL,
+ 0x40000000000001f4ULL,
+ 0x94000004c0009c85ULL,
+ 0x2c00000100009c04ULL,
+ 0x2c0000010800dc04ULL,
+ 0x9400000020009ca5ULL,
+ 0x9400000100011cc5ULL,
+ 0x9400000140021cc5ULL,
+ 0x9400000180031cc5ULL,
+ 0x94000001c0041cc5ULL,
+ 0x9400000200051cc5ULL,
+ 0x9400000240061cc5ULL,
+ 0x9400000280071cc5ULL,
+ 0x94000002c0081cc5ULL,
+ 0x9400000300091cc5ULL,
+ 0x94000003400a1cc5ULL,
+ 0x94000003800b1cc5ULL,
+ 0x94000003c00c1cc5ULL,
+ 0x94000004000d1cc5ULL,
+ 0x94000004400e1cc5ULL,
+ 0x94000004800f1cc5ULL,
+ 0xc000000003f09ea5ULL,
+ 0x94000000c0009ca5ULL,
+ 0xc000000023f09ea5ULL,
+ 0x94000000e0009ca5ULL,
+ 0x2c00000084009c04ULL,
+ 0x2c0000008800dc04ULL,
+ 0x9400000040009ca5ULL,
+ 0x2c0000008c009c04ULL,
+ 0x2c0000009400dc04ULL,
+ 0x9400000060009ca5ULL,
+ 0x2c00000098009c04ULL,
+ 0x2c0000009c00dc04ULL,
+ 0x9400000080009ca5ULL,
+ 0x2c000000c800dc04ULL,
+ 0x0c0000001030dd02ULL,
+ 0x4000000100001ea7ULL,
+ 0x480100000c001c03ULL,
+ 0x0800000000105c42ULL,
+ 0xc100000000309c85ULL,
+ 0x9400000500009c85ULL,
+ 0x0c00000010001d02ULL,
+ 0x0800000000105d42ULL,
+ 0x0c0000001030dd02ULL,
+ 0x4003ffff40001ca7ULL,
+ 0x2800406420001de4ULL,
+ 0x2800406430005de4ULL,
+ 0xe000000000001c45ULL,
+ 0xd000000003ffdcc5ULL,
+ 0x9c000000000fdcc5ULL,
+ 0x2c0000000c009c04ULL,
+ 0x7000c0205020dc03ULL,
+ 0x7000c01820209c03ULL,
+ 0x5000406450209c03ULL,
+ 0x500040644030dc03ULL,
+ 0x480000000c209c03ULL,
+ 0x4801000008001c03ULL,
+ 0x0800000000105c42ULL,
+ 0x280040646000dde4ULL,
+ 0x8400000020009f05ULL,
+ 0x190ec0002821dc03ULL,
+ 0x40000000800001e7ULL,
+ 0x0c00000040001c02ULL,
+ 0x0800000000105c42ULL,
+ 0x0c0000004030dd02ULL,
+ 0x00029dff0ffc5cbfULL,
+ 0x8400000000009f85ULL,
+ 0x2800406400001de4ULL,
+ 0x2800406410005de4ULL,
+ 0x9400000010009c85ULL,
+ 0x4000000000001df4ULL,
+ 0x9800000003ffdcc5ULL,
+ 0xd000000000008007ULL,
+ 0xa000000000004007ULL,
+ 0xd000000000008007ULL,
+ 0x3400c3fffc201c04ULL,
+ 0xc000000003f01ec5ULL,
+ 0xa000000000000007ULL
+};
+
+static const uint16_t nve4_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+ 0x0000,
+ 0x00f0,
+ 0x0f08,
+ 0x0f18,
+};
diff --git a/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
new file mode 100644
index 0000000..d10b6b0
--- /dev/null
+++ b/src/gallium/drivers/nouveau/codegen/target_lib_nvf0.asm.h
@@ -0,0 +1,13 @@
+
+static const uint64_t nvf0_builtin_code[] =
+{
+ 0x19000000001c003cULL,
+};
+
+static const uint16_t nvf0_builtin_offsets[NVC0_BUILTIN_COUNT] =
+{
+ 0,
+ 0,
+ 0,
+ 0
+};