1 files changed, 506 insertions, 278 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index e218a85..3bec728 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -43,6 +43,7 @@
 #include "brw_wm.h"
 #include "brw_fs.h"
 #include "brw_cs.h"
+#include "brw_vec4_gs_visitor.h"
 #include "brw_cfg.h"
 #include "brw_dead_control_flow.h"
 #include "main/uniforms.h"
@@ -75,8 +76,9 @@ fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
 
    /* This will be the case for almost all instructions. */
    switch (dst.file) {
-   case GRF:
-   case HW_REG:
+   case VGRF:
+   case ARF:
+   case FIXED_GRF:
    case MRF:
    case ATTR:
       this->regs_written = DIV_ROUND_UP(dst.component_size(exec_size),
@@ -203,7 +205,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
 
    int regs_written = 4 * (bld.dispatch_width() / 8) * scale;
-   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), dst.type);
+   fs_reg vec4_result = fs_reg(VGRF, alloc.allocate(regs_written), dst.type);
    fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
 
@@ -232,7 +234,7 @@ fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
    const fs_builder ubld = bld.annotate("send dependency resolve")
                               .half(0);
 
-   ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
+   ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
 }
 
 bool
@@ -283,14 +285,15 @@ fs_inst::is_send_from_grf() const
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
       return true;
    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
-      return src[1].file == GRF;
+      return src[1].file == VGRF;
    case FS_OPCODE_FB_WRITE:
-      return src[0].file == GRF;
+      return src[0].file == VGRF;
    default:
       if (is_tex())
-         return src[0].file == GRF;
+         return src[0].file == VGRF;
 
       return false;
    }
@@ -303,10 +306,10 @@ fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
       return false;
 
    fs_reg reg = this->src[0];
-   if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
+   if (reg.file != VGRF || reg.reg_offset != 0 || reg.stride == 0)
       return false;
 
-   if (grf_alloc.sizes[reg.reg] != this->regs_written)
+   if (grf_alloc.sizes[reg.nr] != this->regs_written)
       return false;
 
    for (int i = 0; i < this->sources; i++) {
@@ -378,7 +381,7 @@ fs_reg::fs_reg(float f)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_F;
    this->stride = 0;
-   this->fixed_hw_reg.dw1.f = f;
+   this->f = f;
 }
 
 /** Immediate value constructor. */
@@ -388,7 +391,7 @@ fs_reg::fs_reg(int32_t i)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_D;
    this->stride = 0;
-   this->fixed_hw_reg.dw1.d = i;
+   this->d = i;
 }
 
 /** Immediate value constructor. */
@@ -398,7 +401,7 @@ fs_reg::fs_reg(uint32_t u)
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_UD;
    this->stride = 0;
-   this->fixed_hw_reg.dw1.ud = u;
+   this->ud = u;
 }
 
 /** Vector float immediate value constructor. */
@@ -407,7 +410,7 @@ fs_reg::fs_reg(uint8_t vf[4])
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_VF;
-   memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
+   memcpy(&this->ud, vf, sizeof(unsigned));
 }
 
 /** Vector float immediate value constructor. */
@@ -416,42 +419,38 @@ fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
    init();
    this->file = IMM;
    this->type = BRW_REGISTER_TYPE_VF;
-   this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
-                               (vf1 <<  8) |
-                               (vf2 << 16) |
-                               (vf3 << 24);
+   this->ud = (vf0 <<  0) | (vf1 <<  8) | (vf2 << 16) | (vf3 << 24);
 }
 
-/** Fixed brw_reg. */
-fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
+fs_reg::fs_reg(struct brw_reg reg) :
+   backend_reg(reg)
 {
-   init();
-   this->file = HW_REG;
-   this->fixed_hw_reg = fixed_hw_reg;
-   this->type = fixed_hw_reg.type;
+   this->reg_offset = 0;
+   this->subreg_offset = 0;
+   this->reladdr = NULL;
+   this->stride = 1;
+   if (this->file == IMM &&
+       (this->type != BRW_REGISTER_TYPE_V &&
+        this->type != BRW_REGISTER_TYPE_UV &&
+        this->type != BRW_REGISTER_TYPE_VF)) {
+      this->stride = 0;
+   }
 }
 
 bool
 fs_reg::equals(const fs_reg &r) const
 {
-   return (file == r.file &&
-           reg == r.reg &&
+   return (memcmp((brw_reg *)this, (brw_reg *)&r, sizeof(brw_reg)) == 0 &&
            reg_offset == r.reg_offset &&
            subreg_offset == r.subreg_offset &&
-           type == r.type &&
-           negate == r.negate &&
-           abs == r.abs &&
            !reladdr && !r.reladdr &&
-           ((file != HW_REG && file != IMM) ||
-            memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
-                   sizeof(fixed_hw_reg)) == 0) &&
            stride == r.stride);
 }
 
 fs_reg &
 fs_reg::set_smear(unsigned subreg)
 {
-   assert(file != HW_REG && file != IMM);
+   assert(file != ARF && file != FIXED_GRF && file != IMM);
    subreg_offset = subreg * type_sz(type);
    stride = 0;
    return *this;
@@ -466,9 +465,9 @@ fs_reg::is_contiguous() const
 unsigned
 fs_reg::component_size(unsigned width) const
 {
-   const unsigned stride = (file != HW_REG ? this->stride :
-                            fixed_hw_reg.hstride == 0 ? 0 :
-                            1 << (fixed_hw_reg.hstride - 1));
+   const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
+                            hstride == 0 ? 0 :
+                            1 << (hstride - 1));
    return MAX2(width * stride, 1) * type_sz(type);
 }
 
@@ -514,6 +513,19 @@ type_size_scalar(const struct glsl_type *type)
 }
 
 /**
+ * Returns the number of scalar components needed to store type, assuming
+ * that vectors are padded out to vec4.
+ *
+ * This has the packing rules of type_size_vec4(), but counts components
+ * similar to type_size_scalar().
+ */
+extern "C" int
+type_size_vec4_times_4(const struct glsl_type *type)
+{
+   return 4 * type_size_vec4(type);
+}
+
+/**
  * Create a MOV to read the timestamp register.
  *
  * The caller is responsible for emitting the MOV.  The return value is
@@ -529,7 +541,7 @@ fs_visitor::get_timestamp(const fs_builder &bld)
                                           0),
                              BRW_REGISTER_TYPE_UD));
 
-   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
@@ -584,7 +596,7 @@ fs_visitor::emit_shader_time_end()
 
    fs_reg start = shader_start_time;
    start.negate = true;
-   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   fs_reg diff = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    diff.set_smear(0);
 
    const fs_builder cbld = ibld.group(1, 0);
@@ -706,7 +718,7 @@ fs_inst::components_read(unsigned i) const
       assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
       /* First/second FB write color. */
       if (i < 2)
-         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud;
+         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
       else
          return 1;
 
@@ -717,6 +729,7 @@ fs_inst::components_read(unsigned i) const
    case SHADER_OPCODE_TXS_LOGICAL:
    case FS_OPCODE_TXB_LOGICAL:
    case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
    case SHADER_OPCODE_TXF_UMS_LOGICAL:
    case SHADER_OPCODE_TXF_MCS_LOGICAL:
    case SHADER_OPCODE_LOD_LOGICAL:
@@ -725,13 +738,16 @@ fs_inst::components_read(unsigned i) const
       assert(src[8].file == IMM && src[9].file == IMM);
       /* Texture coordinates. */
       if (i == 0)
-         return src[8].fixed_hw_reg.dw1.ud;
+         return src[8].ud;
       /* Texture derivatives. */
       else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
-         return src[9].fixed_hw_reg.dw1.ud;
+         return src[9].ud;
       /* Texture offset. */
       else if (i == 7)
          return 2;
+      /* MCS */
+      else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+         return 2;
       else
          return 1;
 
@@ -740,7 +756,7 @@ fs_inst::components_read(unsigned i) const
       assert(src[3].file == IMM);
       /* Surface coordinates. */
       if (i == 0)
-         return src[3].fixed_hw_reg.dw1.ud;
+         return src[3].ud;
       /* Surface operation source (ignored for reads). */
       else if (i == 1)
          return 0;
@@ -753,10 +769,10 @@ fs_inst::components_read(unsigned i) const
              src[4].file == IMM);
       /* Surface coordinates. */
       if (i == 0)
-         return src[3].fixed_hw_reg.dw1.ud;
+         return src[3].ud;
       /* Surface operation source. */
       else if (i == 1)
-         return src[4].fixed_hw_reg.dw1.ud;
+         return src[4].ud;
       else
          return 1;
 
@@ -764,10 +780,10 @@ fs_inst::components_read(unsigned i) const
    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
       assert(src[3].file == IMM &&
              src[4].file == IMM);
-      const unsigned op = src[4].fixed_hw_reg.dw1.ud;
+      const unsigned op = src[4].ud;
       /* Surface coordinates. */
       if (i == 0)
-         return src[3].fixed_hw_reg.dw1.ud;
+         return src[3].ud;
       /* Surface operation source. */
       else if (i == 1 && op == BRW_AOP_CMPWR)
          return 2;
@@ -793,6 +809,7 @@ fs_inst::regs_read(int arg) const
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
    case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
    case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -825,7 +842,7 @@ fs_inst::regs_read(int arg) const
       return 1;
 
    default:
-      if (is_tex() && arg == 0 && src[0].file == GRF)
+      if (is_tex() && arg == 0 && src[0].file == VGRF)
          return mlen;
       break;
    }
@@ -836,9 +853,10 @@ fs_inst::regs_read(int arg) const
    case UNIFORM:
    case IMM:
       return 1;
-   case GRF:
+   case ARF:
+   case FIXED_GRF:
+   case VGRF:
    case ATTR:
-   case HW_REG:
       return DIV_ROUND_UP(components_read(arg) *
                           src[arg].component_size(exec_size),
                           REG_SIZE);
@@ -896,6 +914,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case SHADER_OPCODE_TXD:
    case SHADER_OPCODE_TXF:
    case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
    case SHADER_OPCODE_TXF_MCS:
    case SHADER_OPCODE_TG4:
    case SHADER_OPCODE_TG4_OFFSET:
@@ -938,26 +957,24 @@ fs_reg
 fs_visitor::vgrf(const glsl_type *const type)
 {
    int reg_width = dispatch_width / 8;
-   return fs_reg(GRF, alloc.allocate(type_size_scalar(type) * reg_width),
+   return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
                  brw_type_for_base_type(type));
 }
 
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg)
+fs_reg::fs_reg(enum brw_reg_file file, int nr)
 {
    init();
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
    this->type = BRW_REGISTER_TYPE_F;
    this->stride = (file == UNIFORM ? 0 : 1);
 }
 
-/** Fixed HW reg constructor. */
-fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
+fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
 {
    init();
    this->file = file;
-   this->reg = reg;
+   this->nr = nr;
    this->type = type;
    this->stride = (file == UNIFORM ? 0 : 1);
 }
@@ -1285,9 +1302,9 @@ fs_visitor::emit_sampleid_setup()
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
 
    if (key->compute_sample_id) {
-      fs_reg t1(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
+      fs_reg t1(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_D);
       t1.set_smear(0);
-      fs_reg t2(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+      fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
 
       /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
        * 8x multisampling, subspan 0 will represent sample N (where N
@@ -1308,9 +1325,15 @@ fs_visitor::emit_sampleid_setup()
        * are sample 1 of subspan 0; the third group is sample 0 of
        * subspan 1, and finally sample 1 of subspan 1.
        */
+
+      /* SKL+ has an extra bit for the Starting Sample Pair Index to
+       * accomodate 16x MSAA.
+       */
+      unsigned sspi_mask = devinfo->gen >= 9 ? 0x1c0 : 0xc0;
+
       abld.exec_all().group(1, 0)
           .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
-               fs_reg(0xc0));
+               fs_reg(sspi_mask));
       abld.exec_all().group(1, 0).SHR(t1, t1, fs_reg(5));
 
       /* This works for both SIMD8 and SIMD16 */
@@ -1362,6 +1385,57 @@ fs_visitor::emit_discard_jump()
 }
 
 void
+fs_visitor::emit_gs_thread_end()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data =
+      (struct brw_gs_prog_data *) prog_data;
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      emit_gs_control_data_bits(this->final_gs_vertex_count);
+   }
+
+   const fs_builder abld = bld.annotate("thread end");
+   fs_inst *inst;
+
+   if (gs_prog_data->static_vertex_count != -1) {
+      foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
+         if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
+            prev->eot = true;
+
+            /* Delete now dead instructions. */
+            foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
+               if (dead == prev)
+                  break;
+               dead->remove();
+            }
+            return;
+         } else if (prev->is_control_flow() || prev->has_side_effects()) {
+            break;
+         }
+      }
+      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
+      inst->mlen = 1;
+   } else {
+      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
+      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+      sources[1] = this->final_gs_vertex_count;
+      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+      inst->mlen = 2;
+   }
+   inst->eot = true;
+   inst->offset = 0;
+}
+
+void
 fs_visitor::assign_curb_setup()
 {
    if (dispatch_width == 8) {
@@ -1384,7 +1458,7 @@ fs_visitor::assign_curb_setup()
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file == UNIFORM) {
-            int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
+            int uniform_nr = inst->src[i].nr + inst->src[i].reg_offset;
             int constant_nr;
             if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
                constant_nr = push_constant_loc[uniform_nr];
@@ -1400,10 +1474,11 @@ fs_visitor::assign_curb_setup()
 	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
 						  constant_nr / 8,
 						  constant_nr % 8);
+            brw_reg.abs = inst->src[i].abs;
+            brw_reg.negate = inst->src[i].negate;
 
             assert(inst->src[i].stride == 0);
-	    inst->src[i].file = HW_REG;
-	    inst->src[i].fixed_hw_reg = byte_offset(
+            inst->src[i] = byte_offset(
                retype(brw_reg, inst->src[i].type),
                inst->src[i].subreg_offset);
 	 }
@@ -1518,13 +1593,13 @@ fs_visitor::assign_urb_setup()
     */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       if (inst->opcode == FS_OPCODE_LINTERP) {
-	 assert(inst->src[1].file == HW_REG);
-	 inst->src[1].fixed_hw_reg.nr += urb_start;
+	 assert(inst->src[1].file == FIXED_GRF);
+         inst->src[1].nr += urb_start;
       }
 
       if (inst->opcode == FS_OPCODE_CINTERP) {
-	 assert(inst->src[0].file == HW_REG);
-	 inst->src[0].fixed_hw_reg.nr += urb_start;
+	 assert(inst->src[0].file == FIXED_GRF);
+         inst->src[0].nr += urb_start;
       }
    }
 
@@ -1533,6 +1608,30 @@ fs_visitor::assign_urb_setup()
 }
 
 void
+fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
+{
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file == ATTR) {
+         int grf = payload.num_regs +
+                   prog_data->curb_read_length +
+                   inst->src[i].nr +
+                   inst->src[i].reg_offset;
+
+         unsigned width = inst->src[i].stride == 0 ? 1 : inst->exec_size;
+         struct brw_reg reg =
+            stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                               inst->src[i].subreg_offset),
+                   inst->exec_size * inst->src[i].stride,
+                   width, inst->src[i].stride);
+         reg.abs = inst->src[i].abs;
+         reg.negate = inst->src[i].negate;
+
+         inst->src[i] = reg;
+      }
+   }
+}
+
+void
 fs_visitor::assign_vs_urb_setup()
 {
    brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
@@ -1549,24 +1648,44 @@ fs_visitor::assign_vs_urb_setup()
 
    /* Rewrite all ATTR file references to the hw grf that they land in. */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == ATTR) {
-            int grf = payload.num_regs +
-                      prog_data->curb_read_length +
-                      inst->src[i].reg +
-                      inst->src[i].reg_offset;
-
-            inst->src[i].file = HW_REG;
-            inst->src[i].fixed_hw_reg =
-               stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
-                                  inst->src[i].subreg_offset),
-                      inst->exec_size * inst->src[i].stride,
-                      inst->exec_size, inst->src[i].stride);
-         }
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_gs_urb_setup()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   brw_vue_prog_data *vue_prog_data = (brw_vue_prog_data *) prog_data;
+
+   first_non_payload_grf +=
+      8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in;
+
+   const unsigned first_icp_handle = payload.num_regs -
+      (vue_prog_data->include_vue_handles ? nir->info.gs.vertices_in : 0);
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      /* Lower URB_READ_SIMD8 opcodes into real messages. */
+      if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8) {
+         assert(inst->src[0].file == IMM);
+         inst->src[0] = retype(brw_vec8_grf(first_icp_handle +
+                                            inst->src[0].ud,
+                                            0), BRW_REGISTER_TYPE_UD);
+         /* for now, assume constant - we can do per-slot offsets later */
+         assert(inst->src[1].file == IMM);
+         inst->offset = inst->src[1].ud;
+         inst->src[1] = fs_reg();
+         inst->mlen = 1;
+         inst->base_mrf = -1;
       }
+
+      /* Rewrite all ATTR file references to GRFs. */
+      convert_attr_sources_to_hw_regs(inst);
    }
 }
 
+
 /**
  * Split large virtual GRFs into separate components if we can.
  *
@@ -1609,30 +1728,30 @@ fs_visitor::split_virtual_grfs()
 
    /* Mark all used registers as fully splittable */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF) {
-         int reg = vgrf_to_reg[inst->dst.reg];
-         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
+      if (inst->dst.file == VGRF) {
+         int reg = vgrf_to_reg[inst->dst.nr];
+         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
             split_points[reg + j] = true;
       }
 
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
-            int reg = vgrf_to_reg[inst->src[i].reg];
-            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
+         if (inst->src[i].file == VGRF) {
+            int reg = vgrf_to_reg[inst->src[i].nr];
+            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
                split_points[reg + j] = true;
          }
       }
    }
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF) {
-         int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
+      if (inst->dst.file == VGRF) {
+         int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
          for (int j = 1; j < inst->regs_written; j++)
             split_points[reg + j] = false;
       }
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF) {
-            int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
+         if (inst->src[i].file == VGRF) {
+            int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
             for (int j = 1; j < inst->regs_read(i); j++)
                split_points[reg + j] = false;
          }
@@ -1678,16 +1797,16 @@ fs_visitor::split_virtual_grfs()
    assert(reg == reg_count);
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF) {
-         reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
-         inst->dst.reg = new_virtual_grf[reg];
+      if (inst->dst.file == VGRF) {
+         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.reg_offset;
+         inst->dst.nr = new_virtual_grf[reg];
          inst->dst.reg_offset = new_reg_offset[reg];
          assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
       }
       for (int i = 0; i < inst->sources; i++) {
-	 if (inst->src[i].file == GRF) {
-            reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
-            inst->src[i].reg = new_virtual_grf[reg];
+	 if (inst->src[i].file == VGRF) {
+            reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].reg_offset;
+            inst->src[i].nr = new_virtual_grf[reg];
             inst->src[i].reg_offset = new_reg_offset[reg];
             assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
          }
@@ -1714,12 +1833,12 @@ fs_visitor::compact_virtual_grfs()
 
    /* Mark which virtual GRFs are used. */
    foreach_block_and_inst(block, const fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF)
-         remap_table[inst->dst.reg] = 0;
+      if (inst->dst.file == VGRF)
+         remap_table[inst->dst.nr] = 0;
 
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF)
-            remap_table[inst->src[i].reg] = 0;
+         if (inst->src[i].file == VGRF)
+            remap_table[inst->src[i].nr] = 0;
       }
    }
 
@@ -1743,12 +1862,12 @@ fs_visitor::compact_virtual_grfs()
 
    /* Patch all the instructions to use the newly renumbered registers */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->dst.file == GRF)
-         inst->dst.reg = remap_table[inst->dst.reg];
+      if (inst->dst.file == VGRF)
+         inst->dst.nr = remap_table[inst->dst.nr];
 
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF)
-            inst->src[i].reg = remap_table[inst->src[i].reg];
+         if (inst->src[i].file == VGRF)
+            inst->src[i].nr = remap_table[inst->src[i].nr];
       }
    }
 
@@ -1757,9 +1876,9 @@ fs_visitor::compact_virtual_grfs()
     * think some random VGRF is delta_xy.
     */
    for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
-      if (delta_xy[i].file == GRF) {
-         if (remap_table[delta_xy[i].reg] != -1) {
-            delta_xy[i].reg = remap_table[delta_xy[i].reg];
+      if (delta_xy[i].file == VGRF) {
+         if (remap_table[delta_xy[i].nr] != -1) {
+            delta_xy[i].nr = remap_table[delta_xy[i].nr];
          } else {
             delta_xy[i].file = BAD_FILE;
          }
@@ -1811,7 +1930,7 @@ fs_visitor::assign_constant_locations()
             continue;
 
          if (inst->src[i].reladdr) {
-            int uniform = inst->src[i].reg;
+            int uniform = inst->src[i].nr;
 
             /* If this array isn't already present in the pull constant buffer,
              * add it.
@@ -1823,7 +1942,7 @@ fs_visitor::assign_constant_locations()
             }
          } else {
             /* Mark the the one accessed uniform as live */
-            int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
+            int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
             if (constant_nr >= 0 && constant_nr < (int) uniforms)
                is_live[constant_nr] = true;
          }
@@ -1899,7 +2018,7 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          int pull_index;
-         unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
+         unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
          if (location >= uniforms) /* Out of bounds access */
             pull_index = -1;
          else
@@ -1910,7 +2029,7 @@ fs_visitor::demote_pull_constants()
 
          /* Set up the annotation tracking for new generated instructions. */
          const fs_builder ibld(this, block, inst);
-         fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
+         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
          fs_reg dst = vgrf(glsl_type::float_type);
 
          assert(inst->src[i].stride == 0);
@@ -1918,7 +2037,7 @@ fs_visitor::demote_pull_constants()
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
             VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       surf_index,
+                                       fs_reg(index),
                                        *inst->src[i].reladdr,
                                        pull_index);
             inst->src[i].reladdr = NULL;
@@ -1927,13 +2046,14 @@ fs_visitor::demote_pull_constants()
             const fs_builder ubld = ibld.exec_all().group(8, 0);
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
             ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, surf_index, offset);
+                      dst, fs_reg(index), offset);
             inst->src[i].set_smear(pull_index & 3);
          }
+         brw_mark_surface_used(prog_data, index);
 
          /* Rewrite the instruction to use the temporary VGRF. */
-         inst->src[i].file = GRF;
-         inst->src[i].reg = dst.reg;
+         inst->src[i].file = VGRF;
+         inst->src[i].nr = dst.nr;
          inst->src[i].reg_offset = 0;
       }
    }
@@ -1955,8 +2075,7 @@ fs_visitor::opt_algebraic()
             if (inst->dst.type != inst->src[0].type)
                assert(!"unimplemented: saturate mixed types");
 
-            if (brw_saturate_immediate(inst->dst.type,
-                                       &inst->src[0].fixed_hw_reg)) {
+            if (brw_saturate_immediate(inst->dst.type, &inst->src[0])) {
                inst->saturate = false;
                progress = true;
             }
@@ -1996,7 +2115,7 @@ fs_visitor::opt_algebraic()
          if (inst->src[0].file == IMM) {
             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[0].f *= inst->src[1].f;
             inst->src[1] = reg_undef;
             progress = true;
             break;
@@ -2017,7 +2136,7 @@ fs_visitor::opt_algebraic()
          if (inst->src[0].file == IMM) {
             assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
             inst->opcode = BRW_OPCODE_MOV;
-            inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
+            inst->src[0].f += inst->src[1].f;
             inst->src[1] = reg_undef;
             progress = true;
             break;
@@ -2066,7 +2185,7 @@ fs_visitor::opt_algebraic()
             case BRW_CONDITIONAL_L:
                switch (inst->src[1].type) {
                case BRW_REGISTER_TYPE_F:
-                  if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
+                  if (inst->src[1].f >= 1.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->src[1] = reg_undef;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
@@ -2081,7 +2200,7 @@ fs_visitor::opt_algebraic()
             case BRW_CONDITIONAL_G:
                switch (inst->src[1].type) {
                case BRW_REGISTER_TYPE_F:
-                  if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
+                  if (inst->src[1].f <= 0.0f) {
                      inst->opcode = BRW_OPCODE_MOV;
                      inst->src[1] = reg_undef;
                      inst->conditional_mod = BRW_CONDITIONAL_NONE;
@@ -2118,7 +2237,7 @@ fs_visitor::opt_algebraic()
             progress = true;
          } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
             inst->opcode = BRW_OPCODE_ADD;
-            inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
+            inst->src[1].f *= inst->src[2].f;
             inst->src[2] = reg_undef;
             progress = true;
          }
@@ -2143,7 +2262,7 @@ fs_visitor::opt_algebraic()
          } else if (inst->src[1].file == IMM) {
             inst->opcode = BRW_OPCODE_MOV;
             inst->src[0] = component(inst->src[0],
-                                     inst->src[1].fixed_hw_reg.dw1.ud);
+                                     inst->src[1].ud);
             inst->sources = 1;
             inst->force_writemask_all = true;
             progress = true;
@@ -2344,31 +2463,31 @@ fs_visitor::opt_register_renaming()
 
       /* Rewrite instruction sources. */
       for (int i = 0; i < inst->sources; i++) {
-         if (inst->src[i].file == GRF &&
-             remap[inst->src[i].reg] != -1 &&
-             remap[inst->src[i].reg] != inst->src[i].reg) {
-            inst->src[i].reg = remap[inst->src[i].reg];
+         if (inst->src[i].file == VGRF &&
+             remap[inst->src[i].nr] != -1 &&
+             remap[inst->src[i].nr] != inst->src[i].nr) {
+            inst->src[i].nr = remap[inst->src[i].nr];
             progress = true;
          }
       }
 
-      const int dst = inst->dst.reg;
+      const int dst = inst->dst.nr;
 
       if (depth == 0 &&
-          inst->dst.file == GRF &&
-          alloc.sizes[inst->dst.reg] == inst->exec_size / 8 &&
+          inst->dst.file == VGRF &&
+          alloc.sizes[inst->dst.nr] == inst->exec_size / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
             remap[dst] = alloc.allocate(inst->exec_size / 8);
-            inst->dst.reg = remap[dst];
+            inst->dst.nr = remap[dst];
             progress = true;
          }
-      } else if (inst->dst.file == GRF &&
+      } else if (inst->dst.file == VGRF &&
                  remap[dst] != -1 &&
                  remap[dst] != dst) {
-         inst->dst.reg = remap[dst];
+         inst->dst.nr = remap[dst];
          progress = true;
       }
    }
@@ -2377,8 +2496,8 @@ fs_visitor::opt_register_renaming()
       invalidate_live_intervals();
 
       for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
-         if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
-            delta_xy[i].reg = remap[delta_xy[i].reg];
+         if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
+            delta_xy[i].nr = remap[delta_xy[i].nr];
          }
       }
    }
@@ -2445,7 +2564,7 @@ fs_visitor::compute_to_mrf()
 
       if (inst->opcode != BRW_OPCODE_MOV ||
 	  inst->is_partial_write() ||
-	  inst->dst.file != MRF || inst->src[0].file != GRF ||
+	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
 	  inst->dst.type != inst->src[0].type ||
 	  inst->src[0].abs || inst->src[0].negate ||
           !inst->src[0].is_contiguous() ||
@@ -2455,9 +2574,9 @@ fs_visitor::compute_to_mrf()
       /* Work out which hardware MRF registers are written by this
        * instruction.
        */
-      int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
+      int mrf_low = inst->dst.nr & ~BRW_MRF_COMPR4;
       int mrf_high;
-      if (inst->dst.reg & BRW_MRF_COMPR4) {
+      if (inst->dst.nr & BRW_MRF_COMPR4) {
 	 mrf_high = mrf_low + 4;
       } else if (inst->exec_size == 16) {
 	 mrf_high = mrf_low + 1;
@@ -2468,15 +2587,15 @@ fs_visitor::compute_to_mrf()
       /* Can't compute-to-MRF this GRF if someone else was going to
        * read it later.
        */
-      if (this->virtual_grf_end[inst->src[0].reg] > ip)
+      if (this->virtual_grf_end[inst->src[0].nr] > ip)
 	 continue;
 
       /* Found a move of a GRF to a MRF.  Let's see if we can go
        * rewrite the thing that made this GRF to write into the MRF.
        */
       foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
-	 if (scan_inst->dst.file == GRF &&
-	     scan_inst->dst.reg == inst->src[0].reg) {
+	 if (scan_inst->dst.file == VGRF &&
+            scan_inst->dst.nr == inst->src[0].nr) {
 	    /* Found the last thing to write our reg we want to turn
 	     * into a compute-to-MRF.
 	     */
@@ -2511,7 +2630,7 @@ fs_visitor::compute_to_mrf()
 	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
 	       /* Found the creator of our MRF's source value. */
 	       scan_inst->dst.file = MRF;
-	       scan_inst->dst.reg = inst->dst.reg;
+               scan_inst->dst.nr = inst->dst.nr;
 	       scan_inst->saturate |= inst->saturate;
 	       inst->remove(block);
 	       progress = true;
@@ -2531,8 +2650,8 @@ fs_visitor::compute_to_mrf()
 	  */
 	 bool interfered = false;
 	 for (int i = 0; i < scan_inst->sources; i++) {
-	    if (scan_inst->src[i].file == GRF &&
-		scan_inst->src[i].reg == inst->src[0].reg &&
+	    if (scan_inst->src[i].file == VGRF &&
+                scan_inst->src[i].nr == inst->src[0].nr &&
 		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
 	       interfered = true;
 	    }
@@ -2544,10 +2663,10 @@ fs_visitor::compute_to_mrf()
 	    /* If somebody else writes our MRF here, we can't
 	     * compute-to-MRF before that.
 	     */
-	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
+            int scan_mrf_low = scan_inst->dst.nr & ~BRW_MRF_COMPR4;
 	    int scan_mrf_high;
 
-	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
+            if (scan_inst->dst.nr & BRW_MRF_COMPR4) {
 	       scan_mrf_high = scan_mrf_low + 4;
 	    } else if (scan_inst->exec_size == 16) {
 	       scan_mrf_high = scan_mrf_low + 1;
@@ -2690,8 +2809,8 @@ fs_visitor::emit_repclear_shader()
 
    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
    if (uniforms == 1) {
-      assert(mov->src[0].file == HW_REG);
-      mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
    }
 }
 
@@ -2718,7 +2837,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       if (inst->opcode == BRW_OPCODE_MOV &&
 	  inst->dst.file == MRF) {
-	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
+         fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
 	 if (prev_inst && inst->equals(prev_inst)) {
 	    inst->remove(block);
 	    progress = true;
@@ -2728,7 +2847,7 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       /* Clear out the last-write records for MRFs that were overwritten. */
       if (inst->dst.file == MRF) {
-	 last_mrf_move[inst->dst.reg] = NULL;
+         last_mrf_move[inst->dst.nr] = NULL;
       }
 
       if (inst->mlen > 0 && inst->base_mrf != -1) {
@@ -2741,10 +2860,10 @@ fs_visitor::remove_duplicate_mrf_writes()
       }
 
       /* Clear out any MRF move records whose sources got overwritten. */
-      if (inst->dst.file == GRF) {
+      if (inst->dst.file == VGRF) {
 	 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
 	    if (last_mrf_move[i] &&
-		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
+                last_mrf_move[i]->src[0].nr == inst->dst.nr) {
 	       last_mrf_move[i] = NULL;
 	    }
 	 }
@@ -2752,9 +2871,9 @@ fs_visitor::remove_duplicate_mrf_writes()
 
       if (inst->opcode == BRW_OPCODE_MOV &&
 	  inst->dst.file == MRF &&
-	  inst->src[0].file == GRF &&
+	  inst->src[0].file == VGRF &&
 	  !inst->is_partial_write()) {
-	 last_mrf_move[inst->dst.reg] = inst;
+         last_mrf_move[inst->dst.nr] = inst;
       }
    }
 
@@ -2770,11 +2889,8 @@ clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
    /* Clear the flag for registers that actually got read (as expected). */
    for (int i = 0; i < inst->sources; i++) {
       int grf;
-      if (inst->src[i].file == GRF) {
-         grf = inst->src[i].reg;
-      } else if (inst->src[i].file == HW_REG &&
-                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-         grf = inst->src[i].fixed_hw_reg.nr;
+      if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
+         grf = inst->src[i].nr;
       } else {
          continue;
       }
@@ -2809,7 +2925,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
                                                         fs_inst *inst)
 {
    int write_len = inst->regs_written;
-   int first_write_grf = inst->dst.reg;
+   int first_write_grf = inst->dst.nr;
    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
    assert(write_len < (int)sizeof(needs_dep) - 1);
 
@@ -2840,9 +2956,9 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        * instruction but a MOV that might have left us an outstanding
        * dependency has more latency than a MOV.
        */
-      if (scan_inst->dst.file == GRF) {
+      if (scan_inst->dst.file == VGRF) {
          for (int i = 0; i < scan_inst->regs_written; i++) {
-            int reg = scan_inst->dst.reg + i;
+            int reg = scan_inst->dst.nr + i;
 
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
@@ -2880,7 +2996,7 @@ void
 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
 {
    int write_len = inst->regs_written;
-   int first_write_grf = inst->dst.reg;
+   int first_write_grf = inst->dst.nr;
    bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
    assert(write_len < (int)sizeof(needs_dep) - 1);
 
@@ -2906,13 +3022,13 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       /* We insert our reads as late as possible since they're reading the
        * result of a SEND, which has massive latency.
        */
-      if (scan_inst->dst.file == GRF &&
-          scan_inst->dst.reg >= first_write_grf &&
-          scan_inst->dst.reg < first_write_grf + write_len &&
-          needs_dep[scan_inst->dst.reg - first_write_grf]) {
+      if (scan_inst->dst.file == VGRF &&
+          scan_inst->dst.nr >= first_write_grf &&
+          scan_inst->dst.nr < first_write_grf + write_len &&
+          needs_dep[scan_inst->dst.nr - first_write_grf]) {
          DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
-                         scan_inst->dst.reg);
-         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
+                         scan_inst->dst.nr);
+         needs_dep[scan_inst->dst.nr - first_write_grf] = false;
       }
 
       /* Continue the loop only if we haven't resolved all the dependencies */
@@ -2939,7 +3055,7 @@ fs_visitor::insert_gen4_send_dependency_workarounds()
     */
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      if (inst->mlen != 0 && inst->dst.file == GRF) {
+      if (inst->mlen != 0 && inst->dst.file == VGRF) {
          insert_gen4_pre_send_dependency_workarounds(block, inst);
          insert_gen4_post_send_dependency_workarounds(block, inst);
          progress = true;
@@ -2980,18 +3096,18 @@ fs_visitor::lower_uniform_pull_constant_loads()
          fs_reg const_offset_reg = inst->src[1];
          assert(const_offset_reg.file == IMM &&
                 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
-         const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
+         const_offset_reg.ud /= 4;
 
          fs_reg payload, offset;
          if (devinfo->gen >= 9) {
             /* We have to use a message header on Skylake to get SIMD4x2
              * mode.  Reserve space for the register.
             */
-            offset = payload = fs_reg(GRF, alloc.allocate(2));
+            offset = payload = fs_reg(VGRF, alloc.allocate(2));
             offset.reg_offset++;
             inst->mlen = 2;
          } else {
-            offset = payload = fs_reg(GRF, alloc.allocate(1));
+            offset = payload = fs_reg(VGRF, alloc.allocate(1));
             inst->mlen = 1;
          }
 
@@ -3038,13 +3154,13 @@ fs_visitor::lower_load_payload()
       if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
          continue;
 
-      assert(inst->dst.file == MRF || inst->dst.file == GRF);
+      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
       assert(inst->saturate == false);
       fs_reg dst = inst->dst;
 
       /* Get rid of COMPR4.  We'll add it back in if we need it */
       if (dst.file == MRF)
-         dst.reg = dst.reg & ~BRW_MRF_COMPR4;
+         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
 
       const fs_builder ibld(this, block, inst);
       const fs_builder hbld = ibld.exec_all().group(8, 0);
@@ -3058,7 +3174,7 @@ fs_visitor::lower_load_payload()
          dst = offset(dst, hbld, 1);
       }
 
-      if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
+      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
           inst->exec_size > 8) {
          /* In this case, the payload portion of the LOAD_PAYLOAD isn't
           * a straightforward copy.  Instead, the result of the
@@ -3082,18 +3198,18 @@ fs_visitor::lower_load_payload()
             if (inst->src[i].file != BAD_FILE) {
                if (devinfo->has_compr4) {
                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
-                  compr4_dst.reg |= BRW_MRF_COMPR4;
+                  compr4_dst.nr |= BRW_MRF_COMPR4;
                   ibld.MOV(compr4_dst, inst->src[i]);
                } else {
                   /* Platform doesn't have COMPR4.  We have to fake it */
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
                   ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
-                  mov_dst.reg += 4;
+                  mov_dst.nr += 4;
                   ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
                }
             }
 
-            dst.reg++;
+            dst.nr++;
          }
 
          /* The loop above only ever incremented us through the first set
@@ -3101,7 +3217,7 @@ fs_visitor::lower_load_payload()
           * actually wrote to the first 8 registers, so we need to take
           * that into account now.
           */
-         dst.reg += 4;
+         dst.nr += 4;
 
          /* The COMPR4 code took care of the first 4 sources.  We'll let
           * the regular path handle any remaining sources.  Yes, we are
@@ -3149,7 +3265,7 @@ fs_visitor::lower_integer_multiplication()
             continue;
 
          if (inst->src[1].file == IMM &&
-             inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+             inst->src[1].ud < (1 << 16)) {
             /* The MUL instruction isn't commutative. On Gen <= 6, only the low
              * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
              * src1 are used.
@@ -3158,7 +3274,7 @@ fs_visitor::lower_integer_multiplication()
              * single MUL instruction with that value in the proper location.
              */
             if (devinfo->gen < 7) {
-               fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+               fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
                           inst->dst.type);
                ibld.MOV(imm, inst->src[1]);
                ibld.MUL(inst->dst, imm, inst->src[0]);
@@ -3213,11 +3329,11 @@ fs_visitor::lower_integer_multiplication()
 
             fs_reg orig_dst = inst->dst;
             if (orig_dst.is_null() || orig_dst.file == MRF) {
-               inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+               inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                                   inst->dst.type);
             }
             fs_reg low = inst->dst;
-            fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+            fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
                         inst->dst.type);
 
             if (devinfo->gen >= 7) {
@@ -3225,8 +3341,8 @@ fs_visitor::lower_integer_multiplication()
                fs_reg src1_1_w = inst->src[1];
 
                if (inst->src[1].file == IMM) {
-                  src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
-                  src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+                  src1_0_w.ud &= 0xffff;
+                  src1_1_w.ud >>= 16;
                } else {
                   src1_0_w.type = BRW_REGISTER_TYPE_UW;
                   if (src1_0_w.stride != 0) {
@@ -3381,7 +3497,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
    fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
    const unsigned components =
-      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].fixed_hw_reg.dw1.ud;
+      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
 
    /* We can potentially have a message length of up to 15, so we have to set
     * base_mrf to either 0 or 1 in order to fit in m0..m15.
@@ -3411,7 +3527,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    }
 
    if (payload.aa_dest_stencil_reg) {
-      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1));
+      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
       bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
          .MOV(sources[length],
               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
@@ -3419,7 +3535,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    }
 
    if (prog_data->uses_omask) {
-      sources[length] = fs_reg(GRF, bld.shader->alloc.allocate(1),
+      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
                                BRW_REGISTER_TYPE_UD);
 
       /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
@@ -3485,9 +3601,9 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
    fs_inst *load;
    if (devinfo->gen >= 7) {
       /* Send from the GRF */
-      fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F);
+      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
       load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
-      payload.reg = bld.shader->alloc.allocate(load->regs_written);
+      payload.nr = bld.shader->alloc.allocate(load->regs_written);
       load->dst = payload;
 
       inst->src[0] = payload;
@@ -3502,7 +3618,7 @@ lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
        * will do this for us if we just give it a COMPR4 destination.
        */
       if (devinfo->gen < 6 && bld.dispatch_width() == 16)
-         load->dst.reg |= BRW_MRF_COMPR4;
+         load->dst.nr |= BRW_MRF_COMPR4;
 
       inst->resize_sources(0);
       inst->base_mrf = 1;
@@ -3612,8 +3728,8 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
    inst->src[0] = reg_undef;
    inst->src[1] = sampler;
    inst->resize_sources(2);
-   inst->base_mrf = msg_begin.reg;
-   inst->mlen = msg_end.reg - msg_begin.reg;
+   inst->base_mrf = msg_begin.nr;
+   inst->mlen = msg_end.nr - msg_begin.nr;
    inst->header_size = 1;
 }
 
@@ -3637,7 +3753,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
        * go headerless.
        */
       header_size = 1;
-      message.reg--;
+      message.nr--;
    }
 
    for (unsigned i = 0; i < coord_components; i++) {
@@ -3707,8 +3823,8 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
    inst->src[0] = reg_undef;
    inst->src[1] = sampler;
    inst->resize_sources(2);
-   inst->base_mrf = message.reg;
-   inst->mlen = msg_end.reg - message.reg;
+   inst->base_mrf = message.nr;
+   inst->mlen = msg_end.nr - message.nr;
    inst->header_size = header_size;
 
    /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
@@ -3721,7 +3837,7 @@ is_high_sampler(const struct brw_device_info *devinfo, const fs_reg &sampler)
    if (devinfo->gen < 8 && !devinfo->is_haswell)
       return false;
 
-   return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
+   return sampler.file != IMM || sampler.ud >= 16;
 }
 
 static void
@@ -3844,17 +3960,31 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
       coordinate_done = true;
       break;
    case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
    case SHADER_OPCODE_TXF_UMS:
    case SHADER_OPCODE_TXF_MCS:
-      if (op == SHADER_OPCODE_TXF_UMS || op == SHADER_OPCODE_TXF_CMS) {
+      if (op == SHADER_OPCODE_TXF_UMS ||
+          op == SHADER_OPCODE_TXF_CMS ||
+          op == SHADER_OPCODE_TXF_CMS_W) {
          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
          length++;
       }
 
-      if (op == SHADER_OPCODE_TXF_CMS) {
+      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
          /* Data from the multisample control surface. */
          bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
          length++;
+
+         /* On Gen9+ we'll use ld2dms_w instead which has two registers for
+          * the MCS data.
+          */
+         if (op == SHADER_OPCODE_TXF_CMS_W) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
+                    mcs.file == IMM ?
+                    mcs :
+                    offset(mcs, bld, 1));
+            length++;
+         }
       }
 
       /* There is no offsetting for this message; just copy in the integer
@@ -3912,7 +4042,7 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
    else
       mlen = length * reg_width;
 
-   const fs_reg src_payload = fs_reg(GRF, bld.shader->alloc.allocate(mlen),
+   const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
                                      BRW_REGISTER_TYPE_F);
    bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
 
@@ -3942,8 +4072,8 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    const fs_reg &sampler = inst->src[6];
    const fs_reg &offset_value = inst->src[7];
    assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
-   const unsigned coord_components = inst->src[8].fixed_hw_reg.dw1.ud;
-   const unsigned grad_components = inst->src[9].fixed_hw_reg.dw1.ud;
+   const unsigned coord_components = inst->src[8].ud;
+   const unsigned grad_components = inst->src[9].ud;
 
    if (devinfo->gen >= 7) {
       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
@@ -4068,6 +4198,10 @@ fs_visitor::lower_logical_sends()
          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
          break;
 
+      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
+         break;
+
       case SHADER_OPCODE_TXF_UMS_LOGICAL:
          lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
          break;
@@ -4260,6 +4394,21 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
       else
          return inst->exec_size;
 
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL: {
+      /* This opcode can take up to 6 arguments which means that in some
+       * circumstances it can end up with a message that is too long in SIMD16
+       * mode.
+       */
+      const unsigned coord_components = inst->src[8].ud;
+      /* First three arguments are the sample index and the two arguments for
+       * the MCS data.
+       */
+      if ((coord_components + 3) * 2 > MAX_SAMPLER_MESSAGE_SIZE)
+         return 8;
+      else
+         return inst->exec_size;
+   }
+
    case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
    case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
@@ -4473,51 +4622,48 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    }
 
    switch (inst->dst.file) {
-   case GRF:
-      fprintf(file, "vgrf%d", inst->dst.reg);
-      if (alloc.sizes[inst->dst.reg] != inst->regs_written ||
+   case VGRF:
+      fprintf(file, "vgrf%d", inst->dst.nr);
+      if (alloc.sizes[inst->dst.nr] != inst->regs_written ||
           inst->dst.subreg_offset)
          fprintf(file, "+%d.%d",
                  inst->dst.reg_offset, inst->dst.subreg_offset);
       break;
+   case FIXED_GRF:
+      fprintf(file, "g%d", inst->dst.nr);
+      break;
    case MRF:
-      fprintf(file, "m%d", inst->dst.reg);
+      fprintf(file, "m%d", inst->dst.nr);
       break;
    case BAD_FILE:
       fprintf(file, "(null)");
       break;
    case UNIFORM:
-      fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
+      fprintf(file, "***u%d***", inst->dst.nr + inst->dst.reg_offset);
       break;
    case ATTR:
-      fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
+      fprintf(file, "***attr%d***", inst->dst.nr + inst->dst.reg_offset);
       break;
-   case HW_REG:
-      if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
-         switch (inst->dst.fixed_hw_reg.nr) {
-         case BRW_ARF_NULL:
-            fprintf(file, "null");
-            break;
-         case BRW_ARF_ADDRESS:
-            fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
-            break;
-         case BRW_ARF_ACCUMULATOR:
-            fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
-            break;
-         case BRW_ARF_FLAG:
-            fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
-                             inst->dst.fixed_hw_reg.subnr);
-            break;
-         default:
-            fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
-                               inst->dst.fixed_hw_reg.subnr);
-            break;
-         }
-      } else {
-         fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
+   case ARF:
+      switch (inst->dst.nr) {
+      case BRW_ARF_NULL:
+         fprintf(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         fprintf(file, "a0.%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         fprintf(file, "acc%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_FLAG:
+         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      default:
+         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
       }
-      if (inst->dst.fixed_hw_reg.subnr)
-         fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
+      if (inst->dst.subnr)
+         fprintf(file, "+%d", inst->dst.subnr);
       break;
    case IMM:
       unreachable("not reached");
@@ -4530,21 +4676,24 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       if (inst->src[i].abs)
          fprintf(file, "|");
       switch (inst->src[i].file) {
-      case GRF:
-         fprintf(file, "vgrf%d", inst->src[i].reg);
-         if (alloc.sizes[inst->src[i].reg] != (unsigned)inst->regs_read(i) ||
+      case VGRF:
+         fprintf(file, "vgrf%d", inst->src[i].nr);
+         if (alloc.sizes[inst->src[i].nr] != (unsigned)inst->regs_read(i) ||
              inst->src[i].subreg_offset)
             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                     inst->src[i].subreg_offset);
          break;
+      case FIXED_GRF:
+         fprintf(file, "g%d", inst->src[i].nr);
+         break;
       case MRF:
-         fprintf(file, "***m%d***", inst->src[i].reg);
+         fprintf(file, "***m%d***", inst->src[i].nr);
          break;
       case ATTR:
-         fprintf(file, "attr%d+%d", inst->src[i].reg, inst->src[i].reg_offset);
+         fprintf(file, "attr%d+%d", inst->src[i].nr, inst->src[i].reg_offset);
          break;
       case UNIFORM:
-         fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
+         fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
          if (inst->src[i].reladdr) {
             fprintf(file, "+reladdr");
          } else if (inst->src[i].subreg_offset) {
@@ -4558,60 +4707,48 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
       case IMM:
          switch (inst->src[i].type) {
          case BRW_REGISTER_TYPE_F:
-            fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
+            fprintf(file, "%ff", inst->src[i].f);
             break;
          case BRW_REGISTER_TYPE_W:
          case BRW_REGISTER_TYPE_D:
-            fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
+            fprintf(file, "%dd", inst->src[i].d);
             break;
          case BRW_REGISTER_TYPE_UW:
          case BRW_REGISTER_TYPE_UD:
-            fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
+            fprintf(file, "%uu", inst->src[i].ud);
             break;
          case BRW_REGISTER_TYPE_VF:
             fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
-                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
+                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
             break;
          default:
             fprintf(file, "???");
             break;
          }
          break;
-      case HW_REG:
-         if (inst->src[i].fixed_hw_reg.negate)
-            fprintf(file, "-");
-         if (inst->src[i].fixed_hw_reg.abs)
-            fprintf(file, "|");
-         if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
-            switch (inst->src[i].fixed_hw_reg.nr) {
-            case BRW_ARF_NULL:
-               fprintf(file, "null");
-               break;
-            case BRW_ARF_ADDRESS:
-               fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
-               break;
-            case BRW_ARF_ACCUMULATOR:
-               fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
-               break;
-            case BRW_ARF_FLAG:
-               fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
-                                inst->src[i].fixed_hw_reg.subnr);
-               break;
-            default:
-               fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
-                                  inst->src[i].fixed_hw_reg.subnr);
-               break;
-            }
-         } else {
-            fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
+      case ARF:
+         switch (inst->src[i].nr) {
+         case BRW_ARF_NULL:
+            fprintf(file, "null");
+            break;
+         case BRW_ARF_ADDRESS:
+            fprintf(file, "a0.%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_ACCUMULATOR:
+            fprintf(file, "acc%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_FLAG:
+            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         default:
+            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
          }
-         if (inst->src[i].fixed_hw_reg.subnr)
-            fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
-         if (inst->src[i].fixed_hw_reg.abs)
-            fprintf(file, "|");
+         if (inst->src[i].subnr)
+            fprintf(file, "+%d", inst->src[i].subnr);
          break;
       }
       if (inst->src[i].abs)
@@ -4627,6 +4764,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
 
    fprintf(file, " ");
 
+   if (inst->force_writemask_all)
+      fprintf(file, "NoMask ");
+
    if (dispatch_width == 16 && inst->exec_size == 8) {
       if (inst->force_sechalf)
          fprintf(file, "2ndhalf ");
@@ -4779,6 +4919,45 @@ fs_visitor::setup_vs_payload()
  *
  */
 void
+fs_visitor::setup_gs_payload()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data =
+      (struct brw_gs_prog_data *) prog_data;
+   struct brw_vue_prog_data *vue_prog_data =
+      (struct brw_vue_prog_data *) prog_data;
+
+   /* R0: thread header, R1: output URB handles */
+   payload.num_regs = 2;
+
+   if (gs_prog_data->include_primitive_id) {
+      /* R2: Primitive ID 0..7 */
+      payload.num_regs++;
+   }
+
+   /* Use a maximum of 32 registers for push-model inputs. */
+   const unsigned max_push_components = 32;
+
+   /* If pushing our inputs would take too many registers, reduce the URB read
+    * length (which is in HWords, or 8 registers), and resort to pulling.
+    *
+    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
+    * have to multiply by VerticesIn to obtain the total storage requirement.
+    */
+   if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
+       max_push_components) {
+      gs_prog_data->base.include_vue_handles = true;
+
+      /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
+      payload.num_regs += nir->info.gs.vertices_in;
+
+      vue_prog_data->urb_read_length =
+         ROUND_DOWN_TO(max_push_components / nir->info.gs.vertices_in, 8) / 8;
+   }
+}
+
+void
 fs_visitor::setup_cs_payload()
 {
    assert(devinfo->gen >= 7);
@@ -4925,7 +5104,7 @@ fs_visitor::fixup_3src_null_dest()
 {
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
       if (inst->is_3src() && inst->dst.is_null()) {
-         inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                             inst->dst.type);
       }
    }
@@ -5035,6 +5214,55 @@ fs_visitor::run_vs(gl_clip_plane *clip_planes)
 }
 
 bool
+fs_visitor::run_gs()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   setup_gs_payload();
+
+   this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      /* Create a VGRF to store accumulated control data bits. */
+      this->control_data_bits = vgrf(glsl_type::uint_type);
+
+      /* If we're outputting more than 32 control data bits, then EmitVertex()
+       * will set control_data_bits to 0 after emitting the first vertex.
+       * Otherwise, we need to initialize it to 0 here.
+       */
+      if (gs_compile->control_data_header_size_bits <= 32) {
+         const fs_builder abld = bld.annotate("initialize control data bits");
+         abld.MOV(this->control_data_bits, fs_reg(0u));
+      }
+   }
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   emit_gs_thread_end();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_gs_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers();
+
+   return !failed;
+}
+
+bool
 fs_visitor::run_fs(bool do_rep_send)
 {
    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;