9 files changed, 371 insertions, 157 deletions
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5d7e867..9a9dbda 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -65,7 +65,21 @@ fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
    this->conditional_mod = BRW_CONDITIONAL_NONE;
 
    /* This will be the case for almost all instructions. */
-   this->regs_written = 1;
+   switch (dst.file) {
+   case GRF:
+   case HW_REG:
+   case MRF:
+      this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
+      break;
+   case BAD_FILE:
+      this->regs_written = 0;
+      break;
+   case IMM:
+   case UNIFORM:
+      unreachable("Invalid destination register file");
+   default:
+      unreachable("Invalid register file");
+   }
 
    this->writes_accumulator = false;
 }
@@ -252,7 +266,16 @@ fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
 {
    fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
                                         sources);
-   inst->regs_written = sources;
+   inst->regs_written = 0;
+   for (int i = 0; i < sources; ++i) {
+      /* The LOAD_PAYLOAD instruction only really makes sense if we are
+       * dealing with whole registers.  If this ever changes, we can deal
+       * with it later.
+       */
+      int size = src[i].effective_width(this) * type_sz(src[i].type);
+      assert(size % 32 == 0);
+      inst->regs_written += (size + 31) / 32;
+   }
 
    return inst;
 }
@@ -282,7 +305,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
                               varying_offset, fs_reg(const_offset & ~3)));
 
    int scale = 1;
-   if (brw->gen == 4 && dispatch_width == 8) {
+   if (brw->gen == 4 && dst.width == 8) {
       /* Pre-gen5, we can either use a SIMD8 message that requires (header,
        * u, v, r) as parameters, or we can just use the SIMD16 message
        * consisting of (header, u).  We choose the second, at the cost of a
@@ -296,9 +319,13 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
    else
       op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
-   fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
+
+   assert(dst.width % 8 == 0);
+   int regs_written = 4 * (dst.width / 8) * scale;
+   fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
+                               dst.type, dst.width);
    inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
-   inst->regs_written = 4 * scale;
+   inst->regs_written = regs_written;
    instructions.push_tail(inst);
 
    if (brw->gen < 7) {
@@ -802,12 +829,27 @@ int
 fs_inst::regs_read(fs_visitor *v, int arg) const
 {
    if (is_tex() && arg == 0 && src[0].file == GRF) {
-      if (v->dispatch_width == 16)
-	 return (mlen + 1) / 2;
-      else
-	 return mlen;
+      return mlen;
+   }
+
+   switch (src[arg].file) {
+   case BAD_FILE:
+   case UNIFORM:
+   case IMM:
+      return 1;
+   case GRF:
+   case HW_REG:
+      if (src[arg].stride == 0) {
+         return 1;
+      } else {
+         int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
+         return (size + 31) / 32;
+      }
+   case MRF:
+      unreachable("MRF registers are not allowed as sources");
+   default:
+      unreachable("Invalid register file");
    }
-   return 1;
 }
 
 bool
@@ -948,9 +990,10 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
 {
    init();
+   int reg_width = v->dispatch_width / 8;
 
    this->file = GRF;
-   this->reg = v->virtual_grf_alloc(v->type_size(type));
+   this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
    this->reg_offset = 0;
    this->type = brw_type_for_base_type(type);
    this->width = v->dispatch_width;
@@ -2096,6 +2139,7 @@ fs_visitor::demote_pull_constants()
          inst->src[i].file = GRF;
          inst->src[i].reg = dst.reg;
          inst->src[i].reg_offset = 0;
+         inst->src[i].width = dispatch_width;
       }
    }
    invalidate_live_intervals();
@@ -2241,12 +2285,12 @@ fs_visitor::opt_register_renaming()
 
       if (depth == 0 &&
           inst->dst.file == GRF &&
-          virtual_grf_sizes[inst->dst.reg] == 1 &&
+          virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
           !inst->is_partial_write()) {
          if (remap[dst] == -1) {
             remap[dst] = dst;
          } else {
-            remap[dst] = virtual_grf_alloc(1);
+            remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
             inst->dst.reg = remap[dst];
             progress = true;
          }
@@ -2338,7 +2382,7 @@ fs_visitor::compute_to_mrf()
             /* Things returning more than one register would need us to
              * understand coalescing out more than one MOV at a time.
              */
-            if (scan_inst->regs_written > 1)
+            if (scan_inst->regs_written > scan_inst->dst.width / 8)
                break;
 
 	    /* SEND instructions can't have MRF as a destination. */
@@ -2599,8 +2643,7 @@ void
 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
                                                         fs_inst *inst)
 {
-   int reg_size = dispatch_width / 8;
-   int write_len = inst->regs_written * reg_size;
+   int write_len = inst->regs_written;
    int first_write_grf = inst->dst.reg;
    bool needs_dep[BRW_MAX_MRF];
    assert(write_len < (int)sizeof(needs_dep) - 1);
@@ -2639,7 +2682,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        */
       if (scan_inst->dst.file == GRF) {
          for (int i = 0; i < scan_inst->regs_written; i++) {
-            int reg = scan_inst->dst.reg + i * reg_size;
+            int reg = scan_inst->dst.reg + i;
 
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
@@ -2677,7 +2720,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
 void
 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
 {
-   int write_len = inst->regs_written * dispatch_width / 8;
+   int write_len = inst->regs_written;
    int first_write_grf = inst->dst.reg;
    bool needs_dep[BRW_MAX_MRF];
    assert(write_len < (int)sizeof(needs_dep) - 1);
@@ -2829,19 +2872,77 @@ fs_visitor::lower_load_payload()
 {
    bool progress = false;
 
+   int vgrf_to_reg[virtual_grf_count];
+   int reg_count = 16; /* Leave room for MRF */
+   for (int i = 0; i < virtual_grf_count; ++i) {
+      vgrf_to_reg[i] = reg_count;
+      reg_count += virtual_grf_sizes[i];
+   }
+
+   struct {
+      bool written:1; /* Whether this register has ever been written */
+      bool force_writemask_all:1;
+      bool force_sechalf:1;
+   } metadata[reg_count];
+   memset(metadata, 0, sizeof(metadata));
+
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      int dst_reg;
+      if (inst->dst.file == MRF) {
+         dst_reg = inst->dst.reg;
+      } else if (inst->dst.file == GRF) {
+         dst_reg = vgrf_to_reg[inst->dst.reg];
+      }
+
+      if (inst->dst.file == MRF || inst->dst.file == GRF) {
+         bool force_sechalf = inst->force_sechalf;
+         bool toggle_sechalf = inst->dst.width == 16 &&
+                               type_sz(inst->dst.type) == 4;
+         for (int i = 0; i < inst->regs_written; ++i) {
+            metadata[dst_reg + i].written = true;
+            metadata[dst_reg + i].force_sechalf = force_sechalf;
+            metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
+            force_sechalf = (toggle_sechalf != force_sechalf);
+         }
+      }
+
       if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+         assert(inst->dst.file == MRF || inst->dst.file == GRF);
          fs_reg dst = inst->dst;
 
-         /* src[0] represents the (optional) message header. */
-         if (inst->src[0].file != BAD_FILE) {
-            inst->insert_before(block, MOV(dst, inst->src[0]));
-         }
-         dst.reg_offset++;
+         for (int i = 0; i < inst->sources; i++) {
+            dst.width = inst->src[i].effective_width(this);
+            dst.type = inst->src[i].type;
+
+            if (inst->src[i].file == BAD_FILE) {
+               /* Do nothing but otherwise increment as normal */
+            } else {
+               fs_inst *mov = MOV(dst, inst->src[i]);
+               if (inst->src[i].file == GRF) {
+                  int src_reg = vgrf_to_reg[inst->src[i].reg] +
+                                inst->src[i].reg_offset;
+                  mov->force_sechalf = metadata[src_reg].force_sechalf;
+                  mov->force_writemask_all = metadata[src_reg].force_writemask_all;
+                  metadata[dst_reg] = metadata[src_reg];
+                  if (dst.width * type_sz(dst.type) > 32) {
+                     assert((!metadata[src_reg].written ||
+                             !metadata[src_reg].force_sechalf) &&
+                            (!metadata[src_reg + 1].written ||
+                             metadata[src_reg + 1].force_sechalf));
+                     metadata[dst_reg + 1] = metadata[src_reg + 1];
+                  }
+               } else {
+                  metadata[dst_reg].force_writemask_all = false;
+                  metadata[dst_reg].force_sechalf = false;
+                  if (dst.width == 16) {
+                     metadata[dst_reg + 1].force_writemask_all = false;
+                     metadata[dst_reg + 1].force_sechalf = true;
+                  }
+               }
+               inst->insert_before(block, mov);
+            }
 
-         for (int i = 1; i < inst->sources; i++) {
-            inst->insert_before(block, MOV(dst, inst->src[i]));
-            dst.reg_offset++;
+            dst = offset(dst, 1);
          }
 
          inst->remove(block);
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 0d3931e..05fb71d 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -124,18 +124,40 @@ retype(fs_reg reg, enum brw_reg_type type)
 }
 
 static inline fs_reg
-offset(fs_reg reg, unsigned delta)
+byte_offset(fs_reg reg, unsigned delta)
 {
-   assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
-   reg.reg_offset += delta;
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case GRF:
+      reg.reg_offset += delta / 32;
+      break;
+   case MRF:
+      reg.reg += delta / 32;
+      break;
+   default:
+      assert(delta == 0);
+   }
+   reg.subreg_offset += delta % 32;
    return reg;
 }
 
 static inline fs_reg
-byte_offset(fs_reg reg, unsigned delta)
+offset(fs_reg reg, unsigned delta)
 {
-   assert(delta == 0 || (reg.file != HW_REG && reg.file != IMM));
-   reg.subreg_offset += delta;
+   assert(reg.stride > 0);
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case GRF:
+   case MRF:
+      return byte_offset(reg, delta * reg.width * reg.stride * type_sz(reg.type));
+   case UNIFORM:
+      reg.reg_offset += delta;
+      break;
+   default:
+      assert(delta == 0);
+   }
    return reg;
 }
 
@@ -426,6 +448,8 @@ public:
    void emit_if_gen6(ir_if *ir);
    void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                      uint32_t spill_offset, int count);
+   void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
+                   uint32_t spill_offset, int count);
 
    void emit_fragment_program_code();
    void setup_fp_regs();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index bd502c4..b4f4431 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -42,6 +42,7 @@ namespace { /* avoid conflict with opt_copy_propagation_elements */
 struct acp_entry : public exec_node {
    fs_reg dst;
    fs_reg src;
+   uint8_t regs_written;
    enum opcode opcode;
    bool saturate;
 };
@@ -295,11 +296,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    /* Bail if inst is reading a range that isn't contained in the range
     * that entry is writing.
     */
-   int reg_size = dispatch_width * sizeof(float);
    if (inst->src[arg].reg_offset < entry->dst.reg_offset ||
-       (inst->src[arg].reg_offset * reg_size + inst->src[arg].subreg_offset +
-        inst->regs_read(this, arg) * inst->src[arg].stride * reg_size) >
-       (entry->dst.reg_offset + 1) * reg_size)
+       (inst->src[arg].reg_offset * 32 + inst->src[arg].subreg_offset +
+        inst->regs_read(this, arg) * inst->src[arg].stride * 32) >
+       (entry->dst.reg_offset + entry->regs_written) * 32)
       return false;
 
    /* See resolve_ud_negate() and comment in brw_fs_emit.cpp. */
@@ -371,16 +371,25 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
    inst->saturate = inst->saturate || entry->saturate;
 
    switch (entry->src.file) {
+   case UNIFORM:
+      assert(entry->src.width == 1);
    case BAD_FILE:
    case HW_REG:
-   case UNIFORM:
+      inst->src[arg].width = entry->src.width;
       inst->src[arg].reg_offset = entry->src.reg_offset;
       inst->src[arg].subreg_offset = entry->src.subreg_offset;
       break;
    case GRF:
       {
-         /* In this case, we have to deal with mapping parts of vgrfs to
-          * other parts of vgrfs so we have to do some reg_offset magic.
+         assert(entry->src.width % inst->src[arg].width == 0);
+         /* In this case, we'll just leave the width alone.  The source
+          * register could have different widths depending on how it is
+          * being used.  For instance, if only half of the register was
+          * used then we want to preserve that and continue to only use
+          * half.
+          *
+          * Also, we have to deal with mapping parts of vgrfs to other
+          * parts of vgrfs so we have to do some reg_offset magic.
           */
 
          /* Compute the offset of inst->src[arg] relative to inst->dst */
@@ -389,10 +398,10 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
          int rel_suboffset = inst->src[arg].subreg_offset;
 
          /* Compute the final register offset (in bytes) */
-         int offset = entry->src.reg_offset * reg_size + entry->src.subreg_offset;
-         offset += rel_offset * reg_size + rel_suboffset;
-         inst->src[arg].reg_offset = offset / reg_size;
-         inst->src[arg].subreg_offset = offset % reg_size;
+         int offset = entry->src.reg_offset * 32 + entry->src.subreg_offset;
+         offset += rel_offset * 32 + rel_suboffset;
+         inst->src[arg].reg_offset = offset / 32;
+         inst->src[arg].subreg_offset = offset % 32;
       }
       break;
    default:
@@ -429,11 +438,10 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
       /* Bail if inst is reading a range that isn't contained in the range
        * that entry is writing.
        */
-      int reg_size = dispatch_width * sizeof(float);
       if (inst->src[i].reg_offset < entry->dst.reg_offset ||
-          (inst->src[i].reg_offset * reg_size + inst->src[i].subreg_offset +
-           inst->regs_read(this, i) * inst->src[i].stride * reg_size) >
-          (entry->dst.reg_offset + 1) * reg_size)
+          (inst->src[i].reg_offset * 32 + inst->src[i].subreg_offset +
+           inst->regs_read(this, i) * inst->src[i].stride * 32) >
+          (entry->dst.reg_offset + entry->regs_written) * 32)
          continue;
 
       /* Don't bother with cases that should have been taken care of by the
@@ -623,17 +631,23 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
 	 acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
 	 entry->dst = inst->dst;
 	 entry->src = inst->src[0];
+         entry->regs_written = inst->regs_written;
          entry->opcode = inst->opcode;
          entry->saturate = inst->saturate;
 	 acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
       } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
                  inst->dst.file == GRF) {
+         int offset = 0;
          for (int i = 0; i < inst->sources; i++) {
+            int regs_written = ((inst->src[i].effective_width(this) *
+                                 type_sz(inst->src[i].type)) + 31) / 32;
             if (inst->src[i].file == GRF) {
                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
                entry->dst = inst->dst;
-               entry->dst.reg_offset = i;
+               entry->dst.reg_offset = offset;
+               entry->dst.width = inst->src[i].effective_width(this);
                entry->src = inst->src[i];
+               entry->regs_written = regs_written;
                entry->opcode = inst->opcode;
                if (!entry->dst.equals(inst->src[i])) {
                   acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
@@ -641,6 +655,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
                   ralloc_free(entry);
                }
             }
+            offset += regs_written;
          }
       }
    }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 7edbe19..817fc1f 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -202,19 +202,21 @@ fs_visitor::opt_cse_local(bblock_t *block)
             bool no_existing_temp = entry->tmp.file == BAD_FILE;
             if (no_existing_temp && !entry->generator->dst.is_null()) {
                int written = entry->generator->regs_written;
+               int dst_width = entry->generator->dst.width / 8;
+               assert(written % dst_width == 0);
 
                fs_reg orig_dst = entry->generator->dst;
                fs_reg tmp = fs_reg(GRF, virtual_grf_alloc(written),
-                                   orig_dst.type);
+                                   orig_dst.type, orig_dst.width);
                entry->tmp = tmp;
                entry->generator->dst = tmp;
 
                fs_inst *copy;
-               if (written > 1) {
-                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written);
-                  for (int i = 0; i < written; i++)
+               if (written > dst_width) {
+                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
+                  for (int i = 0; i < written / dst_width; i++)
                      sources[i] = offset(tmp, i);
-                  copy = LOAD_PAYLOAD(orig_dst, sources, written);
+                  copy = LOAD_PAYLOAD(orig_dst, sources, written / dst_width);
                } else {
                   copy = MOV(orig_dst, tmp);
                   copy->force_writemask_all =
@@ -226,16 +228,18 @@ fs_visitor::opt_cse_local(bblock_t *block)
             /* dest <- temp */
             if (!inst->dst.is_null()) {
                int written = inst->regs_written;
+               int dst_width = inst->dst.width / 8;
                assert(written == entry->generator->regs_written);
+               assert(dst_width == entry->generator->dst.width / 8);
                assert(inst->dst.type == entry->tmp.type);
                fs_reg dst = inst->dst;
                fs_reg tmp = entry->tmp;
                fs_inst *copy;
-               if (written > 1) {
-                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written);
-                  for (int i = 0; i < written; i++)
+               if (written > dst_width) {
+                  fs_reg *sources = ralloc_array(mem_ctx, fs_reg, written / dst_width);
+                  for (int i = 0; i < written / dst_width; i++)
                      sources[i] = offset(tmp, i);
-                  copy = LOAD_PAYLOAD(dst, sources, written);
+                  copy = LOAD_PAYLOAD(dst, sources, written / dst_width);
                } else {
                   copy = MOV(dst, tmp);
                   copy->force_writemask_all = inst->force_writemask_all;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index 122a43f..5bfc559 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -542,15 +542,8 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       dst = vec16(dst);
    }
 
-   if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
-      /* The send-from-GRF for SIMD16 texturing with a header has an extra
-       * hardware register allocated to it, which we need to skip over (since
-       * our coordinates in the payload are in the even-numbered registers,
-       * and the header comes right before the first one).
-       */
-      assert(src.file == BRW_GENERAL_REGISTER_FILE);
-      src.nr++;
-   }
+   assert(brw->gen < 7 || !inst->header_present ||
+          src.file == BRW_GENERAL_REGISTER_FILE);
 
    assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index a627b64..095b45c 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -31,11 +31,11 @@
 #include "glsl/ir_optimization.h"
 
 static void
-assign_reg(int *reg_hw_locations, fs_reg *reg, int reg_width)
+assign_reg(int *reg_hw_locations, fs_reg *reg)
 {
    if (reg->file == GRF) {
       assert(reg->reg_offset >= 0);
-      reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset * reg_width;
+      reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset;
       reg->reg_offset = 0;
    }
 }
@@ -51,14 +51,14 @@ fs_visitor::assign_regs_trivial()
    hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
    for (i = 1; i <= this->virtual_grf_count; i++) {
       hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
-			   this->virtual_grf_sizes[i - 1] * reg_width);
+			   this->virtual_grf_sizes[i - 1]);
    }
    this->grf_used = hw_reg_mapping[this->virtual_grf_count];
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+      assign_reg(hw_reg_mapping, &inst->dst);
       for (i = 0; i < inst->sources; i++) {
-         assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
+         assign_reg(hw_reg_mapping, &inst->src[i]);
       }
    }
 
@@ -75,7 +75,7 @@ static void
 brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
 {
    const struct brw_device_info *devinfo = screen->devinfo;
-   int base_reg_count = BRW_MAX_GRF / reg_width;
+   int base_reg_count = BRW_MAX_GRF;
    int index = reg_width - 1;
 
    /* The registers used to make up almost all values handled in the compiler
@@ -105,8 +105,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    int class_sizes[BRW_MAX_MRF];
 
    if (devinfo->gen >= 7) {
-      for (class_count = 0; class_count < MAX_SAMPLER_MESSAGE_SIZE;
-           class_count++)
+      for (class_count = 0; class_count < BRW_MAX_MRF; class_count++)
          class_sizes[class_count] = class_count + 1;
    } else {
       for (class_count = 0; class_count < 4; class_count++)
@@ -117,7 +116,21 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    /* Compute the total number of registers across all classes. */
    int ra_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
-      ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+      if (devinfo->gen <= 5 && reg_width == 2) {
+         /* From the G45 PRM:
+          *
+          * In order to reduce the hardware complexity, the following
+          * rules and restrictions apply to the compressed instruction:
+          * ...
+          * * Operand Alignment Rule: With the exceptions listed below, a
+          *   source/destination operand in general should be aligned to
+          *   even 256-bit physical register with a region size equal to
+          *   two 256-bit physical register
+          */
+         ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
+      } else {
+         ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+      }
    }
 
    uint8_t *ra_reg_to_grf = ralloc_array(screen, uint8_t, ra_reg_count);
@@ -134,27 +147,48 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    int pairs_base_reg = 0;
    int pairs_reg_count = 0;
    for (int i = 0; i < class_count; i++) {
-      int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      int class_reg_count;
+      if (devinfo->gen <= 5 && reg_width == 2) {
+         class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
+      } else {
+         class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      }
       classes[i] = ra_alloc_reg_class(regs);
 
       /* Save this off for the aligned pair class at the end. */
       if (class_sizes[i] == 2) {
-	 pairs_base_reg = reg;
-	 pairs_reg_count = class_reg_count;
+         pairs_base_reg = reg;
+         pairs_reg_count = class_reg_count;
       }
 
-      for (int j = 0; j < class_reg_count; j++) {
-	 ra_class_add_reg(regs, classes[i], reg);
+      if (devinfo->gen <= 5 && reg_width == 2) {
+         for (int j = 0; j < class_reg_count; j++) {
+            ra_class_add_reg(regs, classes[i], reg);
 
-	 ra_reg_to_grf[reg] = j;
+            ra_reg_to_grf[reg] = j * 2;
 
-	 for (int base_reg = j;
-	      base_reg < j + class_sizes[i];
-	      base_reg++) {
-	    ra_add_transitive_reg_conflict(regs, base_reg, reg);
-	 }
+            for (int base_reg = j * 2;
+                 base_reg < j * 2 + class_sizes[i];
+                 base_reg++) {
+               ra_add_transitive_reg_conflict(regs, base_reg, reg);
+            }
+
+            reg++;
+         }
+      } else {
+         for (int j = 0; j < class_reg_count; j++) {
+            ra_class_add_reg(regs, classes[i], reg);
+
+            ra_reg_to_grf[reg] = j;
 
-	 reg++;
+            for (int base_reg = j;
+                 base_reg < j + class_sizes[i];
+                 base_reg++) {
+               ra_add_transitive_reg_conflict(regs, base_reg, reg);
+            }
+
+            reg++;
+         }
       }
    }
    assert(reg == ra_reg_count);
@@ -162,7 +196,7 @@ brw_alloc_reg_set(struct intel_screen *screen, int reg_width)
    /* Add a special class for aligned pairs, which we'll put delta_x/y
     * in on gen5 so that we can do PLN.
     */
-   if (devinfo->has_pln && reg_width == 1 && devinfo->gen < 6) {
+   if (devinfo->has_pln && devinfo->gen < 6) {
       aligned_pairs_class = ra_alloc_reg_class(regs);
 
       for (int i = 0; i < pairs_reg_count; i++) {
@@ -236,7 +270,6 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
                                        int payload_node_count,
                                        int first_payload_node)
 {
-   int reg_width = dispatch_width / 8;
    int loop_depth = 0;
    int loop_end_ip = 0;
 
@@ -276,7 +309,7 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
       for (int i = 0; i < inst->sources; i++) {
          if (inst->src[i].file == HW_REG &&
              inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
-            int node_nr = inst->src[i].fixed_hw_reg.nr / reg_width;
+            int node_nr = inst->src[i].fixed_hw_reg.nr;
             if (node_nr >= payload_node_count)
                continue;
 
@@ -292,25 +325,26 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
           * sideband.  It also really freaks out driver developers to see g0
           * used in unusual places, so just always reserve it.
           */
-         payload_last_use_ip[0 / reg_width] = use_ip;
-         payload_last_use_ip[1 / reg_width] = use_ip;
+         payload_last_use_ip[0] = use_ip;
+         payload_last_use_ip[1] = use_ip;
          break;
 
       case FS_OPCODE_LINTERP:
-         /* On gen6+ in SIMD16, there are 4 adjacent registers (so 2 nodes)
-          * used by PLN's sourcing of the deltas, while we list only the first
-          * two in the arguments (1 node).  Pre-gen6, the deltas are computed
-          * in normal VGRFs.
+         /* On gen6+ in SIMD16, there are 4 adjacent registers used by
+          * PLN's sourcing of the deltas, while we list only the first one
+          * in the arguments.  Pre-gen6, the deltas are computed in normal
+          * VGRFs.
           */
          if (brw->gen >= 6) {
             int delta_x_arg = 0;
             if (inst->src[delta_x_arg].file == HW_REG &&
                 inst->src[delta_x_arg].fixed_hw_reg.file ==
                 BRW_GENERAL_REGISTER_FILE) {
-               int sechalf_node = (inst->src[delta_x_arg].fixed_hw_reg.nr /
-                                   reg_width) + 1;
-               assert(sechalf_node < payload_node_count);
-               payload_last_use_ip[sechalf_node] = use_ip;
+               for (int i = 1; i < 4; ++i) {
+                  int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i;
+                  assert(node < payload_node_count);
+                  payload_last_use_ip[node] = use_ip;
+               }
             }
          }
          break;
@@ -391,8 +425,6 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
 void
 fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
 {
-   int reg_width = dispatch_width / 8;
-
    bool mrf_used[BRW_MAX_MRF];
    get_used_mrfs(mrf_used);
 
@@ -402,8 +434,7 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
        * The alternative would be to have per-physical-register classes, which
        * would just be silly.
        */
-      ra_set_node_reg(g, first_mrf_node + i,
-                      (GEN7_MRF_HACK_START + i) / reg_width);
+      ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
 
       /* Since we don't have any live/dead analysis on the MRFs, just mark all
        * that are used as conflicting with all virtual GRFs.
@@ -428,8 +459,7 @@ fs_visitor::assign_regs(bool allow_spilling)
     */
    int reg_width = dispatch_width / 8;
    int hw_reg_mapping[this->virtual_grf_count];
-   int payload_node_count = (ALIGN(this->first_non_payload_grf, reg_width) /
-                            reg_width);
+   int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
    int rsi = reg_width - 1; /* Which screen->wm_reg_sets[] to use */
    calculate_live_intervals();
 
@@ -478,6 +508,30 @@ fs_visitor::assign_regs(bool allow_spilling)
    if (brw->gen >= 7)
       setup_mrf_hack_interference(g, first_mrf_hack_node);
 
+   if (dispatch_width > 8) {
+      /* In 16-wide dispatch we have an issue where a compressed
+       * instruction is actually two instructions executed simultaneiously.
+       * It's actually ok to have the source and destination registers be
+       * the same.  In this case, each instruction over-writes its own
+       * source and there's no problem.  The real problem here is if the
+       * source and destination registers are off by one.  Then you can end
+       * up in a scenario where the first instruction over-writes the
+       * source of the second instruction.  Since the compiler doesn't know
+       * about this level of granularity, we simply make the source and
+       * destination interfere.
+       */
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->dst.file != GRF)
+            continue;
+
+         for (int i = 0; i < inst->sources; ++i) {
+            if (inst->src[i].file == GRF) {
+               ra_add_node_interference(g, inst->dst.reg, inst->src[i].reg);
+            }
+         }
+      }
+   }
+
    /* Debug of register spilling: Go spill everything. */
    if (0) {
       int reg = choose_spill_reg(g);
@@ -511,20 +565,19 @@ fs_visitor::assign_regs(bool allow_spilling)
     * regs in the register classes back down to real hardware reg
     * numbers.
     */
-   this->grf_used = payload_node_count * reg_width;
+   this->grf_used = payload_node_count;
    for (int i = 0; i < this->virtual_grf_count; i++) {
       int reg = ra_get_node_reg(g, i);
 
-      hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg] * reg_width;
+      hw_reg_mapping[i] = screen->wm_reg_sets[rsi].ra_reg_to_grf[reg];
       this->grf_used = MAX2(this->grf_used,
-			    hw_reg_mapping[i] + this->virtual_grf_sizes[i] *
-			    reg_width);
+			    hw_reg_mapping[i] + this->virtual_grf_sizes[i]);
    }
 
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
-      assign_reg(hw_reg_mapping, &inst->dst, reg_width);
+      assign_reg(hw_reg_mapping, &inst->dst);
       for (int i = 0; i < inst->sources; i++) {
-         assign_reg(hw_reg_mapping, &inst->src[i], reg_width);
+         assign_reg(hw_reg_mapping, &inst->src[i]);
       }
    }
 
@@ -539,7 +592,11 @@ void
 fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
                          uint32_t spill_offset, int count)
 {
-   for (int i = 0; i < count; i++) {
+   int reg_size = 1;
+   if (count % 2 == 0)
+      reg_size = 2;
+
+   for (int i = 0; i < count / reg_size; i++) {
       /* The gen7 descriptor-based offset is 12 bits of HWORD units. */
       bool gen7_read = brw->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;
 
@@ -558,8 +615,32 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
       }
       inst->insert_before(block, unspill_inst);
 
-      dst = offset(dst, 1);
-      spill_offset += dispatch_width * sizeof(float);
+      dst.reg_offset += reg_size;
+      spill_offset += reg_size * 8 * sizeof(float);
+   }
+}
+
+void
+fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
+                       uint32_t spill_offset, int count)
+{
+   int spill_base_mrf = dispatch_width > 8 ? 13 : 14;
+
+   int reg_size = 1;
+   if (count % 2 == 0)
+      reg_size = 2;
+
+   for (int i = 0; i < count / reg_size; i++) {
+      fs_inst *spill_inst =
+         new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+                              reg_null_f, src);
+      src.reg_offset += reg_size;
+      spill_inst->offset = spill_offset + i * reg_size;
+      spill_inst->ir = inst->ir;
+      spill_inst->annotation = inst->annotation;
+      spill_inst->mlen = 1 + reg_size; /* header, value */
+      spill_inst->base_mrf = spill_base_mrf;
+      inst->insert_after(block, spill_inst);
    }
 }
 
@@ -712,18 +793,8 @@ fs_visitor::spill_reg(int spill_reg)
                          inst->regs_written);
 	 }
 
-	 for (int chan = 0; chan < inst->regs_written; chan++) {
-	    fs_inst *spill_inst =
-               new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
-                                    reg_null_f, spill_src);
-	    spill_src = offset(spill_src, 1);
-	    spill_inst->offset = subset_spill_offset + chan * reg_size;
-	    spill_inst->ir = inst->ir;
-	    spill_inst->annotation = inst->annotation;
-	    spill_inst->mlen = 1 + dispatch_width / 8; /* header, value */
-	    spill_inst->base_mrf = spill_base_mrf;
-	    inst->insert_after(block, spill_inst);
-	 }
+         emit_spill(block, inst, spill_src, subset_spill_offset,
+                    inst->regs_written);
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
index 73f18f9..9546dcd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_register_coalesce.cpp
@@ -69,16 +69,12 @@ is_copy_payload(const fs_visitor *v, const fs_inst *inst)
    if (v->virtual_grf_sizes[inst->src[0].reg] != inst->regs_written)
       return false;
 
-   const int reg = inst->src[0].reg;
-   if (inst->src[0].reg_offset != 0)
-      return false;
+   fs_reg reg = inst->src[0];
 
-   for (int i = 1; i < inst->sources; i++) {
-      if (inst->src[i].reg != reg ||
-          inst->src[i].reg_offset != i) {
+   for (int i = 0; i < inst->sources; i++)
+      if (!inst->src[i].equals(offset(reg, i)))
          return false;
-      }
-   }
+
    return true;
 }
 
@@ -186,6 +182,7 @@ fs_visitor::register_coalesce()
          src_size = virtual_grf_sizes[inst->src[0].reg];
          assert(src_size <= MAX_SAMPLER_MESSAGE_SIZE);
 
+         assert(inst->src[0].width % 8 == 0);
          channels_remaining = src_size;
          memset(mov, 0, sizeof(mov));
 
@@ -200,12 +197,14 @@ fs_visitor::register_coalesce()
             reg_to_offset[i] = i;
          }
          mov[0] = inst;
-         channels_remaining -= inst->sources;
+         channels_remaining -= inst->regs_written;
       } else {
          const int offset = inst->src[0].reg_offset;
          reg_to_offset[offset] = inst->dst.reg_offset;
+         if (inst->src[0].width == 16)
+            reg_to_offset[offset + 1] = inst->dst.reg_offset + 1;
          mov[offset] = inst;
-         channels_remaining--;
+         channels_remaining -= inst->regs_written;
       }
 
       if (channels_remaining)
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 72ffe1f..3b31e34 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -287,7 +287,8 @@ fs_visitor::try_emit_saturate(ir_expression *ir)
     * src, just set the saturate flag instead of emmitting a separate mov.
     */
    fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
-   if (modify && modify->regs_written == 1 && modify->can_do_saturate()) {
+   if (modify && modify->regs_written == modify->dst.width / 8 &&
+       modify->can_do_saturate()) {
       modify->saturate = true;
       this->result = src;
       return true;
@@ -1434,7 +1435,7 @@ fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    inst->base_mrf = base_mrf;
    inst->mlen = mlen;
    inst->header_present = header_present;
-   inst->regs_written = 4;
+   inst->regs_written = 4 * reg_width;
 
    if (mlen > MAX_SAMPLER_MESSAGE_SIZE) {
       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
@@ -1480,7 +1481,7 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
        * need to offset the Sampler State Pointer in the header.
        */
       header_present = true;
-      sources[length] = reg_undef;
+      sources[0] = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD);
       length++;
    }
 
@@ -1618,7 +1619,13 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
       }
    }
 
-   fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(length),
+   int mlen;
+   if (reg_width == 2)
+      mlen = length * reg_width - header_present;
+   else
+      mlen = length * reg_width;
+
+   fs_reg src_payload = fs_reg(GRF, virtual_grf_alloc(mlen),
                                BRW_REGISTER_TYPE_F);
    emit(LOAD_PAYLOAD(src_payload, sources, length));
 
@@ -1645,12 +1652,9 @@ fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
    }
    fs_inst *inst = emit(opcode, dst, src_payload, sampler);
    inst->base_mrf = -1;
-   if (reg_width == 2)
-      inst->mlen = length * reg_width - header_present;
-   else
-      inst->mlen = length * reg_width;
+   inst->mlen = mlen;
    inst->header_present = header_present;
-   inst->regs_written = 4;
+   inst->regs_written = 4 * reg_width;
 
    if (inst->mlen > MAX_SAMPLER_MESSAGE_SIZE) {
       fail("Message length >" STRINGIFY(MAX_SAMPLER_MESSAGE_SIZE)
@@ -1784,7 +1788,7 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
 {
    int reg_width = dispatch_width / 8;
    int length = ir->coordinate->type->vector_elements;
-   fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length),
+   fs_reg payload = fs_reg(GRF, virtual_grf_alloc(length * reg_width),
                            BRW_REGISTER_TYPE_F);
    fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
    fs_reg *sources = ralloc_array(mem_ctx, fs_reg, length);
@@ -1802,9 +1806,10 @@ fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, fs_reg sampler)
    inst->base_mrf = -1;
    inst->mlen = length * reg_width;
    inst->header_present = false;
-   inst->regs_written = 4; /* we only care about one reg of response,
-                            * but the sampler always writes 4/8
-                            */
+   inst->regs_written = 4 * reg_width; /* we only care about one reg of
+                                        * response, but the sampler always
+                                        * writes 4/8
+                                        */
 
    return dest;
 }
@@ -1979,14 +1984,15 @@ fs_visitor::visit(ir_texture *ir)
          emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
          fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
-         for (int i = 0; i < inst->regs_written; i++) {
+         int components = inst->regs_written / (dst.width / 8);
+         for (int i = 0; i < components; i++) {
             if (i == 2) {
                fixed_payload[i] = fixed_depth;
             } else {
                fixed_payload[i] = offset(dst, i);
             }
          }
-         emit(LOAD_PAYLOAD(dst, fixed_payload, inst->regs_written));
+         emit(LOAD_PAYLOAD(dst, fixed_payload, components));
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index b963bda..5e8c98a 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -795,7 +795,7 @@ fs_instruction_scheduler::calculate_deps()
       for (int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
+               for (int r = 0; r < inst->regs_read(v, i); r++)
                   add_dep(last_grf_write[inst->src[i].reg + r], n);
             } else {
                for (int r = 0; r < inst->regs_read(v, i); r++) {
@@ -847,7 +847,7 @@ fs_instruction_scheduler::calculate_deps()
       /* write-after-write deps. */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
-            for (int r = 0; r < inst->regs_written * reg_width; r++) {
+            for (int r = 0; r < inst->regs_written; r++) {
                add_dep(last_grf_write[inst->dst.reg + r], n);
                last_grf_write[inst->dst.reg + r] = n;
             }
@@ -923,7 +923,7 @@ fs_instruction_scheduler::calculate_deps()
       for (int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file == GRF) {
             if (post_reg_alloc) {
-               for (int r = 0; r < reg_width * inst->regs_read(v, i); r++)
+               for (int r = 0; r < inst->regs_read(v, i); r++)
                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
             } else {
                for (int r = 0; r < inst->regs_read(v, i); r++) {
@@ -977,7 +977,7 @@ fs_instruction_scheduler::calculate_deps()
        */
       if (inst->dst.file == GRF) {
          if (post_reg_alloc) {
-            for (int r = 0; r < inst->regs_written * reg_width; r++)
+            for (int r = 0; r < inst->regs_written; r++)
                last_grf_write[inst->dst.reg + r] = n;
          } else {
             for (int r = 0; r < inst->regs_written; r++) {
@@ -1300,7 +1300,8 @@ fs_instruction_scheduler::choose_instruction_to_schedule()
                 * single-result send is probably actually reducing register
                 * pressure.
                 */
-               if (inst->regs_written <= 1 && chosen_inst->regs_written > 1) {
+               if (inst->regs_written <= inst->dst.width / 8 &&
+                   chosen_inst->regs_written > chosen_inst->dst.width / 8) {
                   chosen = n;
                   continue;
                } else if (inst->regs_written > chosen_inst->regs_written) {