33 files changed, 1101 insertions, 624 deletions
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index cea1e87..5d69039 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -1,6 +1,7 @@
 i965_compiler_FILES = \
 	brw_cfg.cpp \
 	brw_cfg.h \
+	brw_compiler.c \
 	brw_compiler.h \
 	brw_dead_control_flow.cpp \
 	brw_dead_control_flow.h \
@@ -51,6 +52,7 @@ i965_compiler_FILES = \
 	brw_shader.cpp \
 	brw_shader.h \
 	brw_surface_formats.c \
+	brw_surface_formats.h \
 	brw_util.c \
 	brw_util.h \
 	brw_vec4_builder.h \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index c6ae3d8..fd23e23 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -85,7 +85,7 @@ brw_blorp_eu_emitter::emit_texture_lookup(const struct brw_reg &dst,
                                           unsigned msg_length)
 {
    fs_inst *inst = new (mem_ctx) fs_inst(op, 16, dst, brw_message_reg(base_mrf),
-                                         brw_imm_ud(0u));
+                                         brw_imm_ud(0u), brw_imm_ud(0u));
 
    inst->base_mrf = base_mrf;
    inst->mlen = msg_length;
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
new file mode 100644
index 0000000..3d93772
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -0,0 +1,180 @@
+/*
+ * Copyright © 2015-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_context.h"
+#include "glsl/nir/nir.h"
+#include "main/errors.h"
+#include "util/debug.h"
+
+static void
+shader_debug_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+   va_list args;
+
+   va_start(args, fmt);
+   GLuint msg_id = 0;
+   _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                   MESA_DEBUG_TYPE_OTHER,
+                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
+   va_end(args);
+}
+
+static void
+shader_perf_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      va_list args_copy;
+      va_copy(args_copy, args);
+      vfprintf(stderr, fmt, args_copy);
+      va_end(args_copy);
+   }
+
+   if (brw->perf_debug) {
+      GLuint msg_id = 0;
+      _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                      MESA_DEBUG_TYPE_PERFORMANCE,
+                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
+   }
+   va_end(args);
+}
+
+#define COMMON_OPTIONS                                                        \
+   /* In order to help allow for better CSE at the NIR level we tell NIR to   \
+    * split all ffma instructions during opt_algebraic and we then re-combine \
+    * them as a later step.                                                   \
+    */                                                                        \
+   .lower_ffma = true,                                                        \
+   .lower_sub = true,                                                         \
+   .lower_fdiv = true,                                                        \
+   .lower_scmp = true,                                                        \
+   .lower_fmod = true,                                                        \
+   .lower_bitfield_extract = true,                                            \
+   .lower_bitfield_insert = true,                                             \
+   .lower_uadd_carry = true,                                                  \
+   .lower_usub_borrow = true,                                                 \
+   .lower_fdiv = true,                                                        \
+   .native_integers = true,                                                   \
+   .vertex_id_zero_based = true
+
+static const struct nir_shader_compiler_options scalar_nir_options = {
+   COMMON_OPTIONS,
+   .lower_pack_half_2x16 = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_unpack_half_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+};
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
+{
+   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
+
+   compiler->devinfo = devinfo;
+   compiler->shader_debug_log = shader_debug_log_mesa;
+   compiler->shader_perf_log = shader_perf_log_mesa;
+
+   brw_fs_alloc_reg_sets(compiler);
+   brw_vec4_alloc_reg_set(compiler);
+
+   compiler->scalar_stage[MESA_SHADER_VERTEX] =
+      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
+   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
+   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true);
+   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
+   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
+      compiler->glsl_compiler_options[i].MaxIfDepth =
+         devinfo->gen < 6 ? 16 : UINT_MAX;
+
+      compiler->glsl_compiler_options[i].EmitCondCodes = true;
+      compiler->glsl_compiler_options[i].EmitNoNoise = true;
+      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
+      compiler->glsl_compiler_options[i].LowerClipDistance = true;
+
+      bool is_scalar = compiler->scalar_stage[i];
+
+      compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
+      compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
+      compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
+
+      /* !ARB_gpu_shader5 */
+      if (devinfo->gen < 7)
+         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
+
+      compiler->glsl_compiler_options[i].NirOptions =
+         is_scalar ? &scalar_nir_options : &vector_nir_options;
+
+      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
+   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE]
+      .LowerShaderSharedVariables = true;
+
+   return compiler;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 748ffe5..cd28bbb 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -94,6 +94,9 @@ struct brw_compiler {
    struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 };
 
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
+
 
 /**
  * Program key structures.
@@ -687,6 +690,9 @@ struct brw_gs_prog_data
 
 /** @} */
 
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
+
 /**
  * Compile a vertex shader.
  *
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 10a6d39..9edb6f5 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -63,6 +63,7 @@
 # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL (0 << 8)
 # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     (1 << 8)
 
+#ifndef _3DPRIM_POINTLIST /* FIXME: Avoid clashing with defines from bdw_pack.h */
 #define _3DPRIM_POINTLIST         0x01
 #define _3DPRIM_LINELIST          0x02
 #define _3DPRIM_LINESTRIP         0x03
@@ -86,6 +87,7 @@
 #define _3DPRIM_TRIFAN_NOSTIPPLE  0x16
 #define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
 
+#endif /* bdw_pack.h */
 
 /* We use this offset to be able to pass native primitive types in struct
  * _mesa_prim::mode.  Native primitive types are BRW_PRIM_OFFSET +
@@ -1085,6 +1087,18 @@ enum opcode {
     */
    SHADER_OPCODE_BROADCAST,
 
+   /**
+    * Pick the byte from its first source register given by the index
+    * specified as second source.
+    */
+   SHADER_OPCODE_EXTRACT_BYTE,
+
+   /**
+    * Pick the word from its first source register given by the index
+    * specified as second source.
+    */
+   SHADER_OPCODE_EXTRACT_WORD,
+
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 7ab70fe..0587225 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -483,3 +483,15 @@ brw_get_device_info(int devid)
 
    return devinfo;
 }
+
+const char *
+brw_get_device_name(int devid)
+{
+   switch (devid) {
+#undef CHIPSET
+#define CHIPSET(id, family, name) case id: return name;
+#include "pci_ids/i965_pci_ids.h"
+   default:
+      return NULL;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index 73d6820..48e0dee 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -98,3 +98,4 @@ struct brw_device_info
 };
 
 const struct brw_device_info *brw_get_device_info(int devid);
+const char *brw_get_device_name(int devid);
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 8737c64..23e71fd 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -54,23 +54,6 @@
 
 #define FILE_DEBUG_FLAG DEBUG_PRIMS
 
-static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
-   [GL_POINTS] =_3DPRIM_POINTLIST,
-   [GL_LINES] = _3DPRIM_LINELIST,
-   [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
-   [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
-   [GL_TRIANGLES] = _3DPRIM_TRILIST,
-   [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
-   [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
-   [GL_QUADS] = _3DPRIM_QUADLIST,
-   [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
-   [GL_POLYGON] = _3DPRIM_POLYGON,
-   [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
-   [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
-   [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
-   [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
-};
-
 
 static const GLenum reduced_prim[GL_POLYGON+1] = {
    [GL_POINTS] = GL_POINTS,
@@ -85,18 +68,6 @@ static const GLenum reduced_prim[GL_POLYGON+1] = {
    [GL_POLYGON] = GL_TRIANGLES
 };
 
-uint32_t
-get_hw_prim_for_gl_prim(int mode)
-{
-   if (mode >= BRW_PRIM_OFFSET)
-      return mode - BRW_PRIM_OFFSET;
-   else {
-      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
-      return prim_to_hw_prim[mode];
-   }
-}
-
-
 /* When the primitive changes, set a state bit and re-validate.  Not
  * the nicest and would rather deal with this by having all the
  * programs be immune to the active primitive (ie. cope with all
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index cbeab6f..922f720 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -174,7 +174,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
     * CSE can later notice that those loads are all the same and eliminate
     * the redundant ones.
     */
-   fs_reg vec4_offset = vgrf(glsl_type::int_type);
+   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
    bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
 
    int scale = 1;
@@ -433,7 +433,6 @@ fs_reg::fs_reg(struct ::brw_reg reg) :
 {
    this->reg_offset = 0;
    this->subreg_offset = 0;
-   this->reladdr = NULL;
    this->stride = 1;
    if (this->file == IMM &&
        (this->type != BRW_REGISTER_TYPE_V &&
@@ -448,7 +447,6 @@ fs_reg::equals(const fs_reg &r) const
 {
    return (this->backend_reg::equals(r) &&
            subreg_offset == r.subreg_offset &&
-           !reladdr && !r.reladdr &&
            stride == r.stride);
 }
 
@@ -510,6 +508,7 @@ type_size_scalar(const struct glsl_type *type)
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
    case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_FUNCTION:
       unreachable("not reached");
    }
 
@@ -739,15 +738,15 @@ fs_inst::components_read(unsigned i) const
    case SHADER_OPCODE_LOD_LOGICAL:
    case SHADER_OPCODE_TG4_LOGICAL:
    case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
-      assert(src[8].file == IMM && src[9].file == IMM);
+      assert(src[9].file == IMM && src[10].file == IMM);
       /* Texture coordinates. */
       if (i == 0)
-         return src[8].ud;
+         return src[9].ud;
       /* Texture derivatives. */
       else if ((i == 2 || i == 3) && opcode == SHADER_OPCODE_TXD_LOGICAL)
-         return src[9].ud;
+         return src[10].ud;
       /* Texture offset. */
-      else if (i == 7)
+      else if (i == 8)
          return 2;
       /* MCS */
       else if (i == 5 && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
@@ -850,7 +849,10 @@ fs_inst::regs_read(int arg) const
          assert(src[2].file == IMM);
          unsigned region_length = src[2].ud;
 
-         if (src[0].file == FIXED_GRF) {
+         if (src[0].file == UNIFORM) {
+            assert(region_length % 4 == 0);
+            return region_length / 4;
+         } else if (src[0].file == FIXED_GRF) {
             /* If the start of the region is not register aligned, then
              * there's some portion of the register that's technically
              * unread at the beginning.
@@ -864,7 +866,7 @@ fs_inst::regs_read(int arg) const
              * unread portion at the beginning.
              */
             if (src[0].subnr)
-               region_length += src[0].subnr * type_sz(src[0].type);
+               region_length += src[0].subnr;
 
             return DIV_ROUND_UP(region_length, REG_SIZE);
          } else {
@@ -1020,7 +1022,6 @@ fs_visitor::import_uniforms(fs_visitor *v)
    this->push_constant_loc = v->push_constant_loc;
    this->pull_constant_loc = v->pull_constant_loc;
    this->uniforms = v->uniforms;
-   this->param_size = v->param_size;
 }
 
 fs_reg *
@@ -1923,9 +1924,7 @@ fs_visitor::compact_virtual_grfs()
  * maximum number of fragment shader uniform components (64).  If
  * there are too many of these, they'd fill up all of register space.
  * So, this will push some of them out to the pull constant buffer and
- * update the program to load them.  We also use pull constants for all
- * indirect constant loads because we don't support indirect accesses in
- * registers yet.
+ * update the program to load them.
  */
 void
 fs_visitor::assign_constant_locations()
@@ -1934,20 +1933,21 @@ fs_visitor::assign_constant_locations()
    if (dispatch_width != 8)
       return;
 
-   unsigned int num_pull_constants = 0;
-
-   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
-   memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
-
    bool is_live[uniforms];
    memset(is_live, 0, sizeof(is_live));
 
+   /* For each uniform slot, a value of true indicates that the given slot and
+    * the next slot must remain contiguous.  This is used to keep us from
+    * splitting arrays apart.
+    */
+   bool contiguous[uniforms];
+   memset(contiguous, 0, sizeof(contiguous));
+
    /* First, we walk through the instructions and do two things:
     *
     *  1) Figure out which uniforms are live.
     *
-    *  2) Find all indirect access of uniform arrays and flag them as needing
-    *     to go into the pull constant buffer.
+    *  2) Mark any indirectly used ranges of registers as contiguous.
     *
     * Note that we don't move constant-indexed accesses to arrays.  No
     * testing has been done of the performance impact of this choice.
@@ -1957,20 +1957,19 @@ fs_visitor::assign_constant_locations()
          if (inst->src[i].file != UNIFORM)
             continue;
 
-         if (inst->src[i].reladdr) {
-            int uniform = inst->src[i].nr;
+         int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
 
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               assert(param_size[uniform]);
-               for (int j = 0; j < param_size[uniform]; j++)
-                  pull_constant_loc[uniform + j] = num_pull_constants++;
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+            assert(inst->src[2].ud % 4 == 0);
+            unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+            assert(last < uniforms);
+
+            for (unsigned j = constant_nr; j < last; j++) {
+               is_live[j] = true;
+               contiguous[j] = true;
             }
+            is_live[last] = true;
          } else {
-            /* Mark the the one accessed uniform as live */
-            int constant_nr = inst->src[i].nr + inst->src[i].reg_offset;
             if (constant_nr >= 0 && constant_nr < (int) uniforms)
                is_live[constant_nr] = true;
          }
@@ -1985,29 +1984,48 @@ fs_visitor::assign_constant_locations()
     * If changing this value, note the limitation about total_regs in
     * brw_curbe.c.
     */
-   unsigned int max_push_components = 16 * 8;
+   const unsigned int max_push_components = 16 * 8;
+
+   /* For vulkan we don't limit the max_chunk_size. We set it to 32 float =
+    * 128 bytes, which is the maximum vulkan push constant size.
+    */
+   const unsigned int max_chunk_size = 32;
+
    unsigned int num_push_constants = 0;
+   unsigned int num_pull_constants = 0;
 
    push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
 
-   for (unsigned int i = 0; i < uniforms; i++) {
-      if (!is_live[i] || pull_constant_loc[i] != -1) {
-         /* This UNIFORM register is either dead, or has already been demoted
-          * to a pull const.  Mark it as no longer living in the param[] array.
-          */
-         push_constant_loc[i] = -1;
+   int chunk_start = -1;
+   for (unsigned u = 0; u < uniforms; u++) {
+      push_constant_loc[u] = -1;
+      pull_constant_loc[u] = -1;
+
+      if (!is_live[u])
          continue;
-      }
 
-      if (num_push_constants < max_push_components) {
-         /* Retain as a push constant.  Record the location in the params[]
-          * array.
-          */
-         push_constant_loc[i] = num_push_constants++;
-      } else {
-         /* Demote to a pull constant. */
-         push_constant_loc[i] = -1;
-         pull_constant_loc[i] = num_pull_constants++;
+      /* This is the first live uniform in the chunk */
+      if (chunk_start < 0)
+         chunk_start = u;
+
+      /* If this element does not need to be contiguous with the next, we
+       * split at this point and everthing between chunk_start and u forms a
+       * single chunk.
+       */
+      if (!contiguous[u]) {
+         unsigned chunk_size = u - chunk_start + 1;
+
+         if (num_push_constants + chunk_size <= max_push_components &&
+             chunk_size <= max_chunk_size) {
+            for (unsigned j = chunk_start; j <= u; j++)
+               push_constant_loc[j] = num_push_constants++;
+         } else {
+            for (unsigned j = chunk_start; j <= u; j++)
+               pull_constant_loc[j] = num_pull_constants++;
+         }
+
+         chunk_start = -1;
       }
    }
 
@@ -2038,51 +2056,67 @@ fs_visitor::assign_constant_locations()
  * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
  */
 void
-fs_visitor::demote_pull_constants()
+fs_visitor::lower_constant_loads()
 {
-   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+   const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(this, block, inst);
+
       for (int i = 0; i < inst->sources; i++) {
 	 if (inst->src[i].file != UNIFORM)
 	    continue;
 
-         int pull_index;
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
          unsigned location = inst->src[i].nr + inst->src[i].reg_offset;
-         if (location >= uniforms) /* Out of bounds access */
-            pull_index = -1;
-         else
-            pull_index = pull_constant_loc[location];
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
 
          if (pull_index == -1)
 	    continue;
 
-         /* Set up the annotation tracking for new generated instructions. */
-         const fs_builder ibld(this, block, inst);
-         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
-         fs_reg dst = vgrf(glsl_type::float_type);
-
          assert(inst->src[i].stride == 0);
 
-         /* Generate a pull load into dst. */
-         if (inst->src[i].reladdr) {
-            VARYING_PULL_CONSTANT_LOAD(ibld, dst,
-                                       brw_imm_ud(index),
-                                       *inst->src[i].reladdr,
-                                       pull_index * 4);
-            inst->src[i].reladdr = NULL;
-            inst->src[i].stride = 1;
-         } else {
-            const fs_builder ubld = ibld.exec_all().group(8, 0);
-            struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
-            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
-                      dst, brw_imm_ud(index), offset);
-            inst->src[i].set_smear(pull_index & 3);
-         }
-         brw_mark_surface_used(prog_data, index);
+         fs_reg dst = vgrf(glsl_type::float_type);
+         const fs_builder ubld = ibld.exec_all().group(8, 0);
+         struct brw_reg offset = brw_imm_ud((unsigned)(pull_index * 4) & ~15);
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                   dst, brw_imm_ud(index), offset);
 
          /* Rewrite the instruction to use the temporary VGRF. */
          inst->src[i].file = VGRF;
          inst->src[i].nr = dst.nr;
          inst->src[i].reg_offset = 0;
+         inst->src[i].set_smear(pull_index & 3);
+
+         brw_mark_surface_used(prog_data, index);
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         unsigned location = inst->src[0].nr + inst->src[0].reg_offset;
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
+
+         if (pull_index == -1)
+	    continue;
+
+         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                    brw_imm_ud(index),
+                                    inst->src[1],
+                                    pull_index * 4);
+         inst->remove(block);
+
+         brw_mark_surface_used(prog_data, index);
       }
    }
    invalidate_live_intervals();
@@ -2792,10 +2826,23 @@ fs_visitor::emit_repclear_shader()
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
    int base_mrf = 1;
    int color_mrf = base_mrf + 2;
+   fs_inst *mov;
 
-   fs_inst *mov = bld.exec_all().group(4, 0)
-                     .MOV(brw_message_reg(color_mrf),
-                          fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   if (uniforms == 1) {
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   } else {
+      struct brw_reg reg =
+         brw_reg(BRW_GENERAL_REGISTER_FILE,
+                 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8,
+                 BRW_WIDTH_2,
+                 BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+   }
 
    fs_inst *write;
    if (key->nr_color_regions == 1) {
@@ -2824,8 +2871,10 @@ fs_visitor::emit_repclear_shader()
    assign_curb_setup();
 
    /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
-   assert(mov->src[0].file == FIXED_GRF);
-   mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   if (uniforms == 1) {
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   }
 }
 
 /**
@@ -3651,6 +3700,7 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
                                 const fs_reg &coordinate,
                                 const fs_reg &shadow_c,
                                 const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &surface,
                                 const fs_reg &sampler,
                                 unsigned coord_components,
                                 unsigned grad_components)
@@ -3743,8 +3793,9 @@ lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
 
    inst->opcode = op;
    inst->src[0] = reg_undef;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
    inst->base_mrf = msg_begin.nr;
    inst->mlen = msg_end.nr - msg_begin.nr;
    inst->header_size = 1;
@@ -3756,6 +3807,7 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
                                 const fs_reg &shadow_c,
                                 fs_reg lod, fs_reg lod2,
                                 const fs_reg &sample_index,
+                                const fs_reg &surface,
                                 const fs_reg &sampler,
                                 const fs_reg &offset_value,
                                 unsigned coord_components,
@@ -3838,8 +3890,9 @@ lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
 
    inst->opcode = op;
    inst->src[0] = reg_undef;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
    inst->base_mrf = message.nr;
    inst->mlen = msg_end.nr - message.nr;
    inst->header_size = header_size;
@@ -3863,7 +3916,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
                                 const fs_reg &shadow_c,
                                 fs_reg lod, fs_reg lod2,
                                 const fs_reg &sample_index,
-                                const fs_reg &mcs, const fs_reg &sampler,
+                                const fs_reg &mcs,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
                                 fs_reg offset_value,
                                 unsigned coord_components,
                                 unsigned grad_components)
@@ -4066,8 +4121,9 @@ lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
    /* Generate the SEND. */
    inst->opcode = op;
    inst->src[0] = src_payload;
-   inst->src[1] = sampler;
-   inst->resize_sources(2);
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
    inst->base_mrf = -1;
    inst->mlen = mlen;
    inst->header_size = header_size;
@@ -4086,25 +4142,27 @@ lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
    const fs_reg &lod2 = inst->src[3];
    const fs_reg &sample_index = inst->src[4];
    const fs_reg &mcs = inst->src[5];
-   const fs_reg &sampler = inst->src[6];
-   const fs_reg &offset_value = inst->src[7];
-   assert(inst->src[8].file == IMM && inst->src[9].file == IMM);
-   const unsigned coord_components = inst->src[8].ud;
-   const unsigned grad_components = inst->src[9].ud;
+   const fs_reg &surface = inst->src[6];
+   const fs_reg &sampler = inst->src[7];
+   const fs_reg &offset_value = inst->src[8];
+   assert(inst->src[9].file == IMM && inst->src[10].file == IMM);
+   const unsigned coord_components = inst->src[9].ud;
+   const unsigned grad_components = inst->src[10].ud;
 
    if (devinfo->gen >= 7) {
       lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
                                       shadow_c, lod, lod2, sample_index,
-                                      mcs, sampler, offset_value,
+                                      mcs, surface, sampler, offset_value,
                                       coord_components, grad_components);
    } else if (devinfo->gen >= 5) {
       lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
                                       shadow_c, lod, lod2, sample_index,
-                                      sampler, offset_value,
+                                      surface, sampler, offset_value,
                                       coord_components, grad_components);
    } else {
       lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
-                                      shadow_c, lod, lod2, sampler,
+                                      shadow_c, lod, lod2,
+                                      surface, sampler,
                                       coord_components, grad_components);
    }
 }
@@ -4431,6 +4489,10 @@ get_lowered_simd_width(const struct brw_device_info *devinfo,
    case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
       return 8;
 
+   case SHADER_OPCODE_MOV_INDIRECT:
+      /* Prior to Broadwell, we only have 8 address subregisters */
+      return devinfo->gen < 8 ? 8 : inst->exec_size;
+
    default:
       return inst->exec_size;
    }
@@ -4713,9 +4775,7 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
          break;
       case UNIFORM:
          fprintf(file, "u%d", inst->src[i].nr + inst->src[i].reg_offset);
-         if (inst->src[i].reladdr) {
-            fprintf(file, "+reladdr");
-         } else if (inst->src[i].subreg_offset) {
+         if (inst->src[i].subreg_offset) {
             fprintf(file, "+%d.%d", inst->src[i].reg_offset,
                     inst->src[i].subreg_offset);
          }
@@ -4826,7 +4886,6 @@ fs_visitor::get_instruction_generating_reg(fs_inst *start,
 {
    if (end == start ||
        end->is_partial_write() ||
-       reg.reladdr ||
        !reg.equals(end->dst)) {
       return NULL;
    } else {
@@ -5039,7 +5098,7 @@ fs_visitor::optimize()
    bld = fs_builder(this, 64);
 
    assign_constant_locations();
-   demote_pull_constants();
+   lower_constant_loads();
 
    validate();
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 9a54c2d..c931910 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -139,7 +139,7 @@ public:
    void split_virtual_grfs();
    bool compact_virtual_grfs();
    void assign_constant_locations();
-   void demote_pull_constants();
+   void lower_constant_loads();
    void invalidate_live_intervals();
    void calculate_live_intervals();
    void calculate_register_pressure();
@@ -207,6 +207,8 @@ public:
                      fs_reg mcs,
                      int gather_component,
                      bool is_cube_array,
+                     uint32_t surface,
+                     fs_reg surface_reg,
                      uint32_t sampler,
                      fs_reg sampler_reg);
    fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
@@ -222,7 +224,7 @@ public:
    void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                      uint32_t spill_offset, int count);
    void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
-                   uint32_t spill_offset, int count);
+                   uint32_t spill_offset, int count, bool we_all);
 
    void emit_nir_code();
    void nir_setup_inputs();
@@ -321,8 +323,6 @@ public:
 
    const struct brw_vue_map *input_vue_map;
 
-   int *param_size;
-
    int *virtual_grf_start;
    int *virtual_grf_end;
    brw::fs_live_variables *live_intervals;
@@ -448,6 +448,7 @@ private:
    void generate_linterp(fs_inst *inst, struct brw_reg dst,
 			 struct brw_reg *src);
    void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+                     struct brw_reg surface_index,
                      struct brw_reg sampler_index);
    void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst,
                                  struct brw_reg src,
diff --git a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
index 21f0b70..cbad47e 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_channel_expressions.cpp
@@ -72,6 +72,13 @@ channel_expressions_predicate(ir_instruction *ir)
       return false;
 
    switch (expr->operation) {
+      case ir_unop_pack_half_2x16:
+      case ir_unop_pack_snorm_2x16:
+      case ir_unop_pack_snorm_4x8:
+      case ir_unop_pack_unorm_2x16:
+      case ir_unop_pack_unorm_4x8:
+         return false;
+
       /* these opcodes need to act on the whole vector,
        * just like texturing.
        */
@@ -162,6 +169,11 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
       return visit_continue;
 
    switch (expr->operation) {
+      case ir_unop_pack_half_2x16:
+      case ir_unop_pack_snorm_2x16:
+      case ir_unop_pack_snorm_4x8:
+      case ir_unop_pack_unorm_2x16:
+      case ir_unop_pack_unorm_4x8:
       case ir_unop_interpolate_at_centroid:
       case ir_binop_interpolate_at_offset:
       case ir_binop_interpolate_at_sample:
@@ -399,9 +411,6 @@ ir_channel_expressions_visitor::visit_leave(ir_assignment *ir)
    case ir_unop_ssbo_unsized_array_length:
       unreachable("should have been lowered");
 
-   case ir_unop_unpack_half_2x16_split_x:
-   case ir_unop_unpack_half_2x16_split_y:
-   case ir_binop_pack_half_2x16_split:
    case ir_unop_interpolate_at_centroid:
    case ir_binop_interpolate_at_offset:
    case ir_binop_interpolate_at_sample:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index 3b65a38..cde6566 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -78,6 +78,8 @@ is_expression(const fs_visitor *v, const fs_inst *const inst)
    case FS_OPCODE_LINTERP:
    case SHADER_OPCODE_FIND_LIVE_CHANNEL:
    case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_EXTRACT_BYTE:
+   case SHADER_OPCODE_EXTRACT_WORD:
    case SHADER_OPCODE_MOV_INDIRECT:
       return true;
    case SHADER_OPCODE_RCP:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index e05622a..cac92b3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -351,23 +351,47 @@ fs_generator::generate_mov_indirect(fs_inst *inst,
 
    unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
 
-   /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
-   struct brw_reg addr = vec8(brw_address_reg(0));
+   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
+      imm_byte_offset += indirect_byte_offset.ud;
 
-   /* The destination stride of an instruction (in bytes) must be greater
-    * than or equal to the size of the rest of the instruction.  Since the
-    * address register is of type UW, we can't use a D-type instruction.
-    * In order to get around this, re re-type to UW and use a stride.
-    */
-   indirect_byte_offset =
-      retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+      reg.nr = imm_byte_offset / REG_SIZE;
+      reg.subnr = imm_byte_offset % REG_SIZE;
+      brw_MOV(p, dst, reg);
+   } else {
+      /* Prior to Broadwell, there are only 8 address registers. */
+      assert(inst->exec_size == 8 || devinfo->gen >= 8);
 
-   /* Prior to Broadwell, there are only 8 address registers. */
-   assert(inst->exec_size == 8 || devinfo->gen >= 8);
+      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+      struct brw_reg addr = vec8(brw_address_reg(0));
 
-   brw_MOV(p, addr, indirect_byte_offset);
-   brw_inst_set_mask_control(devinfo, brw_last_inst, BRW_MASK_DISABLE);
-   brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type));
+      /* The destination stride of an instruction (in bytes) must be greater
+       * than or equal to the size of the rest of the instruction.  Since the
+       * address register is of type UW, we can't use a D-type instruction.
+       * In order to get around this, re re-type to UW and use a stride.
+       */
+      indirect_byte_offset =
+         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+
+      if (devinfo->gen < 8) {
+         /* Prior to broadwell, we have a restriction that the bottom 5 bits
+          * of the base offset and the bottom 5 bits of the indirect must add
+          * to less than 32.  In other words, the hardware needs to be able to
+          * add the bottom five bits of the two to get the subnumber and add
+          * the next 7 bits of each to get the actual register number.  Since
+          * the indirect may cause us to cross a register boundary, this makes
+          * it almost useless.  We could try and do something clever where we
+          * use a actual base offset if base_offset % 32 == 0 but that would
+          * mean we were generating different code depending on the base
+          * offset.  Instead, for the sake of consistency, we'll just do the
+          * add ourselves.
+          */
+         brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+         brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), dst.type));
+      } else {
+         brw_MOV(p, addr, indirect_byte_offset);
+         brw_MOV(p, dst, retype(brw_VxH_indirect(0, imm_byte_offset), dst.type));
+      }
+   }
 }
 
 void
@@ -678,6 +702,7 @@ fs_generator::generate_get_buffer_size(fs_inst *inst,
 
 void
 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+                           struct brw_reg surface_index,
                            struct brw_reg sampler_index)
 {
    int msg_type = -1;
@@ -933,14 +958,16 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
          ? prog_data->binding_table.gather_texture_start
          : prog_data->binding_table.texture_start;
 
-   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+       sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t surface = surface_index.ud;
       uint32_t sampler = sampler_index.ud;
 
       brw_SAMPLE(p,
                  retype(dst, BRW_REGISTER_TYPE_UW),
                  inst->base_mrf,
                  src,
-                 sampler + base_binding_table_index,
+                 surface + base_binding_table_index,
                  sampler % 16,
                  msg_type,
                  rlen,
@@ -949,19 +976,24 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
                  simd_mode,
                  return_format);
 
-      brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
+      brw_mark_surface_used(prog_data, surface + base_binding_table_index);
    } else {
       /* Non-const sampler index */
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 
       brw_push_insn_state(p);
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
-      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
-      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (memcmp(&surface_reg, &sampler_reg, sizeof(surface_reg)) == 0) {
+         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      } else {
+         brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+         brw_OR(p, addr, addr, surface_reg);
+      }
       if (base_binding_table_index)
          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
       brw_AND(p, addr, addr, brw_imm_ud(0xfff));
@@ -2070,7 +2102,7 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
       case SHADER_OPCODE_TG4:
       case SHADER_OPCODE_TG4_OFFSET:
       case SHADER_OPCODE_SAMPLEINFO:
-	 generate_tex(inst, dst, src[0], src[1]);
+	 generate_tex(inst, dst, src[0], src[1], src[2]);
 	 break;
       case FS_OPCODE_DDX_COARSE:
       case FS_OPCODE_DDX_FINE:
@@ -2201,6 +2233,28 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          brw_broadcast(p, dst, src[0], src[1]);
          break;
 
+      case SHADER_OPCODE_EXTRACT_BYTE: {
+         assert(src[0].type == BRW_REGISTER_TYPE_D ||
+                src[0].type == BRW_REGISTER_TYPE_UD);
+
+         enum brw_reg_type type =
+            src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_B
+                                               : BRW_REGISTER_TYPE_UB;
+         brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 4));
+         break;
+      }
+
+      case SHADER_OPCODE_EXTRACT_WORD: {
+         assert(src[0].type == BRW_REGISTER_TYPE_D ||
+                src[0].type == BRW_REGISTER_TYPE_UD);
+
+         enum brw_reg_type type =
+            src[0].type == BRW_REGISTER_TYPE_D ? BRW_REGISTER_TYPE_W
+                                               : BRW_REGISTER_TYPE_UW;
+         brw_MOV(p, dst, spread(suboffset(retype(src[0], type), src[1].ud), 2));
+         break;
+      }
+
       case FS_OPCODE_SET_SAMPLE_ID:
          generate_set_sample_id(inst, dst, src[0], src[1]);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index f9df2a4..48cdaf6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -179,15 +179,6 @@ fs_visitor::nir_setup_uniforms()
       return;
 
    uniforms = nir->num_uniforms / 4;
-
-   nir_foreach_variable(var, &nir->uniforms) {
-      /* UBO's and atomics don't take up space in the uniform file */
-      if (var->interface_type != NULL || var->type->contains_atomic())
-         continue;
-
-      if (type_size_scalar(var->type) > 0)
-         param_size[var->data.driver_location / 4] = type_size_scalar(var->type);
-   }
 }
 
 static bool
@@ -728,15 +719,29 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
-   case nir_op_fsin:
-      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
-      inst->saturate = instr->dest.saturate;
+   case nir_op_fsin: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
+      inst = bld.emit(SHADER_OPCODE_SIN, tmp, op[0]);
+      if (instr->dest.saturate) {
+         inst->dst = result;
+         inst->saturate = true;
+      } else {
+         bld.MUL(result, tmp, brw_imm_f(0.99997));
+      }
       break;
+   }
 
-   case nir_op_fcos:
-      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
-      inst->saturate = instr->dest.saturate;
+   case nir_op_fcos: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F);
+      inst = bld.emit(SHADER_OPCODE_COS, tmp, op[0]);
+      if (instr->dest.saturate) {
+         inst->dst = result;
+         inst->saturate = true;
+      } else {
+         bld.MUL(result, tmp, brw_imm_f(0.99997));
+      }
       break;
+   }
 
    case nir_op_fddx:
       if (fs_key->high_quality_derivatives) {
@@ -807,9 +812,41 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       unreachable("Should have been lowered by borrow_to_arith().");
 
    case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
       bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
       break;
 
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = bld.MOV(bld.null_reg_d(), result);
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
+      inst = bld.XOR(tmp, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = bld.ADD(result, result, op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
    case nir_op_flt:
    case nir_op_ilt:
    case nir_op_ult:
@@ -947,6 +984,34 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_fquantize2f16: {
+      fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
+      fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
+      fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+      /* The destination stride must be at least as big as the source stride. */
+      tmp16.type = BRW_REGISTER_TYPE_W;
+      tmp16.stride = 2;
+
+      /* Check for denormal */
+      fs_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+              BRW_CONDITIONAL_L);
+      /* Get the appropriately signed zero */
+      bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
+              retype(op[0], BRW_REGISTER_TYPE_UD),
+              brw_imm_ud(0x80000000));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
+      bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
+      /* Select that or zero based on normal status */
+      inst = bld.SEL(result, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
    case nir_op_fmin:
    case nir_op_imin:
    case nir_op_umin:
@@ -1079,6 +1144,22 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
+   case nir_op_extract_ubyte:
+   case nir_op_extract_ibyte: {
+      nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+      bld.emit(SHADER_OPCODE_EXTRACT_BYTE,
+               result, op[0], brw_imm_ud(byte->u[0]));
+      break;
+   }
+
+   case nir_op_extract_uword:
+   case nir_op_extract_iword: {
+      nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
+      bld.emit(SHADER_OPCODE_EXTRACT_WORD,
+               result, op[0], brw_imm_ud(word->u[0]));
+      break;
+   }
+
    default:
       unreachable("unhandled instruction");
    }
@@ -1154,6 +1235,8 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
 {
    fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
                 BRW_REGISTER_TYPE_UD);
+   fs_reg indirect;
+   unsigned indirect_max = 0;
 
    for (const nir_deref *tail = &deref->deref; tail->child;
         tail = tail->child) {
@@ -1165,7 +1248,7 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
       image = offset(image, bld, base * element_size);
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
-         fs_reg tmp = vgrf(glsl_type::int_type);
+         fs_reg tmp = vgrf(glsl_type::uint_type);
 
          if (devinfo->gen == 7 && !devinfo->is_haswell) {
             /* IVB hangs when trying to access an invalid surface index with
@@ -1183,15 +1266,31 @@ fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
             bld.MOV(tmp, get_nir_src(deref_array->indirect));
          }
 
+         indirect_max += element_size * (tail->type->length - 1);
+
          bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
-         if (image.reladdr)
-            bld.ADD(*image.reladdr, *image.reladdr, tmp);
-         else
-            image.reladdr = new(mem_ctx) fs_reg(tmp);
+         if (indirect.file == BAD_FILE) {
+            indirect = tmp;
+         } else {
+            bld.ADD(indirect, indirect, tmp);
+         }
       }
    }
 
-   return image;
+   if (indirect.file == BAD_FILE) {
+      return image;
+   } else {
+      /* Emit a pile of MOVs to load the uniform into a temporary.  The
+       * dead-code elimination pass will get rid of what we don't use.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
+      for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                  offset(tmp, bld, j), offset(image, bld, j),
+                  indirect, brw_imm_ud((indirect_max + 1) * 4));
+      }
+      return tmp;
+   }
 }
 
 void
@@ -2280,6 +2379,82 @@ fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
       nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
       break;
 
+   case nir_intrinsic_load_shared: {
+      assert(devinfo->gen >= 7);
+
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Get the offset to read from */
+      fs_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
+      } else {
+         offset_reg = vgrf(glsl_type::uint_type);
+         bld.ADD(offset_reg,
+                 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(instr->const_index[0]));
+      }
+
+      /* Read the vector */
+      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                             1 /* dims */,
+                                             instr->num_components,
+                                             BRW_PREDICATE_NONE);
+      read_result.type = dest.type;
+      for (int i = 0; i < instr->num_components; i++)
+         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
+
+      break;
+   }
+
+   case nir_intrinsic_store_shared: {
+      assert(devinfo->gen >= 7);
+
+      /* Block index */
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Value */
+      fs_reg val_reg = get_nir_src(instr->src[0]);
+
+      /* Writemask */
+      unsigned writemask = instr->const_index[1];
+
+      /* Combine groups of consecutive enabled channels in one write
+       * message. We use ffs to find the first enabled channel and then ffs on
+       * the bit-inverse, down-shifted writemask to determine the length of
+       * the block of enabled bits.
+       */
+      while (writemask) {
+         unsigned first_component = ffs(writemask) - 1;
+         unsigned length = ffs(~(writemask >> first_component)) - 1;
+         fs_reg offset_reg;
+
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+         if (const_offset) {
+            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
+                                    4 * first_component);
+         } else {
+            offset_reg = vgrf(glsl_type::uint_type);
+            bld.ADD(offset_reg,
+                    retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
+                    brw_imm_ud(instr->const_index[0] + 4 * first_component));
+         }
+
+         emit_untyped_write(bld, surf_index, offset_reg,
+                            offset(val_reg, bld, first_component),
+                            1 /* dims */, length,
+                            BRW_PREDICATE_NONE);
+
+         /* Clear the bits in the writemask that we just wrote, then try
+          * again to see if more channels are left.
+          */
+         writemask &= (15 << (first_component + length));
+      }
+
+      break;
+   }
+
    default:
       nir_emit_intrinsic(bld, instr);
       break;
@@ -2492,12 +2667,28 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
          /* Offsets are in bytes but they should always be multiples of 4 */
          assert(const_offset->u[0] % 4 == 0);
          src.reg_offset = const_offset->u[0] / 4;
+
+         for (unsigned j = 0; j < instr->num_components; j++) {
+            bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+         }
       } else {
-         src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
-      }
+         fs_reg indirect = retype(get_nir_src(instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
 
-      for (unsigned j = 0; j < instr->num_components; j++) {
-         bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+         /* We need to pass a size to the MOV_INDIRECT but we don't want it to
+          * go past the end of the uniform.  In order to keep the n'th
+          * component from running past, we subtract off the size of all but
+          * one component of the vector.
+          */
+         assert(instr->const_index[1] >= instr->num_components * 4);
+         unsigned read_size = instr->const_index[1] -
+                              (instr->num_components - 1) * 4;
+
+         for (unsigned j = 0; j < instr->num_components; j++) {
+            bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                     offset(dest, bld, j), offset(src, bld, j),
+                     indirect, brw_imm_ud(read_size));
+         }
       }
       break;
    }
@@ -2605,82 +2796,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
       break;
    }
 
-   case nir_intrinsic_load_shared: {
-      assert(devinfo->gen >= 7);
-
-      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
-      /* Get the offset to read from */
-      fs_reg offset_reg;
-      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
-      if (const_offset) {
-         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0]);
-      } else {
-         offset_reg = vgrf(glsl_type::uint_type);
-         bld.ADD(offset_reg,
-                 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
-                 brw_imm_ud(instr->const_index[0]));
-      }
-
-      /* Read the vector */
-      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
-                                             1 /* dims */,
-                                             instr->num_components,
-                                             BRW_PREDICATE_NONE);
-      read_result.type = dest.type;
-      for (int i = 0; i < instr->num_components; i++)
-         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
-
-      break;
-   }
-
-   case nir_intrinsic_store_shared: {
-      assert(devinfo->gen >= 7);
-
-      /* Block index */
-      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
-
-      /* Value */
-      fs_reg val_reg = get_nir_src(instr->src[0]);
-
-      /* Writemask */
-      unsigned writemask = instr->const_index[1];
-
-      /* Combine groups of consecutive enabled channels in one write
-       * message. We use ffs to find the first enabled channel and then ffs on
-       * the bit-inverse, down-shifted writemask to determine the length of
-       * the block of enabled bits.
-       */
-      while (writemask) {
-         unsigned first_component = ffs(writemask) - 1;
-         unsigned length = ffs(~(writemask >> first_component)) - 1;
-         fs_reg offset_reg;
-
-         nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
-         if (const_offset) {
-            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u[0] +
-                                    4 * first_component);
-         } else {
-            offset_reg = vgrf(glsl_type::uint_type);
-            bld.ADD(offset_reg,
-                    retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
-                    brw_imm_ud(instr->const_index[0] + 4 * first_component));
-         }
-
-         emit_untyped_write(bld, surf_index, offset_reg,
-                            offset(val_reg, bld, first_component),
-                            1 /* dims */, length,
-                            BRW_PREDICATE_NONE);
-
-         /* Clear the bits in the writemask that we just wrote, then try
-          * again to see if more channels are left.
-          */
-         writemask &= (15 << (first_component + length));
-      }
-
-      break;
-   }
-
    case nir_intrinsic_load_input: {
       fs_reg src;
       if (stage == MESA_SHADER_VERTEX) {
@@ -2924,7 +3039,9 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
 void
 fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
 {
+   unsigned texture = instr->texture_index;
    unsigned sampler = instr->sampler_index;
+   fs_reg texture_reg(brw_imm_ud(texture));
    fs_reg sampler_reg(brw_imm_ud(sampler));
 
    int gather_component = instr->component;
@@ -2937,6 +3054,10 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
 
    fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
 
+   /* Our hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
    for (unsigned i = 0; i < instr->num_srcs; i++) {
       fs_reg src = get_nir_src(instr->src[i].src);
       switch (instr->src[i].src_type) {
@@ -2991,9 +3112,9 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       case nir_tex_src_projector:
          unreachable("should be lowered");
 
-      case nir_tex_src_sampler_offset: {
-         /* Figure out the highest possible sampler index and mark it as used */
-         uint32_t max_used = sampler + instr->sampler_array_size - 1;
+      case nir_tex_src_texture_offset: {
+         /* Figure out the highest possible texture index and mark it as used */
+         uint32_t max_used = texture + instr->texture_array_size - 1;
          if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
             max_used += stage_prog_data->binding_table.gather_texture_start;
          } else {
@@ -3002,6 +3123,14 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
          brw_mark_surface_used(prog_data, max_used);
 
          /* Emit code to evaluate the actual indexing expression */
+         texture_reg = vgrf(glsl_type::uint_type);
+         bld.ADD(texture_reg, src, brw_imm_ud(texture));
+         texture_reg = bld.emit_uniformize(texture_reg);
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         /* Emit code to evaluate the actual indexing expression */
          sampler_reg = vgrf(glsl_type::uint_type);
          bld.ADD(sampler_reg, src, brw_imm_ud(sampler));
          sampler_reg = bld.emit_uniformize(sampler_reg);
@@ -3016,8 +3145,8 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
    if (instr->op == nir_texop_txf_ms ||
        instr->op == nir_texop_samples_identical) {
       if (devinfo->gen >= 7 &&
-          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
-         mcs = emit_mcs_fetch(coordinate, instr->coord_components, sampler_reg);
+          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
+         mcs = emit_mcs_fetch(coordinate, instr->coord_components, texture_reg);
       } else {
          mcs = brw_imm_ud(0u);
       }
@@ -3054,7 +3183,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
       fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
       fs_inst *inst = bld.emit(SHADER_OPCODE_SAMPLEINFO, dst,
                                bld.vgrf(BRW_REGISTER_TYPE_D, 1),
-                               sampler_reg);
+                               texture_reg, texture_reg);
       inst->mlen = 1;
       inst->header_size = 1;
       inst->base_mrf = -1;
@@ -3068,7 +3197,7 @@ fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
    emit_texture(op, dest_type, coordinate, instr->coord_components,
                 shadow_comparitor, lod, lod2, lod_components, sample_index,
                 tex_offset, mcs, gather_component,
-                is_cube_array, sampler, sampler_reg);
+                is_cube_array, texture, texture_reg, sampler, sampler_reg);
 
    fs_reg dest = get_nir_dest(instr->dest);
    dest.type = this->result.type;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 2347cd5..8396854 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -751,6 +751,7 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
                                         dst);
       unspill_inst->offset = spill_offset;
       unspill_inst->regs_written = reg_size;
+      unspill_inst->force_writemask_all = true;
 
       if (!gen7_read) {
          unspill_inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
@@ -764,11 +765,11 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
 
 void
 fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
-                       uint32_t spill_offset, int count)
+                       uint32_t spill_offset, int count, bool we_all)
 {
    int reg_size = 1;
    int spill_base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
-   if (dispatch_width == 16 && count % 2 == 0) {
+   if (inst->exec_size == 16 && count % 2 == 0) {
       spill_base_mrf = FIRST_SPILL_MRF(devinfo->gen);
       reg_size = 2;
    }
@@ -784,6 +785,8 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
       spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
       spill_inst->mlen = 1 + reg_size; /* header, value */
       spill_inst->base_mrf = spill_base_mrf;
+      spill_inst->force_writemask_all = we_all;
+      spill_inst->force_sechalf = inst->force_sechalf;
    }
 }
 
@@ -805,30 +808,13 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)
     */
    foreach_block_and_inst(block, fs_inst, inst, cfg) {
       for (unsigned int i = 0; i < inst->sources; i++) {
-	 if (inst->src[i].file == VGRF) {
+	 if (inst->src[i].file == VGRF)
             spill_costs[inst->src[i].nr] += loop_scale;
-
-            /* Register spilling logic assumes full-width registers; smeared
-             * registers have a width of 1 so if we try to spill them we'll
-             * generate invalid assembly.  This shouldn't be a problem because
-             * smeared registers are only used as short-term temporaries when
-             * loading pull constants, so spilling them is unlikely to reduce
-             * register pressure anyhow.
-             */
-            if (!inst->src[i].is_contiguous()) {
-               no_spill[inst->src[i].nr] = true;
-            }
-	 }
       }
 
-      if (inst->dst.file == VGRF) {
+      if (inst->dst.file == VGRF)
          spill_costs[inst->dst.nr] += inst->regs_written * loop_scale;
 
-         if (!inst->dst.is_contiguous()) {
-            no_spill[inst->dst.nr] = true;
-         }
-      }
-
       switch (inst->opcode) {
 
       case BRW_OPCODE_DO:
@@ -938,12 +924,15 @@ fs_visitor::spill_reg(int spill_reg)
           * inst->regs_written(), then we need to unspill the destination
           * since we write back out all of the regs_written().
 	  */
-	 if (inst->is_partial_write())
+         bool need_unspill = inst->is_partial_write() ||
+                             type_sz(inst->dst.type) != 4;
+         if (need_unspill)
             emit_unspill(block, inst, spill_src, subset_spill_offset,
                          inst->regs_written);
 
          emit_spill(block, inst, spill_src, subset_spill_offset,
-                    inst->regs_written);
+                    inst->regs_written,
+                    need_unspill || inst->force_writemask_all);
       }
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
index 45694ec..f2facee 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_surface_builder.cpp
@@ -717,6 +717,15 @@ namespace {
                   bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
                                   brw_imm_d(-(int)scale(widths[c] - s) - 1),
                                   BRW_CONDITIONAL_GE);
+
+               /* Mask off all but the bits we actually want.  Otherwise, if
+                * we pass a negative number into the hardware when it's
+                * expecting something like UINT8, it will happily clamp it to
+                * +255 for us.
+                */
+               if (is_signed && widths[c] < 32)
+                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+                          brw_imm_d((1 << widths[c]) - 1));
             }
          }
 
@@ -787,6 +796,15 @@ namespace {
                /* Convert to integer. */
                bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
                bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
+
+               /* Mask off all but the bits we actually want.  Otherwise, if
+                * we pass a negative number into the hardware when it's
+                * expecting something like UINT8, it will happily clamp it to
+                * +255 for us.
+                */
+               if (is_signed && widths[c] < 32)
+                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+                          brw_imm_d((1 << widths[c]) - 1));
             }
          }
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 25240ad..4011bf5 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -79,12 +79,12 @@ fs_visitor::emit_vs_system_value(int location)
 /* Sample from the MCS surface attached to this multisample texture. */
 fs_reg
 fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
-                           const fs_reg &sampler)
+                           const fs_reg &texture)
 {
    const fs_reg dest = vgrf(glsl_type::uvec4_type);
    const fs_reg srcs[] = {
       coordinate, fs_reg(), fs_reg(), fs_reg(), fs_reg(), fs_reg(),
-      sampler, fs_reg(), brw_imm_ud(components), brw_imm_d(0)
+      texture, texture, fs_reg(), brw_imm_ud(components), brw_imm_d(0)
    };
    fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
                             ARRAY_SIZE(srcs));
@@ -108,6 +108,8 @@ fs_visitor::emit_texture(ir_texture_opcode op,
                          fs_reg mcs,
                          int gather_component,
                          bool is_cube_array,
+                         uint32_t surface,
+                         fs_reg surface_reg,
                          uint32_t sampler,
                          fs_reg sampler_reg)
 {
@@ -147,7 +149,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
    fs_reg dst = vgrf(glsl_type::get_instance(dest_type->base_type, 4, 1));
    const fs_reg srcs[] = {
       coordinate, shadow_c, lod, lod2,
-      sample_index, mcs, sampler_reg, offset_value,
+      sample_index, mcs, surface_reg, sampler_reg, offset_value,
       brw_imm_d(coord_components), brw_imm_d(grad_components)
    };
    enum opcode opcode;
@@ -200,7 +202,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
 
    if (op == ir_tg4) {
       if (gather_component == 1 &&
-          key_tex->gather_channel_quirk_mask & (1 << sampler)) {
+          key_tex->gather_channel_quirk_mask & (1 << surface)) {
          /* gather4 sampler is broken for green channel on RG32F --
           * we must ask for blue instead.
           */
@@ -210,7 +212,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
       }
 
       if (devinfo->gen == 6)
-         emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], dst);
+         emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], dst);
    }
 
    /* fixup #layers for cube map arrays */
@@ -924,6 +926,8 @@ void
 fs_visitor::emit_barrier()
 {
    assert(devinfo->gen >= 7);
+   const uint32_t barrier_id_mask =
+      devinfo->gen >= 9 ? 0x8f000000u : 0x0f000000u;
 
    /* We are getting the barrier ID from the compute shader header */
    assert(stage == MESA_SHADER_COMPUTE);
@@ -937,7 +941,7 @@ fs_visitor::emit_barrier()
 
    /* Copy bits 27:24 of r0.2 (barrier id) to the message payload reg.2 */
    fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
-   pbld.AND(component(payload, 2), r0_2, brw_imm_ud(0x0f000000u));
+   pbld.AND(component(payload, 2), r0_2, brw_imm_ud(barrier_id_mask));
 
    /* Emit a gateway "barrier" message using the payload we set up, followed
     * by a wait instruction.
@@ -1035,9 +1039,6 @@ fs_visitor::init()
 
    this->spilled_any_registers = false;
    this->do_dual_src = false;
-
-   if (dispatch_width == 8)
-      this->param_size = rzalloc_array(mem_ctx, int, stage_prog_data->nr_params);
 }
 
 fs_visitor::~fs_visitor()
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index c3eec2e..e4f20f4 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -58,8 +58,6 @@ public:
     */
    int subreg_offset;
 
-   fs_reg *reladdr;
-
    /** Register region horizontal stride */
    uint8_t stride;
 };
@@ -136,8 +134,7 @@ component(fs_reg reg, unsigned idx)
 static inline bool
 is_uniform(const fs_reg &reg)
 {
-   return (reg.stride == 0 || reg.is_null()) &&
-          (!reg.reladdr || is_uniform(*reg.reladdr));
+   return (reg.stride == 0 || reg.is_null());
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index 234afd5..ab9d792 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -73,36 +73,13 @@ brw_lower_packing_builtins(struct brw_context *brw,
                            gl_shader_stage shader_type,
                            exec_list *ir)
 {
-   const struct brw_compiler *compiler = brw->intelScreen->compiler;
-
-   int ops = LOWER_PACK_SNORM_2x16
-           | LOWER_UNPACK_SNORM_2x16
-           | LOWER_PACK_UNORM_2x16
-           | LOWER_UNPACK_UNORM_2x16;
-
-   if (compiler->scalar_stage[shader_type]) {
-      ops |= LOWER_UNPACK_UNORM_4x8
-           | LOWER_UNPACK_SNORM_4x8
-           | LOWER_PACK_UNORM_4x8
-           | LOWER_PACK_SNORM_4x8;
-   }
-
-   if (brw->gen >= 7) {
-      /* Gen7 introduced the f32to16 and f16to32 instructions, which can be
-       * used to execute packHalf2x16 and unpackHalf2x16. For AOS code, no
-       * lowering is needed. For SOA code, the Half2x16 ops must be
-       * scalarized.
-       */
-      if (compiler->scalar_stage[shader_type]) {
-         ops |= LOWER_PACK_HALF_2x16_TO_SPLIT
-             |  LOWER_UNPACK_HALF_2x16_TO_SPLIT;
-      }
-   } else {
-      ops |= LOWER_PACK_HALF_2x16
-          |  LOWER_UNPACK_HALF_2x16;
-   }
+   /* Gens < 7 don't have instructions to convert to or from half-precision,
+    * and Gens < 6 don't expose that functionality.
+    */
+   if (brw->gen != 6)
+      return;
 
-   lower_packing_builtins(ir, ops);
+   lower_packing_builtins(ir, LOWER_PACK_HALF_2x16 | LOWER_UNPACK_HALF_2x16);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 935529a..d6987c8 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -409,6 +409,14 @@ brw_nir_lower_uniforms(nir_shader *nir, bool is_scalar)
    }
 }
 
+static void
+brw_nir_lower_shared(nir_shader *nir)
+{
+   nir_assign_var_locations(&nir->shared, &nir->num_shared,
+                            type_size_scalar_bytes);
+   nir_lower_io(nir, nir_var_shared, type_size_scalar_bytes);
+}
+
 #define OPT(pass, ...) ({                                  \
    bool this_progress = false;                             \
    NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
@@ -488,7 +496,7 @@ brw_preprocess_nir(nir_shader *nir, bool is_scalar)
    /* Get rid of split copies */
    nir = nir_optimize(nir, is_scalar);
 
-   OPT(nir_remove_dead_variables);
+   OPT(nir_remove_dead_variables, nir_var_local);
 
    return nir;
 }
@@ -504,6 +512,8 @@ brw_nir_lower_io(nir_shader *nir,
 
    OPT_V(brw_nir_lower_inputs, devinfo, is_scalar);
    OPT_V(brw_nir_lower_outputs, devinfo, is_scalar);
+   if (nir->stage == MESA_SHADER_COMPUTE)
+      OPT_V(brw_nir_lower_shared);
    OPT_V(nir_lower_io, nir_var_all, is_scalar ? type_size_scalar : type_size_vec4);
 
    return nir_optimize(nir, is_scalar);
@@ -529,7 +539,7 @@ brw_postprocess_nir(nir_shader *nir,
 
    if (devinfo->gen >= 6) {
       /* Try and fuse multiply-adds */
-      OPT(brw_nir_opt_peephole_ffma);
+//      OPT(brw_nir_opt_peephole_ffma);
    }
 
    OPT(nir_opt_algebraic_late);
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index 20d4e0d..94ceb52 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -279,7 +279,7 @@ brw_get_scratch_bo(struct brw_context *brw,
 
 void brwInitFragProgFuncs( struct dd_function_table *functions )
 {
-   assert(functions->ProgramStringNotify == _tnl_program_string);
+   /* assert(functions->ProgramStringNotify == _tnl_program_string); */
 
    functions->NewProgram = brwNewProgram;
    functions->DeleteProgram = brwDeleteProgram;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index d92bad2..e4ce8cb 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -29,136 +29,6 @@
 #include "brw_vec4_tes.h"
 #include "main/shaderobj.h"
 #include "main/uniforms.h"
-#include "util/debug.h"
-
-static void
-shader_debug_log_mesa(void *data, const char *fmt, ...)
-{
-   struct brw_context *brw = (struct brw_context *)data;
-   va_list args;
-
-   va_start(args, fmt);
-   GLuint msg_id = 0;
-   _mesa_gl_vdebug(&brw->ctx, &msg_id,
-                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                   MESA_DEBUG_TYPE_OTHER,
-                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
-   va_end(args);
-}
-
-static void
-shader_perf_log_mesa(void *data, const char *fmt, ...)
-{
-   struct brw_context *brw = (struct brw_context *)data;
-
-   va_list args;
-   va_start(args, fmt);
-
-   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
-      va_list args_copy;
-      va_copy(args_copy, args);
-      vfprintf(stderr, fmt, args_copy);
-      va_end(args_copy);
-   }
-
-   if (brw->perf_debug) {
-      GLuint msg_id = 0;
-      _mesa_gl_vdebug(&brw->ctx, &msg_id,
-                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                      MESA_DEBUG_TYPE_PERFORMANCE,
-                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
-   }
-   va_end(args);
-}
-
-struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
-{
-   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
-
-   compiler->devinfo = devinfo;
-   compiler->shader_debug_log = shader_debug_log_mesa;
-   compiler->shader_perf_log = shader_perf_log_mesa;
-
-   brw_fs_alloc_reg_sets(compiler);
-   brw_vec4_alloc_reg_set(compiler);
-
-   compiler->scalar_stage[MESA_SHADER_VERTEX] =
-      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
-   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] = false;
-   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
-   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
-      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", false);
-   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
-   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
-
-   nir_shader_compiler_options *nir_options =
-      rzalloc(compiler, nir_shader_compiler_options);
-   nir_options->native_integers = true;
-   nir_options->lower_fdiv = true;
-   /* In order to help allow for better CSE at the NIR level we tell NIR
-    * to split all ffma instructions during opt_algebraic and we then
-    * re-combine them as a later step.
-    */
-   nir_options->lower_ffma = true;
-   nir_options->lower_sub = true;
-   nir_options->lower_fdiv = true;
-   nir_options->lower_scmp = true;
-   nir_options->lower_fmod = true;
-   nir_options->lower_bitfield_extract = true;
-   nir_options->lower_bitfield_insert = true;
-   nir_options->lower_uadd_carry = true;
-   nir_options->lower_usub_borrow = true;
-
-   /* In the vec4 backend, our dpN instruction replicates its result to all
-    * the components of a vec4.  We would like NIR to give us replicated fdot
-    * instructions because it can optimize better for us.
-    *
-    * For the FS backend, it should be lowered away by the scalarizing pass so
-    * we should never see fdot anyway.
-    */
-   nir_options->fdot_replicates = true;
-
-   /* We want the GLSL compiler to emit code that uses condition codes */
-   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
-      compiler->glsl_compiler_options[i].MaxIfDepth =
-         devinfo->gen < 6 ? 16 : UINT_MAX;
-
-      compiler->glsl_compiler_options[i].EmitCondCodes = true;
-      compiler->glsl_compiler_options[i].EmitNoNoise = true;
-      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
-      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
-      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
-      compiler->glsl_compiler_options[i].LowerClipDistance = true;
-
-      bool is_scalar = compiler->scalar_stage[i];
-
-      compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
-      compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
-      compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
-
-      /* !ARB_gpu_shader5 */
-      if (devinfo->gen < 7)
-         compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
-
-      compiler->glsl_compiler_options[i].NirOptions = nir_options;
-
-      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
-   }
-
-   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
-   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
-
-   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
-      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
-
-   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE]
-      .LowerShaderSharedVariables = true;
-
-   return compiler;
-}
 
 extern "C" struct gl_shader *
 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
@@ -214,6 +84,7 @@ brw_type_for_base_type(const struct glsl_type *type)
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
    case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_FUNCTION:
       unreachable("not reached");
    }
 
@@ -442,6 +313,10 @@ brw_instruction_name(enum opcode op)
    case SHADER_OPCODE_BROADCAST:
       return "broadcast";
 
+   case SHADER_OPCODE_EXTRACT_BYTE:
+      return "extract_byte";
+   case SHADER_OPCODE_EXTRACT_WORD:
+      return "extract_word";
    case VEC4_OPCODE_MOV_BYTES:
       return "mov_bytes";
    case VEC4_OPCODE_PACK_BYTES:
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 5933613..82374a4 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -259,9 +259,6 @@ struct brw_gs_compile
    unsigned control_data_header_size_bits;
 };
 
-struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo);
-
 void
 brw_assign_common_binding_table_offsets(gl_shader_stage stage,
                                         const struct brw_device_info *devinfo,
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index b5c1a35..f42a953 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -25,21 +25,8 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
-
-struct surface_format_info {
-   bool exists;
-   int sampling;
-   int filtering;
-   int shadow_compare;
-   int chroma_key;
-   int render_target;
-   int alpha_blend;
-   int input_vb;
-   int streamed_output_vb;
-   int color_processing;
-   int lossless_compression;
-   const char *name;
-};
+#include "brw_wm.h"
+#include "brw_surface_formats.h"
 
 /* This macro allows us to write the table almost as it appears in the PRM,
  * while restructuring it to turn it into the C code we want.
@@ -86,7 +73,7 @@ struct surface_format_info {
  * - VOL4_Part1 section 3.9.11 Render Target Write.
  * - Render Target Surface Types [SKL+]
  */
-const struct surface_format_info surface_formats[] = {
+const struct brw_surface_format_info surface_formats[] = {
 /* smpl filt shad CK  RT  AB  VB  SO  color ccs_e */
    SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x,   90,   R32G32B32A32_FLOAT)
    SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x,   90,   R32G32B32A32_SINT)
@@ -218,7 +205,7 @@ const struct surface_format_info surface_formats[] = {
    SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   P8A8_UNORM_PALETTE0)
    SF(50, 50,  x,  x,  x,  x,  x,  x,  x,    x,   P8A8_UNORM_PALETTE1)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   A1B5G5R5_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   A4B4G4R4_UNORM)
+   SF( Y,  Y,  x,  Y, 90,  x,  x,  x,  x,    x,   A4B4G4R4_UNORM)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_UINT)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   L8A8_SINT)
    SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x,    x,   R8_UNORM)
@@ -281,13 +268,13 @@ const struct surface_format_info surface_formats[] = {
    SF(70, 70,  x,  x,  x,  x,  x,  x,  x,    x,   BC6H_UF16)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   PLANAR_420_8)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC1_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_SIGNED_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_SIGNED_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_SRGB8)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ETC1_RGB8)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_RGB8)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_R11)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_RG11)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_SIGNED_R11)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   EAC_SIGNED_RG11)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_SRGB8)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R16G16B16_UINT)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R16G16B16_SINT)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R32_SFIXED)
@@ -302,10 +289,10 @@ const struct surface_format_info surface_formats[] = {
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   B10G10R10A2_SINT)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R64G64B64A64_PASSTHRU)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R64G64B64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_RGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_SRGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_EAC_RGBA8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_EAC_SRGB8_A8)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_RGB8_PTA)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_SRGB8_PTA)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_EAC_RGBA8)
+   SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ETC2_EAC_SRGB8_A8)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8_UINT)
    SF( x,  x,  x,  x,  x,  x,  x,  x,  x,    x,   R8G8B8_SINT)
    SF(80, 80,  x,  x,  x,  x,  x,  x,  x,    x,   ASTC_LDR_2D_4x4_FLT16)
@@ -618,7 +605,7 @@ brw_init_surface_formats(struct brw_context *brw)
 
    for (format = MESA_FORMAT_NONE + 1; format < MESA_FORMAT_COUNT; format++) {
       uint32_t texture, render;
-      const struct surface_format_info *rinfo, *tinfo;
+      const struct brw_surface_format_info *rinfo, *tinfo;
       bool is_integer = _mesa_is_format_integer_color(format);
 
       render = texture = brw_format_for_mesa_format(format);
@@ -827,7 +814,7 @@ bool
 brw_losslessly_compressible_format(struct brw_context *brw,
                                    uint32_t brw_format)
 {
-   const struct surface_format_info * const sinfo =
+   const struct brw_surface_format_info * const sinfo =
       &surface_formats[brw_format];
    const int gen = brw->gen * 10;
 
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.h b/src/mesa/drivers/dri/i965/brw_surface_formats.h
new file mode 100644
index 0000000..a5cd49f
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+struct brw_surface_format_info {
+   bool exists;
+   int sampling;
+   int filtering;
+   int shadow_compare;
+   int chroma_key;
+   int render_target;
+   int alpha_blend;
+   int input_vb;
+   int streamed_output_vb;
+   int color_processing;
+   int lossless_compression;
+   const char *name;
+};
+
+extern const struct brw_surface_format_info surface_formats[];
diff --git a/src/mesa/drivers/dri/i965/brw_util.c b/src/mesa/drivers/dri/i965/brw_util.c
index bf7f9c6..934b6b8 100644
--- a/src/mesa/drivers/dri/i965/brw_util.c
+++ b/src/mesa/drivers/dri/i965/brw_util.c
@@ -98,3 +98,31 @@ GLuint brw_translate_blend_factor( GLenum factor )
       unreachable("not reached");
    }
 }
+
+static const GLuint prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
+   [GL_POINTS] =_3DPRIM_POINTLIST,
+   [GL_LINES] = _3DPRIM_LINELIST,
+   [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
+   [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
+   [GL_TRIANGLES] = _3DPRIM_TRILIST,
+   [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+   [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+   [GL_QUADS] = _3DPRIM_QUADLIST,
+   [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+   [GL_POLYGON] = _3DPRIM_POLYGON,
+   [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+   [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+   [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+   [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+uint32_t
+get_hw_prim_for_gl_prim(int mode)
+{
+   if (mode >= BRW_PRIM_OFFSET)
+      return mode - BRW_PRIM_OFFSET;
+   else {
+      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
+      return prim_to_hw_prim[mode];
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index e7aec1f..394e321 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -477,11 +477,6 @@ vec4_visitor::split_uniform_registers()
 	 inst->src[i].reg_offset = 0;
       }
    }
-
-   /* Update that everything is now vector-sized. */
-   for (int i = 0; i < this->uniforms; i++) {
-      this->uniform_size[i] = 1;
-   }
 }
 
 void
@@ -539,7 +534,6 @@ vec4_visitor::pack_uniform_registers()
     * push constants.
     */
    for (int src = 0; src < uniforms; src++) {
-      assert(src < uniform_array_size);
       int size = chans_used[src];
 
       if (size == 0)
@@ -786,7 +780,7 @@ vec4_visitor::move_push_constants_to_pull_constants()
 	 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
 
 	 emit_pull_constant_load(block, inst, temp, inst->src[i],
-				 pull_constant_loc[uniform]);
+				 pull_constant_loc[uniform], src_reg());
 
 	 inst->src[i].file = temp.file;
          inst->src[i].nr = temp.nr;
@@ -1606,8 +1600,6 @@ vec4_visitor::setup_uniforms(int reg)
     * matter what, or the GPU would hang.
     */
    if (devinfo->gen < 6 && this->uniforms == 0) {
-      assert(this->uniforms < this->uniform_array_size);
-
       stage_prog_data->param =
          reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
       for (unsigned int i = 0; i < 4; i++) {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index ddfd87d..83d9eda 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -115,8 +115,6 @@ public:
     */
    dst_reg output_reg[BRW_VARYING_SLOT_COUNT];
    const char *output_reg_annotation[BRW_VARYING_SLOT_COUNT];
-   int *uniform_size;
-   int uniform_array_size; /*< Size of the uniform_size array */
    int uniforms;
 
    src_reg shader_start_time;
@@ -260,6 +258,7 @@ public:
                      src_reg offset_value,
                      src_reg mcs,
                      bool is_cube_array,
+                     uint32_t surface, src_reg surface_reg,
                      uint32_t sampler, src_reg sampler_reg);
 
    src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
@@ -284,8 +283,6 @@ public:
 
    src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
 			      src_reg *reladdr, int reg_offset);
-   src_reg get_pull_constant_offset(bblock_t *block, vec4_instruction *inst,
-				    src_reg *reladdr, int reg_offset);
    void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
 			  dst_reg dst,
 			  src_reg orig_src,
@@ -295,7 +292,8 @@ public:
    void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
 				dst_reg dst,
 				src_reg orig_src,
-				int base_offset);
+				int base_offset,
+                                src_reg indirect);
    void emit_pull_constant_load_reg(dst_reg dst,
                                     src_reg surf_index,
                                     src_reg offset,
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 730be21..237534d 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -109,6 +109,7 @@ generate_tex(struct brw_codegen *p,
              vec4_instruction *inst,
              struct brw_reg dst,
              struct brw_reg src,
+             struct brw_reg surface_index,
              struct brw_reg sampler_index)
 {
    const struct brw_device_info *devinfo = p->devinfo;
@@ -264,14 +265,16 @@ generate_tex(struct brw_codegen *p,
          ? prog_data->base.binding_table.gather_texture_start
          : prog_data->base.binding_table.texture_start;
 
-   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+       sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t surface = surface_index.ud;
       uint32_t sampler = sampler_index.ud;
 
       brw_SAMPLE(p,
                  dst,
                  inst->base_mrf,
                  src,
-                 sampler + base_binding_table_index,
+                 surface + base_binding_table_index,
                  sampler % 16,
                  msg_type,
                  1, /* response length */
@@ -285,14 +288,19 @@ generate_tex(struct brw_codegen *p,
       /* Non-constant sampler index. */
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 
       brw_push_insn_state(p);
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
-      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
-      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (memcmp(&surface_reg, &sampler_reg, sizeof(surface_reg)) == 0) {
+         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      } else {
+         brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+         brw_OR(p, addr, addr, surface_reg);
+      }
       if (base_binding_table_index)
          brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
       brw_AND(p, addr, addr, brw_imm_ud(0xfff));
@@ -1383,6 +1391,48 @@ generate_set_simd4x2_header_gen9(struct brw_codegen *p,
 }
 
 static void
+generate_mov_indirect(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst, struct brw_reg reg,
+                      struct brw_reg indirect, struct brw_reg length)
+{
+   assert(indirect.type == BRW_REGISTER_TYPE_UD);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
+
+   /* This instruction acts in align1 mode */
+   assert(inst->force_writemask_all || reg.writemask == 0xf);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   struct brw_reg addr = vec2(brw_address_reg(0));
+
+   /* We need to move the indirect value into the address register.  In order
+    * to make things make some sense, we want to respect at least the X
+    * component of the swizzle.  In order to do that, we need to convert the
+    * subnr (probably 0) to an align1 subnr and add in the swizzle.  We then
+    * use a region of <8,4,0>:uw to pick off the first 2 bytes of the indirect
+    * and splat it out to all four channels of the given half of a0.
+    */
+   assert(brw_is_single_value_swizzle(indirect.swizzle));
+   indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0)) * 2;
+   indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
+
+   brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
+
+   /* Use a <4,1> region Vx1 region*/
+   struct brw_reg src = brw_VxH_indirect(0, 0);
+   src.width = BRW_WIDTH_4;
+   src.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+   brw_MOV(p, dst, retype(src, reg.type));
+
+   brw_pop_insn_state(p);
+}
+
+static void
 generate_code(struct brw_codegen *p,
               const struct brw_compiler *compiler,
               void *log_data,
@@ -1664,7 +1714,7 @@ generate_code(struct brw_codegen *p,
       case SHADER_OPCODE_TG4:
       case SHADER_OPCODE_TG4_OFFSET:
       case SHADER_OPCODE_SAMPLEINFO:
-         generate_tex(p, prog_data, inst, dst, src[0], src[1]);
+         generate_tex(p, prog_data, inst, dst, src[0], src[1], src[2]);
          break;
 
       case VS_OPCODE_URB_WRITE:
@@ -1928,6 +1978,9 @@ generate_code(struct brw_codegen *p,
          brw_WAIT(p);
          break;
 
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
+
       default:
          unreachable("Unsupported opcode");
       }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 1b87e30..2b261fc 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -132,15 +132,6 @@ void
 vec4_visitor::nir_setup_uniforms()
 {
    uniforms = nir->num_uniforms / 16;
-
-   nir_foreach_variable(var, &nir->uniforms) {
-      /* UBO's and atomics don't take up space in the uniform file */
-      if (var->interface_type != NULL || var->type->contains_atomic())
-         continue;
-
-      if (type_size_vec4(var->type) > 0)
-         uniform_size[var->data.driver_location / 16] = type_size_vec4(var->type);
-   }
 }
 
 void
@@ -710,12 +701,14 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          /* Offsets are in bytes but they should always be multiples of 16 */
          assert(const_offset->u[0] % 16 == 0);
          src.reg_offset = const_offset->u[0] / 16;
+
+         emit(MOV(dest, src));
       } else {
-         src_reg tmp = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_D, 1);
-         src.reladdr = new(mem_ctx) src_reg(tmp);
-      }
+         src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
 
-      emit(MOV(dest, src));
+         emit(SHADER_OPCODE_MOV_INDIRECT, dest, src,
+              indirect, brw_imm_ud(instr->const_index[1]));
+      }
       break;
    }
 
@@ -1093,15 +1086,29 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
-   case nir_op_fsin:
-      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
-      inst->saturate = instr->dest.saturate;
+   case nir_op_fsin: {
+      src_reg tmp = src_reg(this, glsl_type::vec4_type);
+      inst = emit_math(SHADER_OPCODE_SIN, dst_reg(tmp), op[0]);
+      if (instr->dest.saturate) {
+         inst->dst = dst;
+         inst->saturate = true;
+      } else {
+         emit(MUL(dst, tmp, brw_imm_f(0.99997)));
+      }
       break;
+   }
 
-   case nir_op_fcos:
-      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
-      inst->saturate = instr->dest.saturate;
+   case nir_op_fcos: {
+      src_reg tmp = src_reg(this, glsl_type::vec4_type);
+      inst = emit_math(SHADER_OPCODE_COS, dst_reg(tmp), op[0]);
+      if (instr->dest.saturate) {
+         inst->dst = dst;
+         inst->saturate = true;
+      } else {
+         emit(MUL(dst, tmp, brw_imm_f(0.99997)));
+      }
       break;
+   }
 
    case nir_op_idiv:
    case nir_op_udiv:
@@ -1109,9 +1116,41 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       break;
 
    case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
       emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
       break;
 
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = emit(MOV(dst_null_d(), src_reg(dst)));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       */
+      src_reg tmp = src_reg(this, glsl_type::ivec4_type);
+      inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = emit(ADD(dst, src_reg(dst), op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
    case nir_op_ldexp:
       unreachable("not reached: should be handled by ldexp_to_arith()");
 
@@ -1181,6 +1220,32 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
       inst->saturate = instr->dest.saturate;
       break;
 
+   case nir_op_fquantize2f16: {
+      /* See also vec4_visitor::emit_pack_half_2x16() */
+      src_reg tmp16 = src_reg(this, glsl_type::uvec4_type);
+      src_reg tmp32 = src_reg(this, glsl_type::vec4_type);
+      src_reg zero = src_reg(this, glsl_type::vec4_type);
+
+      /* Check for denormal */
+      src_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+               BRW_CONDITIONAL_L));
+      /* Get the appropriately signed zero */
+      emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD),
+               retype(op[0], BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(0x80000000)));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      emit(F32TO16(dst_reg(tmp16), op[0]));
+      emit(F16TO32(dst_reg(tmp32), tmp16));
+      /* Select that or zero based on normal status */
+      inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->predicate_inverse = true;
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
    case nir_op_fmin:
    case nir_op_imin:
    case nir_op_umin:
@@ -1325,6 +1390,24 @@ vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_pack_unorm_2x16:
       unreachable("not reached: should be handled by lower_packing_builtins");
 
+   case nir_op_pack_uvec4_to_uint:
+      unreachable("not reached");
+
+   case nir_op_pack_uvec2_to_uint: {
+      dst_reg tmp1 = dst_reg(this, glsl_type::uint_type);
+      tmp1.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_YYYY;
+      emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u))));
+
+      dst_reg tmp2 = dst_reg(this, glsl_type::uint_type);
+      tmp2.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_XXXX;
+      emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu))));
+
+      emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
+      break;
+   }
+
    case nir_op_unpack_half_2x16:
       /* As NIR does not guarantee that we have a correct swizzle outside the
        * boundaries of a vector, and the implementation of emit_unpack_half_2x16
@@ -1568,7 +1651,6 @@ vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
       break;
 
    case nir_jump_return:
-      /* fall through */
    default:
       unreachable("unknown jump");
    }
@@ -1621,7 +1703,9 @@ glsl_type_for_nir_alu_type(nir_alu_type alu_type,
 void
 vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
 {
+   unsigned texture = instr->texture_index;
    unsigned sampler = instr->sampler_index;
+   src_reg texture_reg = brw_imm_ud(texture);
    src_reg sampler_reg = brw_imm_ud(sampler);
    src_reg coordinate;
    const glsl_type *coord_type = NULL;
@@ -1636,6 +1720,10 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
                                  nir_tex_instr_dest_size(instr));
    dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
 
+   /* Our hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
    /* Load the texture operation sources */
    for (unsigned i = 0; i < instr->num_srcs; i++) {
       switch (instr->src[i].src_type) {
@@ -1697,13 +1785,12 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
          offset_value = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
          break;
 
-      case nir_tex_src_sampler_offset: {
-         /* The highest sampler which may be used by this operation is
+      case nir_tex_src_texture_offset: {
+         /* The highest texture which may be used by this operation is
           * the last element of the array. Mark it here, because the generator
           * doesn't have enough information to determine the bound.
           */
-         uint32_t array_size = instr->sampler_array_size;
-         uint32_t max_used = sampler + array_size - 1;
+         uint32_t max_used = texture + instr->texture_array_size - 1;
          if (instr->op == nir_texop_tg4) {
             max_used += prog_data->base.binding_table.gather_texture_start;
          } else {
@@ -1715,6 +1802,15 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
          /* Emit code to evaluate the actual indexing expression */
          src_reg src = get_nir_src(instr->src[i].src, 1);
          src_reg temp(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(temp), src, brw_imm_ud(texture)));
+         texture_reg = emit_uniformize(temp);
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         /* Emit code to evaluate the actual indexing expression */
+         src_reg src = get_nir_src(instr->src[i].src, 1);
+         src_reg temp(this, glsl_type::uint_type);
          emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler)));
          sampler_reg = emit_uniformize(temp);
          break;
@@ -1753,7 +1849,7 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
    /* Stuff the channel select bits in the top of the texture offset */
    if (instr->op == nir_texop_tg4) {
       if (instr->component == 1 &&
-          (key_tex->gather_channel_quirk_mask & (1 << sampler))) {
+          (key_tex->gather_channel_quirk_mask & (1 << texture))) {
          /* gather4 sampler is broken for green channel on RG32F --
           * we must ask for blue instead.
           */
@@ -1774,7 +1870,8 @@ vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
                 shadow_comparitor,
                 lod, lod2, sample_index,
                 constant_offset, offset_value,
-                mcs, is_cube_array, sampler, sampler_reg);
+                mcs, is_cube_array,
+                texture, texture_reg, sampler, sampler_reg);
 }
 
 void
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 9b75f45..b7d02e9 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -59,8 +59,6 @@ vec4_tcs_visitor::emit_nir_code()
        * copies VS outputs to TES inputs.
        */
       uniforms = 2;
-      uniform_size[0] = 1;
-      uniform_size[1] = 1;
 
       uint64_t varyings = key->outputs_written;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 443d0eb..0c5bfb8 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -622,6 +622,7 @@ type_size_vec4(const struct glsl_type *type)
    case GLSL_TYPE_DOUBLE:
    case GLSL_TYPE_ERROR:
    case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
       unreachable("not reached");
    }
 
@@ -877,6 +878,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
                            src_reg offset_value,
                            src_reg mcs,
                            bool is_cube_array,
+                           uint32_t surface,
+                           src_reg surface_reg,
                            uint32_t sampler,
                            src_reg sampler_reg)
 {
@@ -942,7 +945,8 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
    inst->dst.writemask = WRITEMASK_XYZW;
    inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
 
-   inst->src[1] = sampler_reg;
+   inst->src[1] = surface_reg;
+   inst->src[2] = sampler_reg;
 
    /* MRF for the first parameter */
    int param_base = inst->base_mrf + inst->header_size;
@@ -1068,7 +1072,7 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
    }
 
    if (devinfo->gen == 6 && op == ir_tg4) {
-      emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
+      emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
    }
 
    if (op == ir_query_levels) {
@@ -1464,27 +1468,6 @@ vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
    }
 }
 
-src_reg
-vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
-				       src_reg *reladdr, int reg_offset)
-{
-   if (reladdr) {
-      src_reg index = src_reg(this, glsl_type::int_type);
-
-      emit_before(block, inst, ADD(dst_reg(index), *reladdr,
-                                   brw_imm_d(reg_offset * 16)));
-
-      return index;
-   } else if (devinfo->gen >= 8) {
-      /* Store the offset in a GRF so we can send-from-GRF. */
-      src_reg offset = src_reg(this, glsl_type::int_type);
-      emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
-      return offset;
-   } else {
-      return brw_imm_d(reg_offset * 16);
-   }
-}
-
 /**
  * Emits an instruction before @inst to load the value named by @orig_src
  * from scratch space at @base_offset to @temp.
@@ -1662,12 +1645,24 @@ vec4_visitor::move_grf_array_access_to_scratch()
 void
 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
 				      dst_reg temp, src_reg orig_src,
-				      int base_offset)
+				      int base_offset, src_reg indirect)
 {
    int reg_offset = base_offset + orig_src.reg_offset;
    const unsigned index = prog_data->base.binding_table.pull_constants_start;
-   src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
-                                             reg_offset);
+
+   src_reg offset;
+   if (indirect.file != BAD_FILE) {
+      offset = src_reg(this, glsl_type::int_type);
+
+      emit_before(block, inst, ADD(dst_reg(offset), indirect,
+                                   brw_imm_d(reg_offset * 16)));
+   } else if (devinfo->gen >= 8) {
+      /* Store the offset in a GRF so we can send-from-GRF. */
+      offset = src_reg(this, glsl_type::int_type);
+      emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
+   } else {
+      offset = brw_imm_d(reg_offset * 16);
+   }
 
    emit_pull_constant_load_reg(temp,
                                brw_imm_ud(index),
@@ -1694,59 +1689,55 @@ vec4_visitor::move_uniform_array_access_to_pull_constants()
 {
    int pull_constant_loc[this->uniforms];
    memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
-   bool nested_reladdr;
 
-   /* Walk through and find array access of uniforms.  Put a copy of that
-    * uniform in the pull constant buffer.
-    *
-    * Note that we don't move constant-indexed accesses to arrays.  No
-    * testing has been done of the performance impact of this choice.
+   /* First, walk through the instructions and determine which things need to
+    * be pulled.  We mark something as needing to be pulled by setting
+    * pull_constant_loc to 0.
     */
-   do {
-      nested_reladdr = false;
-
-      foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
-         for (int i = 0 ; i < 3; i++) {
-            if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
-               continue;
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
 
-            int uniform = inst->src[i].nr;
+      int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
 
-            if (inst->src[i].reladdr->reladdr)
-               nested_reladdr = true;  /* will need another pass */
+      for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
+         pull_constant_loc[uniform_nr + j] = 0;
+   }
 
-            /* If this array isn't already present in the pull constant buffer,
-             * add it.
-             */
-            if (pull_constant_loc[uniform] == -1) {
-               const gl_constant_value **values =
-                  &stage_prog_data->param[uniform * 4];
+   /* Next, we walk the list of uniforms and assign real pull constant
+    * locations and set their corresponding entries in pull_param.
+    */
+   for (int j = 0; j < this->uniforms; j++) {
+      if (pull_constant_loc[j] < 0)
+         continue;
 
-               pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
+      pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
 
-               assert(uniform < uniform_array_size);
-               for (int j = 0; j < uniform_size[uniform] * 4; j++) {
-                  stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
-                     = values[j];
-               }
-            }
+      for (int i = 0; i < 4; i++) {
+         stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
+            = stage_prog_data->param[j * 4 + i];
+      }
+   }
 
-            /* Set up the annotation tracking for new generated instructions. */
-            base_ir = inst->ir;
-            current_annotation = inst->annotation;
+   /* Finally, we can walk through the instructions and lower MOV_INDIRECT
+    * instructions to actual uniform pulls.
+    */
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
 
-            dst_reg temp = dst_reg(this, glsl_type::vec4_type);
+      int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
 
-            emit_pull_constant_load(block, inst, temp, inst->src[i],
-                                    pull_constant_loc[uniform]);
+      assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
 
-            inst->src[i].file = temp.file;
-            inst->src[i].nr = temp.nr;
-            inst->src[i].reg_offset = temp.reg_offset;
-            inst->src[i].reladdr = NULL;
-         }
-      }
-   } while (nested_reladdr);
+      emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
+                              pull_constant_loc[uniform_nr], inst->src[1]);
+      inst->remove(block);
+   }
 
    /* Now there are no accesses of the UNIFORM file with a reladdr, so
     * no need to track them as larger-than-vec4 objects.  This will be
@@ -1799,17 +1790,6 @@ vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
 
    this->uniforms = 0;
-
-   /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
-    * at least one. See setup_uniforms() in brw_vec4.cpp.
-    */
-   this->uniform_array_size = 1;
-   if (prog_data) {
-      this->uniform_array_size =
-         MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
-   }
-
-   this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
 }
 
 vec4_visitor::~vec4_visitor()
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 1d69149..86701f3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -270,7 +270,6 @@ void
 vec4_vs_visitor::setup_uniform_clipplane_values()
 {
    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
-      assert(this->uniforms < uniform_array_size);
       this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
       this->userplane[i].type = BRW_REGISTER_TYPE_F;
       for (int j = 0; j < 4; ++j) {