807 files changed, 41129 insertions, 30980 deletions
diff --git a/Android.common.mk b/Android.common.mk
index edf52d6..d662d60 100644
--- a/Android.common.mk
+++ b/Android.common.mk
@@ -68,7 +68,16 @@ LOCAL_CFLAGS += \
 endif
 endif
 
+ifeq ($(MESA_ENABLE_LLVM),true)
+LOCAL_CFLAGS += \
+	-DHAVE_LLVM=0x0305 -DLLVM_VERSION_PATCH=2 \
+	-D__STDC_CONSTANT_MACROS \
+	-D__STDC_FORMAT_MACROS \
+	-D__STDC_LIMIT_MACROS
+endif
+
 LOCAL_CPPFLAGS += \
+	$(if $(filter true,$(MESA_LOLLIPOP_BUILD)),-D_USING_LIBCXX) \
 	-Wno-error=non-virtual-dtor \
 	-Wno-non-virtual-dtor
 
diff --git a/Android.mk b/Android.mk
index b19419b..69e0d33 100644
--- a/Android.mk
+++ b/Android.mk
@@ -24,7 +24,7 @@
 # BOARD_GPU_DRIVERS should be defined.  The valid values are
 #
 #   classic drivers: i915 i965
-#   gallium drivers: swrast freedreno i915g ilo nouveau r300g r600g radeonsi vmwgfx
+#   gallium drivers: swrast freedreno i915g ilo nouveau r300g r600g radeonsi vc4 vmwgfx
 #
 # The main target is libGLES_mesa.  For each classic driver enabled, a DRI
 # module will also be built.  DRI modules will be loaded by libGLES_mesa.
@@ -48,7 +48,7 @@ MESA_PYTHON2 := python
 DRM_GRALLOC_TOP := hardware/drm_gralloc
 
 classic_drivers := i915 i965
-gallium_drivers := swrast freedreno i915g ilo nouveau r300g r600g radeonsi vmwgfx
+gallium_drivers := swrast freedreno i915g ilo nouveau r300g r600g radeonsi vmwgfx vc4
 
 MESA_GPU_DRIVERS := $(strip $(BOARD_GPU_DRIVERS))
 
@@ -80,6 +80,8 @@ else
 MESA_BUILD_GALLIUM := false
 endif
 
+MESA_ENABLE_LLVM := $(if $(filter radeonsi,$(MESA_GPU_DRIVERS)),true,false)
+
 # add subdirectories
 ifneq ($(strip $(MESA_GPU_DRIVERS)),)
 
@@ -89,13 +91,9 @@ SUBDIRS := \
 	src/glsl \
 	src/mesa \
 	src/util \
-	src/egl/main
-
-ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
-SUBDIRS += \
+	src/egl/main \
 	src/egl/drivers/dri2 \
 	src/mesa/drivers/dri
-endif
 
 ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
 SUBDIRS += src/gallium
diff --git a/CleanSpec.mk b/CleanSpec.mk
index 2068163..d08b0de 100644
--- a/CleanSpec.mk
+++ b/CleanSpec.mk
@@ -13,3 +13,4 @@ $(call add-clean-step, rm -rf $(PRODUCT_OUT)/*/SHARED_LIBRARIES/libGLES_mesa_int
 $(call add-clean-step, rm -rf $(HOST_OUT_release)/*/EXECUTABLES/mesa_*_intermediates)
 $(call add-clean-step, rm -rf $(HOST_OUT_release)/*/EXECUTABLES/glsl_compiler_intermediates)
 $(call add-clean-step, rm -rf $(HOST_OUT_release)/*/STATIC_LIBRARIES/libmesa_*_intermediates)
+$(call add-clean-step, rm -rf $(PRODUCT_OUT)/*/SHARED_LIBRARIES/*_dri_intermediates)
diff --git a/VERSION b/VERSION
index 8d30306..1edd8fc 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-10.6.0-devel
+10.7.0-devel
diff --git a/configure.ac b/configure.ac
index 1f23de4..33aacd2 100644
--- a/configure.ac
+++ b/configure.ac
@@ -649,6 +649,7 @@ if test "x$enable_asm" = xyes; then
 fi
 
 AC_CHECK_HEADER([xlocale.h], [DEFINES="$DEFINES -DHAVE_XLOCALE_H"])
+AC_CHECK_HEADER([sys/sysctl.h], [DEFINES="$DEFINES -DHAVE_SYS_SYSCTL_H"])
 AC_CHECK_FUNC([strtof], [DEFINES="$DEFINES -DHAVE_STRTOF"])
 
 dnl Check to see if dlopen is in default libraries (like Solaris, which
@@ -713,15 +714,15 @@ AC_ARG_ENABLE([opengl],
     [enable_opengl="$enableval"],
     [enable_opengl=yes])
 AC_ARG_ENABLE([gles1],
-    [AS_HELP_STRING([--enable-gles1],
-        [enable support for OpenGL ES 1.x API @<:@default=disabled@:>@])],
+    [AS_HELP_STRING([--disable-gles1],
+        [disable support for OpenGL ES 1.x API @<:@default=enabled@:>@])],
     [enable_gles1="$enableval"],
-    [enable_gles1=no])
+    [enable_gles1=yes])
 AC_ARG_ENABLE([gles2],
-    [AS_HELP_STRING([--enable-gles2],
-        [enable support for OpenGL ES 2.x API @<:@default=disabled@:>@])],
+    [AS_HELP_STRING([--disable-gles2],
+        [disable support for OpenGL ES 2.x API @<:@default=enabled@:>@])],
     [enable_gles2="$enableval"],
-    [enable_gles2=no])
+    [enable_gles2=yes])
 
 AC_ARG_ENABLE([dri],
     [AS_HELP_STRING([--enable-dri],
@@ -940,12 +941,6 @@ x*yes*yes*)
     ;;
 esac
 
-# Building Xlib-GLX requires shared glapi to be disabled.
-if test "x$enable_xlib_glx" = xyes; then
-    AC_MSG_NOTICE([Shared GLAPI should not used with Xlib-GLX, disabling])
-    enable_shared_glapi=no
-fi
-
 AM_CONDITIONAL(HAVE_SHARED_GLAPI, test "x$enable_shared_glapi" = xyes)
 
 # Build the pipe-drivers as separate libraries/modules.
@@ -1516,7 +1511,6 @@ if test "x$enable_gbm" = xyes; then
     fi
 
     if test "x$enable_dri" = xyes; then
-        GBM_BACKEND_DIRS="$GBM_BACKEND_DIRS dri"
         if test "x$enable_shared_glapi" = xno; then
             AC_MSG_ERROR([gbm_dri requires --enable-shared-glapi])
         fi
@@ -1553,8 +1547,15 @@ if test "x$enable_egl" = xyes; then
 
     if test "$enable_static" != yes; then
         if test "x$enable_dri" = xyes; then
-	    HAVE_EGL_DRIVER_DRI2=1
-	fi
+            HAVE_EGL_DRIVER_DRI2=1
+            if test "x$enable_shared_glapi" = xno; then
+                AC_MSG_ERROR([egl_dri2 requires --enable-shared-glapi])
+            fi
+        else
+            # Avoid building an "empty" libEGL. Drop/update this
+            # when other backends (haiku?) come along.
+            AC_MSG_ERROR([egl requires --enable-dri])
+        fi
 
     fi
 fi
@@ -1782,6 +1783,11 @@ for plat in $egl_platforms; do
 			AC_MSG_ERROR([EGL platform drm requires libdrm >= $LIBDRM_REQUIRED])
 		;;
 
+	surfaceless)
+		test "x$have_libdrm" != xyes &&
+			AC_MSG_ERROR([EGL platform surfaceless requires libdrm >= $LIBDRM_REQUIRED])
+		;;
+
 	android|gdi|null)
 		;;
 
@@ -1811,6 +1817,7 @@ fi
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_X11, echo "$egl_platforms" | grep -q 'x11')
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_WAYLAND, echo "$egl_platforms" | grep -q 'wayland')
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_DRM, echo "$egl_platforms" | grep -q 'drm')
+AM_CONDITIONAL(HAVE_EGL_PLATFORM_SURFACELESS, echo "$egl_platforms" | grep -q 'surfaceless')
 AM_CONDITIONAL(HAVE_EGL_PLATFORM_NULL, echo "$egl_platforms" | grep -q 'null')
 
 AM_CONDITIONAL(HAVE_EGL_DRIVER_DRI2, test "x$HAVE_EGL_DRIVER_DRI2" != "x")
@@ -1926,10 +1933,7 @@ if test "x$enable_gallium_llvm" = xyes; then
             AC_MSG_ERROR([LLVM $LLVM_REQUIRED_VERSION_MAJOR.$LLVM_REQUIRED_VERSION_MINOR or newer is required])
         fi
 
-        LLVM_COMPONENTS="engine bitwriter"
-        if $LLVM_CONFIG --components | grep -qw 'mcjit'; then
-            LLVM_COMPONENTS="${LLVM_COMPONENTS} mcjit"
-        fi
+        LLVM_COMPONENTS="engine bitwriter mcjit mcdisassembler"
 
         if test "x$enable_opencl" = xyes; then
             llvm_check_version_for "3" "5" "0" "opencl"
@@ -1937,7 +1941,7 @@ if test "x$enable_gallium_llvm" = xyes; then
             LLVM_COMPONENTS="${LLVM_COMPONENTS} all-targets ipo linker instrumentation"
             LLVM_COMPONENTS="${LLVM_COMPONENTS} irreader option objcarcopts profiledata"
         fi
-        DEFINES="${DEFINES} -DHAVE_LLVM=0x0$LLVM_VERSION_INT -DLLVM_VERSION_PATCH=$LLVM_VERSION_PATCH"
+        DEFINES="${DEFINES} -DHAVE_LLVM=0x0$LLVM_VERSION_INT -DMESA_LLVM_VERSION_PATCH=$LLVM_VERSION_PATCH"
         MESA_LLVM=1
 
         dnl Check for Clang internal headers
@@ -2056,16 +2060,19 @@ require_egl_drm() {
 }
 
 radeon_llvm_check() {
+    if test ${LLVM_VERSION_INT} -lt 307; then
+        amdgpu_llvm_target_name='r600'
+    else
+        amdgpu_llvm_target_name='amdgpu'
+    fi
     if test "x$enable_gallium_llvm" != "xyes"; then
         AC_MSG_ERROR([--enable-gallium-llvm is required when building $1])
     fi
     llvm_check_version_for "3" "4" "2" $1 
-    if test true && $LLVM_CONFIG --targets-built | grep -qvw 'R600' ; then
-        AC_MSG_ERROR([LLVM R600 Target not enabled.  You can enable it when building the LLVM
-                      sources with the --enable-experimental-targets=R600
-                      configure flag])
+    if test true && $LLVM_CONFIG --targets-built | grep -iqvw $amdgpu_llvm_target_name ; then
+        AC_MSG_ERROR([LLVM $amdgpu_llvm_target_name not enabled in your LLVM build.])
     fi
-    LLVM_COMPONENTS="${LLVM_COMPONENTS} r600 bitreader ipo"
+    LLVM_COMPONENTS="${LLVM_COMPONENTS} $amdgpu_llvm_target_name bitreader ipo"
     NEED_RADEON_LLVM=yes
     if test "x$have_libelf" != xyes; then
        AC_MSG_ERROR([$1 requires libelf when using llvm])
@@ -2365,7 +2372,6 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/drivers/svga/Makefile
 		src/gallium/drivers/trace/Makefile
 		src/gallium/drivers/vc4/Makefile
-		src/gallium/drivers/vc4/kernel/Makefile
 		src/gallium/state_trackers/clover/Makefile
 		src/gallium/state_trackers/dri/Makefile
 		src/gallium/state_trackers/glx/xlib/Makefile
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 7a7c1bd..220bcc8 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -98,13 +98,13 @@ GL 4.0, GLSL 4.00:
   GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_gpu_shader5                                   DONE (i965, nvc0)
   - 'precise' qualifier                                DONE
-  - Dynamically uniform sampler array indices          DONE (r600)
+  - Dynamically uniform sampler array indices          DONE (r600, softpipe)
   - Dynamically uniform UBO array indices              DONE (r600)
   - Implicit signed -> unsigned conversions            DONE
   - Fused multiply-add                                 DONE ()
-  - Packing/bitfield/conversion functions              DONE (r600, radeonsi)
-  - Enhanced textureGather                             DONE (r600, radeonsi)
-  - Geometry shader instancing                         DONE (r600)
+  - Packing/bitfield/conversion functions              DONE (r600, radeonsi, softpipe)
+  - Enhanced textureGather                             DONE (r600, radeonsi, softpipe)
+  - Geometry shader instancing                         DONE (r600, llvmpipe, softpipe)
   - Geometry shader multiple streams                   DONE ()
   - Enhanced per-sample shading                        DONE (r600, radeonsi)
   - Interpolation functions                            DONE (r600)
@@ -115,10 +115,10 @@ GL 4.0, GLSL 4.00:
   GL_ARB_tessellation_shader                           started (Chris, Ilia)
   GL_ARB_texture_buffer_object_rgb32                   DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_cube_map_array                        DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
-  GL_ARB_texture_gather                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
+  GL_ARB_texture_gather                                DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_query_lod                             DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_transform_feedback2                           DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_transform_feedback3                           DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_transform_feedback2                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
+  GL_ARB_transform_feedback3                           DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
 
 
 GL 4.1, GLSL 4.10:
@@ -137,7 +137,7 @@ GL 4.2, GLSL 4.20:
   GL_ARB_compressed_texture_pixel_storage              DONE (all drivers)
   GL_ARB_shader_atomic_counters                        DONE (i965)
   GL_ARB_texture_storage                               DONE (all drivers)
-  GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi)
+  GL_ARB_transform_feedback_instanced                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_base_instance                                 DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_shader_image_load_store                       in progress (curro)
   GL_ARB_conservative_depth                            DONE (all drivers that support GLSL 1.30)
@@ -153,23 +153,23 @@ GL 4.3, GLSL 4.30:
   GL_ARB_ES3_compatibility                             DONE (all drivers that support GLSL 3.30)
   GL_ARB_clear_buffer_object                           DONE (all drivers)
   GL_ARB_compute_shader                                in progress (jljusten)
-  GL_ARB_copy_image                                    DONE (i965)
+  GL_ARB_copy_image                                    DONE (i965) (gallium - in progress, VMware)
   GL_KHR_debug                                         DONE (all drivers)
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
   GL_ARB_fragment_layer_viewport                       DONE (nv50, nvc0, r600, llvmpipe)
-  GL_ARB_framebuffer_no_attachments                    not started
+  GL_ARB_framebuffer_no_attachments                    DONE (i965)
   GL_ARB_internalformat_query2                         not started
   GL_ARB_invalidate_subdata                            DONE (all drivers)
   GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_program_interface_query                       DONE (all drivers)
   GL_ARB_robust_buffer_access_behavior                 not started
   GL_ARB_shader_image_size                             in progress (Martin Peres)
-  GL_ARB_shader_storage_buffer_object                  not started
+  GL_ARB_shader_storage_buffer_object                  in progress (Iago Toral, Samuel Iglesias)
   GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_texture_buffer_range                          DONE (nv50, nvc0, i965, r600, radeonsi, llvmpipe)
   GL_ARB_texture_query_levels                          DONE (all drivers that support GLSL 1.30)
   GL_ARB_texture_storage_multisample                   DONE (all drivers that support GL_ARB_texture_multisample)
-  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0)
+  GL_ARB_texture_view                                  DONE (i965, nv50, nvc0, llvmpipe, softpipe)
   GL_ARB_vertex_attrib_binding                         DONE (all drivers)
 
 
@@ -177,7 +177,7 @@ GL 4.4, GLSL 4.40:
 
   GL_MAX_VERTEX_ATTRIB_STRIDE                          DONE (all drivers)
   GL_ARB_buffer_storage                                DONE (i965, nv50, nvc0, r600, radeonsi)
-  GL_ARB_clear_texture                                 DONE (i965)
+  GL_ARB_clear_texture                                 DONE (i965) (gallium - in progress, VMware)
   GL_ARB_enhanced_layouts                              not started
   GL_ARB_multi_bind                                    DONE (all drivers)
   GL_ARB_query_buffer_object                           not started
@@ -190,12 +190,12 @@ GL 4.5, GLSL 4.50:
   GL_ARB_ES3_1_compatibility                           not started
   GL_ARB_clip_control                                  DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_conditional_render_inverted                   DONE (i965, nv50, nvc0, llvmpipe, softpipe)
-  GL_ARB_cull_distance                                 not started
+  GL_ARB_cull_distance                                 in progress (Tobias)
   GL_ARB_derivative_control                            DONE (i965, nv50, nvc0, r600)
-  GL_ARB_direct_state_access                           started
+  GL_ARB_direct_state_access                           DONE (all drivers)
   - Transform Feedback object                          DONE
   - Buffer object                                      DONE
-  - Framebuffer object                                 started (Laura Ekstrand)
+  - Framebuffer object                                 DONE
   - Renderbuffer object                                DONE
   - Texture object                                     DONE
   - Vertex array object                                DONE
@@ -216,12 +216,12 @@ GLES3.1, GLSL ES 3.1
   GL_ARB_compute_shader                                in progress (jljusten)
   GL_ARB_draw_indirect                                 DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
-  GL_ARB_framebuffer_no_attachments                    not started
+  GL_ARB_framebuffer_no_attachments                    DONE (i965)
   GL_ARB_program_interface_query                       DONE (all drivers)
   GL_ARB_shader_atomic_counters                        DONE (i965)
   GL_ARB_shader_image_load_store                       in progress (curro)
   GL_ARB_shader_image_size                             in progress (Martin Peres)
-  GL_ARB_shader_storage_buffer_object                  not started
+  GL_ARB_shader_storage_buffer_object                  in progress (Iago Toral, Samuel Iglesias)
   GL_ARB_shading_language_packing                      DONE (all drivers)
   GL_ARB_separate_shader_objects                       DONE (all drivers)
   GL_ARB_stencil_texturing                             DONE (i965/gen8+, nv50, nvc0, r600, radeonsi, llvmpipe, softpipe)
diff --git a/docs/devinfo.html b/docs/devinfo.html
index 8d20eea..8ebf80f 100644
--- a/docs/devinfo.html
+++ b/docs/devinfo.html
@@ -17,159 +17,241 @@
 <h1>Development Notes</h1>
 
 
-<h2>Adding Extensions</h2>
-
-<p>
-To add a new GL extension to Mesa you have to do at least the following.
-
 <ul>
-<li>
-   If glext.h doesn't define the extension, edit include/GL/gl.h and add
-   code like this:
-   <pre>
-     #ifndef GL_EXT_the_extension_name
-     #define GL_EXT_the_extension_name 1
-     /* declare the new enum tokens */
-     /* prototype the new functions */
-     /* TYPEDEFS for the new functions */
-     #endif
-   </pre>
-</li>
-<li>
-   In the src/mapi/glapi/gen/ directory, add the new extension functions and
-   enums to the gl_API.xml file.
-   Then, a bunch of source files must be regenerated by executing the
-   corresponding Python scripts.
-</li>
-<li>
-   Add a new entry to the <code>gl_extensions</code> struct in mtypes.h
-</li>
-<li>
-   Update the <code>extensions.c</code> file.
-</li>
-<li>
-   From this point, the best way to proceed is to find another extension,
-   similar to the new one, that's already implemented in Mesa and use it
-   as an example.
-</li>
-<li>
-   If the new extension adds new GL state, the functions in get.c, enable.c
-   and attrib.c will most likely require new code.
-</li>
-<li>
-   The dispatch tests check_table.cpp and dispatch_sanity.cpp
-   should be updated with details about the new extensions functions. These
-   tests are run using 'make check'
-</li>
+<li><a href="#style">Coding Style</a>
+<li><a href="#submitting">Submitting Patches</a>
+<li><a href="#release">Making a New Mesa Release</a>
+<li><a href="#extensions">Adding Extensions</a>
 </ul>
 
 
-
-<h2>Coding Style</h2>
+<h2 id="style">Coding Style</h2>
 
 <p>
-Mesa's code style has changed over the years.  Here's the latest.
+Mesa is over 20 years old and the coding style has evolved over time.
+Some old parts use a style that's a bit out of date.
+If the guidelines below don't cover something, try following the format of
+existing, neighboring code.
 </p>
 
 <p>
-Comment your code!  It's extremely important that open-source code be
-well documented.  Also, strive to write clean, easily understandable code.
+Basic formatting guidelines
 </p>
 
-<p>
-3-space indentation
-</p>
+<ul>
+<li>3-space indentation, no tabs.
+<li>Limit lines to 78 or fewer characters.  The idea is to prevent line
+wrapping in 80-column editors and terminals.  There are exceptions, such
+as if you're defining a large, static table of information.
+<li>Opening braces go on the same line as the if/for/while statement.
+For example:
+<pre>
+   if (condition) {
+      foo;
+   } else {
+      bar;
+   }
+</pre>
 
-<p>
-If you use tabs, set them to 8 columns
-</p>
+<li>Put a space before/after operators.  For example, <tt>a = b + c;</tt>
+and not <tt>a=b+c;</tt>
 
-<p>
-Line width: the preferred width to fill comments and code in Mesa is 78
-columns.  Exceptions are sometimes made for clarity (e.g. tabular data is
-sometimes filled to a much larger width so that extraneous carriage returns
-don't obscure the table).
-</p>
+<li>This GNU indent command generally does the right thing for formatting:
+<pre>
+   indent -br -i3 -npcs --no-tabs infile.c -o outfile.c
+</pre>
 
-<p>
-Brace example:
-</p>
+<li>Use comments wherever you think it would be helpful for other developers.
+Several specific cases and style examples follow.  Note that we roughly
+follow <a href="http://www.stack.nl/~dimitri/doxygen/">Doxygen</a> conventions.
+<br>
+<br>
+Single-line comments:
 <pre>
-	if (condition) {
-	   foo;
-	}
-	else {
-	   bar;
-	}
-
-	switch (condition) {
-	case 0:
-	   foo();
-	   break;
-
-	case 1: {
-	   ...
-	   break;
-	}
-
-	default:
-	   ...
-	   break;
-	}
+   /* null-out pointer to prevent dangling reference below */
+   bufferObj = NULL;
+</pre>
+Or,
+<pre>
+   bufferObj = NULL;  /* prevent dangling reference below */
+</pre>
+Multi-line comment:
+<pre>
+   /* If this is a new buffer object id, or one which was generated but
+    * never used before, allocate a buffer object now.
+    */
+</pre>
+We try to quote the OpenGL specification where prudent:
+<pre>
+   /* Page 38 of the PDF of the OpenGL ES 3.0 spec says:
+    *
+    *     "An INVALID_OPERATION error is generated for any of the following
+    *     conditions:
+    *
+    *     * <length> is zero."
+    *
+    * Additionally, page 94 of the PDF of the OpenGL 4.5 core spec
+    * (30.10.2014) also says this, so it's no longer allowed for desktop GL,
+    * either.
+    */
+</pre>
+Function comment example:
+<pre>
+   /**
+    * Create and initialize a new buffer object.  Called via the
+    * ctx->Driver.CreateObject() driver callback function.
+    * \param  name  integer name of the object
+    * \param  type  one of GL_FOO, GL_BAR, etc.
+    * \return  pointer to new object or NULL if error
+    */
+   struct gl_object *
+   _mesa_create_object(GLuint name, GLenum type)
+   {
+      /* function body */
+   }
 </pre>
 
-<p>
-Here's the GNU indent command which will best approximate my preferred style:
-(Note that it won't format switch statements in the preferred way)
-</p>
+<li>Put the function return type and qualifiers on one line and the function
+name and parameters on the next, as seen above.  This makes it easy to use
+<code>grep ^function_name dir/*</code> to find function definitions.  Also,
+the opening brace goes on the next line by itself (see above.)
+
+<li>Function names follow various conventions depending on the type of function:
 <pre>
-	indent -br -i3 -npcs --no-tabs infile.c -o outfile.c
+   glFooBar()       - a public GL entry point (in glapi_dispatch.c)
+   _mesa_FooBar()   - the internal immediate mode function
+   save_FooBar()    - retained mode (display list) function in dlist.c
+   foo_bar()        - a static (private) function
+   _mesa_foo_bar()  - an internal non-static Mesa function
 </pre>
 
+<li>Constants, macros and enumerant names are ALL_UPPERCASE, with _ between
+words.
+<li>Mesa usually uses camel case for local variables (Ex: "localVarname")
+while gallium typically uses underscores (Ex: "local_var_name").
+<li>Global variables are almost never used because Mesa should be thread-safe.
 
-<p>
-Local variable name example:  localVarName (no underscores)
-</p>
+<li>Booleans.  Places that are not directly visible to the GL API
+should prefer the use of <tt>bool</tt>, <tt>true</tt>, and
+<tt>false</tt> over <tt>GLboolean</tt>, <tt>GL_TRUE</tt>, and
+<tt>GL_FALSE</tt>.  In C code, this may mean that
+<tt>#include &lt;stdbool.h&gt;</tt> needs to be added.  The
+<tt>try_emit_</tt>* methods in src/mesa/program/ir_to_mesa.cpp and
+src/mesa/state_tracker/st_glsl_to_tgsi.cpp can serve as examples.
+
+</ul>
 
-<p>
-Constants and macros are ALL_UPPERCASE, with _ between words
-</p>
+
+<h2 id="submitting">Submitting patches</h2>
 
 <p>
-Global variables are not allowed.
+The basic guidelines for submitting patches are:
 </p>
 
+<ul>
+<li>Patches should be sufficiently tested before submitting.
+<li>Code patches should follow Mesa coding conventions.
+<li>Whenever possible, patches should only effect individual Mesa/Gallium
+components.
+<li>Patches should never introduce build breaks and should be bisectable (see
+<code>git bisect</code>.)
+<li>Patches should be properly formatted (see below).
+<li>Patches should be submitted to mesa-dev for review using
+<code>git send-email</code>.
+<li>Patches should not mix code changes with code formatting changes (except,
+perhaps, in very trivial cases.)
+</ul>
+
+<h3>Patch formatting</h3>
+
 <p>
-Function name examples:
+The basic rules for patch formatting are:
 </p>
+
+<ul>
+<li>Lines should be limited to 75 characters or less so that git logs
+displayed in 80-column terminals avoid line wrapping.  Note that git
+log uses 4 spaces of indentation (4 + 75 &lt; 80).
+<li>The first line should be a short, concise summary of the change prefixed
+with a module name.  Examples:
+<pre>
+    mesa: Add support for querying GL_VERTEX_ATTRIB_ARRAY_LONG
+
+    gallium: add PIPE_CAP_DEVICE_RESET_STATUS_QUERY
+
+    i965: Fix missing type in local variable declaration.
+</pre>
+<li>Subsequent patch comments should describe the change in more detail,
+if needed.  For example:
 <pre>
-	glFooBar()       - a public GL entry point (in glapi_dispatch.c)
-	_mesa_FooBar()   - the internal immediate mode function
-	save_FooBar()    - retained mode (display list) function in dlist.c
-	foo_bar()        - a static (private) function
-	_mesa_foo_bar()  - an internal non-static Mesa function
+    i965: Remove end-of-thread SEND alignment code.
+    
+    This was present in Eric's initial implementation of the compaction code
+    for Sandybridge (commit 077d01b6). There is no documentation saying this
+    is necessary, and removing it causes no regressions in piglit on any
+    platform.
 </pre>
+<li>A "Signed-off-by:" line is not required, but not discouraged either.
+<li>If a patch address a bugzilla issue, that should be noted in the
+patch comment.  For example:
+<pre>
+   Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=89689
+</pre>
+<li>If there have been several revisions to a patch during the review
+process, they should be noted such as in this example:
+<pre>
+    st/mesa: add ARB_texture_stencil8 support (v4)
+    
+    if we support stencil texturing, enable texture_stencil8
+    there is no requirement to support native S8 for this,
+    the texture can be converted to x24s8 fine.
+    
+    v2: fold fixes from Marek in:
+       a) put S8 last in the list
+       b) fix renderable to always test for d/s renderable
+        fixup the texture case to use a stencil only format
+        for picking the format for the texture view.
+    v3: hit fallback for getteximage
+    v4: put s8 back in front, it shouldn't get picked now (Ilia)
+</pre>
+<li>If someone tested your patch, document it with a line like this:
+<pre>
+    Tested-by: Joe Hacker &lt;jhacker@foo.com&gt;
+</pre>
+<li>If the patch was reviewed (usually the case) or acked by someone,
+that should be documented with:
+<pre>
+    Reviewed-by: Joe Hacker &lt;jhacker@foo.com&gt;
+    Acked-by: Joe Hacker &lt;jhacker@foo.com&gt;
+</pre>
+</ul>
+
+
+
+<h3>Testing Patches</h3>
 
 <p>
-Places that are not directly visible to the GL API should prefer the use
-of <tt>bool</tt>, <tt>true</tt>, and
-<tt>false</tt> over <tt>GLboolean</tt>, <tt>GL_TRUE</tt>, and
-<tt>GL_FALSE</tt>.  In C code, this may mean that
-<tt>#include &lt;stdbool.h&gt;</tt> needs to be added.  The
-<tt>try_emit_</tt>* methods in src/mesa/program/ir_to_mesa.cpp and
-src/mesa/state_tracker/st_glsl_to_tgsi.cpp can serve as examples.
+It should go without saying that patches must be tested.  In general,
+do whatever testing is prudent.
 </p>
 
-<h2>Submitting patches</h2>
-
 <p>
-You should always run the Mesa Testsuite before submitting patches.
-The Testsuite can be run using the 'make check' command. All tests
+You should always run the Mesa test suite before submitting patches.
+The test suite can be run using the 'make check' command. All tests
 must pass before patches will be accepted, this may mean you have
 to update the tests themselves.
 </p>
 
 <p>
+Whenever possible and applicable, test the patch with
+<a href="http://piglit.freedesktop.org">Piglit</a> to
+check for regressions.
+</p>
+
+
+<h3>Mailing Patches</h3>
+
+<p>
 Patches should be sent to the Mesa mailing list for review.
 When submitting a patch make sure to use git send-email rather than attaching
 patches to emails. Sending patches as attachments prevents people from being
@@ -184,7 +266,38 @@ re-sending the whole series). Using --in-reply-to makes
 it harder for reviewers to accidentally review old patches.
 </p>
 
-<h2>Marking a commit as a candidate for a stable branch</h2>
+<p>
+When submitting follow-up patches you should also login to
+<a href="https://patchwork.freedesktop.org">patchwork</a> and change the
+state of your old patches to Superseded.
+</p>
+
+<h3>Reviewing Patches</h3>
+
+<p>
+When you've reviewed a patch on the mailing list, please be unambiguous
+about your review.  That is, state either
+<pre>
+    Reviewed-by: Joe Hacker &lt;jhacker@foo.com&gt;
+</pre>
+or
+<pre>
+    Acked-by: Joe Hacker &lt;jhacker@foo.com&gt;
+</pre>
+Rather than saying just "LGTM" or "Seems OK".
+</p>
+
+<p>
+If small changes are suggested, it's OK to say something like:
+<pre>
+   With the above fixes, Reviewed-by: Joe Hacker &lt;jhacker@foo.com&gt;
+</pre>
+which tells the patch author that the patch can be committed, as long
+as the issues are resolved first.
+</p>
+
+
+<h3>Marking a commit as a candidate for a stable branch</h3>
 
 <p>
 If you want a commit to be applied to a stable branch,
@@ -221,7 +334,7 @@ the upcoming stable release can always be seen on the
 <a href="http://cworth.org/~cworth/mesa-stable-queue/">Mesa Stable Queue</a>
 page.
 
-<h2>Criteria for accepting patches to the stable branch</h2>
+<h3>Criteria for accepting patches to the stable branch</h3>
 
 Mesa has a designated release manager for each stable branch, and the release
 manager is the only developer that should be pushing changes to these
@@ -306,7 +419,8 @@ be rejected:
   regression that is unaacceptable for the stable branch.</li>
 </ul>
 
-<h2>Making a New Mesa Release</h2>
+
+<h2 id="release">Making a New Mesa Release</h2>
 
 <p>
 These are the instructions for making a new Mesa release.
@@ -456,7 +570,7 @@ Edit docs/relnotes/X.Y.Z.html to add the sha256sums printed as part of "make
 tarballs" in the previous step. Commit this change.
 </p>
 
-<h3>Push all commits and the tag creates above</h3>
+<h3>Push all commits and the tag created above</h3>
 
 <p>
 This is the first step that cannot easily be undone. The release is going
@@ -483,7 +597,7 @@ signatures to the freedesktop.org server:
 	mv ~/MesaLib-X.Y.Z* .
 </pre>
 
-<h3>Back on mesa master, andd the new release notes into the tree</h3>
+<h3>Back on mesa master, add the new release notes into the tree</h3>
 
 <p>
 Something like the following steps will do the trick:
@@ -543,6 +657,56 @@ release announcement:
 </pre>
 </p>
 
+
+<h2 id="extensions">Adding Extensions</h2>
+
+<p>
+To add a new GL extension to Mesa you have to do at least the following.
+
+<ul>
+<li>
+   If glext.h doesn't define the extension, edit include/GL/gl.h and add
+   code like this:
+   <pre>
+     #ifndef GL_EXT_the_extension_name
+     #define GL_EXT_the_extension_name 1
+     /* declare the new enum tokens */
+     /* prototype the new functions */
+     /* TYPEDEFS for the new functions */
+     #endif
+   </pre>
+</li>
+<li>
+   In the src/mapi/glapi/gen/ directory, add the new extension functions and
+   enums to the gl_API.xml file.
+   Then, a bunch of source files must be regenerated by executing the
+   corresponding Python scripts.
+</li>
+<li>
+   Add a new entry to the <code>gl_extensions</code> struct in mtypes.h
+</li>
+<li>
+   Update the <code>extensions.c</code> file.
+</li>
+<li>
+   From this point, the best way to proceed is to find another extension,
+   similar to the new one, that's already implemented in Mesa and use it
+   as an example.
+</li>
+<li>
+   If the new extension adds new GL state, the functions in get.c, enable.c
+   and attrib.c will most likely require new code.
+</li>
+<li>
+   The dispatch tests check_table.cpp and dispatch_sanity.cpp
+   should be updated with details about the new extensions functions. These
+   tests are run using 'make check'
+</li>
+</ul>
+
+
+
+
 </div>
 </body>
 </html>
diff --git a/docs/egl.html b/docs/egl.html
index d946bb0..3ab1a60 100644
--- a/docs/egl.html
+++ b/docs/egl.html
@@ -184,14 +184,6 @@ values are: <code>debug</code>, <code>info</code>, <code>warning</code>, and
 <code>fatal</code>.</p>
 
 </dd>
-
-<dt><code>EGL_SOFTWARE</code></dt>
-<dd>
-
-<p>For drivers that support both hardware and software rendering, setting this
-variable to true forces the use of software rendering.</p>
-
-</dd>
 </dl>
 
 <h2>EGL Drivers</h2>
diff --git a/docs/index.html b/docs/index.html
index e01790c..80c6e03 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -16,6 +16,37 @@
 
 <h1>News</h1>
 
+<h2>June 20, 2015</h2>
+<p>
+<a href="relnotes/10.5.8.html">Mesa 10.5.8</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>June 14, 2015</h2>
+<p>
+<a href="relnotes/10.6.0.html">Mesa 10.6.0</a> is released.  This is a new
+development release.  See the release notes for more information about
+the release.
+</p>
+
+<h2>June 07, 2015</h2>
+<p>
+<a href="relnotes/10.5.7.html">Mesa 10.5.7</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 23, 2015</h2>
+<p>
+<a href="relnotes/10.5.6.html">Mesa 10.5.6</a> is released.
+This is a bug-fix release.
+</p>
+
+<h2>May 11, 2015</h2>
+<p>
+<a href="relnotes/10.5.5.html">Mesa 10.5.5</a> is released.
+This is a bug-fix release.
+</p>
+
 <h2>April 24, 2015</h2>
 <p>
 <a href="relnotes/10.5.4.html">Mesa 10.5.4</a> is released.
diff --git a/docs/relnotes.html b/docs/relnotes.html
index 7f2e1d8..5fd8002 100644
--- a/docs/relnotes.html
+++ b/docs/relnotes.html
@@ -21,6 +21,11 @@ The release notes summarize what's new or changed in each Mesa release.
 </p>
 
 <ul>
+<li><a href="relnotes/10.5.8.html">10.5.8 release notes</a>
+<li><a href="relnotes/10.6.0.html">10.6.0 release notes</a>
+<li><a href="relnotes/10.5.7.html">10.5.7 release notes</a>
+<li><a href="relnotes/10.5.6.html">10.5.6 release notes</a>
+<li><a href="relnotes/10.5.5.html">10.5.5 release notes</a>
 <li><a href="relnotes/10.5.4.html">10.5.4 release notes</a>
 <li><a href="relnotes/10.5.3.html">10.5.3 release notes</a>
 <li><a href="relnotes/10.5.2.html">10.5.2 release notes</a>
diff --git a/docs/relnotes/10.5.5.html b/docs/relnotes/10.5.5.html
new file mode 100644
index 0000000..fc8247c
--- /dev/null
+++ b/docs/relnotes/10.5.5.html
@@ -0,0 +1,95 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.5.5 Release Notes / May 11, 2015</h1>
+
+<p>
+Mesa 10.5.5 is a bug fix release which fixes bugs found since the 10.5.4 release.
+</p>
+<p>
+Mesa 10.5.5 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+c10f00fd792b8290dd51ebcc48a9016c4cafab19ec205423c6fcadfd7f3a59f2  mesa-10.5.5.tar.gz
+4ac4e4ea3414f1cadb1467f2f173f9e56170d31e8674f7953a46f0549d319f28  mesa-10.5.5.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88521">Bug 88521</a> - GLBenchmark 2.7 TRex renders with artifacts on Gen8 with !UXA</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89455">Bug 89455</a> - [NVC0/Gallium] Unigine Heaven black and white boxes</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89689">Bug 89689</a> - [Regression] Weston on DRM backend won't start with new version of mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90130">Bug 90130</a> - gl_PrimitiveId seems to reset at 340</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Boyan Ding (1):</p>
+<ul>
+  <li>i965: Add XRGB8888 format to intel_screen_make_configs</li>
+</ul>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>docs: Add sha256 sums for the 10.5.4 release</li>
+  <li>r300: do not link against libdrm_intel</li>
+  <li>Update version to 10.5.5</li>
+</ul>
+
+<p>Ilia Mirkin (4):</p>
+<ul>
+  <li>nvc0/ir: flush denorms to zero in non-compute shaders</li>
+  <li>gk110/ir: fix set with a register dest to not auto-set the abs flag</li>
+  <li>nvc0/ir: fix predicated PFETCH emission</li>
+  <li>nv50/ir: fix asFlow() const helper for OP_JOIN</li>
+</ul>
+
+<p>Kenneth Graunke (2):</p>
+<ul>
+  <li>i965: Make intel_emit_linear_blit handle Gen8+ alignment restrictions.</li>
+  <li>i965: Disallow linear blits that are not cacheline aligned.</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>draw: fix prim ids when there's no gs</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.5.6.html b/docs/relnotes/10.5.6.html
new file mode 100644
index 0000000..0046b8f
--- /dev/null
+++ b/docs/relnotes/10.5.6.html
@@ -0,0 +1,147 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.5.6 Release Notes / May 23, 2015</h1>
+
+<p>
+Mesa 10.5.6 is a bug fix release which fixes bugs found since the 10.5.5 release.
+</p>
+<p>
+Mesa 10.5.6 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+89ff9cb08d0f6e3f34154864c3071253057cd21020759457c8ae27e0f70985d3  mesa-10.5.6.tar.gz
+66017853bde5f7a6647db3eede30512a091a3491daa1708e0ad8027c328ba595  mesa-10.5.6.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86792">Bug 86792</a> - [NVC0] Portal 2 Crashes in Wine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90147">Bug 90147</a> - swrast: build error undeclared _SC_PHYS_PAGES on osx</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90350">Bug 90350</a> - [G96] Portal's portal are incorrectly rendered</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90363">Bug 90363</a> - [nv50] HW state is not reset correctly when using a new GL context</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Alex Deucher (1):</p>
+<ul>
+  <li>radeonsi: add new bonaire pci id</li>
+</ul>
+
+<p>Axel Davy (2):</p>
+<ul>
+  <li>egl/wayland: properly destroy wayland objects</li>
+  <li>glx/dri3: Add additional check for gpu offloading case</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: Add sha256 sums for the 10.5.5 release</li>
+  <li>egl/main: fix EGL_KHR_get_all_proc_addresses</li>
+  <li>targets/osmesa: drop the -module tag from LDFLAGS</li>
+  <li>Update version to 10.5.6</li>
+</ul>
+
+<p>Francisco Jerez (4):</p>
+<ul>
+  <li>clover: Refactor event::trigger and ::abort to prevent deadlock and reentrancy issues.</li>
+  <li>clover: Wrap event::_status in a method to prevent unlocked access.</li>
+  <li>clover: Implement locking of the wait_count, _chain and _status members of event.</li>
+  <li>i965: Fix PBO cache coherency issue after _mesa_meta_pbo_GetTexSubImage().</li>
+</ul>
+
+<p>Fredrik Höglund (2):</p>
+<ul>
+  <li>main: Require that the texture exists in framebuffer_texture</li>
+  <li>mesa: Generate GL_INVALID_VALUE in framebuffer_texture when layer &lt; 0</li>
+</ul>
+
+<p>Ilia Mirkin (7):</p>
+<ul>
+  <li>nv50/ir: only propagate saturate up if some actual folding took place</li>
+  <li>nv50: keep track of PGRAPH state in nv50_screen</li>
+  <li>nvc0: keep track of PGRAPH state in nvc0_screen</li>
+  <li>nvc0: reset the instanced elements state when doing blit using 3d engine</li>
+  <li>nv50/ir: only enable mul saturate on G200+</li>
+  <li>st/mesa: make sure to create a "clean" bool when doing i2b</li>
+  <li>nvc0: switch mechanism for shader eviction to be a while loop</li>
+</ul>
+
+<p>Jeremy Huddleston Sequoia (2):</p>
+<ul>
+  <li>swrast: Build fix for darwin</li>
+  <li>darwin: Fix install name of libOSMesa</li>
+</ul>
+
+<p>Laura Ekstrand (2):</p>
+<ul>
+  <li>main: Fix an error generated by FramebufferTexture</li>
+  <li>main: Complete error conditions for glInvalidate*Framebuffer.</li>
+</ul>
+
+<p>Marta Lofstedt (1):</p>
+<ul>
+  <li>main: glGetIntegeri_v fails for GL_VERTEX_BINDING_STRIDE</li>
+</ul>
+
+<p>Rob Clark (2):</p>
+<ul>
+  <li>freedreno: enable a306</li>
+  <li>freedreno: fix bug in tile/slot calculation</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>draw: (trivial) fix out-of-bounds vector initialization</li>
+</ul>
+
+<p>Tim Rowley (1):</p>
+<ul>
+  <li>mesa: fix shininess check for ffvertex_prog v2</li>
+</ul>
+
+<p>Tom Stellard (2):</p>
+<ul>
+  <li>clover: Add a mutex to guard queue::queued_events</li>
+  <li>clover: Fix a bug with multi-threaded events v2</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.5.7.html b/docs/relnotes/10.5.7.html
new file mode 100644
index 0000000..68c8385
--- /dev/null
+++ b/docs/relnotes/10.5.7.html
@@ -0,0 +1,103 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.5.7 Release Notes / June 07, 2015</h1>
+
+<p>
+Mesa 10.5.7 is a bug fix release which fixes bugs found since the 10.5.6 release.
+</p>
+<p>
+Mesa 10.5.7 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+8f865ce497435fdf25d4e35f3b5551b2bcd5f9bc6570561183be82af20d18b82  mesa-10.5.7.tar.gz
+04d06890cd69af8089d6ca76f40e46dcf9cacfe4a9788b32be620574d4638818  mesa-10.5.7.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89131">Bug 89131</a> - [Bisected] Graphical corruption in Weston,  shows old framebuffer pieces</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965: Emit 3DSTATE_MULTISAMPLE before WM_HZ_OP (gen8+)</li>
+</ul>
+
+<p>Emil Velikov (4):</p>
+<ul>
+  <li>docs: Add sha256sums for the 10.5.6 release</li>
+  <li>get-pick-list.sh: Require explicit "10.5" for nominating stable patches</li>
+  <li>cherry-ignore: add clover build fix not applicable for 10.5</li>
+  <li>Update version to 10.5.7</li>
+</ul>
+
+<p>Ilia Mirkin (18):</p>
+<ul>
+  <li>nvc0/ir: set ftz when sources are floats, not just destinations</li>
+  <li>nv50/ir: guess that the constant offset is the starting slot of array</li>
+  <li>nvc0/ir: LOAD's can't be used for shader inputs</li>
+  <li>nvc0: a geometry shader can have up to 1024 vertices output</li>
+  <li>nv50/ir: avoid messing up arg1 of PFETCH</li>
+  <li>nv30: don't leak fragprog consts</li>
+  <li>nv30: avoid leaking render state and draw shaders</li>
+  <li>nv30: fix clip plane uploads and enable changes</li>
+  <li>nv30/draw: avoid leaving stale pointers in draw state</li>
+  <li>nv30/draw: draw expects constbuf size in bytes, not vec4 units</li>
+  <li>st/mesa: don't leak glsl_to_tgsi object on link failure</li>
+  <li>glsl: avoid leaking linked gl_shader when there's a late linker error</li>
+  <li>nv30/draw: fix indexed draws with swtnl path and a resource index buffer</li>
+  <li>nv30/draw: only use the DMA1 object (GART) if the bo is not in VRAM</li>
+  <li>nv30/draw: allocate vertex buffers in gart</li>
+  <li>nv30/draw: switch varying hookup logic to know about texcoords</li>
+  <li>nv30: falling back to draw path for edgeflag does no good</li>
+  <li>nv30: avoid doing extra work on clear and hitting unexpected states</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>i965/fs: Fix implied_mrf_writes for scratch writes</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>st/dri: fix postprocessing crash when there's no depth buffer</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.5.8.html b/docs/relnotes/10.5.8.html
new file mode 100644
index 0000000..6239400
--- /dev/null
+++ b/docs/relnotes/10.5.8.html
@@ -0,0 +1,112 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.5.8 Release Notes / June 20, 2015</h1>
+
+<p>
+Mesa 10.5.8 is a bug fix release which fixes bugs found since the 10.5.7 release.
+</p>
+<p>
+Mesa 10.5.8 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+611ddcfa3c1bf13f7e6ccac785c8749c3b74c9a78452bac70f8372cf6b209aa0  mesa-10.5.8.tar.gz
+2866b855c5299a4aed066338c77ff6467c389b2c30ada7647be8758663da2b54  mesa-10.5.8.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90310">Bug 90310</a> - Fails to build gallium_dri.so at linking stage with clang because of multiple redefinitions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90347">Bug 90347</a> - [NVE0+] Failure to insert texbar under some circumstances (causing bad colors in Terasology)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90520">Bug 90520</a> - Register spilling clobbers registers used elsewhere in the shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90905">Bug 90905</a> - mesa: Finish subdir-objects transition</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Ben Widawsky (1):</p>
+<ul>
+  <li>i965: Disable compaction for EOT send messages</li>
+</ul>
+
+<p>Boyan Ding (1):</p>
+<ul>
+  <li>egl/x11: Set version of swrastLoader to 2</li>
+</ul>
+
+<p>Emil Velikov (2):</p>
+<ul>
+  <li>docs: Add sha256sums for the 10.5.7 release</li>
+  <li>Update version to 10.5.8</li>
+</ul>
+
+<p>Erik Faye-Lund (1):</p>
+<ul>
+  <li>mesa: build xmlconfig to a separate static library</li>
+</ul>
+
+<p>Francisco Jerez (1):</p>
+<ul>
+  <li>i965: Don't compact instructions with unmapped bits.</li>
+</ul>
+
+<p>Ilia Mirkin (3):</p>
+<ul>
+  <li>nvc0/ir: fix collection of first uses for texture barrier insertion</li>
+  <li>nv50,nvc0: clamp uniform size to 64k</li>
+  <li>nvc0/ir: can't have a join on a load with an indirect source</li>
+</ul>
+
+<p>Jason Ekstrand (1):</p>
+<ul>
+  <li>i965/fs: Don't let the EOT send message interfere with the MRF hack</li>
+</ul>
+
+<p>Marek Olšák (1):</p>
+<ul>
+  <li>egl: fix setting context flags</li>
+</ul>
+
+<p>Roland Scheidegger (1):</p>
+<ul>
+  <li>draw: (trivial) fix NULL pointer dereference</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/10.6.0.html b/docs/relnotes/10.6.0.html
index b7cd486..ebd1f10 100644
--- a/docs/relnotes/10.6.0.html
+++ b/docs/relnotes/10.6.0.html
@@ -14,7 +14,7 @@
 <iframe src="../contents.html"></iframe>
 <div class="content">
 
-<h1>Mesa 10.6.0 Release Notes / TBD</h1>
+<h1>Mesa 10.6.0 Release Notes / June 14, 2015</h1>
 
 <p>
 Mesa 10.6.0 is a new development release.
@@ -31,9 +31,10 @@ because compatibility contexts are not supported.
 </p>
 
 
-<h2>MD5 checksums</h2>
+<h2>SHA256 checksums</h2>
 <pre>
-TBD.
+9bc659abdba26202509304f259723aaa4343dba6aac4bd87d5baea11d23c8c63  mesa-10.6.0.tar.gz
+f37e2633978deed02ff0522abc36c709586e2b555fd439a82ab71dce2c866c76  mesa-10.6.0.tar.xz
 </pre>
 
 
@@ -48,6 +49,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_clip_control on i965</li>
 <li>GL_ARB_depth_buffer_float on freedreno</li>
 <li>GL_ARB_depth_clamp on freedreno</li>
+<li>GL_ARB_direct_state_access on all drivers that support GL 2.0+</li>
 <li>GL_ARB_draw_indirect, GL_ARB_multi_draw_indirect on r600</li>
 <li>GL_ARB_draw_instanced on freedreno</li>
 <li>GL_ARB_gpu_shader_fp64 on nvc0, softpipe</li>
@@ -56,6 +58,7 @@ Note: some of the new features are only available with certain drivers.
 <li>GL_ARB_pipeline_statistics_query on i965, nv50, nvc0, r600, radeonsi, softpipe</li>
 <li>GL_ARB_program_interface_query (all drivers)</li>
 <li>GL_ARB_texture_stencil8 on nv50, nvc0, r600, radeonsi, softpipe</li>
+<li>GL_ARB_texture_view on llvmpipe, softpipe</li>
 <li>GL_ARB_uniform_buffer_object on freedreno</li>
 <li>GL_ARB_vertex_attrib_64bit on nvc0, softpipe</li>
 <li>GL_ARB_viewport_array, GL_AMD_vertex_shader_viewport_index on i965/gen6</li>
@@ -69,7 +72,246 @@ Note: some of the new features are only available with certain drivers.
 
 <h2>Bug fixes</h2>
 
-TBD.
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=15006">Bug 15006</a> - translate &amp; rotate the line cause Aliasing</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=27007">Bug 27007</a> - Lines disappear with GL_LINE_SMOOTH</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=28832">Bug 28832</a> - piglit/general/line-aa-width fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=45348">Bug 45348</a> - [swrast] piglit fbo-drawbuffers-arbfp regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=60797">Bug 60797</a> - 1px lines in octave plot aliased to 0</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=67564">Bug 67564</a> - HiZ buffers are much larger than necessary</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=69226">Bug 69226</a> - Cannot enable basic shaders with Second Life aborts attempt</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71591">Bug 71591</a> - Second Life shaders fail to compile (extension declared in middle of shader)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=79202">Bug 79202</a> - valgrind errors in glsl-fs-uniform-array-loop-unroll.shader_test; random code generation</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=81025">Bug 81025</a> - [IVB/BYT Bisected]Piglit spec_ARB_draw_indirect_arb_draw_indirect-draw-elements-prim-restart-ugly fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=82477">Bug 82477</a> - [softpipe] piglit fp-long-alu regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=82668">Bug 82668</a> - Can't set int attributes to certain values on 32-bit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=82831">Bug 82831</a> - i965: Support GL_ARB_blend_func_extended in SIMD16</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=83962">Bug 83962</a> - [HSW/BYT]Piglit spec_ARB_gpu_shader5_arb_gpu_shader5-emitstreamvertex_nodraw fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=84613">Bug 84613</a> - [G965, bisected] piglit regressions : glslparsertest.glsl2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86747">Bug 86747</a> - Noise in Football Manager 2014 textures</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86792">Bug 86792</a> - [NVC0] Portal 2 Crashes in Wine</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86811">Bug 86811</a> - [BDW/BSW Bisected]Piglit spec_arb_shading_language_packing_execution_built-in-functions_vs-unpackSnorm4x8 fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86837">Bug 86837</a> - kodi segfault since auxiliary/vl: rework the build of the VL code</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86944">Bug 86944</a> - glsl_parser_extras.cpp&quot;, line 1455: Error: Badly formed expression. (Oracle Studio)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86974">Bug 86974</a> - INTEL_DEBUG=shader_time always asserts in fs_generator::generate_code() when Mesa is built with --enable-debug (= with asserts)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=86980">Bug 86980</a> - [swrast] piglit fp-rfl regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=87258">Bug 87258</a> - [BDW/BSW Bisected]Piglit spec_ARB_shader_atomic_counters_array-indexing fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88246">Bug 88246</a> - Commit 2881b12 causes 43 DrawElements test regressions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88248">Bug 88248</a> - Calling glClear while there is an occlusion query in progress messes up the results</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88521">Bug 88521</a> - GLBenchmark 2.7 TRex renders with artifacts on Gen8 with !UXA</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88534">Bug 88534</a> - include/c11/threads_posix.h PTHREAD_MUTEX_RECURSIVE_NP not defined</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88561">Bug 88561</a> - [radeonsi][regression,bisected] Depth test/buffer issues in Portal</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88793">Bug 88793</a> - [BDW/BSW Bisected]Piglit/shaders_glsl-max-varyings fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88815">Bug 88815</a> - Incorrect handling of GLSL #line directive</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88883">Bug 88883</a> - ir-a2xx.c: variable changed in assert statement</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88885">Bug 88885</a> - Transform feedback uses incorrect interleaving if a previous draw did not write gl_Position</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88905">Bug 88905</a> - [SNB+ Bisected]Ogles3conform ES3-CTS.gtf.GL3Tests.packed_pixels.packed_pixels fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=88999">Bug 88999</a> - [SKL] Compiz crashes after opening unity dash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89014">Bug 89014</a> - PIPE_QUERY_GPU_FINISHED is not acting as expected on SI</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89026">Bug 89026</a> - Renderbuffer layered state used for framebuffer completeness test</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89032">Bug 89032</a> - [BDW/BSW/SKL Bisected]Piglit spec_OpenGL_1.1_infinite-spot-light fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89037">Bug 89037</a> - [SKL]Piglit spec_EXT_texture_array_copyteximage_1D_ARRAY_samples=2 sporadically causes GPU hang</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89039">Bug 89039</a> - [SKL]etqw system hang</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89058">Bug 89058</a> - [SKL]Render error in some games (etqw-demo, nexuiz, portal)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89068">Bug 89068</a> - glTexImage2D regression by texstore_rgba switch to _mesa_format_convert</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89069">Bug 89069</a> - Lack of grass in The Talos Principle on radeonsi (native\wine\nine)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89094">Bug 89094</a> - [SNB/IVB/HSW/BYT Bisected]Ogles3conform ES3-CTS.gtf.GL3Tests.shadow.shadow_execution_vert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89095">Bug 89095</a> - [SNB/IVB/BYT Bisected]Webglc conformance/glsl/functions/glsl-function-mix-float.html fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89112">Bug 89112</a> - u_atomic_test: u_atomic_test.c:124: test_atomic_8bits_bool: Assertion `r == 65 &amp;&amp; &quot;p_atomic_add&quot;' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89118">Bug 89118</a> - [SKL Bisected]many Ogles3conform cases core dumped</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89131">Bug 89131</a> - [Bisected] Graphical corruption in Weston,  shows old framebuffer pieces</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89156">Bug 89156</a> - r300g: GL_COMPRESSED_RED_RGTC1 / ATI1N support broken</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89180">Bug 89180</a> - [IVB regression] Rendering issues in Mass Effect through VMware Workstation</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89210">Bug 89210</a> - GS statistics fail on SNB</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89218">Bug 89218</a> - lower_instructions.cpp:648:48: error: invalid suffix 'd' on floating constant</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89224">Bug 89224</a> - Incorrect rendering of Unigine Valley running in VM on VMware Workstation</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89260">Bug 89260</a> - macros.h:34:25: fatal error: util/u_math.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89292">Bug 89292</a> - [regression,bisected] incomplete screenshots in some cases</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89311">Bug 89311</a> - [regression, bisected] dEQP: Added entry points for glCompressedTextureSubImage*D.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89312">Bug 89312</a> - [regression, bisected] main: Added entry points for CopyTextureSubImage*D. (d6b7c40cecfe01)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89315">Bug 89315</a> - [HSW, regression, bisected] i965/fs: Emit MAD instructions when possible.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89317">Bug 89317</a> - [HSW, regression, bisected] i965: Add LINTERP/CINTERP to can_do_cmod() (d91390634)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89328">Bug 89328</a> - python required to build Mesa release tarballs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89342">Bug 89342</a> - main/light.c:159:62: error: 'M_PI' undeclared (first use in this function)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89343">Bug 89343</a> - compiler/tests/radeon_compiler_optimize_tests.c:43:3: error: implicit declaration of function ‘fprintf’ [-Werror=implicit-function-declaration]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89345">Bug 89345</a> - imports.h:452:58: error: expected declaration specifiers or '...' before 'va_list'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89364">Bug 89364</a> - c99_alloca.h:40:22: fatal error: alloca.h: No such file or directory</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89372">Bug 89372</a> - [softpipe] piglit glsl-1.50 generate-zero-primitives regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89387">Bug 89387</a> - Double delete in lp_bld_misc.cpp</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89416">Bug 89416</a> - UE4Editor crash after load project</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89430">Bug 89430</a> - [g965][bisected] arb_copy_image-targets gl_texture* tests fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89433">Bug 89433</a> - GCC 4.2 does not support -Wvla</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89455">Bug 89455</a> - [NVC0/Gallium] Unigine Heaven black and white boxes</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89457">Bug 89457</a> - [BSW Bisected]ogles3conform ES3-CTS.gtf.GL3Tests.shadow.shadow_execution_vert fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89477">Bug 89477</a> - include/no_extern_c.h:47:1: error: template with C linkage</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89508">Bug 89508</a> - Bad int(floatBitsToInt(vec4))</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89530">Bug 89530</a> - FTBFS in loader: missing fstat</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89569">Bug 89569</a> - Papo &amp; Yo crash on startup [HSW]</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89590">Bug 89590</a> - Crash in glLinkProgram with shaders with multiple constant arrays</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89662">Bug 89662</a> - context.c:943: undefined reference to `_glapi_new_nop_table'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89670">Bug 89670</a> - cmod_propagation_test.andnz_one regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89679">Bug 89679</a> - [NV50] Portal/Half-Life 2 will not start (native Steam)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89689">Bug 89689</a> - [Regression] Weston on DRM backend won't start with new version of mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89722">Bug 89722</a> - [ILK Bisected]Ogles2conform/ES2-CTS.gtf.GL.equal.equal_vec2_frag fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89726">Bug 89726</a> - [Bisected] dEQP-GLES3: uniform linking logic in the presence of structs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89746">Bug 89746</a> - Mesa and LLVM 3.6+ break opengl for genymotion</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89754">Bug 89754</a> - vertexAttrib fails WebGL Conformance test with mesa drivers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89758">Bug 89758</a> - pow WebGL Conformance test with mesa drivers</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89759">Bug 89759</a> - WebGL OGL ES GLSL conformance test with mesa drivers fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89831">Bug 89831</a> - [r600] r600_asm.c:310:assign_alu_units: Assertion `0' failed.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89899">Bug 89899</a> - nir/nir_lower_tex_projector.c:112: error: unknown field ‘ssa’ specified in initializer</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89957">Bug 89957</a> - vm protection faults in piglit lest: texsubimage cube_map_array pbo</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89960">Bug 89960</a> - [softpipe] piglit copy-pixels regreession</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89961">Bug 89961</a> - [BDW/BSW Bisected]Synmark2_v6 OglDrvRes/OglDrvShComp/OglDrvState/OglPSPom Image Validation fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89963">Bug 89963</a> - lp_bld_debug.cpp:100:31: error: no matching function for call to ‘llvm::raw_ostream::raw_ostream()’</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90000">Bug 90000</a> - [i965 Bisected NIR] Piglit/gglean_fragprog1-z-write_test fail</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90109">Bug 90109</a> - [SNB+ Bisected]Ogles3conform ES3-CTS.shaders.uniform_block.random.basic_arrays.3 fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90114">Bug 90114</a> - [SNB+ Bisected]Ogles3conform ES3-CTS.shaders.struct.uniform.sampler_array_fragment fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90130">Bug 90130</a> - gl_PrimitiveId seems to reset at 340</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90147">Bug 90147</a> - swrast: build error undeclared _SC_PHYS_PAGES on osx</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90149">Bug 90149</a> - [SNB+ Bisected]ES3-CTS.gtf.GL3Tests.uniform_buffer_object.uniform_buffer_object_getactiveuniformsiv_for_nonexistent_uniform_indices fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90153">Bug 90153</a> - [SKL Bisected]ES3-CTS.gtf.GL3Tests.uniform_buffer_object.uniform_buffer_object_all_valid_basic_types fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90167">Bug 90167</a> - [softpipe] piglit depthstencil-default_fb-drawpixels-32f_24_8_rev regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90207">Bug 90207</a> - [r600g, bisected] regression: NI/Turks crash on WebGL Water (most WebGL stuff)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90213">Bug 90213</a> - glDrawPixels with GL_COLOR_INDEX never returns.</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90243">Bug 90243</a> - [bisected] regression: spec.!opengl 3_2.get-active-attrib-returns-all-inputs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90258">Bug 90258</a> - [IVB] spec.glsl-1_10.execution.fs-dfdy-accuracy fails intermittently</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90310">Bug 90310</a> - Fails to build gallium_dri.so at linking stage with clang because of multiple redefinitions</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90350">Bug 90350</a> - [G96] Portal's portal are incorrectly rendered</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90363">Bug 90363</a> - [nv50] HW state is not reset correctly when using a new GL context</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90397">Bug 90397</a> - ARB_program_interface_query: glGetProgramResourceiv() returns wrong value for GL_REFERENCED_BY_*_SHADER prop for GL_UNIFORM for members of an interface block with an instance name</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90466">Bug 90466</a> - arm: linker error ndefined reference to `nir_metadata_preserve'</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90520">Bug 90520</a> - Register spilling clobbers registers used elsewhere in the shader</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90547">Bug 90547</a> - [BDW/BSW/SKL Bisected]Piglit/glean&#64;vertprog1-rsq_test_2_(reciprocal_square_root_of_negative_value) fais</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90580">Bug 90580</a> - [HSW bisected] integer multiplication bug</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90629">Bug 90629</a> - [i965] SIMD16 dual_source_blend assertion `src[i].file != GRF || src[i].width == dst.width' failed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90749">Bug 90749</a> - [BDW Bisected]dEQP-GLES3.functional.rasterization.fbo.rbo_multisample_max.primitives.lines_wide fails</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90830">Bug 90830</a> - [bsw bisected regression] GPU hang for spec.arb_gpu_shader5.execution.sampler_array_indexing.vs-nonzero-base</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90839">Bug 90839</a> - [10.5.5/10.6 regression, bisected] PBO glDrawPixels no longer using blit fastpath</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=90905">Bug 90905</a> - mesa: Finish subdir-objects transition</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=9951">Bug 9951</a> - GL_LINE_SMOOTH and GL_POLYGON_SMOOTH with i965 driver</li>
+
+</ul>
+
 
 <h2>Changes</h2>
 
diff --git a/docs/relnotes/10.7.0.html b/docs/relnotes/10.7.0.html
new file mode 100644
index 0000000..e089889
--- /dev/null
+++ b/docs/relnotes/10.7.0.html
@@ -0,0 +1,61 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 10.7.0 Release Notes / TBD</h1>
+
+<p>
+Mesa 10.7.0 is a new development release.
+People who are concerned with stability and reliability should stick
+with a previous release or wait for Mesa 10.7.1.
+</p>
+<p>
+Mesa 10.7.0 implements the OpenGL 3.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 3.3.  OpenGL
+3.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+TBD.
+</pre>
+
+
+<h2>New features</h2>
+
+<p>
+Note: some of the new features are only available with certain drivers.
+</p>
+
+<ul>
+<li>GL_ARB_framebuffer_no_attachments on i965</li>
+<li>GL_ARB_shader_stencil_export on llvmpipe</li>
+</ul>
+
+<h2>Bug fixes</h2>
+
+TBD.
+
+<h2>Changes</h2>
+
+TBD.
+
+</div>
+</body>
+</html>
diff --git a/include/EGL/egl.h b/include/EGL/egl.h
index 99ea342..0d514e4 100644
--- a/include/EGL/egl.h
+++ b/include/EGL/egl.h
@@ -1,11 +1,12 @@
-/* -*- mode: c; tab-width: 8; -*- */
-/* vi: set sw=4 ts=8: */
-/* Reference version of egl.h for EGL 1.4.
- * $Revision: 9356 $ on $Date: 2009-10-21 02:52:25 -0700 (Wed, 21 Oct 2009) $
- */
+#ifndef __egl_h_
+#define __egl_h_ 1
+
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 /*
-** Copyright (c) 2007-2009 The Khronos Group Inc.
+** Copyright (c) 2013-2014 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -26,304 +27,277 @@
 ** TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
 ** MATERIALS OR THE USE OR OTHER DEALINGS IN THE MATERIALS.
 */
+/*
+** This header is generated from the Khronos OpenGL / OpenGL ES XML
+** API Registry. The current version of the Registry, generator scripts
+** used to make the header, and the header can be found at
+**   http://www.opengl.org/registry/
+**
+** Khronos $Revision: 31039 $ on $Date: 2015-05-04 17:01:57 -0700 (Mon, 04 May 2015) $
+*/
 
-#ifndef __egl_h_
-#define __egl_h_
-
-/* All platform-dependent types and macro boilerplate (such as EGLAPI
- * and EGLAPIENTRY) should go in eglplatform.h.
- */
 #include <EGL/eglplatform.h>
 
-#ifdef __cplusplus
-extern "C" {
-#endif
+/* Generated on date 20150504 */
+
+/* Generated C header for:
+ * API: egl
+ * Versions considered: .*
+ * Versions emitted: .*
+ * Default extensions included: None
+ * Additional extensions included: _nomatch_^
+ * Extensions removed: _nomatch_^
+ */
 
-/* EGL Types */
-/* EGLint is defined in eglplatform.h */
+#ifndef EGL_VERSION_1_0
+#define EGL_VERSION_1_0 1
 typedef unsigned int EGLBoolean;
-typedef unsigned int EGLenum;
-typedef void *EGLConfig;
-typedef void *EGLContext;
 typedef void *EGLDisplay;
+#include <KHR/khrplatform.h>
+#include <EGL/eglplatform.h>
+typedef void *EGLConfig;
 typedef void *EGLSurface;
-typedef void *EGLClientBuffer;
-
-/* EGL Versioning */
-#define EGL_VERSION_1_0			1
-#define EGL_VERSION_1_1			1
-#define EGL_VERSION_1_2			1
-#define EGL_VERSION_1_3			1
-#define EGL_VERSION_1_4			1
-
-/* EGL Enumerants. Bitmasks and other exceptional cases aside, most
- * enums are assigned unique values starting at 0x3000.
- */
-
-/* EGL aliases */
-#define EGL_FALSE			0
-#define EGL_TRUE			1
-
-/* Out-of-band handle values */
-#define EGL_DEFAULT_DISPLAY		((EGLNativeDisplayType)0)
-#define EGL_NO_CONTEXT			((EGLContext)0)
-#define EGL_NO_DISPLAY			((EGLDisplay)0)
-#define EGL_NO_SURFACE			((EGLSurface)0)
-
-/* Out-of-band attribute value */
-#define EGL_DONT_CARE			((EGLint)-1)
-
-/* Errors / GetError return values */
-#define EGL_SUCCESS			0x3000
-#define EGL_NOT_INITIALIZED		0x3001
-#define EGL_BAD_ACCESS			0x3002
-#define EGL_BAD_ALLOC			0x3003
-#define EGL_BAD_ATTRIBUTE		0x3004
-#define EGL_BAD_CONFIG			0x3005
-#define EGL_BAD_CONTEXT			0x3006
-#define EGL_BAD_CURRENT_SURFACE		0x3007
-#define EGL_BAD_DISPLAY			0x3008
-#define EGL_BAD_MATCH			0x3009
-#define EGL_BAD_NATIVE_PIXMAP		0x300A
-#define EGL_BAD_NATIVE_WINDOW		0x300B
-#define EGL_BAD_PARAMETER		0x300C
-#define EGL_BAD_SURFACE			0x300D
-#define EGL_CONTEXT_LOST		0x300E	/* EGL 1.1 - IMG_power_management */
-
-/* Reserved 0x300F-0x301F for additional errors */
-
-/* Config attributes */
-#define EGL_BUFFER_SIZE			0x3020
-#define EGL_ALPHA_SIZE			0x3021
-#define EGL_BLUE_SIZE			0x3022
-#define EGL_GREEN_SIZE			0x3023
-#define EGL_RED_SIZE			0x3024
-#define EGL_DEPTH_SIZE			0x3025
-#define EGL_STENCIL_SIZE		0x3026
-#define EGL_CONFIG_CAVEAT		0x3027
-#define EGL_CONFIG_ID			0x3028
-#define EGL_LEVEL			0x3029
-#define EGL_MAX_PBUFFER_HEIGHT		0x302A
-#define EGL_MAX_PBUFFER_PIXELS		0x302B
-#define EGL_MAX_PBUFFER_WIDTH		0x302C
-#define EGL_NATIVE_RENDERABLE		0x302D
-#define EGL_NATIVE_VISUAL_ID		0x302E
-#define EGL_NATIVE_VISUAL_TYPE		0x302F
-#define EGL_SAMPLES			0x3031
-#define EGL_SAMPLE_BUFFERS		0x3032
-#define EGL_SURFACE_TYPE		0x3033
-#define EGL_TRANSPARENT_TYPE		0x3034
-#define EGL_TRANSPARENT_BLUE_VALUE	0x3035
-#define EGL_TRANSPARENT_GREEN_VALUE	0x3036
-#define EGL_TRANSPARENT_RED_VALUE	0x3037
-#define EGL_NONE			0x3038	/* Attrib list terminator */
-#define EGL_BIND_TO_TEXTURE_RGB		0x3039
-#define EGL_BIND_TO_TEXTURE_RGBA	0x303A
-#define EGL_MIN_SWAP_INTERVAL		0x303B
-#define EGL_MAX_SWAP_INTERVAL		0x303C
-#define EGL_LUMINANCE_SIZE		0x303D
-#define EGL_ALPHA_MASK_SIZE		0x303E
-#define EGL_COLOR_BUFFER_TYPE		0x303F
-#define EGL_RENDERABLE_TYPE		0x3040
-#define EGL_MATCH_NATIVE_PIXMAP		0x3041	/* Pseudo-attribute (not queryable) */
-#define EGL_CONFORMANT			0x3042
-
-/* Reserved 0x3041-0x304F for additional config attributes */
-
-/* Config attribute values */
-#define EGL_SLOW_CONFIG			0x3050	/* EGL_CONFIG_CAVEAT value */
-#define EGL_NON_CONFORMANT_CONFIG	0x3051	/* EGL_CONFIG_CAVEAT value */
-#define EGL_TRANSPARENT_RGB		0x3052	/* EGL_TRANSPARENT_TYPE value */
-#define EGL_RGB_BUFFER			0x308E	/* EGL_COLOR_BUFFER_TYPE value */
-#define EGL_LUMINANCE_BUFFER		0x308F	/* EGL_COLOR_BUFFER_TYPE value */
-
-/* More config attribute values, for EGL_TEXTURE_FORMAT */
-#define EGL_NO_TEXTURE			0x305C
-#define EGL_TEXTURE_RGB			0x305D
-#define EGL_TEXTURE_RGBA		0x305E
-#define EGL_TEXTURE_2D			0x305F
-
-/* Config attribute mask bits */
-#define EGL_PBUFFER_BIT			0x0001	/* EGL_SURFACE_TYPE mask bits */
-#define EGL_PIXMAP_BIT			0x0002	/* EGL_SURFACE_TYPE mask bits */
-#define EGL_WINDOW_BIT			0x0004	/* EGL_SURFACE_TYPE mask bits */
-#define EGL_VG_COLORSPACE_LINEAR_BIT	0x0020	/* EGL_SURFACE_TYPE mask bits */
-#define EGL_VG_ALPHA_FORMAT_PRE_BIT	0x0040	/* EGL_SURFACE_TYPE mask bits */
-#define EGL_MULTISAMPLE_RESOLVE_BOX_BIT 0x0200	/* EGL_SURFACE_TYPE mask bits */
-#define EGL_SWAP_BEHAVIOR_PRESERVED_BIT 0x0400	/* EGL_SURFACE_TYPE mask bits */
-
-#define EGL_OPENGL_ES_BIT		0x0001	/* EGL_RENDERABLE_TYPE mask bits */
-#define EGL_OPENVG_BIT			0x0002	/* EGL_RENDERABLE_TYPE mask bits */
-#define EGL_OPENGL_ES2_BIT		0x0004	/* EGL_RENDERABLE_TYPE mask bits */
-#define EGL_OPENGL_BIT			0x0008	/* EGL_RENDERABLE_TYPE mask bits */
-
-/* QueryString targets */
-#define EGL_VENDOR			0x3053
-#define EGL_VERSION			0x3054
-#define EGL_EXTENSIONS			0x3055
-#define EGL_CLIENT_APIS			0x308D
-
-/* QuerySurface / SurfaceAttrib / CreatePbufferSurface targets */
-#define EGL_HEIGHT			0x3056
-#define EGL_WIDTH			0x3057
-#define EGL_LARGEST_PBUFFER		0x3058
-#define EGL_TEXTURE_FORMAT		0x3080
-#define EGL_TEXTURE_TARGET		0x3081
-#define EGL_MIPMAP_TEXTURE		0x3082
-#define EGL_MIPMAP_LEVEL		0x3083
-#define EGL_RENDER_BUFFER		0x3086
-#define EGL_VG_COLORSPACE		0x3087
-#define EGL_VG_ALPHA_FORMAT		0x3088
-#define EGL_HORIZONTAL_RESOLUTION	0x3090
-#define EGL_VERTICAL_RESOLUTION		0x3091
-#define EGL_PIXEL_ASPECT_RATIO		0x3092
-#define EGL_SWAP_BEHAVIOR		0x3093
-#define EGL_MULTISAMPLE_RESOLVE		0x3099
-
-/* EGL_RENDER_BUFFER values / BindTexImage / ReleaseTexImage buffer targets */
-#define EGL_BACK_BUFFER			0x3084
-#define EGL_SINGLE_BUFFER		0x3085
-
-/* OpenVG color spaces */
-#define EGL_VG_COLORSPACE_sRGB		0x3089	/* EGL_VG_COLORSPACE value */
-#define EGL_VG_COLORSPACE_LINEAR	0x308A	/* EGL_VG_COLORSPACE value */
-
-/* OpenVG alpha formats */
-#define EGL_VG_ALPHA_FORMAT_NONPRE	0x308B	/* EGL_ALPHA_FORMAT value */
-#define EGL_VG_ALPHA_FORMAT_PRE		0x308C	/* EGL_ALPHA_FORMAT value */
-
-/* Constant scale factor by which fractional display resolutions &
- * aspect ratio are scaled when queried as integer values.
- */
-#define EGL_DISPLAY_SCALING		10000
-
-/* Unknown display resolution/aspect ratio */
-#define EGL_UNKNOWN			((EGLint)-1)
-
-/* Back buffer swap behaviors */
-#define EGL_BUFFER_PRESERVED		0x3094	/* EGL_SWAP_BEHAVIOR value */
-#define EGL_BUFFER_DESTROYED		0x3095	/* EGL_SWAP_BEHAVIOR value */
-
-/* CreatePbufferFromClientBuffer buffer types */
-#define EGL_OPENVG_IMAGE		0x3096
-
-/* QueryContext targets */
-#define EGL_CONTEXT_CLIENT_TYPE		0x3097
-
-/* CreateContext attributes */
-#define EGL_CONTEXT_CLIENT_VERSION	0x3098
-
-/* Multisample resolution behaviors */
-#define EGL_MULTISAMPLE_RESOLVE_DEFAULT 0x309A	/* EGL_MULTISAMPLE_RESOLVE value */
-#define EGL_MULTISAMPLE_RESOLVE_BOX	0x309B	/* EGL_MULTISAMPLE_RESOLVE value */
-
-/* BindAPI/QueryAPI targets */
-#define EGL_OPENGL_ES_API		0x30A0
-#define EGL_OPENVG_API			0x30A1
-#define EGL_OPENGL_API			0x30A2
-
-/* GetCurrentSurface targets */
-#define EGL_DRAW			0x3059
-#define EGL_READ			0x305A
-
-/* WaitNative engines */
-#define EGL_CORE_NATIVE_ENGINE		0x305B
-
-/* EGL 1.2 tokens renamed for consistency in EGL 1.3 */
-#define EGL_COLORSPACE			EGL_VG_COLORSPACE
-#define EGL_ALPHA_FORMAT		EGL_VG_ALPHA_FORMAT
-#define EGL_COLORSPACE_sRGB		EGL_VG_COLORSPACE_sRGB
-#define EGL_COLORSPACE_LINEAR		EGL_VG_COLORSPACE_LINEAR
-#define EGL_ALPHA_FORMAT_NONPRE		EGL_VG_ALPHA_FORMAT_NONPRE
-#define EGL_ALPHA_FORMAT_PRE		EGL_VG_ALPHA_FORMAT_PRE
-
-/* EGL extensions must request enum blocks from the Khronos
- * API Registrar, who maintains the enumerant registry. Submit
- * a bug in Khronos Bugzilla against task "Registry".
- */
-
-
-
-/* EGL Functions */
-
-EGLAPI EGLint EGLAPIENTRY eglGetError(void);
-
-EGLAPI EGLDisplay EGLAPIENTRY eglGetDisplay(EGLNativeDisplayType display_id);
-EGLAPI EGLBoolean EGLAPIENTRY eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor);
-EGLAPI EGLBoolean EGLAPIENTRY eglTerminate(EGLDisplay dpy);
-
-EGLAPI const char * EGLAPIENTRY eglQueryString(EGLDisplay dpy, EGLint name);
-
-EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigs(EGLDisplay dpy, EGLConfig *configs,
-			 EGLint config_size, EGLint *num_config);
-EGLAPI EGLBoolean EGLAPIENTRY eglChooseConfig(EGLDisplay dpy, const EGLint *attrib_list,
-			   EGLConfig *configs, EGLint config_size,
-			   EGLint *num_config);
-EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigAttrib(EGLDisplay dpy, EGLConfig config,
-			      EGLint attribute, EGLint *value);
-
-EGLAPI EGLSurface EGLAPIENTRY eglCreateWindowSurface(EGLDisplay dpy, EGLConfig config,
-				  EGLNativeWindowType win,
-				  const EGLint *attrib_list);
-EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferSurface(EGLDisplay dpy, EGLConfig config,
-				   const EGLint *attrib_list);
-EGLAPI EGLSurface EGLAPIENTRY eglCreatePixmapSurface(EGLDisplay dpy, EGLConfig config,
-				  EGLNativePixmapType pixmap,
-				  const EGLint *attrib_list);
-EGLAPI EGLBoolean EGLAPIENTRY eglDestroySurface(EGLDisplay dpy, EGLSurface surface);
-EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurface(EGLDisplay dpy, EGLSurface surface,
-			   EGLint attribute, EGLint *value);
-
-EGLAPI EGLBoolean EGLAPIENTRY eglBindAPI(EGLenum api);
-EGLAPI EGLenum EGLAPIENTRY eglQueryAPI(void);
-
-EGLAPI EGLBoolean EGLAPIENTRY eglWaitClient(void);
-
-EGLAPI EGLBoolean EGLAPIENTRY eglReleaseThread(void);
-
-EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferFromClientBuffer(
-	      EGLDisplay dpy, EGLenum buftype, EGLClientBuffer buffer,
-	      EGLConfig config, const EGLint *attrib_list);
-
-EGLAPI EGLBoolean EGLAPIENTRY eglSurfaceAttrib(EGLDisplay dpy, EGLSurface surface,
-			    EGLint attribute, EGLint value);
-EGLAPI EGLBoolean EGLAPIENTRY eglBindTexImage(EGLDisplay dpy, EGLSurface surface, EGLint buffer);
-EGLAPI EGLBoolean EGLAPIENTRY eglReleaseTexImage(EGLDisplay dpy, EGLSurface surface, EGLint buffer);
-
-
-EGLAPI EGLBoolean EGLAPIENTRY eglSwapInterval(EGLDisplay dpy, EGLint interval);
-
-
-EGLAPI EGLContext EGLAPIENTRY eglCreateContext(EGLDisplay dpy, EGLConfig config,
-			    EGLContext share_context,
-			    const EGLint *attrib_list);
-EGLAPI EGLBoolean EGLAPIENTRY eglDestroyContext(EGLDisplay dpy, EGLContext ctx);
-EGLAPI EGLBoolean EGLAPIENTRY eglMakeCurrent(EGLDisplay dpy, EGLSurface draw,
-			  EGLSurface read, EGLContext ctx);
-
-EGLAPI EGLContext EGLAPIENTRY eglGetCurrentContext(void);
-EGLAPI EGLSurface EGLAPIENTRY eglGetCurrentSurface(EGLint readdraw);
-EGLAPI EGLDisplay EGLAPIENTRY eglGetCurrentDisplay(void);
-EGLAPI EGLBoolean EGLAPIENTRY eglQueryContext(EGLDisplay dpy, EGLContext ctx,
-			   EGLint attribute, EGLint *value);
-
-EGLAPI EGLBoolean EGLAPIENTRY eglWaitGL(void);
-EGLAPI EGLBoolean EGLAPIENTRY eglWaitNative(EGLint engine);
-EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffers(EGLDisplay dpy, EGLSurface surface);
-EGLAPI EGLBoolean EGLAPIENTRY eglCopyBuffers(EGLDisplay dpy, EGLSurface surface,
-			  EGLNativePixmapType target);
-
-/* This is a generic function pointer type, whose name indicates it must
- * be cast to the proper type *and calling convention* before use.
- */
+typedef void *EGLContext;
 typedef void (*__eglMustCastToProperFunctionPointerType)(void);
-
-/* Now, define eglGetProcAddress using the generic function ptr. type */
-EGLAPI __eglMustCastToProperFunctionPointerType EGLAPIENTRY
-       eglGetProcAddress(const char *procname);
+#define EGL_ALPHA_SIZE                    0x3021
+#define EGL_BAD_ACCESS                    0x3002
+#define EGL_BAD_ALLOC                     0x3003
+#define EGL_BAD_ATTRIBUTE                 0x3004
+#define EGL_BAD_CONFIG                    0x3005
+#define EGL_BAD_CONTEXT                   0x3006
+#define EGL_BAD_CURRENT_SURFACE           0x3007
+#define EGL_BAD_DISPLAY                   0x3008
+#define EGL_BAD_MATCH                     0x3009
+#define EGL_BAD_NATIVE_PIXMAP             0x300A
+#define EGL_BAD_NATIVE_WINDOW             0x300B
+#define EGL_BAD_PARAMETER                 0x300C
+#define EGL_BAD_SURFACE                   0x300D
+#define EGL_BLUE_SIZE                     0x3022
+#define EGL_BUFFER_SIZE                   0x3020
+#define EGL_CONFIG_CAVEAT                 0x3027
+#define EGL_CONFIG_ID                     0x3028
+#define EGL_CORE_NATIVE_ENGINE            0x305B
+#define EGL_DEPTH_SIZE                    0x3025
+#define EGL_DONT_CARE                     ((EGLint)-1)
+#define EGL_DRAW                          0x3059
+#define EGL_EXTENSIONS                    0x3055
+#define EGL_FALSE                         0
+#define EGL_GREEN_SIZE                    0x3023
+#define EGL_HEIGHT                        0x3056
+#define EGL_LARGEST_PBUFFER               0x3058
+#define EGL_LEVEL                         0x3029
+#define EGL_MAX_PBUFFER_HEIGHT            0x302A
+#define EGL_MAX_PBUFFER_PIXELS            0x302B
+#define EGL_MAX_PBUFFER_WIDTH             0x302C
+#define EGL_NATIVE_RENDERABLE             0x302D
+#define EGL_NATIVE_VISUAL_ID              0x302E
+#define EGL_NATIVE_VISUAL_TYPE            0x302F
+#define EGL_NONE                          0x3038
+#define EGL_NON_CONFORMANT_CONFIG         0x3051
+#define EGL_NOT_INITIALIZED               0x3001
+#define EGL_NO_CONTEXT                    ((EGLContext)0)
+#define EGL_NO_DISPLAY                    ((EGLDisplay)0)
+#define EGL_NO_SURFACE                    ((EGLSurface)0)
+#define EGL_PBUFFER_BIT                   0x0001
+#define EGL_PIXMAP_BIT                    0x0002
+#define EGL_READ                          0x305A
+#define EGL_RED_SIZE                      0x3024
+#define EGL_SAMPLES                       0x3031
+#define EGL_SAMPLE_BUFFERS                0x3032
+#define EGL_SLOW_CONFIG                   0x3050
+#define EGL_STENCIL_SIZE                  0x3026
+#define EGL_SUCCESS                       0x3000
+#define EGL_SURFACE_TYPE                  0x3033
+#define EGL_TRANSPARENT_BLUE_VALUE        0x3035
+#define EGL_TRANSPARENT_GREEN_VALUE       0x3036
+#define EGL_TRANSPARENT_RED_VALUE         0x3037
+#define EGL_TRANSPARENT_RGB               0x3052
+#define EGL_TRANSPARENT_TYPE              0x3034
+#define EGL_TRUE                          1
+#define EGL_VENDOR                        0x3053
+#define EGL_VERSION                       0x3054
+#define EGL_WIDTH                         0x3057
+#define EGL_WINDOW_BIT                    0x0004
+EGLAPI EGLBoolean EGLAPIENTRY eglChooseConfig (EGLDisplay dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+EGLAPI EGLBoolean EGLAPIENTRY eglCopyBuffers (EGLDisplay dpy, EGLSurface surface, EGLNativePixmapType target);
+EGLAPI EGLContext EGLAPIENTRY eglCreateContext (EGLDisplay dpy, EGLConfig config, EGLContext share_context, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferSurface (EGLDisplay dpy, EGLConfig config, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePixmapSurface (EGLDisplay dpy, EGLConfig config, EGLNativePixmapType pixmap, const EGLint *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreateWindowSurface (EGLDisplay dpy, EGLConfig config, EGLNativeWindowType win, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroyContext (EGLDisplay dpy, EGLContext ctx);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySurface (EGLDisplay dpy, EGLSurface surface);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigAttrib (EGLDisplay dpy, EGLConfig config, EGLint attribute, EGLint *value);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetConfigs (EGLDisplay dpy, EGLConfig *configs, EGLint config_size, EGLint *num_config);
+EGLAPI EGLDisplay EGLAPIENTRY eglGetCurrentDisplay (void);
+EGLAPI EGLSurface EGLAPIENTRY eglGetCurrentSurface (EGLint readdraw);
+EGLAPI EGLDisplay EGLAPIENTRY eglGetDisplay (EGLNativeDisplayType display_id);
+EGLAPI EGLint EGLAPIENTRY eglGetError (void);
+EGLAPI __eglMustCastToProperFunctionPointerType EGLAPIENTRY eglGetProcAddress (const char *procname);
+EGLAPI EGLBoolean EGLAPIENTRY eglInitialize (EGLDisplay dpy, EGLint *major, EGLint *minor);
+EGLAPI EGLBoolean EGLAPIENTRY eglMakeCurrent (EGLDisplay dpy, EGLSurface draw, EGLSurface read, EGLContext ctx);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryContext (EGLDisplay dpy, EGLContext ctx, EGLint attribute, EGLint *value);
+EGLAPI const char *EGLAPIENTRY eglQueryString (EGLDisplay dpy, EGLint name);
+EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurface (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLint *value);
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffers (EGLDisplay dpy, EGLSurface surface);
+EGLAPI EGLBoolean EGLAPIENTRY eglTerminate (EGLDisplay dpy);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitGL (void);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitNative (EGLint engine);
+#endif /* EGL_VERSION_1_0 */
+
+#ifndef EGL_VERSION_1_1
+#define EGL_VERSION_1_1 1
+#define EGL_BACK_BUFFER                   0x3084
+#define EGL_BIND_TO_TEXTURE_RGB           0x3039
+#define EGL_BIND_TO_TEXTURE_RGBA          0x303A
+#define EGL_CONTEXT_LOST                  0x300E
+#define EGL_MIN_SWAP_INTERVAL             0x303B
+#define EGL_MAX_SWAP_INTERVAL             0x303C
+#define EGL_MIPMAP_TEXTURE                0x3082
+#define EGL_MIPMAP_LEVEL                  0x3083
+#define EGL_NO_TEXTURE                    0x305C
+#define EGL_TEXTURE_2D                    0x305F
+#define EGL_TEXTURE_FORMAT                0x3080
+#define EGL_TEXTURE_RGB                   0x305D
+#define EGL_TEXTURE_RGBA                  0x305E
+#define EGL_TEXTURE_TARGET                0x3081
+EGLAPI EGLBoolean EGLAPIENTRY eglBindTexImage (EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+EGLAPI EGLBoolean EGLAPIENTRY eglReleaseTexImage (EGLDisplay dpy, EGLSurface surface, EGLint buffer);
+EGLAPI EGLBoolean EGLAPIENTRY eglSurfaceAttrib (EGLDisplay dpy, EGLSurface surface, EGLint attribute, EGLint value);
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapInterval (EGLDisplay dpy, EGLint interval);
+#endif /* EGL_VERSION_1_1 */
+
+#ifndef EGL_VERSION_1_2
+#define EGL_VERSION_1_2 1
+typedef unsigned int EGLenum;
+typedef void *EGLClientBuffer;
+#define EGL_ALPHA_FORMAT                  0x3088
+#define EGL_ALPHA_FORMAT_NONPRE           0x308B
+#define EGL_ALPHA_FORMAT_PRE              0x308C
+#define EGL_ALPHA_MASK_SIZE               0x303E
+#define EGL_BUFFER_PRESERVED              0x3094
+#define EGL_BUFFER_DESTROYED              0x3095
+#define EGL_CLIENT_APIS                   0x308D
+#define EGL_COLORSPACE                    0x3087
+#define EGL_COLORSPACE_sRGB               0x3089
+#define EGL_COLORSPACE_LINEAR             0x308A
+#define EGL_COLOR_BUFFER_TYPE             0x303F
+#define EGL_CONTEXT_CLIENT_TYPE           0x3097
+#define EGL_DISPLAY_SCALING               10000
+#define EGL_HORIZONTAL_RESOLUTION         0x3090
+#define EGL_LUMINANCE_BUFFER              0x308F
+#define EGL_LUMINANCE_SIZE                0x303D
+#define EGL_OPENGL_ES_BIT                 0x0001
+#define EGL_OPENVG_BIT                    0x0002
+#define EGL_OPENGL_ES_API                 0x30A0
+#define EGL_OPENVG_API                    0x30A1
+#define EGL_OPENVG_IMAGE                  0x3096
+#define EGL_PIXEL_ASPECT_RATIO            0x3092
+#define EGL_RENDERABLE_TYPE               0x3040
+#define EGL_RENDER_BUFFER                 0x3086
+#define EGL_RGB_BUFFER                    0x308E
+#define EGL_SINGLE_BUFFER                 0x3085
+#define EGL_SWAP_BEHAVIOR                 0x3093
+#define EGL_UNKNOWN                       ((EGLint)-1)
+#define EGL_VERTICAL_RESOLUTION           0x3091
+EGLAPI EGLBoolean EGLAPIENTRY eglBindAPI (EGLenum api);
+EGLAPI EGLenum EGLAPIENTRY eglQueryAPI (void);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePbufferFromClientBuffer (EGLDisplay dpy, EGLenum buftype, EGLClientBuffer buffer, EGLConfig config, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglReleaseThread (void);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitClient (void);
+#endif /* EGL_VERSION_1_2 */
+
+#ifndef EGL_VERSION_1_3
+#define EGL_VERSION_1_3 1
+#define EGL_CONFORMANT                    0x3042
+#define EGL_CONTEXT_CLIENT_VERSION        0x3098
+#define EGL_MATCH_NATIVE_PIXMAP           0x3041
+#define EGL_OPENGL_ES2_BIT                0x0004
+#define EGL_VG_ALPHA_FORMAT               0x3088
+#define EGL_VG_ALPHA_FORMAT_NONPRE        0x308B
+#define EGL_VG_ALPHA_FORMAT_PRE           0x308C
+#define EGL_VG_ALPHA_FORMAT_PRE_BIT       0x0040
+#define EGL_VG_COLORSPACE                 0x3087
+#define EGL_VG_COLORSPACE_sRGB            0x3089
+#define EGL_VG_COLORSPACE_LINEAR          0x308A
+#define EGL_VG_COLORSPACE_LINEAR_BIT      0x0020
+#endif /* EGL_VERSION_1_3 */
+
+#ifndef EGL_VERSION_1_4
+#define EGL_VERSION_1_4 1
+#define EGL_DEFAULT_DISPLAY               ((EGLNativeDisplayType)0)
+#define EGL_MULTISAMPLE_RESOLVE_BOX_BIT   0x0200
+#define EGL_MULTISAMPLE_RESOLVE           0x3099
+#define EGL_MULTISAMPLE_RESOLVE_DEFAULT   0x309A
+#define EGL_MULTISAMPLE_RESOLVE_BOX       0x309B
+#define EGL_OPENGL_API                    0x30A2
+#define EGL_OPENGL_BIT                    0x0008
+#define EGL_SWAP_BEHAVIOR_PRESERVED_BIT   0x0400
+EGLAPI EGLContext EGLAPIENTRY eglGetCurrentContext (void);
+#endif /* EGL_VERSION_1_4 */
+
+#ifndef EGL_VERSION_1_5
+#define EGL_VERSION_1_5 1
+typedef void *EGLSync;
+typedef intptr_t EGLAttrib;
+typedef khronos_utime_nanoseconds_t EGLTime;
+typedef void *EGLImage;
+#define EGL_CONTEXT_MAJOR_VERSION         0x3098
+#define EGL_CONTEXT_MINOR_VERSION         0x30FB
+#define EGL_CONTEXT_OPENGL_PROFILE_MASK   0x30FD
+#define EGL_CONTEXT_OPENGL_RESET_NOTIFICATION_STRATEGY 0x31BD
+#define EGL_NO_RESET_NOTIFICATION         0x31BE
+#define EGL_LOSE_CONTEXT_ON_RESET         0x31BF
+#define EGL_CONTEXT_OPENGL_CORE_PROFILE_BIT 0x00000001
+#define EGL_CONTEXT_OPENGL_COMPATIBILITY_PROFILE_BIT 0x00000002
+#define EGL_CONTEXT_OPENGL_DEBUG          0x31B0
+#define EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE 0x31B1
+#define EGL_CONTEXT_OPENGL_ROBUST_ACCESS  0x31B2
+#define EGL_OPENGL_ES3_BIT                0x00000040
+#define EGL_CL_EVENT_HANDLE               0x309C
+#define EGL_SYNC_CL_EVENT                 0x30FE
+#define EGL_SYNC_CL_EVENT_COMPLETE        0x30FF
+#define EGL_SYNC_PRIOR_COMMANDS_COMPLETE  0x30F0
+#define EGL_SYNC_TYPE                     0x30F7
+#define EGL_SYNC_STATUS                   0x30F1
+#define EGL_SYNC_CONDITION                0x30F8
+#define EGL_SIGNALED                      0x30F2
+#define EGL_UNSIGNALED                    0x30F3
+#define EGL_SYNC_FLUSH_COMMANDS_BIT       0x0001
+#define EGL_FOREVER                       0xFFFFFFFFFFFFFFFFull
+#define EGL_TIMEOUT_EXPIRED               0x30F5
+#define EGL_CONDITION_SATISFIED           0x30F6
+#define EGL_NO_SYNC                       ((EGLSync)0)
+#define EGL_SYNC_FENCE                    0x30F9
+#define EGL_GL_COLORSPACE                 0x309D
+#define EGL_GL_COLORSPACE_SRGB            0x3089
+#define EGL_GL_COLORSPACE_LINEAR          0x308A
+#define EGL_GL_RENDERBUFFER               0x30B9
+#define EGL_GL_TEXTURE_2D                 0x30B1
+#define EGL_GL_TEXTURE_LEVEL              0x30BC
+#define EGL_GL_TEXTURE_3D                 0x30B2
+#define EGL_GL_TEXTURE_ZOFFSET            0x30BD
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_X 0x30B3
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_X 0x30B4
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Y 0x30B5
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Y 0x30B6
+#define EGL_GL_TEXTURE_CUBE_MAP_POSITIVE_Z 0x30B7
+#define EGL_GL_TEXTURE_CUBE_MAP_NEGATIVE_Z 0x30B8
+#define EGL_IMAGE_PRESERVED               0x30D2
+#define EGL_NO_IMAGE                      ((EGLImage)0)
+EGLAPI EGLSync EGLAPIENTRY eglCreateSync (EGLDisplay dpy, EGLenum type, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySync (EGLDisplay dpy, EGLSync sync);
+EGLAPI EGLint EGLAPIENTRY eglClientWaitSync (EGLDisplay dpy, EGLSync sync, EGLint flags, EGLTime timeout);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncAttrib (EGLDisplay dpy, EGLSync sync, EGLint attribute, EGLAttrib *value);
+EGLAPI EGLImage EGLAPIENTRY eglCreateImage (EGLDisplay dpy, EGLContext ctx, EGLenum target, EGLClientBuffer buffer, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroyImage (EGLDisplay dpy, EGLImage image);
+EGLAPI EGLDisplay EGLAPIENTRY eglGetPlatformDisplay (EGLenum platform, void *native_display, const EGLAttrib *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformWindowSurface (EGLDisplay dpy, EGLConfig config, void *native_window, const EGLAttrib *attrib_list);
+EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformPixmapSurface (EGLDisplay dpy, EGLConfig config, void *native_pixmap, const EGLAttrib *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglWaitSync (EGLDisplay dpy, EGLSync sync, EGLint flags);
+#endif /* EGL_VERSION_1_5 */
 
 #ifdef __cplusplus
 }
 #endif
 
-#endif /* __egl_h_ */
+#endif
diff --git a/include/EGL/eglext.h b/include/EGL/eglext.h
index 88b39db..6043b37 100644
--- a/include/EGL/eglext.h
+++ b/include/EGL/eglext.h
@@ -6,7 +6,7 @@ extern "C" {
 #endif
 
 /*
-** Copyright (c) 2013 The Khronos Group Inc.
+** Copyright (c) 2013-2014 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -33,12 +33,12 @@ extern "C" {
 ** used to make the header, and the header can be found at
 **   http://www.opengl.org/registry/
 **
-** Khronos $Revision: 24567 $ on $Date: 2013-12-18 09:50:17 -0800 (Wed, 18 Dec 2013) $
+** Khronos $Revision$ on $Date$
 */
 
 #include <EGL/eglplatform.h>
 
-#define EGL_EGLEXT_VERSION 20131218
+#define EGL_EGLEXT_VERSION 20150508
 
 /* Generated C header for:
  * API: egl
@@ -94,12 +94,28 @@ EGLAPI EGLSyncKHR EGLAPIENTRY eglCreateSync64KHR (EGLDisplay dpy, EGLenum type,
 #define EGL_OPENGL_ES3_BIT_KHR            0x00000040
 #endif /* EGL_KHR_create_context */
 
+#ifndef EGL_KHR_create_context_no_error
+#define EGL_KHR_create_context_no_error 1
+#define EGL_CONTEXT_OPENGL_NO_ERROR_KHR   0x31B3
+#endif /* EGL_KHR_create_context_no_error */
+
 #ifndef EGL_KHR_fence_sync
 #define EGL_KHR_fence_sync 1
+typedef khronos_utime_nanoseconds_t EGLTimeKHR;
 #ifdef KHRONOS_SUPPORT_INT64
 #define EGL_SYNC_PRIOR_COMMANDS_COMPLETE_KHR 0x30F0
 #define EGL_SYNC_CONDITION_KHR            0x30F8
 #define EGL_SYNC_FENCE_KHR                0x30F9
+typedef EGLSyncKHR (EGLAPIENTRYP PFNEGLCREATESYNCKHRPROC) (EGLDisplay dpy, EGLenum type, const EGLint *attrib_list);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync);
+typedef EGLint (EGLAPIENTRYP PFNEGLCLIENTWAITSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCATTRIBKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLSyncKHR EGLAPIENTRY eglCreateSyncKHR (EGLDisplay dpy, EGLenum type, const EGLint *attrib_list);
+EGLAPI EGLBoolean EGLAPIENTRY eglDestroySyncKHR (EGLDisplay dpy, EGLSyncKHR sync);
+EGLAPI EGLint EGLAPIENTRY eglClientWaitSyncKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncAttribKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value);
+#endif
 #endif /* KHRONOS_SUPPORT_INT64 */
 #endif /* EGL_KHR_fence_sync */
 
@@ -207,9 +223,38 @@ EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurface64KHR (EGLDisplay dpy, EGLSurface s
 #endif
 #endif /* EGL_KHR_lock_surface3 */
 
+#ifndef EGL_KHR_partial_update
+#define EGL_KHR_partial_update 1
+#define EGL_BUFFER_AGE_KHR                0x313D
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSETDAMAGEREGIONKHRPROC) (EGLDisplay dpy, EGLSurface surface, EGLint *rects, EGLint n_rects);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglSetDamageRegionKHR (EGLDisplay dpy, EGLSurface surface, EGLint *rects, EGLint n_rects);
+#endif
+#endif /* EGL_KHR_partial_update */
+
+#ifndef EGL_KHR_platform_android
+#define EGL_KHR_platform_android 1
+#define EGL_PLATFORM_ANDROID_KHR          0x3141
+#endif /* EGL_KHR_platform_android */
+
+#ifndef EGL_KHR_platform_gbm
+#define EGL_KHR_platform_gbm 1
+#define EGL_PLATFORM_GBM_KHR              0x31D7
+#endif /* EGL_KHR_platform_gbm */
+
+#ifndef EGL_KHR_platform_wayland
+#define EGL_KHR_platform_wayland 1
+#define EGL_PLATFORM_WAYLAND_KHR          0x31D8
+#endif /* EGL_KHR_platform_wayland */
+
+#ifndef EGL_KHR_platform_x11
+#define EGL_KHR_platform_x11 1
+#define EGL_PLATFORM_X11_KHR              0x31D5
+#define EGL_PLATFORM_X11_SCREEN_KHR       0x31D6
+#endif /* EGL_KHR_platform_x11 */
+
 #ifndef EGL_KHR_reusable_sync
 #define EGL_KHR_reusable_sync 1
-typedef khronos_utime_nanoseconds_t EGLTimeKHR;
 #ifdef KHRONOS_SUPPORT_INT64
 #define EGL_SYNC_STATUS_KHR               0x30F1
 #define EGL_SIGNALED_KHR                  0x30F2
@@ -221,17 +266,9 @@ typedef khronos_utime_nanoseconds_t EGLTimeKHR;
 #define EGL_SYNC_FLUSH_COMMANDS_BIT_KHR   0x0001
 #define EGL_FOREVER_KHR                   0xFFFFFFFFFFFFFFFFull
 #define EGL_NO_SYNC_KHR                   ((EGLSyncKHR)0)
-typedef EGLSyncKHR (EGLAPIENTRYP PFNEGLCREATESYNCKHRPROC) (EGLDisplay dpy, EGLenum type, const EGLint *attrib_list);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLDESTROYSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync);
-typedef EGLint (EGLAPIENTRYP PFNEGLCLIENTWAITSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout);
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLSIGNALSYNCKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSYNCATTRIBKHRPROC) (EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value);
 #ifdef EGL_EGLEXT_PROTOTYPES
-EGLAPI EGLSyncKHR EGLAPIENTRY eglCreateSyncKHR (EGLDisplay dpy, EGLenum type, const EGLint *attrib_list);
-EGLAPI EGLBoolean EGLAPIENTRY eglDestroySyncKHR (EGLDisplay dpy, EGLSyncKHR sync);
-EGLAPI EGLint EGLAPIENTRY eglClientWaitSyncKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout);
 EGLAPI EGLBoolean EGLAPIENTRY eglSignalSyncKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode);
-EGLAPI EGLBoolean EGLAPIENTRY eglGetSyncAttribKHR (EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value);
 #endif
 #endif /* KHRONOS_SUPPORT_INT64 */
 #endif /* EGL_KHR_reusable_sync */
@@ -333,6 +370,14 @@ EGLAPI EGLSurface EGLAPIENTRY eglCreateStreamProducerSurfaceKHR (EGLDisplay dpy,
 #define EGL_KHR_surfaceless_context 1
 #endif /* EGL_KHR_surfaceless_context */
 
+#ifndef EGL_KHR_swap_buffers_with_damage
+#define EGL_KHR_swap_buffers_with_damage 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSWITHDAMAGEKHRPROC) (EGLDisplay dpy, EGLSurface surface, EGLint *rects, EGLint n_rects);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffersWithDamageKHR (EGLDisplay dpy, EGLSurface surface, EGLint *rects, EGLint n_rects);
+#endif
+#endif /* EGL_KHR_swap_buffers_with_damage */
+
 #ifndef EGL_KHR_vg_parent_image
 #define EGL_KHR_vg_parent_image 1
 #define EGL_VG_PARENT_IMAGE_KHR           0x30BA
@@ -389,6 +434,12 @@ EGLAPI EGLint EGLAPIENTRY eglDupNativeFenceFDANDROID (EGLDisplay dpy, EGLSyncKHR
 #define EGL_D3D_TEXTURE_2D_SHARE_HANDLE_ANGLE 0x3200
 #endif /* EGL_ANGLE_d3d_share_handle_client_buffer */
 
+#ifndef EGL_ANGLE_device_d3d
+#define EGL_ANGLE_device_d3d 1
+#define EGL_D3D9_DEVICE_ANGLE             0x33A0
+#define EGL_D3D11_DEVICE_ANGLE            0x33A1
+#endif /* EGL_ANGLE_device_d3d */
+
 #ifndef EGL_ANGLE_query_surface_pointer
 #define EGL_ANGLE_query_surface_pointer 1
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSURFACEPOINTERANGLEPROC) (EGLDisplay dpy, EGLSurface surface, EGLint attribute, void **value);
@@ -401,6 +452,11 @@ EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurfacePointerANGLE (EGLDisplay dpy, EGLSu
 #define EGL_ANGLE_surface_d3d_texture_2d_share_handle 1
 #endif /* EGL_ANGLE_surface_d3d_texture_2d_share_handle */
 
+#ifndef EGL_ANGLE_window_fixed_size
+#define EGL_ANGLE_window_fixed_size 1
+#define EGL_FIXED_SIZE_ANGLE              0x3201
+#endif /* EGL_ANGLE_window_fixed_size */
+
 #ifndef EGL_ARM_pixmap_multisample_discard
 #define EGL_ARM_pixmap_multisample_discard 1
 #define EGL_DISCARD_SAMPLES_ARM           0x3286
@@ -423,6 +479,42 @@ EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurfacePointerANGLE (EGLDisplay dpy, EGLSu
 #define EGL_LOSE_CONTEXT_ON_RESET_EXT     0x31BF
 #endif /* EGL_EXT_create_context_robustness */
 
+#ifndef EGL_EXT_device_base
+#define EGL_EXT_device_base 1
+typedef void *EGLDeviceEXT;
+#define EGL_NO_DEVICE_EXT                 ((EGLDeviceEXT)(0))
+#define EGL_BAD_DEVICE_EXT                0x322B
+#define EGL_DEVICE_EXT                    0x322C
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYDEVICEATTRIBEXTPROC) (EGLDeviceEXT device, EGLint attribute, EGLAttrib *value);
+typedef const char *(EGLAPIENTRYP PFNEGLQUERYDEVICESTRINGEXTPROC) (EGLDeviceEXT device, EGLint name);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYDEVICESEXTPROC) (EGLint max_devices, EGLDeviceEXT *devices, EGLint *num_devices);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYDISPLAYATTRIBEXTPROC) (EGLDisplay dpy, EGLint attribute, EGLAttrib *value);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryDeviceAttribEXT (EGLDeviceEXT device, EGLint attribute, EGLAttrib *value);
+EGLAPI const char *EGLAPIENTRY eglQueryDeviceStringEXT (EGLDeviceEXT device, EGLint name);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryDevicesEXT (EGLint max_devices, EGLDeviceEXT *devices, EGLint *num_devices);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryDisplayAttribEXT (EGLDisplay dpy, EGLint attribute, EGLAttrib *value);
+#endif
+#endif /* EGL_EXT_device_base */
+
+#ifndef EGL_EXT_device_drm
+#define EGL_EXT_device_drm 1
+#define EGL_DRM_DEVICE_FILE_EXT           0x3233
+#endif /* EGL_EXT_device_drm */
+
+#ifndef EGL_EXT_device_enumeration
+#define EGL_EXT_device_enumeration 1
+#endif /* EGL_EXT_device_enumeration */
+
+#ifndef EGL_EXT_device_openwf
+#define EGL_EXT_device_openwf 1
+#define EGL_OPENWF_DEVICE_ID_EXT          0x3237
+#endif /* EGL_EXT_device_openwf */
+
+#ifndef EGL_EXT_device_query
+#define EGL_EXT_device_query 1
+#endif /* EGL_EXT_device_query */
+
 #ifndef EGL_EXT_image_dma_buf_import
 #define EGL_EXT_image_dma_buf_import 1
 #define EGL_LINUX_DMA_BUF_EXT             0x3270
@@ -454,6 +546,48 @@ EGLAPI EGLBoolean EGLAPIENTRY eglQuerySurfacePointerANGLE (EGLDisplay dpy, EGLSu
 #define EGL_MULTIVIEW_VIEW_COUNT_EXT      0x3134
 #endif /* EGL_EXT_multiview_window */
 
+#ifndef EGL_EXT_output_base
+#define EGL_EXT_output_base 1
+typedef void *EGLOutputLayerEXT;
+typedef void *EGLOutputPortEXT;
+#define EGL_NO_OUTPUT_LAYER_EXT           ((EGLOutputLayerEXT)0)
+#define EGL_NO_OUTPUT_PORT_EXT            ((EGLOutputPortEXT)0)
+#define EGL_BAD_OUTPUT_LAYER_EXT          0x322D
+#define EGL_BAD_OUTPUT_PORT_EXT           0x322E
+#define EGL_SWAP_INTERVAL_EXT             0x322F
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETOUTPUTLAYERSEXTPROC) (EGLDisplay dpy, const EGLAttrib *attrib_list, EGLOutputLayerEXT *layers, EGLint max_layers, EGLint *num_layers);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETOUTPUTPORTSEXTPROC) (EGLDisplay dpy, const EGLAttrib *attrib_list, EGLOutputPortEXT *ports, EGLint max_ports, EGLint *num_ports);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLOUTPUTLAYERATTRIBEXTPROC) (EGLDisplay dpy, EGLOutputLayerEXT layer, EGLint attribute, EGLAttrib value);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYOUTPUTLAYERATTRIBEXTPROC) (EGLDisplay dpy, EGLOutputLayerEXT layer, EGLint attribute, EGLAttrib *value);
+typedef const char *(EGLAPIENTRYP PFNEGLQUERYOUTPUTLAYERSTRINGEXTPROC) (EGLDisplay dpy, EGLOutputLayerEXT layer, EGLint name);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLOUTPUTPORTATTRIBEXTPROC) (EGLDisplay dpy, EGLOutputPortEXT port, EGLint attribute, EGLAttrib value);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYOUTPUTPORTATTRIBEXTPROC) (EGLDisplay dpy, EGLOutputPortEXT port, EGLint attribute, EGLAttrib *value);
+typedef const char *(EGLAPIENTRYP PFNEGLQUERYOUTPUTPORTSTRINGEXTPROC) (EGLDisplay dpy, EGLOutputPortEXT port, EGLint name);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglGetOutputLayersEXT (EGLDisplay dpy, const EGLAttrib *attrib_list, EGLOutputLayerEXT *layers, EGLint max_layers, EGLint *num_layers);
+EGLAPI EGLBoolean EGLAPIENTRY eglGetOutputPortsEXT (EGLDisplay dpy, const EGLAttrib *attrib_list, EGLOutputPortEXT *ports, EGLint max_ports, EGLint *num_ports);
+EGLAPI EGLBoolean EGLAPIENTRY eglOutputLayerAttribEXT (EGLDisplay dpy, EGLOutputLayerEXT layer, EGLint attribute, EGLAttrib value);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryOutputLayerAttribEXT (EGLDisplay dpy, EGLOutputLayerEXT layer, EGLint attribute, EGLAttrib *value);
+EGLAPI const char *EGLAPIENTRY eglQueryOutputLayerStringEXT (EGLDisplay dpy, EGLOutputLayerEXT layer, EGLint name);
+EGLAPI EGLBoolean EGLAPIENTRY eglOutputPortAttribEXT (EGLDisplay dpy, EGLOutputPortEXT port, EGLint attribute, EGLAttrib value);
+EGLAPI EGLBoolean EGLAPIENTRY eglQueryOutputPortAttribEXT (EGLDisplay dpy, EGLOutputPortEXT port, EGLint attribute, EGLAttrib *value);
+EGLAPI const char *EGLAPIENTRY eglQueryOutputPortStringEXT (EGLDisplay dpy, EGLOutputPortEXT port, EGLint name);
+#endif
+#endif /* EGL_EXT_output_base */
+
+#ifndef EGL_EXT_output_drm
+#define EGL_EXT_output_drm 1
+#define EGL_DRM_CRTC_EXT                  0x3234
+#define EGL_DRM_PLANE_EXT                 0x3235
+#define EGL_DRM_CONNECTOR_EXT             0x3236
+#endif /* EGL_EXT_output_drm */
+
+#ifndef EGL_EXT_output_openwf
+#define EGL_EXT_output_openwf 1
+#define EGL_OPENWF_PIPELINE_ID_EXT        0x3238
+#define EGL_OPENWF_PORT_ID_EXT            0x3239
+#endif /* EGL_EXT_output_openwf */
+
 #ifndef EGL_EXT_platform_base
 #define EGL_EXT_platform_base 1
 typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETPLATFORMDISPLAYEXTPROC) (EGLenum platform, void *native_display, const EGLint *attrib_list);
@@ -466,6 +600,11 @@ EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformPixmapSurfaceEXT (EGLDisplay dpy,
 #endif
 #endif /* EGL_EXT_platform_base */
 
+#ifndef EGL_EXT_platform_device
+#define EGL_EXT_platform_device 1
+#define EGL_PLATFORM_DEVICE_EXT           0x313F
+#endif /* EGL_EXT_platform_device */
+
 #ifndef EGL_EXT_platform_wayland
 #define EGL_EXT_platform_wayland 1
 #define EGL_PLATFORM_WAYLAND_EXT          0x31D8
@@ -477,6 +616,19 @@ EGLAPI EGLSurface EGLAPIENTRY eglCreatePlatformPixmapSurfaceEXT (EGLDisplay dpy,
 #define EGL_PLATFORM_X11_SCREEN_EXT       0x31D6
 #endif /* EGL_EXT_platform_x11 */
 
+#ifndef EGL_EXT_protected_surface
+#define EGL_EXT_protected_surface 1
+#define EGL_PROTECTED_CONTENT_EXT         0x32C0
+#endif /* EGL_EXT_protected_surface */
+
+#ifndef EGL_EXT_stream_consumer_egloutput
+#define EGL_EXT_stream_consumer_egloutput 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSTREAMCONSUMEROUTPUTEXTPROC) (EGLDisplay dpy, EGLStreamKHR stream, EGLOutputLayerEXT layer);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglStreamConsumerOutputEXT (EGLDisplay dpy, EGLStreamKHR stream, EGLOutputLayerEXT layer);
+#endif
+#endif /* EGL_EXT_stream_consumer_egloutput */
+
 #ifndef EGL_EXT_swap_buffers_with_damage
 #define EGL_EXT_swap_buffers_with_damage 1
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSWITHDAMAGEEXTPROC) (EGLDisplay dpy, EGLSurface surface, EGLint *rects, EGLint n_rects);
@@ -485,6 +637,35 @@ EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffersWithDamageEXT (EGLDisplay dpy, EGLSu
 #endif
 #endif /* EGL_EXT_swap_buffers_with_damage */
 
+#ifndef EGL_EXT_yuv_surface
+#define EGL_EXT_yuv_surface 1
+#define EGL_YUV_ORDER_EXT                 0x3301
+#define EGL_YUV_NUMBER_OF_PLANES_EXT      0x3311
+#define EGL_YUV_SUBSAMPLE_EXT             0x3312
+#define EGL_YUV_DEPTH_RANGE_EXT           0x3317
+#define EGL_YUV_CSC_STANDARD_EXT          0x330A
+#define EGL_YUV_PLANE_BPP_EXT             0x331A
+#define EGL_YUV_BUFFER_EXT                0x3300
+#define EGL_YUV_ORDER_YUV_EXT             0x3302
+#define EGL_YUV_ORDER_YVU_EXT             0x3303
+#define EGL_YUV_ORDER_YUYV_EXT            0x3304
+#define EGL_YUV_ORDER_UYVY_EXT            0x3305
+#define EGL_YUV_ORDER_YVYU_EXT            0x3306
+#define EGL_YUV_ORDER_VYUY_EXT            0x3307
+#define EGL_YUV_ORDER_AYUV_EXT            0x3308
+#define EGL_YUV_SUBSAMPLE_4_2_0_EXT       0x3313
+#define EGL_YUV_SUBSAMPLE_4_2_2_EXT       0x3314
+#define EGL_YUV_SUBSAMPLE_4_4_4_EXT       0x3315
+#define EGL_YUV_DEPTH_RANGE_LIMITED_EXT   0x3318
+#define EGL_YUV_DEPTH_RANGE_FULL_EXT      0x3319
+#define EGL_YUV_CSC_STANDARD_601_EXT      0x330B
+#define EGL_YUV_CSC_STANDARD_709_EXT      0x330C
+#define EGL_YUV_CSC_STANDARD_2020_EXT     0x330D
+#define EGL_YUV_PLANE_BPP_0_EXT           0x331B
+#define EGL_YUV_PLANE_BPP_8_EXT           0x331C
+#define EGL_YUV_PLANE_BPP_10_EXT          0x331D
+#endif /* EGL_EXT_yuv_surface */
+
 #ifndef EGL_HI_clientpixmap
 #define EGL_HI_clientpixmap 1
 struct EGLClientPixmapHI {
@@ -533,11 +714,42 @@ EGLAPI EGLBoolean EGLAPIENTRY eglExportDRMImageMESA (EGLDisplay dpy, EGLImageKHR
 #endif
 #endif /* EGL_MESA_drm_image */
 
+#ifndef EGL_MESA_image_dma_buf_export
+#define EGL_MESA_image_dma_buf_export 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLEXPORTDMABUFIMAGEQUERYMESAPROC) (EGLDisplay dpy, EGLImageKHR image, int *fourcc, int *num_planes, EGLuint64KHR *modifiers);
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLEXPORTDMABUFIMAGEMESAPROC) (EGLDisplay dpy, EGLImageKHR image, int *fds, EGLint *strides, EGLint *offsets);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglExportDMABUFImageQueryMESA (EGLDisplay dpy, EGLImageKHR image, int *fourcc, int *num_planes, EGLuint64KHR *modifiers);
+EGLAPI EGLBoolean EGLAPIENTRY eglExportDMABUFImageMESA (EGLDisplay dpy, EGLImageKHR image, int *fds, EGLint *strides, EGLint *offsets);
+#endif
+#endif /* EGL_MESA_image_dma_buf_export */
+
 #ifndef EGL_MESA_platform_gbm
 #define EGL_MESA_platform_gbm 1
 #define EGL_PLATFORM_GBM_MESA             0x31D7
 #endif /* EGL_MESA_platform_gbm */
 
+#ifndef EGL_NOK_swap_region
+#define EGL_NOK_swap_region 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSREGIONNOKPROC) (EGLDisplay dpy, EGLSurface surface, EGLint numRects, const EGLint *rects);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffersRegionNOK (EGLDisplay dpy, EGLSurface surface, EGLint numRects, const EGLint *rects);
+#endif
+#endif /* EGL_NOK_swap_region */
+
+#ifndef EGL_NOK_swap_region2
+#define EGL_NOK_swap_region2 1
+typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSREGION2NOKPROC) (EGLDisplay dpy, EGLSurface surface, EGLint numRects, const EGLint *rects);
+#ifdef EGL_EGLEXT_PROTOTYPES
+EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffersRegion2NOK (EGLDisplay dpy, EGLSurface surface, EGLint numRects, const EGLint *rects);
+#endif
+#endif /* EGL_NOK_swap_region2 */
+
+#ifndef EGL_NOK_texture_from_pixmap
+#define EGL_NOK_texture_from_pixmap 1
+#define EGL_Y_INVERTED_NOK                0x307F
+#endif /* EGL_NOK_texture_from_pixmap */
+
 #ifndef EGL_NV_3dvision_surface
 #define EGL_NV_3dvision_surface 1
 #define EGL_AUTO_STEREO_NV                0x3136
@@ -556,6 +768,13 @@ EGLAPI EGLBoolean EGLAPIENTRY eglExportDRMImageMESA (EGLDisplay dpy, EGLImageKHR
 #define EGL_COVERAGE_SAMPLE_RESOLVE_NONE_NV 0x3133
 #endif /* EGL_NV_coverage_sample_resolve */
 
+#ifndef EGL_NV_cuda_event
+#define EGL_NV_cuda_event 1
+#define EGL_CUDA_EVENT_HANDLE_NV          0x323B
+#define EGL_SYNC_CUDA_EVENT_NV            0x323C
+#define EGL_SYNC_CUDA_EVENT_COMPLETE_NV   0x323D
+#endif /* EGL_NV_cuda_event */
+
 #ifndef EGL_NV_depth_nonlinear
 #define EGL_NV_depth_nonlinear 1
 #define EGL_DEPTH_ENCODING_NV             0x30E2
@@ -563,6 +782,11 @@ EGLAPI EGLBoolean EGLAPIENTRY eglExportDRMImageMESA (EGLDisplay dpy, EGLImageKHR
 #define EGL_DEPTH_ENCODING_NONLINEAR_NV   0x30E3
 #endif /* EGL_NV_depth_nonlinear */
 
+#ifndef EGL_NV_device_cuda
+#define EGL_NV_device_cuda 1
+#define EGL_CUDA_DEVICE_NV                0x323A
+#endif /* EGL_NV_device_cuda */
+
 #ifndef EGL_NV_native_query
 #define EGL_NV_native_query 1
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYNATIVEDISPLAYNVPROC) (EGLDisplay dpy, EGLNativeDisplayType *display_id);
@@ -645,6 +869,16 @@ EGLAPI EGLuint64NV EGLAPIENTRY eglGetSystemTimeNV (void);
 #endif /* KHRONOS_SUPPORT_INT64 */
 #endif /* EGL_NV_system_time */
 
+#ifndef EGL_TIZEN_image_native_buffer
+#define EGL_TIZEN_image_native_buffer 1
+#define EGL_NATIVE_BUFFER_TIZEN           0x32A0
+#endif /* EGL_TIZEN_image_native_buffer */
+
+#ifndef EGL_TIZEN_image_native_surface
+#define EGL_TIZEN_image_native_surface 1
+#define EGL_NATIVE_SURFACE_TIZEN          0x32A1
+#endif /* EGL_TIZEN_image_native_surface */
+
 #include <EGL/eglmesaext.h>
 #include <EGL/eglextchromium.h>
 
diff --git a/include/EGL/eglmesaext.h b/include/EGL/eglmesaext.h
index 7ce8346..917a204 100644
--- a/include/EGL/eglmesaext.h
+++ b/include/EGL/eglmesaext.h
@@ -34,63 +34,6 @@ extern "C" {
 
 #include <EGL/eglplatform.h>
 
-/* EGL_MESA_screen extension  >>> PRELIMINARY <<< */
-#ifndef EGL_MESA_screen_surface
-#define EGL_MESA_screen_surface 1
-
-#define EGL_BAD_SCREEN_MESA                    0x4000
-#define EGL_BAD_MODE_MESA                      0x4001
-#define EGL_SCREEN_COUNT_MESA                  0x4002
-#define EGL_SCREEN_POSITION_MESA               0x4003
-#define EGL_SCREEN_POSITION_GRANULARITY_MESA   0x4004
-#define EGL_MODE_ID_MESA                       0x4005
-#define EGL_REFRESH_RATE_MESA                  0x4006
-#define EGL_OPTIMAL_MESA                       0x4007
-#define EGL_INTERLACED_MESA                    0x4008
-#define EGL_SCREEN_BIT_MESA                    0x08
-
-typedef khronos_uint32_t EGLScreenMESA;
-typedef khronos_uint32_t EGLModeMESA;
-
-#ifdef EGL_EGLEXT_PROTOTYPES
-EGLAPI EGLBoolean EGLAPIENTRY eglChooseModeMESA(EGLDisplay dpy, EGLScreenMESA screen, const EGLint *attrib_list, EGLModeMESA *modes, EGLint modes_size, EGLint *num_modes);
-EGLAPI EGLBoolean EGLAPIENTRY eglGetModesMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLModeMESA *modes, EGLint modes_size, EGLint *num_modes);
-EGLAPI EGLBoolean EGLAPIENTRY eglGetModeAttribMESA(EGLDisplay dpy, EGLModeMESA mode, EGLint attribute, EGLint *value);
-EGLAPI EGLBoolean EGLAPIENTRY eglGetScreensMESA(EGLDisplay dpy, EGLScreenMESA *screens, EGLint max_screens, EGLint *num_screens);
-EGLAPI EGLSurface EGLAPIENTRY eglCreateScreenSurfaceMESA(EGLDisplay dpy, EGLConfig config, const EGLint *attrib_list);
-EGLAPI EGLBoolean EGLAPIENTRY eglShowScreenSurfaceMESA(EGLDisplay dpy, EGLint screen, EGLSurface surface, EGLModeMESA mode);
-EGLAPI EGLBoolean EGLAPIENTRY eglScreenPositionMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLint x, EGLint y);
-EGLAPI EGLBoolean EGLAPIENTRY eglQueryScreenMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLint attribute, EGLint *value);
-EGLAPI EGLBoolean EGLAPIENTRY eglQueryScreenSurfaceMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLSurface *surface);
-EGLAPI EGLBoolean EGLAPIENTRY eglQueryScreenModeMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLModeMESA *mode);
-EGLAPI const char * EGLAPIENTRY eglQueryModeStringMESA(EGLDisplay dpy, EGLModeMESA mode);
-#endif /* EGL_EGLEXT_PROTOTYPES */
-
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLCHOOSEMODEMESA) (EGLDisplay dpy, EGLScreenMESA screen, const EGLint *attrib_list, EGLModeMESA *modes, EGLint modes_size, EGLint *num_modes);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETMODESMESA) (EGLDisplay dpy, EGLScreenMESA screen, EGLModeMESA *modes, EGLint modes_size, EGLint *num_modes);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLGetModeATTRIBMESA) (EGLDisplay dpy, EGLModeMESA mode, EGLint attribute, EGLint *value);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLGETSCRREENSMESA) (EGLDisplay dpy, EGLScreenMESA *screens, EGLint max_screens, EGLint *num_screens);
-typedef EGLSurface (EGLAPIENTRYP PFNEGLCREATESCREENSURFACEMESA) (EGLDisplay dpy, EGLConfig config, const EGLint *attrib_list);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLSHOWSCREENSURFACEMESA) (EGLDisplay dpy, EGLint screen, EGLSurface surface, EGLModeMESA mode);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLSCREENPOSIITONMESA) (EGLDisplay dpy, EGLScreenMESA screen, EGLint x, EGLint y);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSCREENMESA) (EGLDisplay dpy, EGLScreenMESA screen, EGLint attribute, EGLint *value);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSCREENSURFACEMESA) (EGLDisplay dpy, EGLScreenMESA screen, EGLSurface *surface);
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLQUERYSCREENMODEMESA) (EGLDisplay dpy, EGLScreenMESA screen, EGLModeMESA *mode);
-typedef const char * (EGLAPIENTRYP PFNEGLQUERYMODESTRINGMESA) (EGLDisplay dpy, EGLModeMESA mode);
-
-#endif /* EGL_MESA_screen_surface */
-
-#ifndef EGL_MESA_copy_context
-#define EGL_MESA_copy_context 1
-
-#ifdef EGL_EGLEXT_PROTOTYPES
-EGLAPI EGLBoolean EGLAPIENTRY eglCopyContextMESA(EGLDisplay dpy, EGLContext source, EGLContext dest, EGLint mask);
-#endif /* EGL_EGLEXT_PROTOTYPES */
-
-typedef EGLBoolean (EGLAPIENTRYP PFNEGLCOPYCONTEXTMESA) (EGLDisplay dpy, EGLContext source, EGLContext dest, EGLint mask);
-
-#endif /* EGL_MESA_copy_context */
-
 #ifndef EGL_MESA_drm_display
 #define EGL_MESA_drm_display 1
 
@@ -144,39 +87,14 @@ typedef struct wl_buffer * (EGLAPIENTRYP PFNEGLCREATEWAYLANDBUFFERFROMIMAGEWL) (
 
 #endif
 
-#ifndef EGL_NOK_swap_region
-#define EGL_NOK_swap_region 1
-
-#ifdef EGL_EGLEXT_PROTOTYPES
-EGLAPI EGLBoolean EGLAPIENTRY eglSwapBuffersRegionNOK(EGLDisplay dpy, EGLSurface surface, EGLint numRects, const EGLint* rects);
-#endif
-
+/* remnant of EGL_NOK_swap_region kept for compatibility because of a non-standard type name */
 typedef EGLBoolean (EGLAPIENTRYP PFNEGLSWAPBUFFERSREGIONNOK) (EGLDisplay dpy, EGLSurface surface, EGLint numRects, const EGLint* rects);
-#endif
-
-#ifndef EGL_NOK_texture_from_pixmap
-#define EGL_NOK_texture_from_pixmap 1
-
-#define EGL_Y_INVERTED_NOK			0x307F
-#endif /* EGL_NOK_texture_from_pixmap */
-
-#ifndef EGL_ANDROID_image_native_buffer
-#define EGL_ANDROID_image_native_buffer 1
-#define EGL_NATIVE_BUFFER_ANDROID       0x3140  /* eglCreateImageKHR target */
-#endif
 
 #ifndef EGL_MESA_configless_context
 #define EGL_MESA_configless_context 1
 #define EGL_NO_CONFIG_MESA			((EGLConfig)0)
 #endif
 
-#if KHRONOS_SUPPORT_INT64
-#ifndef EGL_MESA_image_dma_buf_export
-#define EGL_MESA_image_dma_buf_export 1
-EGLAPI EGLBoolean EGLAPIENTRY eglExportDMABUFImageQueryMESA (EGLDisplay dpy, EGLImageKHR image, EGLint *fourcc, EGLint *nplanes, EGLuint64KHR *modifiers);
-EGLAPI EGLBoolean EGLAPIENTRY eglExportDMABUFImageMESA (EGLDisplay dpy, EGLImageKHR image, int *fds, EGLint *strides, EGLint *offsets);
-#endif
-#endif
 #ifdef __cplusplus
 }
 #endif
diff --git a/include/EGL/eglplatform.h b/include/EGL/eglplatform.h
index 2eb6865..7802542 100644
--- a/include/EGL/eglplatform.h
+++ b/include/EGL/eglplatform.h
@@ -2,7 +2,7 @@
 #define __eglplatform_h_
 
 /*
-** Copyright (c) 2007-2009 The Khronos Group Inc.
+** Copyright (c) 2007-2013 The Khronos Group Inc.
 **
 ** Permission is hereby granted, free of charge, to any person obtaining a
 ** copy of this software and/or associated documentation files (the
@@ -25,7 +25,7 @@
 */
 
 /* Platform-specific types and definitions for egl.h
- * $Revision: 12306 $ on $Date: 2010-08-25 09:51:28 -0700 (Wed, 25 Aug 2010) $
+ * $Revision: 30994 $ on $Date: 2015-04-30 13:36:48 -0700 (Thu, 30 Apr 2015) $
  *
  * Adopters may modify khrplatform.h and this file to suit their platform.
  * You are encouraged to submit all modifications to the Khronos group so that
@@ -77,7 +77,7 @@ typedef HDC     EGLNativeDisplayType;
 typedef HBITMAP EGLNativePixmapType;
 typedef HWND    EGLNativeWindowType;
 
-#elif defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
+#elif defined(__APPLE__) || defined(__WINSCW__) || defined(__SYMBIAN32__)  /* Symbian */
 
 typedef int   EGLNativeDisplayType;
 typedef void *EGLNativeWindowType;
@@ -95,14 +95,15 @@ typedef struct gbm_device  *EGLNativeDisplayType;
 typedef struct gbm_bo      *EGLNativePixmapType;
 typedef void               *EGLNativeWindowType;
 
-#elif defined(ANDROID) /* Android */
+#elif defined(__ANDROID__) || defined(ANDROID)
+
+#include <android/native_window.h>
 
-struct ANativeWindow;
 struct egl_native_pixmap_t;
 
-typedef struct ANativeWindow        *EGLNativeWindowType;
-typedef struct egl_native_pixmap_t  *EGLNativePixmapType;
-typedef void                        *EGLNativeDisplayType;
+typedef struct ANativeWindow*           EGLNativeWindowType;
+typedef struct egl_native_pixmap_t*     EGLNativePixmapType;
+typedef void*                           EGLNativeDisplayType;
 
 #elif defined(__unix__)
 
@@ -131,9 +132,7 @@ typedef khronos_uintptr_t	 EGLNativePixmapType;
 typedef khronos_uintptr_t	 EGLNativeWindowType;
 
 #else
-
 #error "Platform not recognized"
-
 #endif
 
 /* EGL 1.2 types, renamed for consistency in EGL 1.3 */
diff --git a/include/KHR/khrplatform.h b/include/KHR/khrplatform.h
index 4479539..790de44 100644
--- a/include/KHR/khrplatform.h
+++ b/include/KHR/khrplatform.h
@@ -26,7 +26,7 @@
 
 /* Khronos platform-specific types and definitions.
  *
- * $Revision: 9356 $ on $Date: 2009-10-21 02:52:25 -0700 (Wed, 21 Oct 2009) $
+ * $Revision: 23298 $ on $Date: 2013-09-30 17:07:13 -0700 (Mon, 30 Sep 2013) $
  *
  * Adopters may modify this file to suit their platform. Adopters are
  * encouraged to submit platform specific modifications to the Khronos
@@ -106,9 +106,9 @@
 #elif defined (__SYMBIAN32__)
 #   define KHRONOS_APICALL IMPORT_C
 #elif (defined(__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__) >= 303) \
-	|| (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
+       || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
 /* KHRONOS_APIATTRIBUTES is not used by the client API headers yet */
-#  define KHRONOS_APICALL __attribute__((visibility("default")))
+#   define KHRONOS_APICALL __attribute__((visibility("default")))
 #else
 #   define KHRONOS_APICALL
 #endif
@@ -229,10 +229,23 @@ typedef signed   char          khronos_int8_t;
 typedef unsigned char          khronos_uint8_t;
 typedef signed   short int     khronos_int16_t;
 typedef unsigned short int     khronos_uint16_t;
+
+/*
+ * Types that differ between LLP64 and LP64 architectures - in LLP64, 
+ * pointers are 64 bits, but 'long' is still 32 bits. Win64 appears
+ * to be the only LLP64 architecture in current use.
+ */
+#ifdef _WIN64
+typedef signed   long long int khronos_intptr_t;
+typedef unsigned long long int khronos_uintptr_t;
+typedef signed   long long int khronos_ssize_t;
+typedef unsigned long long int khronos_usize_t;
+#else
 typedef signed   long  int     khronos_intptr_t;
 typedef unsigned long  int     khronos_uintptr_t;
 typedef signed   long  int     khronos_ssize_t;
 typedef unsigned long  int     khronos_usize_t;
+#endif
 
 #if KHRONOS_SUPPORT_FLOAT
 /*
diff --git a/include/pci_ids/radeonsi_pci_ids.h b/include/pci_ids/radeonsi_pci_ids.h
index 571e863..cd5da99 100644
--- a/include/pci_ids/radeonsi_pci_ids.h
+++ b/include/pci_ids/radeonsi_pci_ids.h
@@ -85,6 +85,7 @@ CHIPSET(0x6651, BONAIRE_6651, BONAIRE)
 CHIPSET(0x6658, BONAIRE_6658, BONAIRE)
 CHIPSET(0x665C, BONAIRE_665C, BONAIRE)
 CHIPSET(0x665D, BONAIRE_665D, BONAIRE)
+CHIPSET(0x665F, BONAIRE_665F, BONAIRE)
 
 CHIPSET(0x9830, KABINI_9830, KABINI)
 CHIPSET(0x9831, KABINI_9831, KABINI)
diff --git a/scons/llvm.py b/scons/llvm.py
index 17278df..c59b8cb 100644
--- a/scons/llvm.py
+++ b/scons/llvm.py
@@ -120,6 +120,7 @@ def generate(env):
             ])
         elif llvm_version >= distutils.version.LooseVersion('3.5'):
             env.Prepend(LIBS = [
+                'LLVMMCDisassembler',
                 'LLVMBitWriter', 'LLVMMCJIT', 'LLVMRuntimeDyld',
                 'LLVMX86Disassembler', 'LLVMX86AsmParser', 'LLVMX86CodeGen',
                 'LLVMSelectionDAG', 'LLVMAsmPrinter', 'LLVMX86Desc',
@@ -132,6 +133,7 @@ def generate(env):
             ])
         else:
             env.Prepend(LIBS = [
+                'LLVMMCDisassembler',
                 'LLVMBitWriter', 'LLVMX86Disassembler', 'LLVMX86AsmParser',
                 'LLVMX86CodeGen', 'LLVMX86Desc', 'LLVMSelectionDAG',
                 'LLVMAsmPrinter', 'LLVMMCParser', 'LLVMX86AsmPrinter',
@@ -189,7 +191,7 @@ def generate(env):
             if '-fno-rtti' in cxxflags:
                 env.Append(CXXFLAGS = ['-fno-rtti'])
 
-            components = ['engine', 'mcjit', 'bitwriter', 'x86asmprinter']
+            components = ['engine', 'mcjit', 'bitwriter', 'x86asmprinter', 'mcdisassembler']
 
             env.ParseConfig('llvm-config --libs ' + ' '.join(components))
             env.ParseConfig('llvm-config --ldflags')
diff --git a/src/Makefile.am b/src/Makefile.am
index bf76e35..d41a087 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -76,4 +76,5 @@ noinst_LTLIBRARIES = libglsl_util.la
 libglsl_util_la_SOURCES = \
 	mesa/main/imports.c \
 	mesa/program/prog_hash_table.c \
-	mesa/program/symbol_table.c
+	mesa/program/symbol_table.c \
+	mesa/program/dummy_errors.c
diff --git a/src/egl/drivers/dri2/Android.mk b/src/egl/drivers/dri2/Android.mk
index 5931ce8..109e4d4 100644
--- a/src/egl/drivers/dri2/Android.mk
+++ b/src/egl/drivers/dri2/Android.mk
@@ -36,6 +36,7 @@ LOCAL_CFLAGS := \
 	-DHAVE_ANDROID_PLATFORM
 
 ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_CFLAGS_arm := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
 LOCAL_CFLAGS_x86 := -DDEFAULT_DRIVER_DIR=\"/system/lib/dri\"
 LOCAL_CFLAGS_x86_64 := -DDEFAULT_DRIVER_DIR=\"/system/lib64/dri\"
 else
@@ -45,7 +46,6 @@ endif
 LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/mapi \
 	$(MESA_TOP)/src/egl/main \
-	$(MESA_TOP)/src/loader \
 	$(DRM_GRALLOC_TOP)
 
 LOCAL_STATIC_LIBRARIES := \
diff --git a/src/egl/drivers/dri2/Makefile.am b/src/egl/drivers/dri2/Makefile.am
index f589600..55be4a7 100644
--- a/src/egl/drivers/dri2/Makefile.am
+++ b/src/egl/drivers/dri2/Makefile.am
@@ -65,4 +65,9 @@ libegl_dri2_la_SOURCES += platform_drm.c
 AM_CFLAGS += -DHAVE_DRM_PLATFORM
 endif
 
+if HAVE_EGL_PLATFORM_SURFACELESS
+libegl_dri2_la_SOURCES += platform_surfaceless.c
+AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
+endif
+
 EXTRA_DIST = SConscript
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index f4c29da..a1cbd43 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -397,7 +397,7 @@ dri2_open_driver(_EGLDisplay *disp)
 
    dri2_dpy->driver = NULL;
    end = search_paths + strlen(search_paths);
-   for (p = search_paths; p < end && dri2_dpy->driver == NULL; p = next + 1) {
+   for (p = search_paths; p < end; p = next + 1) {
       int len;
       next = strchr(p, ':');
       if (next == NULL)
@@ -419,6 +419,15 @@ dri2_open_driver(_EGLDisplay *disp)
       /* not need continue to loop all paths once the driver is found */
       if (dri2_dpy->driver != NULL)
          break;
+
+#ifdef ANDROID
+      snprintf(path, sizeof path, "%.*s/gallium_dri.so", len, p);
+      dri2_dpy->driver = dlopen(path, RTLD_NOW | RTLD_GLOBAL);
+      if (dri2_dpy->driver == NULL)
+         _eglLog(_EGL_DEBUG, "failed to open %s: %s\n", path, dlerror());
+      else
+         break;
+#endif
    }
 
    if (dri2_dpy->driver == NULL) {
@@ -576,6 +585,7 @@ dri2_create_screen(_EGLDisplay *disp)
 {
    const __DRIextension **extensions;
    struct dri2_egl_display *dri2_dpy;
+   unsigned i;
 
    dri2_dpy = disp->DriverData;
 
@@ -616,28 +626,26 @@ dri2_create_screen(_EGLDisplay *disp)
    extensions = dri2_dpy->core->getExtensions(dri2_dpy->dri_screen);
    
    if (dri2_dpy->dri2) {
-      unsigned i;
-
       if (!dri2_bind_extensions(dri2_dpy, dri2_core_extensions, extensions))
          goto cleanup_dri_screen;
-
-      for (i = 0; extensions[i]; i++) {
-	 if (strcmp(extensions[i]->name, __DRI2_ROBUSTNESS) == 0) {
-            dri2_dpy->robustness = (__DRIrobustnessExtension *) extensions[i];
-	 }
-	 if (strcmp(extensions[i]->name, __DRI2_CONFIG_QUERY) == 0) {
-	    dri2_dpy->config = (__DRI2configQueryExtension *) extensions[i];
-	 }
-         if (strcmp(extensions[i]->name, __DRI2_FENCE) == 0) {
-            dri2_dpy->fence = (__DRI2fenceExtension *) extensions[i];
-         }
-      }
    } else {
       assert(dri2_dpy->swrast);
       if (!dri2_bind_extensions(dri2_dpy, swrast_core_extensions, extensions))
          goto cleanup_dri_screen;
    }
 
+   for (i = 0; extensions[i]; i++) {
+      if (strcmp(extensions[i]->name, __DRI2_ROBUSTNESS) == 0) {
+         dri2_dpy->robustness = (__DRIrobustnessExtension *) extensions[i];
+      }
+      if (strcmp(extensions[i]->name, __DRI2_CONFIG_QUERY) == 0) {
+         dri2_dpy->config = (__DRI2configQueryExtension *) extensions[i];
+      }
+      if (strcmp(extensions[i]->name, __DRI2_FENCE) == 0) {
+         dri2_dpy->fence = (__DRI2fenceExtension *) extensions[i];
+      }
+   }
+
    dri2_setup_screen(disp);
 
    return EGL_TRUE;
@@ -659,6 +667,13 @@ dri2_initialize(_EGLDriver *drv, _EGLDisplay *disp)
       return EGL_FALSE;
 
    switch (disp->Platform) {
+#ifdef HAVE_SURFACELESS_PLATFORM
+   case _EGL_PLATFORM_SURFACELESS:
+      if (disp->Options.TestOnly)
+         return EGL_TRUE;
+      return dri2_initialize_surfaceless(drv, disp);
+#endif
+
 #ifdef HAVE_X11_PLATFORM
    case _EGL_PLATFORM_X11:
       if (disp->Options.TestOnly)
@@ -729,7 +744,12 @@ dri2_terminate(_EGLDriver *drv, _EGLDisplay *disp)
 #endif
 #ifdef HAVE_WAYLAND_PLATFORM
    case _EGL_PLATFORM_WAYLAND:
-      wl_drm_destroy(dri2_dpy->wl_drm);
+      if (dri2_dpy->wl_drm)
+          wl_drm_destroy(dri2_dpy->wl_drm);
+      if (dri2_dpy->wl_shm)
+          wl_shm_destroy(dri2_dpy->wl_shm);
+      wl_registry_destroy(dri2_dpy->wl_registry);
+      wl_event_queue_destroy(dri2_dpy->wl_queue);
       if (dri2_dpy->own_device) {
          wl_display_disconnect(dri2_dpy->wl_dpy);
       }
@@ -1252,7 +1272,8 @@ dri2_bind_tex_image(_EGLDriver *drv,
       format = __DRI_TEXTURE_FORMAT_RGBA;
       break;
    default:
-      assert(0);
+      assert(!"Unexpected texture format in dri2_bind_tex_image()");
+      format = __DRI_TEXTURE_FORMAT_RGBA;
    }
 
    switch (dri2_surf->base.TextureTarget) {
@@ -1260,7 +1281,8 @@ dri2_bind_tex_image(_EGLDriver *drv,
       target = GL_TEXTURE_2D;
       break;
    default:
-      assert(0);
+      target = GL_TEXTURE_2D;
+      assert(!"Unexpected texture target in dri2_bind_tex_image()");
    }
 
    (*dri2_dpy->tex_buffer->setTexBuffer2)(dri2_ctx->dri_context,
@@ -2210,7 +2232,7 @@ dri2_egl_unref_sync(struct dri2_egl_display *dri2_dpy,
 static _EGLSync *
 dri2_create_sync(_EGLDriver *drv, _EGLDisplay *dpy,
                  EGLenum type, const EGLint *attrib_list,
-                 const EGLAttribKHR *attrib_list64)
+                 const EGLAttrib *attrib_list64)
 {
    _EGLContext *ctx = _eglGetCurrentContext();
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
@@ -2276,7 +2298,7 @@ dri2_destroy_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync)
 
 static EGLint
 dri2_client_wait_sync(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
-                      EGLint flags, EGLTimeKHR timeout)
+                      EGLint flags, EGLTime timeout)
 {
    _EGLContext *ctx = _eglGetCurrentContext();
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(dpy);
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 371fb4a..9985c49 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -196,10 +196,13 @@ struct dri2_egl_display
    struct wl_registry       *wl_registry;
    struct wl_drm            *wl_server_drm;
    struct wl_drm            *wl_drm;
+   struct wl_shm            *wl_shm;
    struct wl_event_queue    *wl_queue;
    int			     authenticated;
    int			     formats;
    uint32_t                  capabilities;
+   int			     is_render_node;
+   int			     is_different_gpu;
 #endif
 };
 
@@ -253,6 +256,11 @@ struct dri2_egl_surface
 #ifdef HAVE_WAYLAND_PLATFORM
       struct wl_buffer   *wl_buffer;
       __DRIimage         *dri_image;
+      /* for is_different_gpu case. NULL else */
+      __DRIimage         *linear_copy;
+      /* for swrast */
+      void *data;
+      int data_size;
 #endif
 #ifdef HAVE_DRM_PLATFORM
       struct gbm_bo       *bo;
@@ -343,6 +351,9 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp);
 EGLBoolean
 dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *disp);
 
+EGLBoolean
+dri2_initialize_surfaceless(_EGLDriver *drv, _EGLDisplay *disp);
+
 void
 dri2_flush_drawable_for_swapbuffers(_EGLDisplay *disp, _EGLSurface *draw);
 
diff --git a/src/egl/drivers/dri2/egl_dri2_fallbacks.h b/src/egl/drivers/dri2/egl_dri2_fallbacks.h
index 9cba001..e769af3 100644
--- a/src/egl/drivers/dri2/egl_dri2_fallbacks.h
+++ b/src/egl/drivers/dri2/egl_dri2_fallbacks.h
@@ -45,6 +45,15 @@ dri2_fallback_create_pbuffer_surface(_EGLDriver *drv, _EGLDisplay *disp,
    return NULL;
 }
 
+static inline _EGLImage*
+dri2_fallback_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
+                               _EGLContext *ctx, EGLenum target,
+                               EGLClientBuffer buffer,
+                               const EGLint *attr_list)
+{
+   return NULL;
+}
+
 static inline EGLBoolean
 dri2_fallback_swap_interval(_EGLDriver *drv, _EGLDisplay *dpy,
                             _EGLSurface *surf, EGLint interval)
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index f482526..fed3073 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -707,10 +707,6 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
    dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE;
    dpy->Extensions.KHR_image_base = EGL_TRUE;
 
-   /* we're supporting EGL 1.4 */
-   dpy->VersionMajor = 1;
-   dpy->VersionMinor = 4;
-
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
     */
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index 486b003..a62da41 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -611,9 +611,9 @@ dri2_initialize_drm(_EGLDriver *drv, _EGLDisplay *disp)
       char buf[64];
       int n = snprintf(buf, sizeof(buf), DRM_DEV_NAME, DRM_DIR_NAME, 0);
       if (n != -1 && n < sizeof(buf))
-         fd = open(buf, O_RDWR);
+         fd = loader_open_device(buf);
       if (fd < 0)
-         fd = open("/dev/dri/card0", O_RDWR);
+         fd = loader_open_device("/dev/dri/card0");
       dri2_dpy->own_device = 1;
       gbm = gbm_create_device(fd);
       if (gbm == NULL)
@@ -632,7 +632,7 @@ dri2_initialize_drm(_EGLDriver *drv, _EGLDisplay *disp)
    }
 
    if (fd < 0) {
-      fd = dup(gbm_device_get_fd(gbm));
+      fd = fcntl(gbm_device_get_fd(gbm), F_DUPFD_CLOEXEC, 3);
       if (fd < 0) {
          free(dri2_dpy);
          return EGL_FALSE;
@@ -715,10 +715,6 @@ dri2_initialize_drm(_EGLDriver *drv, _EGLDisplay *disp)
    }
 #endif
 
-   /* we're supporting EGL 1.4 */
-   disp->VersionMajor = 1;
-   disp->VersionMinor = 4;
-
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
     */
diff --git a/src/egl/drivers/dri2/platform_surfaceless.c b/src/egl/drivers/dri2/platform_surfaceless.c
new file mode 100644
index 0000000..48f15df
--- /dev/null
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -0,0 +1,162 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (c) 2014 The Chromium OS Authors.
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <xf86drm.h>
+#include <dlfcn.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include "egl_dri2.h"
+#include "egl_dri2_fallbacks.h"
+#include "loader.h"
+
+static struct dri2_egl_display_vtbl dri2_surfaceless_display_vtbl = {
+   .create_pixmap_surface = dri2_fallback_create_pixmap_surface,
+   .create_image = dri2_create_image_khr,
+   .swap_interval = dri2_fallback_swap_interval,
+   .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
+   .swap_buffers_region = dri2_fallback_swap_buffers_region,
+   .post_sub_buffer = dri2_fallback_post_sub_buffer,
+   .copy_buffers = dri2_fallback_copy_buffers,
+   .query_buffer_age = dri2_fallback_query_buffer_age,
+   .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
+   .get_sync_values = dri2_fallback_get_sync_values,
+};
+
+static void
+surfaceless_flush_front_buffer(__DRIdrawable *driDrawable, void *loaderPrivate)
+{
+}
+
+static __DRIbuffer *
+surfaceless_get_buffers_with_format(__DRIdrawable * driDrawable,
+                             int *width, int *height,
+                             unsigned int *attachments, int count,
+                             int *out_count, void *loaderPrivate)
+{
+   struct dri2_egl_surface *dri2_surf = loaderPrivate;
+
+   dri2_surf->buffer_count = 1;
+   if (width)
+      *width = dri2_surf->base.Width;
+   if (height)
+      *height = dri2_surf->base.Height;
+   *out_count = dri2_surf->buffer_count;;
+   return dri2_surf->buffers;
+}
+
+#define DRM_RENDER_DEV_NAME  "%s/renderD%d"
+
+EGLBoolean
+dri2_initialize_surfaceless(_EGLDriver *drv, _EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy;
+   const char* err;
+   int i;
+   int driver_loaded = 0;
+
+   loader_set_logger(_eglLog);
+
+   dri2_dpy = calloc(1, sizeof *dri2_dpy);
+   if (!dri2_dpy)
+      return _eglError(EGL_BAD_ALLOC, "eglInitialize");
+
+   disp->DriverData = (void *) dri2_dpy;
+
+   const int limit = 64;
+   const int base = 128;
+   for (i = 0; i < limit; ++i) {
+      char *card_path;
+      if (asprintf(&card_path, DRM_RENDER_DEV_NAME, DRM_DIR_NAME, base + i) < 0)
+         continue;
+
+      dri2_dpy->fd = loader_open_device(card_path);
+
+      free(card_path);
+      if (dri2_dpy->fd < 0)
+         continue;
+
+      dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
+      if (dri2_dpy->driver_name) {
+         if (dri2_load_driver(disp)) {
+            driver_loaded = 1;
+            break;
+         }
+         free(dri2_dpy->driver_name);
+      }
+      close(dri2_dpy->fd);
+   }
+
+   if (!driver_loaded) {
+      err = "DRI2: failed to load driver";
+      goto cleanup_display;
+   }
+
+   dri2_dpy->dri2_loader_extension.base.name = __DRI_DRI2_LOADER;
+   dri2_dpy->dri2_loader_extension.base.version = 3;
+   dri2_dpy->dri2_loader_extension.getBuffers = NULL;
+   dri2_dpy->dri2_loader_extension.flushFrontBuffer =
+      surfaceless_flush_front_buffer;
+   dri2_dpy->dri2_loader_extension.getBuffersWithFormat =
+      surfaceless_get_buffers_with_format;
+
+   dri2_dpy->extensions[0] = &dri2_dpy->dri2_loader_extension.base;
+   dri2_dpy->extensions[1] = &image_lookup_extension.base;
+   dri2_dpy->extensions[2] = &use_invalidate.base;
+   dri2_dpy->extensions[3] = NULL;
+
+   if (!dri2_create_screen(disp)) {
+      err = "DRI2: failed to create screen";
+      goto cleanup_driver;
+   }
+
+   for (i = 0; dri2_dpy->driver_configs[i]; i++) {
+      dri2_add_config(disp, dri2_dpy->driver_configs[i],
+                      i + 1, EGL_WINDOW_BIT, NULL, NULL);
+   }
+
+   disp->Extensions.KHR_image_base = EGL_TRUE;
+
+   /* Fill vtbl last to prevent accidentally calling virtual function during
+    * initialization.
+    */
+   dri2_dpy->vtbl = &dri2_surfaceless_display_vtbl;
+
+   return EGL_TRUE;
+
+cleanup_driver:
+   dlclose(dri2_dpy->driver);
+   free(dri2_dpy->driver_name);
+   close(dri2_dpy->fd);
+cleanup_display:
+   free(dri2_dpy);
+
+   return _eglError(EGL_NOT_INITIALIZED, err);
+}
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index e226005..1c98552 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -1,5 +1,6 @@
 /*
  * Copyright © 2011-2012 Intel Corporation
+ * Copyright © 2012 Collabora, Ltd.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -35,6 +36,7 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <xf86drm.h>
+#include <sys/mman.h>
 
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
@@ -120,7 +122,7 @@ resize_callback(struct wl_egl_window *wl_win, void *data)
  * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
  */
 static _EGLSurface *
-dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
+dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
                        _EGLConfig *conf, void *native_window,
                        const EGLint *attrib_list)
 {
@@ -137,7 +139,7 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
       return NULL;
    }
    
-   if (!_eglInitSurface(&dri2_surf->base, disp, type, conf, attrib_list))
+   if (!_eglInitSurface(&dri2_surf->base, disp, EGL_WINDOW_BIT, conf, attrib_list))
       goto cleanup_surf;
 
    if (conf->RedSize == 5)
@@ -147,25 +149,17 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
    else
       dri2_surf->format = WL_DRM_FORMAT_ARGB8888;
 
-   switch (type) {
-   case EGL_WINDOW_BIT:
-      dri2_surf->wl_win = window;
+   dri2_surf->wl_win = window;
 
-      dri2_surf->wl_win->private = dri2_surf;
-      dri2_surf->wl_win->resize_callback = resize_callback;
+   dri2_surf->wl_win->private = dri2_surf;
+   dri2_surf->wl_win->resize_callback = resize_callback;
 
-      dri2_surf->base.Width =  -1;
-      dri2_surf->base.Height = -1;
-      break;
-   default: 
-      goto cleanup_surf;
-   }
+   dri2_surf->base.Width =  -1;
+   dri2_surf->base.Height = -1;
 
    dri2_surf->dri_drawable = 
       (*dri2_dpy->dri2->createNewDrawable) (dri2_dpy->dri_screen,
-					    type == EGL_WINDOW_BIT ?
-					    dri2_conf->dri_double_config : 
-					    dri2_conf->dri_single_config,
+					    dri2_conf->dri_double_config,
 					    dri2_surf);
    if (dri2_surf->dri_drawable == NULL) {
       _eglError(EGL_BAD_ALLOC, "dri2->createNewDrawable");
@@ -193,8 +187,7 @@ dri2_wl_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    _EGLSurface *surf;
 
-   surf = dri2_wl_create_surface(drv, disp, EGL_WINDOW_BIT, conf,
-                                 native_window, attrib_list);
+   surf = dri2_wl_create_surface(drv, disp, conf, native_window, attrib_list);
 
    if (surf != NULL)
       dri2_wl_swap_interval(drv, disp, surf, dri2_dpy->default_swap_interval);
@@ -240,21 +233,26 @@ dri2_wl_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
          wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
       if (dri2_surf->color_buffers[i].dri_image)
          dri2_dpy->image->destroyImage(dri2_surf->color_buffers[i].dri_image);
+      if (dri2_surf->color_buffers[i].linear_copy)
+         dri2_dpy->image->destroyImage(dri2_surf->color_buffers[i].linear_copy);
+      if (dri2_surf->color_buffers[i].data)
+         munmap(dri2_surf->color_buffers[i].data,
+                dri2_surf->color_buffers[i].data_size);
    }
 
-   for (i = 0; i < __DRI_BUFFER_COUNT; i++)
-      if (dri2_surf->dri_buffers[i] &&
-          dri2_surf->dri_buffers[i]->attachment != __DRI_BUFFER_BACK_LEFT)
-         dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
-                                       dri2_surf->dri_buffers[i]);
+   if (dri2_dpy->dri2) {
+      for (i = 0; i < __DRI_BUFFER_COUNT; i++)
+         if (dri2_surf->dri_buffers[i] &&
+             dri2_surf->dri_buffers[i]->attachment != __DRI_BUFFER_BACK_LEFT)
+            dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
+                                          dri2_surf->dri_buffers[i]);
+   }
 
    if (dri2_surf->throttle_callback)
       wl_callback_destroy(dri2_surf->throttle_callback);
 
-   if (dri2_surf->base.Type == EGL_WINDOW_BIT) {
-      dri2_surf->wl_win->private = NULL;
-      dri2_surf->wl_win->resize_callback = NULL;
-   }
+   dri2_surf->wl_win->private = NULL;
+   dri2_surf->wl_win->resize_callback = NULL;
 
    free(surf);
 
@@ -274,17 +272,26 @@ dri2_wl_release_buffers(struct dri2_egl_surface *dri2_surf)
          wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
       if (dri2_surf->color_buffers[i].dri_image)
          dri2_dpy->image->destroyImage(dri2_surf->color_buffers[i].dri_image);
+      if (dri2_surf->color_buffers[i].linear_copy)
+         dri2_dpy->image->destroyImage(dri2_surf->color_buffers[i].linear_copy);
+      if (dri2_surf->color_buffers[i].data)
+         munmap(dri2_surf->color_buffers[i].data,
+                dri2_surf->color_buffers[i].data_size);
 
       dri2_surf->color_buffers[i].wl_buffer = NULL;
       dri2_surf->color_buffers[i].dri_image = NULL;
+      dri2_surf->color_buffers[i].linear_copy = NULL;
+      dri2_surf->color_buffers[i].data = NULL;
       dri2_surf->color_buffers[i].locked = 0;
    }
 
-   for (i = 0; i < __DRI_BUFFER_COUNT; i++)
-      if (dri2_surf->dri_buffers[i] &&
-          dri2_surf->dri_buffers[i]->attachment != __DRI_BUFFER_BACK_LEFT)
-         dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
-                                       dri2_surf->dri_buffers[i]);
+   if (dri2_dpy->dri2) {
+      for (i = 0; i < __DRI_BUFFER_COUNT; i++)
+         if (dri2_surf->dri_buffers[i] &&
+             dri2_surf->dri_buffers[i]->attachment != __DRI_BUFFER_BACK_LEFT)
+            dri2_dpy->dri2->releaseBuffer(dri2_dpy->dri_screen,
+                                          dri2_surf->dri_buffers[i]);
+   }
 }
 
 static int
@@ -338,13 +345,29 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
 
    if (dri2_surf->back == NULL)
       return -1;
+
+   if (dri2_dpy->is_different_gpu &&
+       dri2_surf->back->linear_copy == NULL) {
+       dri2_surf->back->linear_copy =
+          dri2_dpy->image->createImage(dri2_dpy->dri_screen,
+                                      dri2_surf->base.Width,
+                                      dri2_surf->base.Height,
+                                      dri_image_format,
+                                      __DRI_IMAGE_USE_SHARE |
+                                      __DRI_IMAGE_USE_LINEAR,
+                                      NULL);
+      if (dri2_surf->back->linear_copy == NULL)
+          return -1;
+   }
+
    if (dri2_surf->back->dri_image == NULL) {
       dri2_surf->back->dri_image = 
          dri2_dpy->image->createImage(dri2_dpy->dri_screen,
                                       dri2_surf->base.Width,
                                       dri2_surf->base.Height,
                                       dri_image_format,
-                                      __DRI_IMAGE_USE_SHARE,
+                                      dri2_dpy->is_different_gpu ?
+                                         0 : __DRI_IMAGE_USE_SHARE,
                                       NULL);
       dri2_surf->back->age = 0;
    }
@@ -407,9 +430,8 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
       dri2_egl_display(dri2_surf->base.Resource.Display);
    int i;
 
-   if (dri2_surf->base.Type == EGL_WINDOW_BIT &&
-       (dri2_surf->base.Width != dri2_surf->wl_win->width || 
-        dri2_surf->base.Height != dri2_surf->wl_win->height)) {
+   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->height) {
 
       dri2_wl_release_buffers(dri2_surf);
 
@@ -432,8 +454,11 @@ update_buffers(struct dri2_egl_surface *dri2_surf)
           dri2_surf->color_buffers[i].wl_buffer) {
          wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
          dri2_dpy->image->destroyImage(dri2_surf->color_buffers[i].dri_image);
+         if (dri2_dpy->is_different_gpu)
+            dri2_dpy->image->destroyImage(dri2_surf->color_buffers[i].linear_copy);
          dri2_surf->color_buffers[i].wl_buffer = NULL;
          dri2_surf->color_buffers[i].dri_image = NULL;
+         dri2_surf->color_buffers[i].linear_copy = NULL;
       }
    }
 
@@ -578,16 +603,20 @@ create_wl_buffer(struct dri2_egl_surface *dri2_surf)
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
+   __DRIimage *image;
    int fd, stride, name;
 
    if (dri2_surf->current->wl_buffer != NULL)
       return;
 
+   if (dri2_dpy->is_different_gpu) {
+      image = dri2_surf->current->linear_copy;
+   } else {
+      image = dri2_surf->current->dri_image;
+   }
    if (dri2_dpy->capabilities & WL_DRM_CAPABILITY_PRIME) {
-      dri2_dpy->image->queryImage(dri2_surf->current->dri_image,
-                                  __DRI_IMAGE_ATTRIB_FD, &fd);
-      dri2_dpy->image->queryImage(dri2_surf->current->dri_image,
-                                  __DRI_IMAGE_ATTRIB_STRIDE, &stride);
+      dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_FD, &fd);
+      dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride);
 
       dri2_surf->current->wl_buffer =
          wl_drm_create_prime_buffer(dri2_dpy->wl_drm,
@@ -600,10 +629,8 @@ create_wl_buffer(struct dri2_egl_surface *dri2_surf)
                                     0, 0);
       close(fd);
    } else {
-      dri2_dpy->image->queryImage(dri2_surf->current->dri_image,
-                                  __DRI_IMAGE_ATTRIB_NAME, &name);
-      dri2_dpy->image->queryImage(dri2_surf->current->dri_image,
-                                  __DRI_IMAGE_ATTRIB_STRIDE, &stride);
+      dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_NAME, &name);
+      dri2_dpy->image->queryImage(image, __DRI_IMAGE_ATTRIB_STRIDE, &stride);
 
       dri2_surf->current->wl_buffer =
          wl_drm_create_buffer(dri2_dpy->wl_drm,
@@ -683,6 +710,18 @@ dri2_wl_swap_buffers_with_damage(_EGLDriver *drv,
       }
    }
 
+   if (dri2_dpy->is_different_gpu) {
+      _EGLContext *ctx = _eglGetCurrentContext();
+      struct dri2_egl_context *dri2_ctx = dri2_egl_context(ctx);
+      dri2_dpy->image->blitImage(dri2_ctx->dri_context,
+                                 dri2_surf->current->linear_copy,
+                                 dri2_surf->current->dri_image,
+                                 0, 0, dri2_surf->base.Width,
+                                 dri2_surf->base.Height,
+                                 0, 0, dri2_surf->base.Width,
+                                 dri2_surf->base.Height, 0);
+   }
+
    dri2_flush_drawable_for_swapbuffers(disp, draw);
    (*dri2_dpy->flush->invalidate)(dri2_surf->dri_drawable);
 
@@ -800,12 +839,33 @@ bad_format:
    return NULL;
 }
 
+static char
+is_fd_render_node(int fd)
+{
+   struct stat render;
+
+   if (fstat(fd, &render))
+      return 0;
+
+   if (!S_ISCHR(render.st_mode))
+      return 0;
+
+   if (render.st_rdev & 0x80)
+      return 1;
+   return 0;
+}
+
 static int
 dri2_wl_authenticate(_EGLDisplay *disp, uint32_t id)
 {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    int ret = 0;
 
+   if (dri2_dpy->is_render_node) {
+      _eglLog(_EGL_WARNING, "wayland-egl: client asks server to "
+                            "authenticate for render-nodes");
+      return 0;
+   }
    dri2_dpy->authenticated = 0;
 
    wl_drm_authenticate(dri2_dpy->wl_drm, id);
@@ -831,24 +891,19 @@ drm_handle_device(void *data, struct wl_drm *drm, const char *device)
    if (!dri2_dpy->device_name)
       return;
 
-#ifdef O_CLOEXEC
-   dri2_dpy->fd = open(dri2_dpy->device_name, O_RDWR | O_CLOEXEC);
-   if (dri2_dpy->fd == -1 && errno == EINVAL)
-#endif
-   {
-      dri2_dpy->fd = open(dri2_dpy->device_name, O_RDWR);
-      if (dri2_dpy->fd != -1)
-         fcntl(dri2_dpy->fd, F_SETFD, fcntl(dri2_dpy->fd, F_GETFD) |
-            FD_CLOEXEC);
-   }
+   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
    if (dri2_dpy->fd == -1) {
       _eglLog(_EGL_WARNING, "wayland-egl: could not open %s (%s)",
 	      dri2_dpy->device_name, strerror(errno));
       return;
    }
 
-   drmGetMagic(dri2_dpy->fd, &magic);
-   wl_drm_authenticate(dri2_dpy->wl_drm, magic);
+   if (is_fd_render_node(dri2_dpy->fd)) {
+      dri2_dpy->authenticated = 1;
+   } else {
+      drmGetMagic(dri2_dpy->fd, &magic);
+      wl_drm_authenticate(dri2_dpy->wl_drm, magic);
+   }
 }
 
 static void
@@ -893,7 +948,7 @@ static const struct wl_drm_listener drm_listener = {
 };
 
 static void
-registry_handle_global(void *data, struct wl_registry *registry, uint32_t name,
+registry_handle_global_drm(void *data, struct wl_registry *registry, uint32_t name,
 		       const char *interface, uint32_t version)
 {
    struct dri2_egl_display *dri2_dpy = data;
@@ -913,8 +968,8 @@ registry_handle_global_remove(void *data, struct wl_registry *registry,
 {
 }
 
-static const struct wl_registry_listener registry_listener = {
-   registry_handle_global,
+static const struct wl_registry_listener registry_listener_drm = {
+   registry_handle_global_drm,
    registry_handle_global_remove
 };
 
@@ -990,8 +1045,8 @@ static struct dri2_egl_display_vtbl dri2_wl_display_vtbl = {
    .get_sync_values = dri2_fallback_get_sync_values,
 };
 
-EGLBoolean
-dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
+static EGLBoolean
+dri2_initialize_wayland_drm(_EGLDriver *drv, _EGLDisplay *disp)
 {
    struct dri2_egl_display *dri2_dpy;
    const __DRIconfig *config;
@@ -1027,9 +1082,9 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
    wl_proxy_set_queue((struct wl_proxy *) dri2_dpy->wl_registry,
                       dri2_dpy->wl_queue);
    wl_registry_add_listener(dri2_dpy->wl_registry,
-                            &registry_listener, dri2_dpy);
+                            &registry_listener_drm, dri2_dpy);
    if (roundtrip(dri2_dpy) < 0 || dri2_dpy->wl_drm == NULL)
-      goto cleanup_dpy;
+      goto cleanup_registry;
 
    if (roundtrip(dri2_dpy) < 0 || dri2_dpy->fd == -1)
       goto cleanup_drm;
@@ -1037,6 +1092,24 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
    if (roundtrip(dri2_dpy) < 0 || !dri2_dpy->authenticated)
       goto cleanup_fd;
 
+   dri2_dpy->fd = loader_get_user_preferred_fd(dri2_dpy->fd,
+                                               &dri2_dpy->is_different_gpu);
+   if (dri2_dpy->is_different_gpu) {
+      free(dri2_dpy->device_name);
+      dri2_dpy->device_name = loader_get_device_name_for_fd(dri2_dpy->fd);
+      if (!dri2_dpy->device_name) {
+         _eglError(EGL_BAD_ALLOC, "wayland-egl: failed to get device name "
+                                  "for requested GPU");
+         goto cleanup_fd;
+      }
+   }
+
+   /* we have to do the check now, because loader_get_user_preferred_fd
+    * will return a render-node when the requested gpu is different
+    * to the server, but also if the client asks for the same gpu than
+    * the server by requesting its pci-id */
+   dri2_dpy->is_render_node = is_fd_render_node(dri2_dpy->fd);
+
    dri2_dpy->driver_name = loader_get_driver_for_fd(dri2_dpy->fd, 0);
    if (dri2_dpy->driver_name == NULL) {
       _eglError(EGL_BAD_ALLOC, "DRI2: failed to get driver name");
@@ -1046,18 +1119,23 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
    if (!dri2_load_driver(disp))
       goto cleanup_driver_name;
 
-   dri2_dpy->dri2_loader_extension.base.name = __DRI_DRI2_LOADER;
-   dri2_dpy->dri2_loader_extension.base.version = 3;
-   dri2_dpy->dri2_loader_extension.getBuffers = dri2_wl_get_buffers;
-   dri2_dpy->dri2_loader_extension.flushFrontBuffer = dri2_wl_flush_front_buffer;
-   dri2_dpy->dri2_loader_extension.getBuffersWithFormat =
-      dri2_wl_get_buffers_with_format;
-
-   dri2_dpy->extensions[0] = &dri2_dpy->dri2_loader_extension.base;
-   dri2_dpy->extensions[1] = &image_loader_extension.base;
-   dri2_dpy->extensions[2] = &image_lookup_extension.base;
-   dri2_dpy->extensions[3] = &use_invalidate.base;
-   dri2_dpy->extensions[4] = NULL;
+   dri2_dpy->extensions[0] = &image_loader_extension.base;
+   dri2_dpy->extensions[1] = &image_lookup_extension.base;
+   dri2_dpy->extensions[2] = &use_invalidate.base;
+
+   /* render nodes cannot use Gem names, and thus do not support
+    * the __DRI_DRI2_LOADER extension */
+   if (!dri2_dpy->is_render_node) {
+      dri2_dpy->dri2_loader_extension.base.name = __DRI_DRI2_LOADER;
+      dri2_dpy->dri2_loader_extension.base.version = 3;
+      dri2_dpy->dri2_loader_extension.getBuffers = dri2_wl_get_buffers;
+      dri2_dpy->dri2_loader_extension.flushFrontBuffer = dri2_wl_flush_front_buffer;
+      dri2_dpy->dri2_loader_extension.getBuffersWithFormat =
+         dri2_wl_get_buffers_with_format;
+      dri2_dpy->extensions[3] = &dri2_dpy->dri2_loader_extension.base;
+      dri2_dpy->extensions[4] = NULL;
+   } else
+      dri2_dpy->extensions[3] = NULL;
 
    dri2_dpy->swap_available = EGL_TRUE;
 
@@ -1066,15 +1144,33 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
 
    dri2_wl_setup_swap_interval(dri2_dpy);
 
-   /* The server shouldn't advertise WL_DRM_CAPABILITY_PRIME if the driver
-    * doesn't have createImageFromFds, since we're using the same driver on
-    * both sides.  We don't want crash if that happens anyway, so fall back to
-    * gem names if we don't have prime support. */
+   /* To use Prime, we must have _DRI_IMAGE v7 at least.
+    * createImageFromFds support indicates that Prime export/import
+    * is supported by the driver. Fall back to
+    * gem names if we don't have Prime support. */
 
    if (dri2_dpy->image->base.version < 7 ||
        dri2_dpy->image->createImageFromFds == NULL)
       dri2_dpy->capabilities &= ~WL_DRM_CAPABILITY_PRIME;
 
+   /* We cannot use Gem names with render-nodes, only prime fds (dma-buf).
+    * The server needs to accept them */
+   if (dri2_dpy->is_render_node &&
+       !(dri2_dpy->capabilities & WL_DRM_CAPABILITY_PRIME)) {
+      _eglLog(_EGL_WARNING, "wayland-egl: display is not render-node capable");
+      goto cleanup_screen;
+   }
+
+   if (dri2_dpy->is_different_gpu &&
+       (dri2_dpy->image->base.version < 9 ||
+        dri2_dpy->image->blitImage == NULL)) {
+      _eglLog(_EGL_WARNING, "wayland-egl: Different GPU selected, but the "
+                            "Image extension in the driver is not "
+                            "compatible. Version 9 or later and blitImage() "
+                            "are required");
+      goto cleanup_screen;
+   }
+
    types = EGL_WINDOW_BIT;
    for (i = 0; dri2_dpy->driver_configs[i]; i++) {
       config = dri2_dpy->driver_configs[i];
@@ -1087,15 +1183,20 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
    }
 
    disp->Extensions.WL_bind_wayland_display = EGL_TRUE;
-   disp->Extensions.WL_create_wayland_buffer_from_image = EGL_TRUE;
+   /* When cannot convert EGLImage to wl_buffer when on a different gpu,
+    * because the buffer of the EGLImage has likely a tiling mode the server
+    * gpu won't support. These is no way to check for now. Thus do not support the
+    * extension */
+   if (!dri2_dpy->is_different_gpu) {
+      disp->Extensions.WL_create_wayland_buffer_from_image = EGL_TRUE;
+   } else {
+      dri2_wl_display_vtbl.create_wayland_buffer_from_image =
+         dri2_fallback_create_wayland_buffer_from_image;
+   }
    disp->Extensions.EXT_buffer_age = EGL_TRUE;
 
    disp->Extensions.EXT_swap_buffers_with_damage = EGL_TRUE;
 
-   /* we're supporting EGL 1.4 */
-   disp->VersionMajor = 1;
-   disp->VersionMinor = 4;
-
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
     */
@@ -1103,6 +1204,8 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
 
    return EGL_TRUE;
 
+ cleanup_screen:
+   dri2_dpy->core->destroyScreen(dri2_dpy->dri_screen);
  cleanup_driver:
    dlclose(dri2_dpy->driver);
  cleanup_driver_name:
@@ -1112,8 +1215,666 @@ dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
  cleanup_drm:
    free(dri2_dpy->device_name);
    wl_drm_destroy(dri2_dpy->wl_drm);
+ cleanup_registry:
+   wl_registry_destroy(dri2_dpy->wl_registry);
+   wl_event_queue_destroy(dri2_dpy->wl_queue);
  cleanup_dpy:
    free(dri2_dpy);
    
    return EGL_FALSE;
 }
+
+static int
+dri2_wl_swrast_get_stride_for_format(int format, int w)
+{
+   if (format == WL_SHM_FORMAT_RGB565)
+      return 2 * w;
+   else /* ARGB8888 || XRGB8888 */
+      return 4 * w;
+}
+
+/*
+ * Taken from weston shared/os-compatibility.c
+ */
+
+static int
+set_cloexec_or_close(int fd)
+{
+   long flags;
+
+   if (fd == -1)
+      return -1;
+
+   flags = fcntl(fd, F_GETFD);
+   if (flags == -1)
+      goto err;
+
+   if (fcntl(fd, F_SETFD, flags | FD_CLOEXEC) == -1)
+      goto err;
+
+   return fd;
+
+err:
+   close(fd);
+   return -1;
+}
+
+/*
+ * Taken from weston shared/os-compatibility.c
+ */
+
+static int
+create_tmpfile_cloexec(char *tmpname)
+{
+   int fd;
+
+#ifdef HAVE_MKOSTEMP
+   fd = mkostemp(tmpname, O_CLOEXEC);
+   if (fd >= 0)
+      unlink(tmpname);
+#else
+   fd = mkstemp(tmpname);
+   if (fd >= 0) {
+      fd = set_cloexec_or_close(fd);
+      unlink(tmpname);
+   }
+#endif
+
+   return fd;
+}
+
+/*
+ * Taken from weston shared/os-compatibility.c
+ *
+ * Create a new, unique, anonymous file of the given size, and
+ * return the file descriptor for it. The file descriptor is set
+ * CLOEXEC. The file is immediately suitable for mmap()'ing
+ * the given size at offset zero.
+ *
+ * The file should not have a permanent backing store like a disk,
+ * but may have if XDG_RUNTIME_DIR is not properly implemented in OS.
+ *
+ * The file name is deleted from the file system.
+ *
+ * The file is suitable for buffer sharing between processes by
+ * transmitting the file descriptor over Unix sockets using the
+ * SCM_RIGHTS methods.
+ *
+ * If the C library implements posix_fallocate(), it is used to
+ * guarantee that disk space is available for the file at the
+ * given size. If disk space is insufficent, errno is set to ENOSPC.
+ * If posix_fallocate() is not supported, program may receive
+ * SIGBUS on accessing mmap()'ed file contents instead.
+ */
+static int
+os_create_anonymous_file(off_t size)
+{
+   static const char template[] = "/mesa-shared-XXXXXX";
+   const char *path;
+   char *name;
+   int fd;
+   int ret;
+
+   path = getenv("XDG_RUNTIME_DIR");
+   if (!path) {
+      errno = ENOENT;
+      return -1;
+   }
+
+   name = malloc(strlen(path) + sizeof(template));
+   if (!name)
+      return -1;
+
+   strcpy(name, path);
+   strcat(name, template);
+
+   fd = create_tmpfile_cloexec(name);
+
+   free(name);
+
+   if (fd < 0)
+      return -1;
+
+   ret = ftruncate(fd, size);
+   if (ret < 0) {
+      close(fd);
+      return -1;
+   }
+
+   return fd;
+}
+
+
+static EGLBoolean
+dri2_wl_swrast_allocate_buffer(struct dri2_egl_display *dri2_dpy,
+                               int format, int w, int h,
+                               void **data, int *size,
+                               struct wl_buffer **buffer)
+{
+   struct wl_shm_pool *pool;
+   int fd, stride, size_map;
+   void *data_map;
+
+   stride = dri2_wl_swrast_get_stride_for_format(format, w);
+   size_map = h * stride;
+
+   /* Create a sharable buffer */
+   fd = os_create_anonymous_file(size_map);
+   if (fd < 0)
+      return EGL_FALSE;
+
+   data_map = mmap(NULL, size_map, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+   if (data_map == MAP_FAILED) {
+      close(fd);
+      return EGL_FALSE;
+   }
+
+   /* Share it in a wl_buffer */
+   pool = wl_shm_create_pool(dri2_dpy->wl_shm, fd, size_map);
+   *buffer = wl_shm_pool_create_buffer(pool, 0, w, h, stride, format);
+   wl_shm_pool_destroy(pool);
+   close(fd);
+
+   *data = data_map;
+   *size = size_map;
+   return EGL_TRUE;
+}
+
+static int
+swrast_update_buffers(struct dri2_egl_surface *dri2_surf)
+{
+   struct dri2_egl_display *dri2_dpy =
+      dri2_egl_display(dri2_surf->base.Resource.Display);
+   int i;
+
+   /* we need to do the following operations only once per frame */
+   if (dri2_surf->back)
+      return 0;
+
+   if (dri2_surf->base.Width != dri2_surf->wl_win->width ||
+       dri2_surf->base.Height != dri2_surf->wl_win->height) {
+
+      dri2_wl_release_buffers(dri2_surf);
+
+      dri2_surf->base.Width  = dri2_surf->wl_win->width;
+      dri2_surf->base.Height = dri2_surf->wl_win->height;
+      dri2_surf->dx = dri2_surf->wl_win->dx;
+      dri2_surf->dy = dri2_surf->wl_win->dy;
+      dri2_surf->current = NULL;
+   }
+
+   /* find back buffer */
+
+   /* We always want to throttle to some event (either a frame callback or
+    * a sync request) after the commit so that we can be sure the
+    * compositor has had a chance to handle it and send us a release event
+    * before we look for a free buffer */
+   while (dri2_surf->throttle_callback != NULL)
+      if (wl_display_dispatch_queue(dri2_dpy->wl_dpy,
+                                    dri2_dpy->wl_queue) == -1)
+         return -1;
+
+   /* try get free buffer already created */
+   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+      if (!dri2_surf->color_buffers[i].locked &&
+          dri2_surf->color_buffers[i].wl_buffer) {
+          dri2_surf->back = &dri2_surf->color_buffers[i];
+          break;
+      }
+   }
+
+   /* else choose any another free location */
+   if (!dri2_surf->back) {
+      for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+         if (!dri2_surf->color_buffers[i].locked) {
+             dri2_surf->back = &dri2_surf->color_buffers[i];
+             if (!dri2_wl_swrast_allocate_buffer(dri2_dpy,
+                                                 dri2_surf->format,
+                                                 dri2_surf->base.Width,
+                                                 dri2_surf->base.Height,
+                                                 &dri2_surf->back->data,
+                                                 &dri2_surf->back->data_size,
+                                                 &dri2_surf->back->wl_buffer)) {
+                _eglError(EGL_BAD_ALLOC, "failed to allocate color buffer");
+                 return -1;
+             }
+             wl_proxy_set_queue((struct wl_proxy *) dri2_surf->back->wl_buffer,
+                                dri2_dpy->wl_queue);
+             wl_buffer_add_listener(dri2_surf->back->wl_buffer,
+                                    &wl_buffer_listener, dri2_surf);
+             break;
+         }
+      }
+   }
+
+   if (!dri2_surf->back) {
+      _eglError(EGL_BAD_ALLOC, "failed to find free buffer");
+      return -1;
+   }
+
+   dri2_surf->back->locked = 1;
+
+   /* If we have an extra unlocked buffer at this point, we had to do triple
+    * buffering for a while, but now can go back to just double buffering.
+    * That means we can free any unlocked buffer now. */
+   for (i = 0; i < ARRAY_SIZE(dri2_surf->color_buffers); i++) {
+      if (!dri2_surf->color_buffers[i].locked &&
+          dri2_surf->color_buffers[i].wl_buffer) {
+         wl_buffer_destroy(dri2_surf->color_buffers[i].wl_buffer);
+         munmap(dri2_surf->color_buffers[i].data,
+                dri2_surf->color_buffers[i].data_size);
+         dri2_surf->color_buffers[i].wl_buffer = NULL;
+         dri2_surf->color_buffers[i].data = NULL;
+      }
+   }
+
+   return 0;
+}
+
+static void*
+dri2_wl_swrast_get_frontbuffer_data(struct dri2_egl_surface *dri2_surf)
+{
+   /* if there has been a resize: */
+   if (!dri2_surf->current)
+      return NULL;
+
+   return dri2_surf->current->data;
+}
+
+static void*
+dri2_wl_swrast_get_backbuffer_data(struct dri2_egl_surface *dri2_surf)
+{
+   assert(dri2_surf->back);
+   return dri2_surf->back->data;
+}
+
+static void
+dri2_wl_swrast_commit_backbuffer(struct dri2_egl_surface *dri2_surf)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(dri2_surf->base.Resource.Display);
+
+   if (dri2_surf->base.SwapInterval > 0) {
+      dri2_surf->throttle_callback =
+         wl_surface_frame(dri2_surf->wl_win->surface);
+      wl_callback_add_listener(dri2_surf->throttle_callback,
+                               &throttle_listener, dri2_surf);
+      wl_proxy_set_queue((struct wl_proxy *) dri2_surf->throttle_callback,
+                         dri2_dpy->wl_queue);
+   }
+
+   dri2_surf->current = dri2_surf->back;
+   dri2_surf->back = NULL;
+
+   wl_surface_attach(dri2_surf->wl_win->surface,
+                     dri2_surf->current->wl_buffer,
+                     dri2_surf->dx, dri2_surf->dy);
+
+   dri2_surf->wl_win->attached_width  = dri2_surf->base.Width;
+   dri2_surf->wl_win->attached_height = dri2_surf->base.Height;
+   /* reset resize growing parameters */
+   dri2_surf->dx = 0;
+   dri2_surf->dy = 0;
+
+   wl_surface_damage(dri2_surf->wl_win->surface,
+                     0, 0, INT32_MAX, INT32_MAX);
+   wl_surface_commit(dri2_surf->wl_win->surface);
+
+   /* If we're not waiting for a frame callback then we'll at least throttle
+    * to a sync callback so that we always give a chance for the compositor to
+    * handle the commit and send a release event before checking for a free
+    * buffer */
+   if (dri2_surf->throttle_callback == NULL) {
+      dri2_surf->throttle_callback = wl_display_sync(dri2_dpy->wl_dpy);
+      wl_callback_add_listener(dri2_surf->throttle_callback,
+                               &throttle_listener, dri2_surf);
+      wl_proxy_set_queue((struct wl_proxy *) dri2_surf->throttle_callback,
+                         dri2_dpy->wl_queue);
+   }
+
+   wl_display_flush(dri2_dpy->wl_dpy);
+}
+
+static void
+dri2_wl_swrast_get_drawable_info(__DRIdrawable * draw,
+                                 int *x, int *y, int *w, int *h,
+                                 void *loaderPrivate)
+{
+   struct dri2_egl_surface *dri2_surf = loaderPrivate;
+
+   (void) swrast_update_buffers(dri2_surf);
+   *x = 0;
+   *y = 0;
+   *w = dri2_surf->base.Width;
+   *h = dri2_surf->base.Height;
+}
+
+static void
+dri2_wl_swrast_get_image(__DRIdrawable * read,
+                         int x, int y, int w, int h,
+                         char *data, void *loaderPrivate)
+{
+   struct dri2_egl_surface *dri2_surf = loaderPrivate;
+   int copy_width = dri2_wl_swrast_get_stride_for_format(dri2_surf->format, w);
+   int x_offset = dri2_wl_swrast_get_stride_for_format(dri2_surf->format, x);
+   int src_stride = dri2_wl_swrast_get_stride_for_format(dri2_surf->format, dri2_surf->base.Width);
+   int dst_stride = copy_width;
+   char *src, *dst;
+
+   src = dri2_wl_swrast_get_frontbuffer_data(dri2_surf);
+   if (!src) {
+      memset(data, 0, copy_width * h);
+      return;
+   }
+
+   assert(data != src);
+   assert(copy_width <= src_stride);
+
+   src += x_offset;
+   src += y * src_stride;
+   dst = data;
+
+   if (copy_width > src_stride-x_offset)
+      copy_width = src_stride-x_offset;
+   if (h > dri2_surf->base.Height-y)
+      h = dri2_surf->base.Height-y;
+
+   for (; h>0; h--) {
+      memcpy(dst, src, copy_width);
+      src += src_stride;
+      dst += dst_stride;
+   }
+}
+
+static void
+dri2_wl_swrast_put_image2(__DRIdrawable * draw, int op,
+                         int x, int y, int w, int h, int stride,
+                         char *data, void *loaderPrivate)
+{
+   struct dri2_egl_surface *dri2_surf = loaderPrivate;
+   int copy_width = dri2_wl_swrast_get_stride_for_format(dri2_surf->format, w);
+   int dst_stride = dri2_wl_swrast_get_stride_for_format(dri2_surf->format, dri2_surf->base.Width);
+   int x_offset = dri2_wl_swrast_get_stride_for_format(dri2_surf->format, x);
+   char *src, *dst;
+
+   assert(copy_width <= stride);
+
+   (void) swrast_update_buffers(dri2_surf);
+   dst = dri2_wl_swrast_get_backbuffer_data(dri2_surf);
+
+   /* partial copy, copy old content */
+   if (copy_width < dst_stride)
+      dri2_wl_swrast_get_image(draw, 0, 0,
+                               dri2_surf->base.Width, dri2_surf->base.Height,
+                               dst, loaderPrivate);
+
+   dst += x_offset;
+   dst += y * dst_stride;
+
+   src = data;
+
+   /* drivers expect we do these checks (and some rely on it) */
+   if (copy_width > dst_stride-x_offset)
+      copy_width = dst_stride-x_offset;
+   if (h > dri2_surf->base.Height-y)
+      h = dri2_surf->base.Height-y;
+
+   for (; h>0; h--) {
+      memcpy(dst, src, copy_width);
+      src += stride;
+      dst += dst_stride;
+   }
+   dri2_wl_swrast_commit_backbuffer(dri2_surf);
+}
+
+static void
+dri2_wl_swrast_put_image(__DRIdrawable * draw, int op,
+                         int x, int y, int w, int h,
+                         char *data, void *loaderPrivate)
+{
+   struct dri2_egl_surface *dri2_surf = loaderPrivate;
+   int stride;
+
+   stride = dri2_wl_swrast_get_stride_for_format(dri2_surf->format, w);
+   dri2_wl_swrast_put_image2(draw, op, x, y, w, h,
+                             stride, data, loaderPrivate);
+}
+
+/**
+ * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
+ */
+static _EGLSurface *
+dri2_wl_swrast_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
+                                     _EGLConfig *conf, void *native_window,
+                                     const EGLint *attrib_list)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_config *dri2_conf = dri2_egl_config(conf);
+   struct wl_egl_window *window = native_window;
+   struct dri2_egl_surface *dri2_surf;
+
+   (void) drv;
+
+   dri2_surf = calloc(1, sizeof *dri2_surf);
+   if (!dri2_surf) {
+      _eglError(EGL_BAD_ALLOC, "dri2_create_surface");
+      return NULL;
+   }
+
+   if (!_eglInitSurface(&dri2_surf->base, disp, EGL_WINDOW_BIT, conf, attrib_list))
+      goto cleanup_surf;
+
+   if (conf->RedSize == 5)
+      dri2_surf->format = WL_SHM_FORMAT_RGB565;
+   else if (conf->AlphaSize == 0)
+      dri2_surf->format = WL_SHM_FORMAT_XRGB8888;
+   else
+      dri2_surf->format = WL_SHM_FORMAT_ARGB8888;
+
+   dri2_surf->wl_win = window;
+
+   dri2_surf->base.Width = -1;
+   dri2_surf->base.Height = -1;
+
+   dri2_surf->dri_drawable =
+      (*dri2_dpy->swrast->createNewDrawable) (dri2_dpy->dri_screen,
+                                              dri2_conf->dri_double_config,
+                                              dri2_surf);
+   if (dri2_surf->dri_drawable == NULL) {
+      _eglError(EGL_BAD_ALLOC, "swrast->createNewDrawable");
+      goto cleanup_dri_drawable;
+   }
+
+   dri2_wl_swap_interval(drv, disp, &dri2_surf->base,
+                         dri2_dpy->default_swap_interval);
+
+   return &dri2_surf->base;
+
+ cleanup_dri_drawable:
+   dri2_dpy->core->destroyDrawable(dri2_surf->dri_drawable);
+ cleanup_surf:
+   free(dri2_surf);
+
+   return NULL;
+}
+
+static EGLBoolean
+dri2_wl_swrast_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
+{
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+   struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
+
+   dri2_dpy->core->swapBuffers(dri2_surf->dri_drawable);
+   return EGL_TRUE;
+}
+
+static void
+shm_handle_format(void *data, struct wl_shm *shm, uint32_t format)
+{
+   struct dri2_egl_display *dri2_dpy = data;
+
+   switch (format) {
+   case WL_SHM_FORMAT_ARGB8888:
+      dri2_dpy->formats |= HAS_ARGB8888;
+      break;
+   case WL_SHM_FORMAT_XRGB8888:
+      dri2_dpy->formats |= HAS_XRGB8888;
+      break;
+   case WL_SHM_FORMAT_RGB565:
+      dri2_dpy->formats |= HAS_RGB565;
+      break;
+   }
+}
+
+static const struct wl_shm_listener shm_listener = {
+   shm_handle_format
+};
+
+static void
+registry_handle_global_swrast(void *data, struct wl_registry *registry, uint32_t name,
+                              const char *interface, uint32_t version)
+{
+   struct dri2_egl_display *dri2_dpy = data;
+
+   if (strcmp(interface, "wl_shm") == 0) {
+      dri2_dpy->wl_shm =
+         wl_registry_bind(registry, name, &wl_shm_interface, 1);
+      wl_shm_add_listener(dri2_dpy->wl_shm, &shm_listener, dri2_dpy);
+   }
+}
+
+static const struct wl_registry_listener registry_listener_swrast = {
+   registry_handle_global_swrast,
+   registry_handle_global_remove
+};
+
+static struct dri2_egl_display_vtbl dri2_wl_swrast_display_vtbl = {
+   .authenticate = NULL,
+   .create_window_surface = dri2_wl_swrast_create_window_surface,
+   .create_pixmap_surface = dri2_wl_create_pixmap_surface,
+   .create_pbuffer_surface = dri2_fallback_create_pbuffer_surface,
+   .destroy_surface = dri2_wl_destroy_surface,
+   .create_image = dri2_fallback_create_image_khr,
+   .swap_interval = dri2_wl_swap_interval,
+   .swap_buffers = dri2_wl_swrast_swap_buffers,
+   .swap_buffers_with_damage = dri2_fallback_swap_buffers_with_damage,
+   .swap_buffers_region = dri2_fallback_swap_buffers_region,
+   .post_sub_buffer = dri2_fallback_post_sub_buffer,
+   .copy_buffers = dri2_fallback_copy_buffers,
+   .query_buffer_age = dri2_fallback_query_buffer_age,
+   .create_wayland_buffer_from_image = dri2_fallback_create_wayland_buffer_from_image,
+   .get_sync_values = dri2_fallback_get_sync_values,
+};
+
+static EGLBoolean
+dri2_initialize_wayland_swrast(_EGLDriver *drv, _EGLDisplay *disp)
+{
+   struct dri2_egl_display *dri2_dpy;
+   const __DRIconfig *config;
+   uint32_t types;
+   int i;
+   static const unsigned int argb_masks[4] =
+      { 0xff0000, 0xff00, 0xff, 0xff000000 };
+   static const unsigned int rgb_masks[4] = { 0xff0000, 0xff00, 0xff, 0 };
+   static const unsigned int rgb565_masks[4] = { 0xf800, 0x07e0, 0x001f, 0 };
+
+   loader_set_logger(_eglLog);
+
+   dri2_dpy = calloc(1, sizeof *dri2_dpy);
+   if (!dri2_dpy)
+      return _eglError(EGL_BAD_ALLOC, "eglInitialize");
+
+   disp->DriverData = (void *) dri2_dpy;
+   if (disp->PlatformDisplay == NULL) {
+      dri2_dpy->wl_dpy = wl_display_connect(NULL);
+      if (dri2_dpy->wl_dpy == NULL)
+         goto cleanup_dpy;
+      dri2_dpy->own_device = 1;
+   } else {
+      dri2_dpy->wl_dpy = disp->PlatformDisplay;
+   }
+
+   dri2_dpy->wl_queue = wl_display_create_queue(dri2_dpy->wl_dpy);
+
+   if (dri2_dpy->own_device)
+      wl_display_dispatch_pending(dri2_dpy->wl_dpy);
+
+   dri2_dpy->wl_registry = wl_display_get_registry(dri2_dpy->wl_dpy);
+   wl_proxy_set_queue((struct wl_proxy *) dri2_dpy->wl_registry,
+                      dri2_dpy->wl_queue);
+   wl_registry_add_listener(dri2_dpy->wl_registry,
+                            &registry_listener_swrast, dri2_dpy);
+
+   if (roundtrip(dri2_dpy) < 0 || dri2_dpy->wl_shm == NULL)
+      goto cleanup_registry;
+
+   if (roundtrip(dri2_dpy) < 0 || dri2_dpy->formats == 0)
+      goto cleanup_shm;
+
+   dri2_dpy->driver_name = strdup("swrast");
+   if (!dri2_load_driver_swrast(disp))
+      goto cleanup_shm;
+
+   dri2_dpy->swrast_loader_extension.base.name = __DRI_SWRAST_LOADER;
+   dri2_dpy->swrast_loader_extension.base.version = 2;
+   dri2_dpy->swrast_loader_extension.getDrawableInfo = dri2_wl_swrast_get_drawable_info;
+   dri2_dpy->swrast_loader_extension.putImage = dri2_wl_swrast_put_image;
+   dri2_dpy->swrast_loader_extension.getImage = dri2_wl_swrast_get_image;
+   dri2_dpy->swrast_loader_extension.putImage2 = dri2_wl_swrast_put_image2;
+
+   dri2_dpy->extensions[0] = &dri2_dpy->swrast_loader_extension.base;
+   dri2_dpy->extensions[1] = NULL;
+
+   if (!dri2_create_screen(disp))
+      goto cleanup_driver;
+
+   dri2_wl_setup_swap_interval(dri2_dpy);
+
+   types = EGL_WINDOW_BIT;
+   for (i = 0; dri2_dpy->driver_configs[i]; i++) {
+      config = dri2_dpy->driver_configs[i];
+      if (dri2_dpy->formats & HAS_XRGB8888)
+	 dri2_add_config(disp, config, i + 1, types, NULL, rgb_masks);
+      if (dri2_dpy->formats & HAS_ARGB8888)
+	 dri2_add_config(disp, config, i + 1, types, NULL, argb_masks);
+      if (dri2_dpy->formats & HAS_RGB565)
+        dri2_add_config(disp, config, i + 1, types, NULL, rgb565_masks);
+   }
+
+   /* Fill vtbl last to prevent accidentally calling virtual function during
+    * initialization.
+    */
+   dri2_dpy->vtbl = &dri2_wl_swrast_display_vtbl;
+
+   return EGL_TRUE;
+
+ cleanup_driver:
+   dlclose(dri2_dpy->driver);
+ cleanup_shm:
+   wl_shm_destroy(dri2_dpy->wl_shm);
+ cleanup_registry:
+   wl_registry_destroy(dri2_dpy->wl_registry);
+   wl_event_queue_destroy(dri2_dpy->wl_queue);
+ cleanup_dpy:
+   free(dri2_dpy);
+
+   return EGL_FALSE;
+}
+
+EGLBoolean
+dri2_initialize_wayland(_EGLDriver *drv, _EGLDisplay *disp)
+{
+   EGLBoolean initialized = EGL_TRUE;
+
+   int hw_accel = (getenv("LIBGL_ALWAYS_SOFTWARE") == NULL);
+
+   if (hw_accel) {
+      if (!dri2_initialize_wayland_drm(drv, disp)) {
+         initialized = dri2_initialize_wayland_swrast(drv, disp);
+      }
+   } else {
+      initialized = dri2_initialize_wayland_swrast(drv, disp);
+   }
+
+   return initialized;
+
+}
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index ddb3b54..56c1428 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -43,6 +43,7 @@
 
 #include "egl_dri2.h"
 #include "egl_dri2_fallbacks.h"
+#include "loader.h"
 
 static EGLBoolean
 dri2_x11_swap_interval(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf,
@@ -1017,15 +1018,6 @@ dri2_x11_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
    }
 }
 
-static _EGLImage*
-dri2_x11_swrast_create_image_khr(_EGLDriver *drv, _EGLDisplay *disp,
-                                 _EGLContext *ctx, EGLenum target,
-                                 EGLClientBuffer buffer,
-                                 const EGLint *attr_list)
-{
-   return NULL;
-}
-
 static EGLBoolean
 dri2_x11_get_sync_values(_EGLDisplay *display, _EGLSurface *surface,
                          EGLuint64KHR *ust, EGLuint64KHR *msc,
@@ -1058,7 +1050,7 @@ static struct dri2_egl_display_vtbl dri2_x11_swrast_display_vtbl = {
    .create_pixmap_surface = dri2_x11_create_pixmap_surface,
    .create_pbuffer_surface = dri2_x11_create_pbuffer_surface,
    .destroy_surface = dri2_x11_destroy_surface,
-   .create_image = dri2_x11_swrast_create_image_khr,
+   .create_image = dri2_fallback_create_image_khr,
    .swap_interval = dri2_fallback_swap_interval,
    .swap_buffers = dri2_x11_swap_buffers,
    .swap_buffers_region = dri2_fallback_swap_buffers_region,
@@ -1121,7 +1113,7 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
       goto cleanup_conn;
 
    dri2_dpy->swrast_loader_extension.base.name = __DRI_SWRAST_LOADER;
-   dri2_dpy->swrast_loader_extension.base.version = __DRI_SWRAST_LOADER_VERSION;
+   dri2_dpy->swrast_loader_extension.base.version = 2;
    dri2_dpy->swrast_loader_extension.getDrawableInfo = swrastGetDrawableInfo;
    dri2_dpy->swrast_loader_extension.putImage = swrastPutImage;
    dri2_dpy->swrast_loader_extension.getImage = swrastGetImage;
@@ -1138,10 +1130,6 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
          goto cleanup_configs;
    }
 
-   /* we're supporting EGL 1.4 */
-   disp->VersionMajor = 1;
-   disp->VersionMinor = 4;
-
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
     */
@@ -1243,16 +1231,7 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
    if (!dri2_load_driver(disp))
       goto cleanup_conn;
 
-#ifdef O_CLOEXEC
-   dri2_dpy->fd = open(dri2_dpy->device_name, O_RDWR | O_CLOEXEC);
-   if (dri2_dpy->fd == -1 && errno == EINVAL)
-#endif
-   {
-      dri2_dpy->fd = open(dri2_dpy->device_name, O_RDWR);
-      if (dri2_dpy->fd != -1)
-         fcntl(dri2_dpy->fd, F_SETFD, fcntl(dri2_dpy->fd, F_GETFD) |
-            FD_CLOEXEC);
-   }
+   dri2_dpy->fd = loader_open_device(dri2_dpy->device_name);
    if (dri2_dpy->fd == -1) {
       _eglLog(_EGL_WARNING,
 	      "DRI2: could not open %s (%s)", dri2_dpy->device_name,
@@ -1292,11 +1271,6 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
 
    dri2_x11_setup_swap_interval(dri2_dpy);
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp))
-	 goto cleanup_configs;
-   }
-
    disp->Extensions.KHR_image_pixmap = EGL_TRUE;
    disp->Extensions.NOK_swap_region = EGL_TRUE;
    disp->Extensions.NOK_texture_from_pixmap = EGL_TRUE;
@@ -1312,10 +1286,6 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
 	 goto cleanup_configs;
    }
 
-   /* we're supporting EGL 1.4 */
-   disp->VersionMajor = 1;
-   disp->VersionMinor = 4;
-
    /* Fill vtbl last to prevent accidentally calling virtual function during
     * initialization.
     */
diff --git a/src/egl/drivers/haiku/SConscript b/src/egl/drivers/haiku/SConscript
index 9dd2f70..ec6020e 100644
--- a/src/egl/drivers/haiku/SConscript
+++ b/src/egl/drivers/haiku/SConscript
@@ -9,7 +9,6 @@ env.Append(CPPDEFINES = [
 env.Append(CPPPATH = [
 	'#/include',
 	'#/src/egl/main',
-	'#/src/loader',
 ])
 
 sources = [
@@ -22,10 +21,6 @@ if env['platform'] == 'haiku':
 		'_EGL_NATIVE_PLATFORM=haiku',
 	])
 
-env.Prepend(LIBS = [
-	libloader,
-])
-
 egl_haiku = env.ConvenienceLibrary(
 	target = 'egl_haiku',
 	source = sources,
diff --git a/src/egl/drivers/haiku/egl_haiku.cpp b/src/egl/drivers/haiku/egl_haiku.cpp
index 4cf2ccb..3d00e47 100644
--- a/src/egl/drivers/haiku/egl_haiku.cpp
+++ b/src/egl/drivers/haiku/egl_haiku.cpp
@@ -27,8 +27,6 @@
 #include <stdint.h>
 #include <stdio.h>
 
-extern "C" {
-#include "loader.h"
 #include "eglconfig.h"
 #include "eglcontext.h"
 #include "egldisplay.h"
@@ -38,13 +36,19 @@ extern "C" {
 #include "eglsurface.h"
 #include "eglimage.h"
 #include "egltypedefs.h"
-}
 
 #include <InterfaceKit.h>
 #include <OpenGLKit.h>
 
 
-#define CALLOC_STRUCT(T)   (struct T *) calloc(1, sizeof(struct T))
+#ifdef DEBUG
+#	define TRACE(x...) printf("egl_haiku: " x)
+#	define CALLED() TRACE("CALLED: %s\n", __PRETTY_FUNCTION__)
+#else
+#	define TRACE(x...)
+#	define CALLED()
+#endif
+#define ERROR(x...) printf("egl_haiku: " x)
 
 
 _EGL_DRIVER_STANDARD_TYPECASTS(haiku_egl)
@@ -53,10 +57,6 @@ _EGL_DRIVER_STANDARD_TYPECASTS(haiku_egl)
 struct haiku_egl_driver
 {
 	_EGLDriver base;
-
-	void *handle;
-	_EGLProc (*get_proc_address)(const char *procname);
-	void (*glFlush)(void);
 };
 
 struct haiku_egl_config
@@ -76,81 +76,6 @@ struct haiku_egl_surface
 };
 
 
-/*
-static void
-swrastCreateDrawable(struct dri2_egl_display * dri2_dpy,
-	struct dri2_egl_surface * dri2_surf, int depth)
-{
-
-}
-
-
-static void
-swrastDestroyDrawable(struct dri2_egl_display * dri2_dpy,
-	struct dri2_egl_surface * dri2_surf)
-{
-
-}
-
-
-static void
-swrastGetDrawableInfo(__DRIdrawable * draw, int *x, int *y,
-	int *w, int *h, void *loaderPrivate)
-{
-
-}
-
-
-static void
-swrastPutImage(__DRIdrawable * draw, int op, int x, int y,
-	int w, int h, char *data, void *loaderPrivate)
-{
-
-}
-
-
-static void
-swrastGetImage(__DRIdrawable * read, int x, int y,
-	int w, int h, char *data, void *loaderPrivate)
-{
-
-}
-*/
-
-
-static void
-haiku_log(EGLint level, const char *msg)
-{
-	switch (level) {
-		case _EGL_DEBUG:
-			fprintf(stderr,"%s", msg);
-			break;
-		case _EGL_INFO:
-			fprintf(stderr,"%s", msg);
-			break;
-		case _EGL_WARNING:
-			fprintf(stderr,"%s", msg);
-			break;
-		case _EGL_FATAL:
-			fprintf(stderr,"%s", msg);
-			break;
-		default:
-			break;
-	}
-}
-
-
-/**
- * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
- */
-static _EGLSurface *
-haiku_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
-	_EGLConfig *conf, void *native_surface, const EGLint *attrib_list)
-{
-	return NULL;
-}
-
-
 /**
  * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
  */
@@ -158,25 +83,37 @@ static _EGLSurface *
 haiku_create_window_surface(_EGLDriver *drv, _EGLDisplay *disp,
 	_EGLConfig *conf, void *native_window, const EGLint *attrib_list)
 {
+	CALLED();
+
 	struct haiku_egl_surface* surface;
-	surface = (struct haiku_egl_surface*)calloc(1,sizeof (*surface));
+	surface = (struct haiku_egl_surface*) calloc(1, sizeof (*surface));
+	if (!surface) {
+		_eglError(EGL_BAD_ALLOC, "haiku_create_window_surface");
+		return NULL;
+	}
+
+	if (!_eglInitSurface(&surface->surf, disp, EGL_WINDOW_BIT, conf, attrib_list))
+		goto cleanup_surface;
 
-	_eglInitSurface(&surface->surf, disp, EGL_WINDOW_BIT, conf, attrib_list);
 	(&surface->surf)->SwapInterval = 1;
 
-	_eglLog(_EGL_DEBUG, "Creating window");
+	TRACE("Creating window\n");
 	BWindow* win = (BWindow*)native_window;
 
-	_eglLog(_EGL_DEBUG, "Creating GL view");
+	TRACE("Creating GL view\n");
 	surface->gl = new BGLView(win->Bounds(), "OpenGL", B_FOLLOW_ALL_SIDES, 0,
 		BGL_RGB | BGL_DOUBLE | BGL_ALPHA);
 
-	_eglLog(_EGL_DEBUG, "Adding GL");
+	TRACE("Adding GL\n");
 	win->AddChild(surface->gl);
 
-	_eglLog(_EGL_DEBUG, "Showing window");
+	TRACE("Showing window\n");
 	win->Show();
 	return &surface->surf;
+
+cleanup_surface:
+	free(surface);
+	return NULL;
 }
 
 
@@ -199,6 +136,10 @@ haiku_create_pbuffer_surface(_EGLDriver *drv, _EGLDisplay *disp,
 static EGLBoolean
 haiku_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
 {
+	if (_eglPutSurface(surf)) {
+		// XXX: detach haiku_egl_surface::gl from the native window and destroy it
+		free(surf);
+        }
 	return EGL_TRUE;
 }
 
@@ -206,13 +147,18 @@ haiku_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
 static EGLBoolean
 haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 {
-	printf("Adding configs\n");
+	CALLED();
 
 	struct haiku_egl_config* conf;
-	conf = CALLOC_STRUCT(haiku_egl_config);
+	conf = (struct haiku_egl_config*) calloc(1, sizeof (*conf));
+	if (!conf) {
+		_eglError(EGL_BAD_ALLOC, "haiku_add_configs_for_visuals");
+		return NULL;
+	}
 
 	_eglInitConfig(&conf->base, dpy, 1);
-	_eglLog(_EGL_DEBUG,"Config inited\n");
+	TRACE("Config inited\n");
+
 	_eglSetConfigKey(&conf->base, EGL_RED_SIZE, 8);
 	_eglSetConfigKey(&conf->base, EGL_BLUE_SIZE, 8);
 	_eglSetConfigKey(&conf->base, EGL_GREEN_SIZE, 8);
@@ -243,76 +189,40 @@ haiku_add_configs_for_visuals(_EGLDisplay *dpy)
 	_eglSetConfigKey(&conf->base, EGL_MAX_PBUFFER_PIXELS, 0); // TODO: How to get the right value ?
 	_eglSetConfigKey(&conf->base, EGL_SURFACE_TYPE, EGL_WINDOW_BIT /*| EGL_PIXMAP_BIT | EGL_PBUFFER_BIT*/);
 
-	printf("Config configuated\n");
+	TRACE("Config configuated\n");
 	if (!_eglValidateConfig(&conf->base, EGL_FALSE)) {
-		_eglLog(_EGL_DEBUG, "Haiku failed to validate config");
-		return EGL_FALSE;
+		_eglLog(_EGL_DEBUG, "Haiku: failed to validate config");
+		goto cleanup;
 	}
-	printf("Validated config\n");
+	TRACE("Validated config\n");
    
 	_eglLinkConfig(&conf->base);
 	if (!_eglGetArraySize(dpy->Configs)) {
 		_eglLog(_EGL_WARNING, "Haiku: failed to create any config");
-		return EGL_FALSE;
+		goto cleanup;
 	}
-	printf("Config successful!\n");
-   
+	TRACE("Config successfull\n");
+
 	return EGL_TRUE;
+
+cleanup:
+	free(conf);
+	return EGL_FALSE;
 }
 
 extern "C"
 EGLBoolean
 init_haiku(_EGLDriver *drv, _EGLDisplay *dpy)
 {
-	_eglLog(_EGL_DEBUG,"\nInitializing Haiku EGL\n");
-	//_EGLDisplay* egl_dpy;
-
-	printf("Initializing Haiku EGL\n");
-	_eglSetLogProc(haiku_log);
-
-	loader_set_logger(_eglLog);
-
-	/*egl_dpy = (_EGLDisplay*) calloc(1, sizeof(_EGLDisplay));
-	if (!egl_dpy)
-		return _eglError(EGL_BAD_ALLOC, "eglInitialize");
-
-	dpy->DriverData=(void*) egl_dpy;
-	if (!dpy->PlatformDisplay) {
-		// OPEN DEVICE 
-		//dri2_dpy->bwindow = (void*)haiku_create_window();
-		//dri2_dpy->own_device = true;
-	} else {
-		//dri2_dpy->bwindow = (BWindow*)dpy->PlatformDisplay;
-	}*/
-	
-	//dri2_dpy->driver_name = strdup("swrast");
-	//if (!dri2_load_driver_swrast(dpy))
-	//   goto cleanup_conn;
-
-	/*dri2_dpy->swrast_loader_extension.base.name = __DRI_SWRAST_LOADER;
-	dri2_dpy->swrast_loader_extension.base.version = __DRI_SWRAST_LOADER_VERSION;
-	dri2_dpy->swrast_loader_extension.getDrawableInfo = swrastGetDrawableInfo;
-	dri2_dpy->swrast_loader_extension.putImage = swrastPutImage;
-	dri2_dpy->swrast_loader_extension.getImage = swrastGetImage;
-
-	dri2_dpy->extensions[0] = &dri2_dpy->swrast_loader_extension.base;
-	dri2_dpy->extensions[1] = NULL;
-	dri2_dpy->extensions[2] = NULL;*/
-
-	/*if (dri2_dpy->bwindow) {
-		if (!dri2_haiku_add_configs_for_visuals(dri2_dpy, dpy))
-			goto cleanup_configs;
-	}*/
-	_eglLog(_EGL_DEBUG,"Add configs");
-    haiku_add_configs_for_visuals(dpy);
-
-	dpy->VersionMajor=1;
-	dpy->VersionMinor=4;
-   
-   //dpy->Extensions.KHR_create_context = true;
+	CALLED();
+
+	TRACE("Add configs\n");
+	if (!haiku_add_configs_for_visuals(dpy))
+		return EGL_FALSE;
 
-	//dri2_dpy->vtbl = &dri2_haiku_display_vtbl;
-	_eglLog(_EGL_DEBUG, "Initialization finished");
+	dpy->Version = 14;
+   
+	TRACE("Initialization finished\n");
 
 	return EGL_TRUE;
 }
@@ -331,13 +241,24 @@ _EGLContext*
 haiku_create_context(_EGLDriver *drv, _EGLDisplay *disp, _EGLConfig *conf,
 	_EGLContext *share_list, const EGLint *attrib_list)
 {
-	_eglLog(_EGL_DEBUG,"Creating context");
+	CALLED();
+
 	struct haiku_egl_context* context;
-	context=(struct haiku_egl_context*)calloc(1,sizeof (*context));
-	if(!_eglInitContext(&context->ctx, disp, conf, attrib_list))
-		printf("ERROR creating context");
-	_eglLog(_EGL_DEBUG, "Context created");
+	context = (struct haiku_egl_context*) calloc(1, sizeof (*context));
+	if (!context) {
+		_eglError(EGL_BAD_ALLOC, "haiku_create_context");
+		return NULL;
+	}
+
+	if (!_eglInitContext(&context->ctx, disp, conf, attrib_list))
+		goto cleanup;
+
+	TRACE("Context created\n");
 	return &context->ctx;
+
+cleanup:
+	free(context);
+	return NULL;
 }
 
 
@@ -345,7 +266,13 @@ extern "C"
 EGLBoolean
 haiku_destroy_context(_EGLDriver* drv, _EGLDisplay *disp, _EGLContext* ctx)
 {
-	ctx=NULL;
+	struct haiku_egl_context* context = haiku_egl_context(ctx);
+
+	if (_eglPutContext(ctx)) {
+		// XXX: teardown the context ?
+		free(context);
+		ctx = NULL
+	}
 	return EGL_TRUE;
 }
 
@@ -355,11 +282,16 @@ EGLBoolean
 haiku_make_current(_EGLDriver* drv, _EGLDisplay* dpy, _EGLSurface *dsurf,
 		  _EGLSurface *rsurf, _EGLContext *ctx)
 {
-	struct haiku_egl_context* cont=haiku_egl_context(ctx);
-	struct haiku_egl_surface* surf=haiku_egl_surface(dsurf);
+	CALLED();
+
+	struct haiku_egl_context* cont = haiku_egl_context(ctx);
+	struct haiku_egl_surface* surf = haiku_egl_surface(dsurf);
 	_EGLContext *old_ctx;
-    _EGLSurface *old_dsurf, *old_rsurf;
-	_eglBindContext(ctx, dsurf, rsurf, &old_ctx, &old_dsurf, &old_rsurf);
+	_EGLSurface *old_dsurf, *old_rsurf;
+
+	if (!_eglBindContext(ctx, dsurf, rsurf, &old_ctx, &old_dsurf, &old_rsurf))
+		return EGL_FALSE;
+
 	//cont->ctx.DrawSurface=&surf->surf;
 	surf->gl->LockGL();
 	return EGL_TRUE;
@@ -370,7 +302,8 @@ extern "C"
 EGLBoolean
 haiku_swap_buffers(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf)
 {
-	struct haiku_egl_surface* surface=haiku_egl_surface(surf);
+	struct haiku_egl_surface* surface = haiku_egl_surface(surf);
+
 	surface->gl->SwapBuffers();
 	//gl->Render();
 	return EGL_TRUE;
@@ -393,9 +326,15 @@ extern "C"
 _EGLDriver*
 _eglBuiltInDriverHaiku(const char *args)
 {
-	_eglLog(_EGL_DEBUG,"Driver loaded");
+	CALLED();
+
 	struct haiku_egl_driver* driver;
-	driver=(struct haiku_egl_driver*)calloc(1,sizeof(*driver));
+	driver = (struct haiku_egl_driver*) calloc(1, sizeof(*driver));
+	if (!driver) {
+		_eglError(EGL_BAD_ALLOC, "_eglBuiltInDriverHaiku");
+		return NULL;
+	}
+
 	_eglInitDriverFallbacks(&driver->base);
 	driver->base.API.Initialize = init_haiku;
 	driver->base.API.Terminate = haiku_terminate;
@@ -406,32 +345,13 @@ _eglBuiltInDriverHaiku(const char *args)
 	driver->base.API.CreatePixmapSurface = haiku_create_pixmap_surface;
 	driver->base.API.CreatePbufferSurface = haiku_create_pbuffer_surface;
 	driver->base.API.DestroySurface = haiku_destroy_surface;
-	/*
-	driver->API.GetProcAddress = dri2_get_proc_address;
-	driver->API.WaitClient = dri2_wait_client;
-	driver->API.WaitNative = dri2_wait_native;
-	driver->API.BindTexImage = dri2_bind_tex_image;
-	driver->API.ReleaseTexImage = dri2_release_tex_image;
-	driver->API.SwapInterval = dri2_swap_interval;
-	*/
 
 	driver->base.API.SwapBuffers = haiku_swap_buffers;
-	/*
-	driver->API.SwapBuffersWithDamageEXT = dri2_swap_buffers_with_damage;
-	driver->API.SwapBuffersRegionNOK = dri2_swap_buffers_region;
-	driver->API.PostSubBufferNV = dri2_post_sub_buffer;
-	driver->API.CopyBuffers = dri2_copy_buffers,
-	driver->API.QueryBufferAge = dri2_query_buffer_age;
-	driver->API.CreateImageKHR = dri2_create_image;
-	driver->API.DestroyImageKHR = dri2_destroy_image_khr;
-	driver->API.CreateWaylandBufferFromImageWL = dri2_create_wayland_buffer_from_image;
-	driver->API.GetSyncValuesCHROMIUM = dri2_get_sync_values_chromium;
-	*/
 
 	driver->base.Name = "Haiku";
 	driver->base.Unload = haiku_unload;
 
-	_eglLog(_EGL_DEBUG, "API Calls defined");
-	
+	TRACE("API Calls defined\n");
+
 	return &driver->base;
 }
diff --git a/src/egl/main/Android.mk b/src/egl/main/Android.mk
index 12b66d0..0ba7295 100644
--- a/src/egl/main/Android.mk
+++ b/src/egl/main/Android.mk
@@ -43,10 +43,7 @@ LOCAL_CFLAGS := \
 	-D_EGL_DRIVER_SEARCH_DIR=\"/system/lib/egl\" \
 	-D_EGL_OS_UNIX=1
 
-LOCAL_STATIC_LIBRARIES :=
-
 LOCAL_SHARED_LIBRARIES := \
-	libglapi \
 	libdl \
 	libhardware \
 	liblog \
@@ -62,95 +59,20 @@ ifneq ($(MESA_GPU_DRIVERS),swrast)
 LOCAL_SHARED_LIBRARIES += libdrm
 endif
 
-ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
 LOCAL_CFLAGS += -D_EGL_BUILT_IN_DRIVER_DRI2
-LOCAL_STATIC_LIBRARIES += libmesa_egl_dri2
 
+ifeq ($(strip $(MESA_BUILD_CLASSIC)),true)
 # require i915_dri and/or i965_dri
 LOCAL_REQUIRED_MODULES += \
 	$(addsuffix _dri, $(filter i915 i965, $(MESA_GPU_DRIVERS)))
 endif # MESA_BUILD_CLASSIC
 
 ifeq ($(strip $(MESA_BUILD_GALLIUM)),true)
-
-gallium_DRIVERS :=
-
-# swrast
-gallium_DRIVERS += libmesa_pipe_softpipe libmesa_winsys_sw_android
-
-# freedreno
-ifneq ($(filter freedreno, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_winsys_freedreno libmesa_pipe_freedreno
-LOCAL_SHARED_LIBRARIES += libdrm_freedreno
-endif
-
-# i915g
-ifneq ($(filter i915g, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_winsys_i915 libmesa_pipe_i915
-LOCAL_SHARED_LIBRARIES += libdrm_intel
-endif
-
-# ilo
-ifneq ($(filter ilo, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_winsys_intel libmesa_pipe_ilo
-LOCAL_SHARED_LIBRARIES += libdrm_intel
-endif
-
-# nouveau
-ifneq ($(filter nouveau, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS +=  libmesa_winsys_nouveau libmesa_pipe_nouveau
-LOCAL_SHARED_LIBRARIES += libdrm_nouveau
-LOCAL_SHARED_LIBRARIES += libstlport
-endif
-
-# r300g/r600g/radeonsi
-ifneq ($(filter r300g r600g radeonsi, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_winsys_radeon
-LOCAL_SHARED_LIBRARIES += libdrm_radeon
-ifneq ($(filter r300g, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_r300
-endif # r300g
-ifneq ($(filter r600g radeonsi, $(MESA_GPU_DRIVERS)),)
-ifneq ($(filter r600g, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_r600
-LOCAL_SHARED_LIBRARIES += libstlport
-endif # r600g
-ifneq ($(filter radeonsi, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_pipe_radeonsi
-endif # radeonsi
-gallium_DRIVERS += libmesa_pipe_radeon
-endif # r600g || radeonsi
-endif # r300g || r600g || radeonsi
-
-# vmwgfx
-ifneq ($(filter vmwgfx, $(MESA_GPU_DRIVERS)),)
-gallium_DRIVERS += libmesa_winsys_svga libmesa_pipe_svga
-endif
-
-#
-# Notes about the order here:
-#
-#  * libmesa_st_egl depends on libmesa_winsys_sw_android in $(gallium_DRIVERS)
-#  * libmesa_pipe_r300 in $(gallium_DRIVERS) depends on libmesa_st_mesa and
-#    libmesa_glsl
-#  * libmesa_st_mesa depends on libmesa_glsl
-#  * libmesa_glsl depends on libmesa_glsl_utils
-#
-LOCAL_STATIC_LIBRARIES := \
-	libmesa_egl_gallium \
-	libmesa_st_egl \
-	$(gallium_DRIVERS) \
-	libmesa_st_mesa \
-	libmesa_util \
-	libmesa_glsl \
-	libmesa_glsl_utils \
-	libmesa_gallium \
-	$(LOCAL_STATIC_LIBRARIES)
-
+LOCAL_REQUIRED_MODULES += gallium_dri
 endif # MESA_BUILD_GALLIUM
 
 LOCAL_STATIC_LIBRARIES := \
-	$(LOCAL_STATIC_LIBRARIES) \
+	libmesa_egl_dri2 \
 	libmesa_loader
 
 LOCAL_MODULE := libGLES_mesa
diff --git a/src/egl/main/Makefile.am b/src/egl/main/Makefile.am
index b661736..9030d27 100644
--- a/src/egl/main/Makefile.am
+++ b/src/egl/main/Makefile.am
@@ -68,6 +68,10 @@ if HAVE_EGL_PLATFORM_NULL
 AM_CFLAGS += -DHAVE_NULL_PLATFORM
 endif
 
+if HAVE_EGL_PLATFORM_SURFACELESS
+AM_CFLAGS += -DHAVE_SURFACELESS_PLATFORM
+endif
+
 if HAVE_EGL_DRIVER_DRI2
 AM_CFLAGS += -D_EGL_BUILT_IN_DRIVER_DRI2
 AM_CFLAGS += -DHAVE_XCB_DRI2
diff --git a/src/egl/main/Makefile.sources b/src/egl/main/Makefile.sources
index 304c773..e39a80f 100644
--- a/src/egl/main/Makefile.sources
+++ b/src/egl/main/Makefile.sources
@@ -22,10 +22,6 @@ LIBEGL_C_FILES := \
 	eglimage.h \
 	egllog.c \
 	egllog.h \
-	eglmode.c \
-	eglmode.h \
-	eglscreen.c \
-	eglscreen.h \
 	eglstring.c \
 	eglstring.h \
 	eglsurface.c \
diff --git a/src/egl/main/README.txt b/src/egl/main/README.txt
index b3d253d..1af9959 100644
--- a/src/egl/main/README.txt
+++ b/src/egl/main/README.txt
@@ -16,10 +16,10 @@ The EGL code here basically consists of two things:
 
 Bootstrapping:
 
-When the apps calls eglOpenDisplay() a device driver is selected and loaded
-(look for dlsym() or LoadLibrary() in egldriver.c).
+When the apps calls eglInitialize() a device driver is selected and loaded
+(look for _eglAddDrivers() and _eglLoadModule() in egldriver.c).
 
-The driver's _eglMain() function is then called.  This driver function
+The built-in driver's entry point function is then called.  This driver function
 allocates, initializes and returns a new _EGLDriver object (usually a
 subclass of that type).
 
@@ -30,10 +30,9 @@ driver->API.Initialize and driver->API.Terminate _must_ be implemented
 with driver-specific code (no default/fallback function is possible).
 
 
-A bit later, the app will call eglInitialize().  This will get routed
-to the driver->API.Initialize() function.  Any additional driver
-initialization that wasn't done in _eglMain() should be done at this
-point.  Typically, this will involve setting up visual configs, etc.
+Shortly after, the driver->API.Initialize() function is executed.  Any additional
+driver initialization that wasn't done in the driver entry point should be
+done at this point.  Typically, this will involve setting up visual configs, etc.
 
 
 
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index ba1d0dd..105e919 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -98,8 +98,6 @@
 #include "egldriver.h"
 #include "eglsurface.h"
 #include "eglconfig.h"
-#include "eglscreen.h"
-#include "eglmode.h"
 #include "eglimage.h"
 #include "eglsync.h"
 #include "eglstring.h"
@@ -155,12 +153,6 @@
 #define _EGL_CHECK_CONFIG(disp, conf, ret, drv) \
    _EGL_CHECK_OBJECT(disp, Config, conf, ret, drv)
 
-#define _EGL_CHECK_SCREEN(disp, scrn, ret, drv) \
-   _EGL_CHECK_OBJECT(disp, Screen, scrn, ret, drv)
-
-#define _EGL_CHECK_MODE(disp, m, ret, drv) \
-   _EGL_CHECK_OBJECT(disp, Mode, m, ret, drv)
-
 #define _EGL_CHECK_SYNC(disp, s, ret, drv) \
    _EGL_CHECK_OBJECT(disp, Sync, s, ret, drv)
 
@@ -236,40 +228,6 @@ _eglCheckSync(_EGLDisplay *disp, _EGLSync *s, const char *msg)
 }
 
 
-#ifdef EGL_MESA_screen_surface
-
-
-static inline _EGLDriver *
-_eglCheckScreen(_EGLDisplay *disp, _EGLScreen *scrn, const char *msg)
-{
-   _EGLDriver *drv = _eglCheckDisplay(disp, msg);
-   if (!drv)
-      return NULL;
-   if (!scrn) {
-      _eglError(EGL_BAD_SCREEN_MESA, msg);
-      return NULL;
-   }
-   return drv;
-}
-
-
-static inline _EGLDriver *
-_eglCheckMode(_EGLDisplay *disp, _EGLMode *m, const char *msg)
-{
-   _EGLDriver *drv = _eglCheckDisplay(disp, msg);
-   if (!drv)
-      return NULL;
-   if (!m) {
-      _eglError(EGL_BAD_MODE_MESA, msg);
-      return NULL;
-   }
-   return drv;
-}
-
-
-#endif /* EGL_MESA_screen_surface */
-
-
 /**
  * Lookup and lock a display.
  */
@@ -293,6 +251,31 @@ _eglUnlockDisplay(_EGLDisplay *dpy)
 }
 
 
+static EGLint *
+_eglConvertAttribsToInt(const EGLAttrib *attr_list)
+{
+   EGLint *int_attribs = NULL;
+
+   /* Convert attributes from EGLAttrib[] to EGLint[] */
+   if (attr_list) {
+      int i, size = 0;
+
+      while (attr_list[size] != EGL_NONE)
+         size += 2;
+
+      size += 1; /* add space for EGL_NONE */
+
+      int_attribs = calloc(size, sizeof(int_attribs[0]));
+      if (!int_attribs)
+         return NULL;
+
+      for (i = 0; i < size; i++)
+         int_attribs[i] = attr_list[i];
+   }
+   return int_attribs;
+}
+
+
 /**
  * This is typically the first EGL function that an application calls.
  * It associates a private _EGLDisplay object to the native display.
@@ -312,7 +295,7 @@ eglGetDisplay(EGLNativeDisplayType nativeDisplay)
    return _eglGetDisplayHandle(dpy);
 }
 
-EGLDisplay EGLAPIENTRY
+static EGLDisplay EGLAPIENTRY
 eglGetPlatformDisplayEXT(EGLenum platform, void *native_display,
                          const EGLint *attrib_list)
 {
@@ -343,6 +326,21 @@ eglGetPlatformDisplayEXT(EGLenum platform, void *native_display,
    return _eglGetDisplayHandle(dpy);
 }
 
+EGLDisplay EGLAPIENTRY
+eglGetPlatformDisplay(EGLenum platform, void *native_display,
+                      const EGLAttrib *attrib_list)
+{
+   EGLDisplay display;
+   EGLint *int_attribs = _eglConvertAttribsToInt(attrib_list);
+
+   if (attrib_list && !int_attribs)
+      RETURN_EGL_ERROR(NULL, EGL_BAD_ALLOC, NULL);
+
+   display = eglGetPlatformDisplayEXT(platform, native_display, int_attribs);
+   free(int_attribs);
+   return display;
+}
+
 /**
  * Copy the extension into the string and update the string pointer.
  */
@@ -383,8 +381,6 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
 
    char *exts = dpy->ExtensionsString;
 
-   _EGL_CHECK_EXTENSION(MESA_screen_surface);
-   _EGL_CHECK_EXTENSION(MESA_copy_context);
    _EGL_CHECK_EXTENSION(MESA_drm_display);
    _EGL_CHECK_EXTENSION(MESA_drm_image);
    _EGL_CHECK_EXTENSION(MESA_configless_context);
@@ -451,6 +447,26 @@ _eglCreateAPIsString(_EGLDisplay *dpy)
    assert(strlen(dpy->ClientAPIsString) < sizeof(dpy->ClientAPIsString));
 }
 
+static void
+_eglComputeVersion(_EGLDisplay *disp)
+{
+   disp->Version = 14;
+
+   if (disp->Extensions.KHR_fence_sync &&
+       disp->Extensions.KHR_cl_event2 &&
+       disp->Extensions.KHR_wait_sync &&
+       disp->Extensions.KHR_image_base &&
+       disp->Extensions.KHR_gl_texture_2D_image &&
+       disp->Extensions.KHR_gl_texture_3D_image &&
+       disp->Extensions.KHR_gl_texture_cubemap_image &&
+       disp->Extensions.KHR_gl_renderbuffer_image &&
+       disp->Extensions.KHR_create_context &&
+       disp->Extensions.EXT_create_context_robustness &&
+       disp->Extensions.KHR_get_all_proc_addresses &&
+       disp->Extensions.KHR_gl_colorspace &&
+       disp->Extensions.KHR_surfaceless_context)
+      disp->Version = 15;
+}
 
 /**
  * This is typically the second EGL function that an application calls.
@@ -488,17 +504,18 @@ eglInitialize(EGLDisplay dpy, EGLint *major, EGLint *minor)
        */
       disp->Extensions.KHR_get_all_proc_addresses = EGL_TRUE;
 
+      _eglComputeVersion(disp);
       _eglCreateExtensionsString(disp);
       _eglCreateAPIsString(disp);
       _eglsnprintf(disp->VersionString, sizeof(disp->VersionString),
-              "%d.%d (%s)", disp->VersionMajor, disp->VersionMinor,
+              "%d.%d (%s)", disp->Version / 10, disp->Version % 10,
               disp->Driver->Name);
    }
 
    /* Update applications version of major and minor if not NULL */
    if ((major != NULL) && (minor != NULL)) {
-      *major = disp->VersionMajor;
-      *minor = disp->VersionMinor;
+      *major = disp->Version / 10;
+      *minor = disp->Version % 10;
    }
 
    RETURN_EGL_SUCCESS(disp, EGL_TRUE);
@@ -740,7 +757,7 @@ eglCreateWindowSurface(EGLDisplay dpy, EGLConfig config,
 }
 
 
-EGLSurface EGLAPIENTRY
+static EGLSurface EGLAPIENTRY
 eglCreatePlatformWindowSurfaceEXT(EGLDisplay dpy, EGLConfig config,
                                   void *native_window,
                                   const EGLint *attrib_list)
@@ -765,6 +782,24 @@ eglCreatePlatformWindowSurfaceEXT(EGLDisplay dpy, EGLConfig config,
 }
 
 
+EGLSurface EGLAPIENTRY
+eglCreatePlatformWindowSurface(EGLDisplay dpy, EGLConfig config,
+                               void *native_window,
+                               const EGLAttrib *attrib_list)
+{
+   EGLSurface surface;
+   EGLint *int_attribs = _eglConvertAttribsToInt(attrib_list);
+
+   if (attrib_list && !int_attribs)
+      RETURN_EGL_ERROR(NULL, EGL_BAD_ALLOC, EGL_NO_SURFACE);
+
+   surface = eglCreatePlatformWindowSurfaceEXT(dpy, config, native_window,
+                                               int_attribs);
+   free(int_attribs);
+   return surface;
+}
+
+
 static EGLSurface
 _eglCreatePixmapSurfaceCommon(_EGLDisplay *disp, EGLConfig config,
                               void *native_pixmap, const EGLint *attrib_list)
@@ -793,7 +828,7 @@ eglCreatePixmapSurface(EGLDisplay dpy, EGLConfig config,
                                          attrib_list);
 }
 
-EGLSurface EGLAPIENTRY
+static EGLSurface EGLAPIENTRY
 eglCreatePlatformPixmapSurfaceEXT(EGLDisplay dpy, EGLConfig config,
                                    void *native_pixmap,
                                    const EGLint *attrib_list)
@@ -819,6 +854,24 @@ eglCreatePlatformPixmapSurfaceEXT(EGLDisplay dpy, EGLConfig config,
 
 
 EGLSurface EGLAPIENTRY
+eglCreatePlatformPixmapSurface(EGLDisplay dpy, EGLConfig config,
+                               void *native_pixmap,
+                               const EGLAttrib *attrib_list)
+{
+   EGLSurface surface;
+   EGLint *int_attribs = _eglConvertAttribsToInt(attrib_list);
+
+   if (attrib_list && !int_attribs)
+      RETURN_EGL_ERROR(NULL, EGL_BAD_ALLOC, EGL_NO_SURFACE);
+
+   surface = eglCreatePlatformPixmapSurfaceEXT(dpy, config, native_pixmap,
+                                               int_attribs);
+   free(int_attribs);
+   return surface;
+}
+
+
+EGLSurface EGLAPIENTRY
 eglCreatePbufferSurface(EGLDisplay dpy, EGLConfig config,
                         const EGLint *attrib_list)
 {
@@ -964,7 +1017,7 @@ eglSwapBuffers(EGLDisplay dpy, EGLSurface surface)
 
 #ifdef EGL_EXT_swap_buffers_with_damage
 
-EGLBoolean EGLAPIENTRY
+static EGLBoolean EGLAPIENTRY
 eglSwapBuffersWithDamageEXT(EGLDisplay dpy, EGLSurface surface,
                             EGLint *rects, EGLint n_rects)
 {
@@ -1151,352 +1204,9 @@ eglGetError(void)
 }
 
 
-__eglMustCastToProperFunctionPointerType EGLAPIENTRY
-eglGetProcAddress(const char *procname)
-{
-   static const struct {
-      const char *name;
-      _EGLProc function;
-   } egl_functions[] = {
-      /* core functions should not be queryable, but, well... */
-#ifdef _EGL_GET_CORE_ADDRESSES
-      /* alphabetical order */
-      { "eglBindAPI", (_EGLProc) eglBindAPI },
-      { "eglBindTexImage", (_EGLProc) eglBindTexImage },
-      { "eglChooseConfig", (_EGLProc) eglChooseConfig },
-      { "eglCopyBuffers", (_EGLProc) eglCopyBuffers },
-      { "eglCreateContext", (_EGLProc) eglCreateContext },
-      { "eglCreatePbufferFromClientBuffer", (_EGLProc) eglCreatePbufferFromClientBuffer },
-      { "eglCreatePbufferSurface", (_EGLProc) eglCreatePbufferSurface },
-      { "eglCreatePixmapSurface", (_EGLProc) eglCreatePixmapSurface },
-      { "eglCreateWindowSurface", (_EGLProc) eglCreateWindowSurface },
-      { "eglDestroyContext", (_EGLProc) eglDestroyContext },
-      { "eglDestroySurface", (_EGLProc) eglDestroySurface },
-      { "eglGetConfigAttrib", (_EGLProc) eglGetConfigAttrib },
-      { "eglGetConfigs", (_EGLProc) eglGetConfigs },
-      { "eglGetCurrentContext", (_EGLProc) eglGetCurrentContext },
-      { "eglGetCurrentDisplay", (_EGLProc) eglGetCurrentDisplay },
-      { "eglGetCurrentSurface", (_EGLProc) eglGetCurrentSurface },
-      { "eglGetDisplay", (_EGLProc) eglGetDisplay },
-      { "eglGetError", (_EGLProc) eglGetError },
-      { "eglGetProcAddress", (_EGLProc) eglGetProcAddress },
-      { "eglInitialize", (_EGLProc) eglInitialize },
-      { "eglMakeCurrent", (_EGLProc) eglMakeCurrent },
-      { "eglQueryAPI", (_EGLProc) eglQueryAPI },
-      { "eglQueryContext", (_EGLProc) eglQueryContext },
-      { "eglQueryString", (_EGLProc) eglQueryString },
-      { "eglQuerySurface", (_EGLProc) eglQuerySurface },
-      { "eglReleaseTexImage", (_EGLProc) eglReleaseTexImage },
-      { "eglReleaseThread", (_EGLProc) eglReleaseThread },
-      { "eglSurfaceAttrib", (_EGLProc) eglSurfaceAttrib },
-      { "eglSwapBuffers", (_EGLProc) eglSwapBuffers },
-      { "eglSwapInterval", (_EGLProc) eglSwapInterval },
-      { "eglTerminate", (_EGLProc) eglTerminate },
-      { "eglWaitClient", (_EGLProc) eglWaitClient },
-      { "eglWaitGL", (_EGLProc) eglWaitGL },
-      { "eglWaitNative", (_EGLProc) eglWaitNative },
-#endif /* _EGL_GET_CORE_ADDRESSES */
-#ifdef EGL_MESA_screen_surface
-      { "eglChooseModeMESA", (_EGLProc) eglChooseModeMESA },
-      { "eglGetModesMESA", (_EGLProc) eglGetModesMESA },
-      { "eglGetModeAttribMESA", (_EGLProc) eglGetModeAttribMESA },
-      { "eglCopyContextMESA", (_EGLProc) eglCopyContextMESA },
-      { "eglGetScreensMESA", (_EGLProc) eglGetScreensMESA },
-      { "eglCreateScreenSurfaceMESA", (_EGLProc) eglCreateScreenSurfaceMESA },
-      { "eglShowScreenSurfaceMESA", (_EGLProc) eglShowScreenSurfaceMESA },
-      { "eglScreenPositionMESA", (_EGLProc) eglScreenPositionMESA },
-      { "eglQueryScreenMESA", (_EGLProc) eglQueryScreenMESA },
-      { "eglQueryScreenSurfaceMESA", (_EGLProc) eglQueryScreenSurfaceMESA },
-      { "eglQueryScreenModeMESA", (_EGLProc) eglQueryScreenModeMESA },
-      { "eglQueryModeStringMESA", (_EGLProc) eglQueryModeStringMESA },
-#endif /* EGL_MESA_screen_surface */
 #ifdef EGL_MESA_drm_display
-      { "eglGetDRMDisplayMESA", (_EGLProc) eglGetDRMDisplayMESA },
-#endif
-      { "eglCreateImageKHR", (_EGLProc) eglCreateImageKHR },
-      { "eglDestroyImageKHR", (_EGLProc) eglDestroyImageKHR },
-      { "eglCreateSyncKHR", (_EGLProc) eglCreateSyncKHR },
-      { "eglCreateSync64KHR", (_EGLProc) eglCreateSync64KHR },
-      { "eglDestroySyncKHR", (_EGLProc) eglDestroySyncKHR },
-      { "eglClientWaitSyncKHR", (_EGLProc) eglClientWaitSyncKHR },
-      { "eglWaitSyncKHR", (_EGLProc) eglWaitSyncKHR },
-      { "eglSignalSyncKHR", (_EGLProc) eglSignalSyncKHR },
-      { "eglGetSyncAttribKHR", (_EGLProc) eglGetSyncAttribKHR },
-#ifdef EGL_NOK_swap_region
-      { "eglSwapBuffersRegionNOK", (_EGLProc) eglSwapBuffersRegionNOK },
-#endif
-#ifdef EGL_MESA_drm_image
-      { "eglCreateDRMImageMESA", (_EGLProc) eglCreateDRMImageMESA },
-      { "eglExportDRMImageMESA", (_EGLProc) eglExportDRMImageMESA },
-#endif
-#ifdef EGL_WL_bind_wayland_display
-      { "eglBindWaylandDisplayWL", (_EGLProc) eglBindWaylandDisplayWL },
-      { "eglUnbindWaylandDisplayWL", (_EGLProc) eglUnbindWaylandDisplayWL },
-      { "eglQueryWaylandBufferWL", (_EGLProc) eglQueryWaylandBufferWL },
-#endif
-#ifdef EGL_WL_create_wayland_buffer_from_image
-      { "eglCreateWaylandBufferFromImageWL", (_EGLProc) eglCreateWaylandBufferFromImageWL },
-#endif
-      { "eglPostSubBufferNV", (_EGLProc) eglPostSubBufferNV },
-#ifdef EGL_EXT_swap_buffers_with_damage
-      { "eglSwapBuffersWithDamageEXT", (_EGLProc) eglSwapBuffersWithDamageEXT },
-#endif
-      { "eglGetPlatformDisplayEXT", (_EGLProc) eglGetPlatformDisplayEXT },
-      { "eglCreatePlatformWindowSurfaceEXT", (_EGLProc) eglCreatePlatformWindowSurfaceEXT },
-      { "eglCreatePlatformPixmapSurfaceEXT", (_EGLProc) eglCreatePlatformPixmapSurfaceEXT },
-      { "eglGetSyncValuesCHROMIUM", (_EGLProc) eglGetSyncValuesCHROMIUM },
-#ifdef EGL_MESA_dma_buf_image_export
-      { "eglExportDMABUFImageQueryMESA", (_EGLProc) eglExportDMABUFImageQueryMESA },
-      { "eglExportDMABUFImageMESA", (_EGLProc) eglExportDMABUFImageMESA },
-#endif
-      { NULL, NULL }
-   };
-   EGLint i;
-   _EGLProc ret;
-
-   if (!procname)
-      RETURN_EGL_SUCCESS(NULL, NULL);
-
-   ret = NULL;
-   if (strncmp(procname, "egl", 3) == 0) {
-      for (i = 0; egl_functions[i].name; i++) {
-         if (strcmp(egl_functions[i].name, procname) == 0) {
-            ret = egl_functions[i].function;
-            break;
-         }
-      }
-   }
-   if (!ret)
-      ret = _eglGetDriverProc(procname);
-
-   RETURN_EGL_SUCCESS(NULL, ret);
-}
-
-
-#ifdef EGL_MESA_screen_surface
-
-
-/*
- * EGL_MESA_screen extension
- */
-
-EGLBoolean EGLAPIENTRY
-eglChooseModeMESA(EGLDisplay dpy, EGLScreenMESA screen,
-                  const EGLint *attrib_list, EGLModeMESA *modes,
-                  EGLint modes_size, EGLint *num_modes)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLScreen *scrn = _eglLookupScreen(screen, disp);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_SCREEN(disp, scrn, EGL_FALSE, drv);
-   ret = drv->API.ChooseModeMESA(drv, disp, scrn, attrib_list,
-         modes, modes_size, num_modes);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglGetModesMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLModeMESA *modes,
-                EGLint mode_size, EGLint *num_mode)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLScreen *scrn = _eglLookupScreen(screen, disp);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_SCREEN(disp, scrn, EGL_FALSE, drv);
-   ret = drv->API.GetModesMESA(drv, disp, scrn, modes, mode_size, num_mode);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglGetModeAttribMESA(EGLDisplay dpy, EGLModeMESA mode,
-                     EGLint attribute, EGLint *value)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLMode *m = _eglLookupMode(mode, disp);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_MODE(disp, m, EGL_FALSE, drv);
-   ret = drv->API.GetModeAttribMESA(drv, disp, m, attribute, value);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglCopyContextMESA(EGLDisplay dpy, EGLContext source, EGLContext dest,
-                   EGLint mask)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLContext *source_context = _eglLookupContext(source, disp);
-   _EGLContext *dest_context = _eglLookupContext(dest, disp);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_CONTEXT(disp, source_context, EGL_FALSE, drv);
-   if (!dest_context)
-      RETURN_EGL_ERROR(disp, EGL_BAD_CONTEXT, EGL_FALSE);
-
-   ret = drv->API.CopyContextMESA(drv, disp,
-         source_context, dest_context, mask);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglGetScreensMESA(EGLDisplay dpy, EGLScreenMESA *screens,
-                  EGLint max_screens, EGLint *num_screens)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_DISPLAY(disp, EGL_FALSE, drv);
-   ret = drv->API.GetScreensMESA(drv, disp, screens, max_screens, num_screens);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
 
-EGLSurface EGLAPIENTRY
-eglCreateScreenSurfaceMESA(EGLDisplay dpy, EGLConfig config,
-                           const EGLint *attrib_list)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLConfig *conf = _eglLookupConfig(config, disp);
-   _EGLDriver *drv;
-   _EGLSurface *surf;
-   EGLSurface ret;
-
-   _EGL_CHECK_CONFIG(disp, conf, EGL_NO_SURFACE, drv);
-
-   surf = drv->API.CreateScreenSurfaceMESA(drv, disp, conf, attrib_list);
-   ret = (surf) ? _eglLinkSurface(surf) : EGL_NO_SURFACE;
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglShowScreenSurfaceMESA(EGLDisplay dpy, EGLint screen,
-                         EGLSurface surface, EGLModeMESA mode)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLScreen *scrn = _eglLookupScreen((EGLScreenMESA) screen, disp);
-   _EGLSurface *surf = _eglLookupSurface(surface, disp);
-   _EGLMode *m = _eglLookupMode(mode, disp);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_SCREEN(disp, scrn, EGL_FALSE, drv);
-   if (!surf && surface != EGL_NO_SURFACE)
-      RETURN_EGL_ERROR(disp, EGL_BAD_SURFACE, EGL_FALSE);
-   if (!m && mode != EGL_NO_MODE_MESA)
-      RETURN_EGL_ERROR(disp, EGL_BAD_MODE_MESA, EGL_FALSE);
-
-   ret = drv->API.ShowScreenSurfaceMESA(drv, disp, scrn, surf, m);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglScreenPositionMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLint x, EGLint y)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLScreen *scrn = _eglLookupScreen(screen, disp);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_SCREEN(disp, scrn, EGL_FALSE, drv);
-   ret = drv->API.ScreenPositionMESA(drv, disp, scrn, x, y);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglQueryScreenMESA(EGLDisplay dpy, EGLScreenMESA screen,
-                   EGLint attribute, EGLint *value)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLScreen *scrn = _eglLookupScreen(screen, disp);
-   _EGLDriver *drv;
-   EGLBoolean ret;
-
-   _EGL_CHECK_SCREEN(disp, scrn, EGL_FALSE, drv);
-   ret = drv->API.QueryScreenMESA(drv, disp, scrn, attribute, value);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglQueryScreenSurfaceMESA(EGLDisplay dpy, EGLScreenMESA screen,
-                          EGLSurface *surface)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLScreen *scrn = _eglLookupScreen((EGLScreenMESA) screen, disp);
-   _EGLDriver *drv;
-   _EGLSurface *surf;
-   EGLBoolean ret;
-
-   _EGL_CHECK_SCREEN(disp, scrn, EGL_FALSE, drv);
-   ret = drv->API.QueryScreenSurfaceMESA(drv, disp, scrn, &surf);
-   if (ret && surface)
-      *surface = _eglGetSurfaceHandle(surf);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-EGLBoolean EGLAPIENTRY
-eglQueryScreenModeMESA(EGLDisplay dpy, EGLScreenMESA screen, EGLModeMESA *mode)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLScreen *scrn = _eglLookupScreen((EGLScreenMESA) screen, disp);
-   _EGLDriver *drv;
-   _EGLMode *m;
-   EGLBoolean ret;
-
-   _EGL_CHECK_SCREEN(disp, scrn, EGL_FALSE, drv);
-   ret = drv->API.QueryScreenModeMESA(drv, disp, scrn, &m);
-   if (ret && mode)
-      *mode = m->Handle;
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-const char * EGLAPIENTRY
-eglQueryModeStringMESA(EGLDisplay dpy, EGLModeMESA mode)
-{
-   _EGLDisplay *disp = _eglLockDisplay(dpy);
-   _EGLMode *m = _eglLookupMode(mode, disp);
-   _EGLDriver *drv;
-   const char *ret;
-
-   _EGL_CHECK_MODE(disp, m, NULL, drv);
-   ret = drv->API.QueryModeStringMESA(drv, disp, m);
-
-   RETURN_EGL_EVAL(disp, ret);
-}
-
-
-#endif /* EGL_MESA_screen_surface */
-
-
-#ifdef EGL_MESA_drm_display
-
-EGLDisplay EGLAPIENTRY
+static EGLDisplay EGLAPIENTRY
 eglGetDRMDisplayMESA(int fd)
 {
    _EGLDisplay *dpy = _eglFindDisplay(_EGL_PLATFORM_DRM, (void *) (intptr_t) fd);
@@ -1607,7 +1317,7 @@ eglReleaseThread(void)
 }
 
 
-EGLImageKHR EGLAPIENTRY
+static EGLImage EGLAPIENTRY
 eglCreateImageKHR(EGLDisplay dpy, EGLContext ctx, EGLenum target,
                   EGLClientBuffer buffer, const EGLint *attr_list)
 {
@@ -1615,7 +1325,7 @@ eglCreateImageKHR(EGLDisplay dpy, EGLContext ctx, EGLenum target,
    _EGLContext *context = _eglLookupContext(ctx, disp);
    _EGLDriver *drv;
    _EGLImage *img;
-   EGLImageKHR ret;
+   EGLImage ret;
 
    _EGL_CHECK_DISPLAY(disp, EGL_NO_IMAGE_KHR, drv);
    if (!disp->Extensions.KHR_image_base)
@@ -1636,8 +1346,24 @@ eglCreateImageKHR(EGLDisplay dpy, EGLContext ctx, EGLenum target,
 }
 
 
+EGLImage EGLAPIENTRY
+eglCreateImage(EGLDisplay dpy, EGLContext ctx, EGLenum target,
+               EGLClientBuffer buffer, const EGLAttrib *attr_list)
+{
+   EGLImage image;
+   EGLint *int_attribs = _eglConvertAttribsToInt(attr_list);
+
+   if (attr_list && !int_attribs)
+      RETURN_EGL_ERROR(NULL, EGL_BAD_ALLOC, EGL_NO_IMAGE);
+
+   image = eglCreateImageKHR(dpy, ctx, target, buffer, int_attribs);
+   free(int_attribs);
+   return image;
+}
+
+
 EGLBoolean EGLAPIENTRY
-eglDestroyImageKHR(EGLDisplay dpy, EGLImageKHR image)
+eglDestroyImage(EGLDisplay dpy, EGLImage image)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLImage *img = _eglLookupImage(image, disp);
@@ -1657,15 +1383,16 @@ eglDestroyImageKHR(EGLDisplay dpy, EGLImageKHR image)
 }
 
 
-static EGLSyncKHR
+static EGLSync
 _eglCreateSync(EGLDisplay dpy, EGLenum type, const EGLint *attrib_list,
-               const EGLAttribKHR *attrib_list64, EGLBoolean is64)
+               const EGLAttrib *attrib_list64, EGLBoolean is64,
+               EGLenum invalid_type_error)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLContext *ctx = _eglGetCurrentContext();
    _EGLDriver *drv;
    _EGLSync *sync;
-   EGLSyncKHR ret;
+   EGLSync ret;
 
    _EGL_CHECK_DISPLAY(disp, EGL_NO_SYNC_KHR, drv);
 
@@ -1680,18 +1407,18 @@ _eglCreateSync(EGLDisplay dpy, EGLenum type, const EGLint *attrib_list,
    switch (type) {
    case EGL_SYNC_FENCE_KHR:
       if (!disp->Extensions.KHR_fence_sync)
-         RETURN_EGL_ERROR(disp, EGL_BAD_ATTRIBUTE, EGL_NO_SYNC_KHR);
+         RETURN_EGL_ERROR(disp, invalid_type_error, EGL_NO_SYNC_KHR);
       break;
    case EGL_SYNC_REUSABLE_KHR:
       if (!disp->Extensions.KHR_reusable_sync)
-         RETURN_EGL_ERROR(disp, EGL_BAD_ATTRIBUTE, EGL_NO_SYNC_KHR);
+         RETURN_EGL_ERROR(disp, invalid_type_error, EGL_NO_SYNC_KHR);
       break;
    case EGL_SYNC_CL_EVENT_KHR:
       if (!disp->Extensions.KHR_cl_event2)
-         RETURN_EGL_ERROR(disp, EGL_BAD_ATTRIBUTE, EGL_NO_SYNC_KHR);
+         RETURN_EGL_ERROR(disp, invalid_type_error, EGL_NO_SYNC_KHR);
       break;
    default:
-      RETURN_EGL_ERROR(disp, EGL_BAD_ATTRIBUTE, EGL_NO_SYNC_KHR);
+      RETURN_EGL_ERROR(disp, invalid_type_error, EGL_NO_SYNC_KHR);
    }
 
    sync = drv->API.CreateSyncKHR(drv, disp, type, attrib_list, attrib_list64);
@@ -1701,22 +1428,32 @@ _eglCreateSync(EGLDisplay dpy, EGLenum type, const EGLint *attrib_list,
 }
 
 
-EGLSyncKHR EGLAPIENTRY
+static EGLSync EGLAPIENTRY
 eglCreateSyncKHR(EGLDisplay dpy, EGLenum type, const EGLint *attrib_list)
 {
-   return _eglCreateSync(dpy, type, attrib_list, NULL, EGL_FALSE);
+   return _eglCreateSync(dpy, type, attrib_list, NULL, EGL_FALSE,
+                         EGL_BAD_ATTRIBUTE);
+}
+
+
+static EGLSync EGLAPIENTRY
+eglCreateSync64KHR(EGLDisplay dpy, EGLenum type, const EGLAttrib *attrib_list)
+{
+   return _eglCreateSync(dpy, type, NULL, attrib_list, EGL_TRUE,
+                         EGL_BAD_ATTRIBUTE);
 }
 
 
-EGLSyncKHR EGLAPIENTRY
-eglCreateSync64KHR(EGLDisplay dpy, EGLenum type, const EGLAttribKHR *attrib_list)
+EGLSync EGLAPIENTRY
+eglCreateSync(EGLDisplay dpy, EGLenum type, const EGLAttrib *attrib_list)
 {
-   return _eglCreateSync(dpy, type, NULL, attrib_list, EGL_TRUE);
+   return _eglCreateSync(dpy, type, NULL, attrib_list, EGL_TRUE,
+                         EGL_BAD_PARAMETER);
 }
 
 
 EGLBoolean EGLAPIENTRY
-eglDestroySyncKHR(EGLDisplay dpy, EGLSyncKHR sync)
+eglDestroySync(EGLDisplay dpy, EGLSync sync)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLSync *s = _eglLookupSync(sync, disp);
@@ -1735,7 +1472,7 @@ eglDestroySyncKHR(EGLDisplay dpy, EGLSyncKHR sync)
 
 
 EGLint EGLAPIENTRY
-eglClientWaitSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR timeout)
+eglClientWaitSync(EGLDisplay dpy, EGLSync sync, EGLint flags, EGLTime timeout)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLSync *s = _eglLookupSync(sync, disp);
@@ -1755,8 +1492,8 @@ eglClientWaitSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint flags, EGLTimeKHR t
 }
 
 
-EGLint EGLAPIENTRY
-eglWaitSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint flags)
+static EGLint EGLAPIENTRY
+eglWaitSyncKHR(EGLDisplay dpy, EGLSync sync, EGLint flags)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLSync *s = _eglLookupSync(sync, disp);
@@ -1782,7 +1519,18 @@ eglWaitSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint flags)
 
 
 EGLBoolean EGLAPIENTRY
-eglSignalSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode)
+eglWaitSync(EGLDisplay dpy, EGLSync sync, EGLint flags)
+{
+   /* The KHR version returns EGLint, while the core version returns
+    * EGLBoolean. In both cases, the return values can only be EGL_FALSE and
+    * EGL_TRUE.
+    */
+   return eglWaitSyncKHR(dpy, sync, flags);
+}
+
+
+static EGLBoolean EGLAPIENTRY
+eglSignalSyncKHR(EGLDisplay dpy, EGLSync sync, EGLenum mode)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLSync *s = _eglLookupSync(sync, disp);
@@ -1798,7 +1546,7 @@ eglSignalSyncKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLenum mode)
 
 
 EGLBoolean EGLAPIENTRY
-eglGetSyncAttribKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *value)
+eglGetSyncAttrib(EGLDisplay dpy, EGLSync sync, EGLint attribute, EGLAttrib *value)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLSync *s = _eglLookupSync(sync, disp);
@@ -1808,15 +1556,33 @@ eglGetSyncAttribKHR(EGLDisplay dpy, EGLSyncKHR sync, EGLint attribute, EGLint *v
    _EGL_CHECK_SYNC(disp, s, EGL_FALSE, drv);
    assert(disp->Extensions.KHR_reusable_sync ||
           disp->Extensions.KHR_fence_sync);
-   ret = drv->API.GetSyncAttribKHR(drv, disp, s, attribute, value);
+   ret = drv->API.GetSyncAttrib(drv, disp, s, attribute, value);
 
    RETURN_EGL_EVAL(disp, ret);
 }
 
 
+static EGLBoolean EGLAPIENTRY
+eglGetSyncAttribKHR(EGLDisplay dpy, EGLSync sync, EGLint attribute, EGLint *value)
+{
+   EGLAttrib attrib = *value;
+   EGLBoolean result = eglGetSyncAttrib(dpy, sync, attribute, &attrib);
+
+   /* The EGL_KHR_fence_sync spec says this about eglGetSyncAttribKHR:
+    *
+    *    If any error occurs, <*value> is not modified.
+    */
+   if (result == EGL_FALSE)
+      return result;
+
+   *value = attrib;
+   return result;
+}
+
+
 #ifdef EGL_NOK_swap_region
 
-EGLBoolean EGLAPIENTRY
+static EGLBoolean EGLAPIENTRY
 eglSwapBuffersRegionNOK(EGLDisplay dpy, EGLSurface surface,
 			EGLint numRects, const EGLint *rects)
 {
@@ -1846,13 +1612,13 @@ eglSwapBuffersRegionNOK(EGLDisplay dpy, EGLSurface surface,
 
 #ifdef EGL_MESA_drm_image
 
-EGLImageKHR EGLAPIENTRY
+static EGLImage EGLAPIENTRY
 eglCreateDRMImageMESA(EGLDisplay dpy, const EGLint *attr_list)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLDriver *drv;
    _EGLImage *img;
-   EGLImageKHR ret;
+   EGLImage ret;
 
    _EGL_CHECK_DISPLAY(disp, EGL_NO_IMAGE_KHR, drv);
    if (!disp->Extensions.MESA_drm_image)
@@ -1864,8 +1630,8 @@ eglCreateDRMImageMESA(EGLDisplay dpy, const EGLint *attr_list)
    RETURN_EGL_EVAL(disp, ret);
 }
 
-EGLBoolean EGLAPIENTRY
-eglExportDRMImageMESA(EGLDisplay dpy, EGLImageKHR image,
+static EGLBoolean EGLAPIENTRY
+eglExportDRMImageMESA(EGLDisplay dpy, EGLImage image,
 		      EGLint *name, EGLint *handle, EGLint *stride)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
@@ -1889,7 +1655,7 @@ eglExportDRMImageMESA(EGLDisplay dpy, EGLImageKHR image,
 #ifdef EGL_WL_bind_wayland_display
 struct wl_display;
 
-EGLBoolean EGLAPIENTRY
+static EGLBoolean EGLAPIENTRY
 eglBindWaylandDisplayWL(EGLDisplay dpy, struct wl_display *display)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
@@ -1907,7 +1673,7 @@ eglBindWaylandDisplayWL(EGLDisplay dpy, struct wl_display *display)
    RETURN_EGL_EVAL(disp, ret);
 }
 
-EGLBoolean EGLAPIENTRY
+static EGLBoolean EGLAPIENTRY
 eglUnbindWaylandDisplayWL(EGLDisplay dpy, struct wl_display *display)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
@@ -1925,7 +1691,7 @@ eglUnbindWaylandDisplayWL(EGLDisplay dpy, struct wl_display *display)
    RETURN_EGL_EVAL(disp, ret);
 }
 
-EGLBoolean EGLAPIENTRY
+static EGLBoolean EGLAPIENTRY
 eglQueryWaylandBufferWL(EGLDisplay dpy, struct wl_resource *buffer,
                         EGLint attribute, EGLint *value)
 {
@@ -1946,8 +1712,8 @@ eglQueryWaylandBufferWL(EGLDisplay dpy, struct wl_resource *buffer,
 #endif
 
 #ifdef EGL_WL_create_wayland_buffer_from_image
-struct wl_buffer * EGLAPIENTRY
-eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImageKHR image)
+static struct wl_buffer * EGLAPIENTRY
+eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImage image)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
    _EGLImage *img;
@@ -1968,7 +1734,7 @@ eglCreateWaylandBufferFromImageWL(EGLDisplay dpy, EGLImageKHR image)
 }
 #endif
 
-EGLBoolean EGLAPIENTRY
+static EGLBoolean EGLAPIENTRY
 eglPostSubBufferNV(EGLDisplay dpy, EGLSurface surface,
                    EGLint x, EGLint y, EGLint width, EGLint height)
 {
@@ -1987,7 +1753,7 @@ eglPostSubBufferNV(EGLDisplay dpy, EGLSurface surface,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-EGLBoolean EGLAPIENTRY
+static EGLBoolean EGLAPIENTRY
 eglGetSyncValuesCHROMIUM(EGLDisplay display, EGLSurface surface,
                          EGLuint64KHR *ust, EGLuint64KHR *msc,
                          EGLuint64KHR *sbc)
@@ -2010,8 +1776,8 @@ eglGetSyncValuesCHROMIUM(EGLDisplay display, EGLSurface surface,
 }
 
 #ifdef EGL_MESA_image_dma_buf_export
-EGLBoolean EGLAPIENTRY
-eglExportDMABUFImageQueryMESA(EGLDisplay dpy, EGLImageKHR image,
+static EGLBoolean EGLAPIENTRY
+eglExportDMABUFImageQueryMESA(EGLDisplay dpy, EGLImage image,
                               EGLint *fourcc, EGLint *nplanes,
                               EGLuint64KHR *modifiers)
 {
@@ -2032,8 +1798,8 @@ eglExportDMABUFImageQueryMESA(EGLDisplay dpy, EGLImageKHR image,
    RETURN_EGL_EVAL(disp, ret);
 }
 
-EGLBoolean EGLAPIENTRY
-eglExportDMABUFImageMESA(EGLDisplay dpy, EGLImageKHR image,
+static EGLBoolean EGLAPIENTRY
+eglExportDMABUFImageMESA(EGLDisplay dpy, EGLImage image,
                          int *fds, EGLint *strides, EGLint *offsets)
 {
    _EGLDisplay *disp = _eglLockDisplay(dpy);
@@ -2052,3 +1818,120 @@ eglExportDMABUFImageMESA(EGLDisplay dpy, EGLImageKHR image,
    RETURN_EGL_EVAL(disp, ret);
 }
 #endif
+
+__eglMustCastToProperFunctionPointerType EGLAPIENTRY
+eglGetProcAddress(const char *procname)
+{
+   static const struct {
+      const char *name;
+      _EGLProc function;
+   } egl_functions[] = {
+      /* core functions queryable in the presence of
+       * EGL_KHR_get_all_proc_addresses or EGL 1.5
+       */
+      /* alphabetical order */
+      { "eglBindAPI", (_EGLProc) eglBindAPI },
+      { "eglBindTexImage", (_EGLProc) eglBindTexImage },
+      { "eglChooseConfig", (_EGLProc) eglChooseConfig },
+      { "eglCopyBuffers", (_EGLProc) eglCopyBuffers },
+      { "eglCreateContext", (_EGLProc) eglCreateContext },
+      { "eglCreatePbufferFromClientBuffer", (_EGLProc) eglCreatePbufferFromClientBuffer },
+      { "eglCreatePbufferSurface", (_EGLProc) eglCreatePbufferSurface },
+      { "eglCreatePixmapSurface", (_EGLProc) eglCreatePixmapSurface },
+      { "eglCreateWindowSurface", (_EGLProc) eglCreateWindowSurface },
+      { "eglDestroyContext", (_EGLProc) eglDestroyContext },
+      { "eglDestroySurface", (_EGLProc) eglDestroySurface },
+      { "eglGetConfigAttrib", (_EGLProc) eglGetConfigAttrib },
+      { "eglGetConfigs", (_EGLProc) eglGetConfigs },
+      { "eglGetCurrentContext", (_EGLProc) eglGetCurrentContext },
+      { "eglGetCurrentDisplay", (_EGLProc) eglGetCurrentDisplay },
+      { "eglGetCurrentSurface", (_EGLProc) eglGetCurrentSurface },
+      { "eglGetDisplay", (_EGLProc) eglGetDisplay },
+      { "eglGetError", (_EGLProc) eglGetError },
+      { "eglGetProcAddress", (_EGLProc) eglGetProcAddress },
+      { "eglInitialize", (_EGLProc) eglInitialize },
+      { "eglMakeCurrent", (_EGLProc) eglMakeCurrent },
+      { "eglQueryAPI", (_EGLProc) eglQueryAPI },
+      { "eglQueryContext", (_EGLProc) eglQueryContext },
+      { "eglQueryString", (_EGLProc) eglQueryString },
+      { "eglQuerySurface", (_EGLProc) eglQuerySurface },
+      { "eglReleaseTexImage", (_EGLProc) eglReleaseTexImage },
+      { "eglReleaseThread", (_EGLProc) eglReleaseThread },
+      { "eglSurfaceAttrib", (_EGLProc) eglSurfaceAttrib },
+      { "eglSwapBuffers", (_EGLProc) eglSwapBuffers },
+      { "eglSwapInterval", (_EGLProc) eglSwapInterval },
+      { "eglTerminate", (_EGLProc) eglTerminate },
+      { "eglWaitClient", (_EGLProc) eglWaitClient },
+      { "eglWaitGL", (_EGLProc) eglWaitGL },
+      { "eglWaitNative", (_EGLProc) eglWaitNative },
+      { "eglCreateSync", (_EGLProc) eglCreateSync },
+      { "eglDestroySync", (_EGLProc) eglDestroySync },
+      { "eglClientWaitSync", (_EGLProc) eglClientWaitSync },
+      { "eglGetSyncAttrib", (_EGLProc) eglGetSyncAttrib },
+      { "eglWaitSync", (_EGLProc) eglWaitSync },
+      { "eglCreateImage", (_EGLProc) eglCreateImage },
+      { "eglDestroyImage", (_EGLProc) eglDestroyImage },
+      { "eglGetPlatformDisplay", (_EGLProc) eglGetPlatformDisplay },
+      { "eglCreatePlatformWindowSurface", (_EGLProc) eglCreatePlatformWindowSurface },
+      { "eglCreatePlatformPixmapSurface", (_EGLProc) eglCreatePlatformPixmapSurface },
+#ifdef EGL_MESA_drm_display
+      { "eglGetDRMDisplayMESA", (_EGLProc) eglGetDRMDisplayMESA },
+#endif
+      { "eglCreateImageKHR", (_EGLProc) eglCreateImageKHR },
+      { "eglDestroyImageKHR", (_EGLProc) eglDestroyImage },
+      { "eglCreateSyncKHR", (_EGLProc) eglCreateSyncKHR },
+      { "eglCreateSync64KHR", (_EGLProc) eglCreateSync64KHR },
+      { "eglDestroySyncKHR", (_EGLProc) eglDestroySync },
+      { "eglClientWaitSyncKHR", (_EGLProc) eglClientWaitSync },
+      { "eglWaitSyncKHR", (_EGLProc) eglWaitSyncKHR },
+      { "eglSignalSyncKHR", (_EGLProc) eglSignalSyncKHR },
+      { "eglGetSyncAttribKHR", (_EGLProc) eglGetSyncAttribKHR },
+#ifdef EGL_NOK_swap_region
+      { "eglSwapBuffersRegionNOK", (_EGLProc) eglSwapBuffersRegionNOK },
+#endif
+#ifdef EGL_MESA_drm_image
+      { "eglCreateDRMImageMESA", (_EGLProc) eglCreateDRMImageMESA },
+      { "eglExportDRMImageMESA", (_EGLProc) eglExportDRMImageMESA },
+#endif
+#ifdef EGL_WL_bind_wayland_display
+      { "eglBindWaylandDisplayWL", (_EGLProc) eglBindWaylandDisplayWL },
+      { "eglUnbindWaylandDisplayWL", (_EGLProc) eglUnbindWaylandDisplayWL },
+      { "eglQueryWaylandBufferWL", (_EGLProc) eglQueryWaylandBufferWL },
+#endif
+#ifdef EGL_WL_create_wayland_buffer_from_image
+      { "eglCreateWaylandBufferFromImageWL", (_EGLProc) eglCreateWaylandBufferFromImageWL },
+#endif
+      { "eglPostSubBufferNV", (_EGLProc) eglPostSubBufferNV },
+#ifdef EGL_EXT_swap_buffers_with_damage
+      { "eglSwapBuffersWithDamageEXT", (_EGLProc) eglSwapBuffersWithDamageEXT },
+#endif
+      { "eglGetPlatformDisplayEXT", (_EGLProc) eglGetPlatformDisplayEXT },
+      { "eglCreatePlatformWindowSurfaceEXT", (_EGLProc) eglCreatePlatformWindowSurfaceEXT },
+      { "eglCreatePlatformPixmapSurfaceEXT", (_EGLProc) eglCreatePlatformPixmapSurfaceEXT },
+      { "eglGetSyncValuesCHROMIUM", (_EGLProc) eglGetSyncValuesCHROMIUM },
+#ifdef EGL_MESA_image_dma_buf_export
+      { "eglExportDMABUFImageQueryMESA", (_EGLProc) eglExportDMABUFImageQueryMESA },
+      { "eglExportDMABUFImageMESA", (_EGLProc) eglExportDMABUFImageMESA },
+#endif
+      { NULL, NULL }
+   };
+   EGLint i;
+   _EGLProc ret;
+
+   if (!procname)
+      RETURN_EGL_SUCCESS(NULL, NULL);
+
+   ret = NULL;
+   if (strncmp(procname, "egl", 3) == 0) {
+      for (i = 0; egl_functions[i].name; i++) {
+         if (strcmp(egl_functions[i].name, procname) == 0) {
+            ret = egl_functions[i].function;
+            break;
+         }
+      }
+   }
+   if (!ret)
+      ret = _eglGetDriverProc(procname);
+
+   RETURN_EGL_SUCCESS(NULL, ret);
+}
diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h
index 068d4ef..4e0378d 100644
--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -31,6 +31,11 @@
 #ifndef EGLAPI_INCLUDED
 #define EGLAPI_INCLUDED
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * A generic function ptr type
  */
@@ -79,22 +84,6 @@ typedef _EGLProc (*GetProcAddress_t)(_EGLDriver *drv, const char *procname);
 
 
 
-#ifdef EGL_MESA_screen_surface
-typedef EGLBoolean (*ChooseModeMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *screen, const EGLint *attrib_list, EGLModeMESA *modes, EGLint modes_size, EGLint *num_modes);
-typedef EGLBoolean (*GetModesMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *screen, EGLModeMESA *modes, EGLint mode_size, EGLint *num_mode);
-typedef EGLBoolean (*GetModeAttribMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLMode *mode, EGLint attribute, EGLint *value);
-typedef EGLBoolean (*CopyContextMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *source, _EGLContext *dest, EGLint mask);
-typedef EGLBoolean (*GetScreensMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLScreenMESA *screens, EGLint max_screens, EGLint *num_screens);
-typedef _EGLSurface *(*CreateScreenSurfaceMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, const EGLint *attrib_list);
-typedef EGLBoolean (*ShowScreenSurfaceMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *screen, _EGLSurface *surface, _EGLMode *mode);
-typedef EGLBoolean (*ScreenPositionMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *screen, EGLint x, EGLint y);
-typedef EGLBoolean (*QueryScreenMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *screen, EGLint attribute, EGLint *value);
-typedef EGLBoolean (*QueryScreenSurfaceMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *screen, _EGLSurface **surface);
-typedef EGLBoolean (*QueryScreenModeMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *screen, _EGLMode **mode);
-typedef const char * (*QueryModeStringMESA_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLMode *mode);
-#endif /* EGL_MESA_screen_surface */
-
-
 typedef _EGLSurface *(*CreatePbufferFromClientBuffer_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum buftype, EGLClientBuffer buffer, _EGLConfig *config, const EGLint *attrib_list);
 
 
@@ -102,12 +91,12 @@ typedef _EGLImage *(*CreateImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLCo
 typedef EGLBoolean (*DestroyImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *image);
 
 
-typedef _EGLSync *(*CreateSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum type, const EGLint *attrib_list, const EGLAttribKHR *attrib_list64);
+typedef _EGLSync *(*CreateSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum type, const EGLint *attrib_list, const EGLAttrib *attrib_list64);
 typedef EGLBoolean (*DestroySyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
-typedef EGLint (*ClientWaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint flags, EGLTimeKHR timeout);
+typedef EGLint (*ClientWaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint flags, EGLTime timeout);
 typedef EGLint (*WaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
 typedef EGLBoolean (*SignalSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLenum mode);
-typedef EGLBoolean (*GetSyncAttribKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint attribute, EGLint *value);
+typedef EGLBoolean (*GetSyncAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint attribute, EGLAttrib *value);
 
 
 #ifdef EGL_NOK_swap_region
@@ -179,21 +168,6 @@ struct _egl_api
    WaitNative_t WaitNative;
    GetProcAddress_t GetProcAddress;
 
-#ifdef EGL_MESA_screen_surface
-   ChooseModeMESA_t ChooseModeMESA;
-   GetModesMESA_t GetModesMESA;
-   GetModeAttribMESA_t GetModeAttribMESA;
-   CopyContextMESA_t CopyContextMESA;
-   GetScreensMESA_t GetScreensMESA;
-   CreateScreenSurfaceMESA_t CreateScreenSurfaceMESA;
-   ShowScreenSurfaceMESA_t ShowScreenSurfaceMESA;
-   ScreenPositionMESA_t ScreenPositionMESA;
-   QueryScreenMESA_t QueryScreenMESA;
-   QueryScreenSurfaceMESA_t QueryScreenSurfaceMESA;
-   QueryScreenModeMESA_t QueryScreenModeMESA;
-   QueryModeStringMESA_t QueryModeStringMESA;
-#endif /* EGL_MESA_screen_surface */
-
    CreatePbufferFromClientBuffer_t CreatePbufferFromClientBuffer;
 
    CreateImageKHR_t CreateImageKHR;
@@ -204,7 +178,7 @@ struct _egl_api
    ClientWaitSyncKHR_t ClientWaitSyncKHR;
    WaitSyncKHR_t WaitSyncKHR;
    SignalSyncKHR_t SignalSyncKHR;
-   GetSyncAttribKHR_t GetSyncAttribKHR;
+   GetSyncAttrib_t GetSyncAttrib;
 
 #ifdef EGL_NOK_swap_region
    SwapBuffersRegionNOK_t SwapBuffersRegionNOK;
@@ -240,4 +214,9 @@ struct _egl_api
 #endif
 };
 
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLAPI_INCLUDED */
diff --git a/src/egl/main/eglarray.h b/src/egl/main/eglarray.h
index 228f6c3..29b7128 100644
--- a/src/egl/main/eglarray.h
+++ b/src/egl/main/eglarray.h
@@ -34,6 +34,10 @@
 #include "egltypedefs.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef EGLBoolean (*_EGLArrayForEach)(void *elem, void *foreach_data);
 
 
@@ -83,4 +87,8 @@ _eglGetArraySize(_EGLArray *array)
 }
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLARRAY_INCLUDED */
diff --git a/src/egl/main/eglcompiler.h b/src/egl/main/eglcompiler.h
index b457a40..9804ca4 100644
--- a/src/egl/main/eglcompiler.h
+++ b/src/egl/main/eglcompiler.h
@@ -30,9 +30,17 @@
 #ifndef EGLCOMPILER_INCLUDED
 #define EGLCOMPILER_INCLUDED
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define STATIC_ASSERT(COND) \
    do { \
       (void) sizeof(char [1 - 2*!(COND)]); \
    } while (0)
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLCOMPILER_INCLUDED */
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index db42e95..cf65c69 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -323,10 +323,6 @@ _eglValidateConfig(const _EGLConfig *conf, EGLBoolean for_matching)
                    EGL_VG_ALPHA_FORMAT_PRE_BIT |
                    EGL_MULTISAMPLE_RESOLVE_BOX_BIT |
                    EGL_SWAP_BEHAVIOR_PRESERVED_BIT;
-#ifdef EGL_MESA_screen_surface
-            if (conf->Display->Extensions.MESA_screen_surface)
-               mask |= EGL_SCREEN_BIT_MESA;
-#endif
             break;
          case EGL_RENDERABLE_TYPE:
          case EGL_CONFORMANT:
diff --git a/src/egl/main/eglconfig.h b/src/egl/main/eglconfig.h
index dc59ea3..84cb227 100644
--- a/src/egl/main/eglconfig.h
+++ b/src/egl/main/eglconfig.h
@@ -39,6 +39,10 @@
 #include "egltypedefs.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* update _eglValidationTable and _eglOffsetOfConfig before updating this
  * struct */
 struct _egl_config
@@ -225,4 +229,8 @@ extern EGLBoolean
 _eglGetConfigs(_EGLDriver *drv, _EGLDisplay *dpy, EGLConfig *configs, EGLint config_size, EGLint *num_config);
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLCONFIG_INCLUDED */
diff --git a/src/egl/main/eglcontext.c b/src/egl/main/eglcontext.c
index 514b91a..e767f4b 100644
--- a/src/egl/main/eglcontext.c
+++ b/src/egl/main/eglcontext.c
@@ -131,7 +131,7 @@ _eglParseContextAttribList(_EGLContext *ctx, _EGLDisplay *dpy,
             break;
          }
 
-         ctx->Flags = val;
+         ctx->Flags |= val;
          break;
 
       case EGL_CONTEXT_OPENGL_PROFILE_MASK_KHR:
@@ -194,7 +194,38 @@ _eglParseContextAttribList(_EGLContext *ctx, _EGLDisplay *dpy,
             break;
          }
 
-         ctx->Flags = EGL_CONTEXT_OPENGL_ROBUST_ACCESS_BIT_KHR;
+         if (val == EGL_TRUE)
+            ctx->Flags |= EGL_CONTEXT_OPENGL_ROBUST_ACCESS_BIT_KHR;
+         break;
+
+      case EGL_CONTEXT_OPENGL_ROBUST_ACCESS:
+         if (dpy->Version < 15) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+
+         if (val == EGL_TRUE)
+            ctx->Flags |= EGL_CONTEXT_OPENGL_ROBUST_ACCESS_BIT_KHR;
+         break;
+
+      case EGL_CONTEXT_OPENGL_DEBUG:
+         if (dpy->Version < 15) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+
+         if (val == EGL_TRUE)
+            ctx->Flags |= EGL_CONTEXT_OPENGL_DEBUG_BIT_KHR;
+         break;
+
+      case EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE:
+         if (dpy->Version < 15) {
+            err = EGL_BAD_ATTRIBUTE;
+            break;
+         }
+
+         if (val == EGL_TRUE)
+            ctx->Flags |= EGL_CONTEXT_OPENGL_FORWARD_COMPATIBLE_BIT_KHR;
          break;
 
       default:
diff --git a/src/egl/main/eglcontext.h b/src/egl/main/eglcontext.h
index 241917f..69bf77d 100644
--- a/src/egl/main/eglcontext.h
+++ b/src/egl/main/eglcontext.h
@@ -37,6 +37,10 @@
 #include "egldisplay.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * "Base" class for device driver contexts.
  */
@@ -150,4 +154,8 @@ _eglGetContextHandle(_EGLContext *ctx)
 }
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLCONTEXT_INCLUDED */
diff --git a/src/egl/main/eglcurrent.c b/src/egl/main/eglcurrent.c
index 6ffc799..835631d 100644
--- a/src/egl/main/eglcurrent.c
+++ b/src/egl/main/eglcurrent.c
@@ -282,14 +282,6 @@ _eglError(EGLint errCode, const char *msg)
       case EGL_NOT_INITIALIZED:
          s = "EGL_NOT_INITIALIZED";
          break;
-#ifdef EGL_MESA_screen_surface
-      case EGL_BAD_SCREEN_MESA:
-         s = "EGL_BAD_SCREEN_MESA";
-         break;
-      case EGL_BAD_MODE_MESA:
-         s = "EGL_BAD_MODE_MESA";
-         break;
-#endif
       default:
          s = "other EGL error";
       }
diff --git a/src/egl/main/eglcurrent.h b/src/egl/main/eglcurrent.h
index 3343755..1e386ac 100644
--- a/src/egl/main/eglcurrent.h
+++ b/src/egl/main/eglcurrent.h
@@ -34,6 +34,10 @@
 #include "egltypedefs.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _EGL_API_ALL_BITS \
    (EGL_OPENGL_ES_BIT   | \
     EGL_OPENVG_BIT      | \
@@ -115,4 +119,8 @@ extern EGLBoolean
 _eglError(EGLint errCode, const char *msg);
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLCURRENT_INCLUDED */
diff --git a/src/egl/main/egldefines.h b/src/egl/main/egldefines.h
index 4c9e014..a32cab2 100644
--- a/src/egl/main/egldefines.h
+++ b/src/egl/main/egldefines.h
@@ -34,6 +34,9 @@
 #ifndef EGLDEFINES_INCLUDED
 #define EGLDEFINES_INCLUDED
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 #define _EGL_MAX_EXTENSIONS_LEN 1000
 
@@ -41,5 +44,8 @@
 
 #define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* EGLDEFINES_INCLUDED */
diff --git a/src/egl/main/egldisplay.c b/src/egl/main/egldisplay.c
index a3ecba8..24a0c7e 100644
--- a/src/egl/main/egldisplay.c
+++ b/src/egl/main/egldisplay.c
@@ -71,7 +71,8 @@ static const struct {
    { _EGL_PLATFORM_DRM, "drm" },
    { _EGL_PLATFORM_NULL, "null" },
    { _EGL_PLATFORM_ANDROID, "android" },
-   { _EGL_PLATFORM_HAIKU, "haiku" }
+   { _EGL_PLATFORM_HAIKU, "haiku" },
+   { _EGL_PLATFORM_SURFACELESS, "surfaceless" },
 };
 
 
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index b6b9ed8..0b50a36 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -39,6 +39,10 @@
 #include "eglarray.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum _egl_platform_type {
    _EGL_PLATFORM_WINDOWS,
    _EGL_PLATFORM_X11,
@@ -47,6 +51,7 @@ enum _egl_platform_type {
    _EGL_PLATFORM_NULL,
    _EGL_PLATFORM_ANDROID,
    _EGL_PLATFORM_HAIKU,
+   _EGL_PLATFORM_SURFACELESS,
 
    _EGL_NUM_PLATFORMS,
    _EGL_INVALID_PLATFORM = -1
@@ -86,8 +91,6 @@ struct _egl_resource
  */
 struct _egl_extensions
 {
-   EGLBoolean MESA_screen_surface;
-   EGLBoolean MESA_copy_context;
    EGLBoolean MESA_drm_display;
    EGLBoolean MESA_drm_image;
    EGLBoolean MESA_configless_context;
@@ -99,6 +102,7 @@ struct _egl_extensions
    EGLBoolean KHR_image_pixmap;
    EGLBoolean KHR_vg_parent_image;
    EGLBoolean KHR_get_all_proc_addresses;
+   EGLBoolean KHR_gl_colorspace;
    EGLBoolean KHR_gl_texture_2D_image;
    EGLBoolean KHR_gl_texture_cubemap_image;
    EGLBoolean KHR_gl_texture_3D_image;
@@ -151,8 +155,7 @@ struct _egl_display
 
    /* these fields are set by the driver during init */
    void *DriverData;          /**< Driver private data */
-   EGLint VersionMajor;       /**< EGL major version */
-   EGLint VersionMinor;       /**< EGL minor version */
+   EGLint Version;            /**< EGL version major*10+minor */
    EGLint ClientAPIs;         /**< Bitmask of APIs supported (EGL_xxx_BIT) */
    _EGLExtensions Extensions; /**< Extensions supported */
 
@@ -271,4 +274,9 @@ _eglGetWaylandDisplay(struct wl_display *native_display,
                       const EGLint *attrib_list);
 #endif
 
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLDISPLAY_INCLUDED */
diff --git a/src/egl/main/egldriver.c b/src/egl/main/egldriver.c
index 6983af9..6ef79d9 100644
--- a/src/egl/main/egldriver.c
+++ b/src/egl/main/egldriver.c
@@ -45,21 +45,9 @@
 #include "egldriver.h"
 #include "egllog.h"
 
-#if defined(_EGL_OS_UNIX)
-#include <dlfcn.h>
-#include <sys/types.h>
-#include <dirent.h>
-#include <unistd.h>
-#endif
-
-#ifdef _EGL_BUILT_IN_DRIVER_HAIKU
-_EGLDriver* _eglBuiltInDriverHaiku(const char* args);
-#endif
-
 typedef struct _egl_module {
-   char *Path;
+   char *Name;
    _EGLMain_t BuiltIn;
-   void *Handle;
    _EGLDriver *Driver;
 } _EGLModule;
 
@@ -80,152 +68,23 @@ const struct {
 };
 
 /**
- * Wrappers for dlopen/dlclose()
- */
-#if defined(_EGL_OS_WINDOWS)
-
-
-typedef HMODULE lib_handle;
-
-static HMODULE
-open_library(const char *filename)
-{
-   return LoadLibrary(filename);
-}
-
-static void
-close_library(HMODULE lib)
-{
-   FreeLibrary(lib);
-}
-
-
-static const char *
-library_suffix(void)
-{
-   return ".dll";
-}
-
-
-#elif defined(_EGL_OS_UNIX)
-
-
-typedef void * lib_handle;
-
-static void *
-open_library(const char *filename)
-{
-   return dlopen(filename, RTLD_LAZY);
-}
-
-static void
-close_library(void *lib)
-{
-   dlclose(lib);
-}
-
-
-static const char *
-library_suffix(void)
-{
-   return ".so";
-}
-
-
-#endif
-
-
-/**
- * Open the named driver and find its bootstrap function: _eglMain().
- */
-static _EGLMain_t
-_eglOpenLibrary(const char *driverPath, lib_handle *handle)
-{
-   lib_handle lib;
-   _EGLMain_t mainFunc = NULL;
-   const char *error = "unknown error";
-
-   assert(driverPath);
-
-   _eglLog(_EGL_DEBUG, "dlopen(%s)", driverPath);
-   lib = open_library(driverPath);
-
-#if defined(_EGL_OS_WINDOWS)
-   /* XXX untested */
-   if (lib)
-      mainFunc = (_EGLMain_t) GetProcAddress(lib, "_eglMain");
-#elif defined(_EGL_OS_UNIX)
-   if (lib) {
-      union {
-         _EGLMain_t func;
-         void *ptr;
-      } tmp = { NULL };
-      /* direct cast gives a warning when compiled with -pedantic */
-      tmp.ptr = dlsym(lib, "_eglMain");
-      mainFunc = tmp.func;
-      if (!mainFunc)
-         error = dlerror();
-   }
-   else {
-      error = dlerror();
-   }
-#endif
-
-   if (!lib) {
-      _eglLog(_EGL_WARNING, "Could not open driver %s (%s)",
-              driverPath, error);
-      return NULL;
-   }
-
-   if (!mainFunc) {
-      _eglLog(_EGL_WARNING, "_eglMain not found in %s (%s)",
-              driverPath, error);
-      if (lib)
-         close_library(lib);
-      return NULL;
-   }
-
-   *handle = lib;
-   return mainFunc;
-}
-
-
-/**
  * Load a module and create the driver object.
  */
 static EGLBoolean
 _eglLoadModule(_EGLModule *mod)
 {
-   _EGLMain_t mainFunc;
-   lib_handle lib;
    _EGLDriver *drv;
 
    if (mod->Driver)
       return EGL_TRUE;
 
-   if (mod->BuiltIn) {
-      lib = (lib_handle) NULL;
-      mainFunc = mod->BuiltIn;
-   }
-   else {
-      mainFunc = _eglOpenLibrary(mod->Path, &lib);
-      if (!mainFunc)
+   if (!mod->BuiltIn)
          return EGL_FALSE;
-   }
 
-   drv = mainFunc(NULL);
-   if (!drv) {
-      if (lib)
-         close_library(lib);
+   drv = mod->BuiltIn(NULL);
+   if (!drv || !drv->Name)
       return EGL_FALSE;
-   }
 
-   if (!drv->Name) {
-      _eglLog(_EGL_WARNING, "Driver loaded from %s has no name", mod->Path);
-      drv->Name = "UNNAMED";
-   }
-
-   mod->Handle = (void *) lib;
    mod->Driver = drv;
 
    return EGL_TRUE;
@@ -243,20 +102,11 @@ _eglUnloadModule(_EGLModule *mod)
    if (mod->Driver && mod->Driver->Unload)
       mod->Driver->Unload(mod->Driver);
 
-   /*
-    * XXX At this point (atexit), the module might be the last reference to
-    * libEGL.  Closing the module might unmap libEGL and give problems.
-    */
-#if 0
-   if (mod->Handle)
-      close_library(mod->Handle);
-#endif
 #elif defined(_EGL_OS_WINDOWS)
    /* XXX Windows unloads DLLs before atexit */
 #endif
 
    mod->Driver = NULL;
-   mod->Handle = NULL;
 }
 
 
@@ -264,7 +114,7 @@ _eglUnloadModule(_EGLModule *mod)
  * Add a module to the module array.
  */
 static _EGLModule *
-_eglAddModule(const char *path)
+_eglAddModule(const char *name)
 {
    _EGLModule *mod;
    EGLint i;
@@ -278,22 +128,22 @@ _eglAddModule(const char *path)
    /* find duplicates */
    for (i = 0; i < _eglModules->Size; i++) {
       mod = _eglModules->Elements[i];
-      if (strcmp(mod->Path, path) == 0)
+      if (strcmp(mod->Name, name) == 0)
          return mod;
    }
 
    /* allocate a new one */
    mod = calloc(1, sizeof(*mod));
    if (mod) {
-      mod->Path = _eglstrdup(path);
-      if (!mod->Path) {
+      mod->Name = _eglstrdup(name);
+      if (!mod->Name) {
          free(mod);
          mod = NULL;
       }
    }
    if (mod) {
       _eglAppendArray(_eglModules, (void *) mod);
-      _eglLog(_EGL_DEBUG, "added %s to module array", mod->Path);
+      _eglLog(_EGL_DEBUG, "added %s to module array", mod->Name);
    }
 
    return mod;
@@ -309,155 +159,12 @@ _eglFreeModule(void *module)
    _EGLModule *mod = (_EGLModule *) module;
 
    _eglUnloadModule(mod);
-   free(mod->Path);
+   free(mod->Name);
    free(mod);
 }
 
 
 /**
- * A loader function for use with _eglPreloadForEach.  The loader data is the
- * filename of the driver.   This function stops on the first valid driver.
- */
-static EGLBoolean
-_eglLoaderFile(const char *dir, size_t len, void *loader_data)
-{
-   char path[1024];
-   const char *filename = (const char *) loader_data;
-   size_t flen = strlen(filename);
-
-   /* make a full path */
-   if (len + flen + 2 > sizeof(path))
-      return EGL_TRUE;
-   if (len) {
-      memcpy(path, dir, len);
-      path[len++] = '/';
-   }
-   memcpy(path + len, filename, flen);
-   len += flen;
-   path[len] = '\0';
-
-   if (library_suffix()) {
-      const char *suffix = library_suffix();
-      size_t slen = strlen(suffix);
-      const char *p;
-      EGLBoolean need_suffix;
-
-      p = filename + flen - slen;
-      need_suffix = (p < filename || strcmp(p, suffix) != 0);
-      if (need_suffix) {
-         /* overflow */
-         if (len + slen + 1 > sizeof(path))
-            return EGL_TRUE;
-         strcpy(path + len, suffix);
-      }
-   }
-
-#if defined(_EGL_OS_UNIX)
-   /* check if the file exists */
-   if (access(path, F_OK))
-      return EGL_TRUE;
-#endif
-
-   _eglAddModule(path);
-
-   return EGL_TRUE;
-}
-
-
-/**
- * Run the callback function on each driver directory.
- *
- * The process may end prematurely if the callback function returns false.
- */
-static void
-_eglPreloadForEach(const char *search_path,
-                   EGLBoolean (*loader)(const char *, size_t, void *),
-                   void *loader_data)
-{
-   const char *cur, *next;
-   size_t len;
-
-   cur = search_path;
-   while (cur) {
-      next = strchr(cur, ':');
-      len = (next) ? next - cur : strlen(cur);
-
-      if (!loader(cur, len, loader_data))
-         break;
-
-      cur = (next) ? next + 1 : NULL;
-   }
-}
-
-
-/**
- * Return a list of colon-separated driver directories.
- */
-static const char *
-_eglGetSearchPath(void)
-{
-   static char search_path[1024];
-
-#if defined(_EGL_OS_UNIX) || defined(_EGL_OS_WINDOWS)
-   if (search_path[0] == '\0') {
-      char *buf = search_path;
-      size_t len = sizeof(search_path);
-      EGLBoolean use_env;
-      char dir_sep;
-      int ret;
-
-#if defined(_EGL_OS_UNIX)
-      use_env = (geteuid() == getuid() && getegid() == getgid());
-      dir_sep = '/';
-#else
-      use_env = EGL_TRUE;
-      dir_sep = '\\';
-#endif
-
-      if (use_env) {
-         char *p;
-
-         /* extract the dirname from EGL_DRIVER */
-         p = getenv("EGL_DRIVER");
-         if (p && strchr(p, dir_sep)) {
-            ret = _eglsnprintf(buf, len, "%s", p);
-            if (ret > 0 && ret < len) {
-               p = strrchr(buf, dir_sep);
-               *p++ = ':';
-
-               len -= p - buf;
-               buf = p;
-            }
-         }
-
-         /* append EGL_DRIVERS_PATH */
-         p = getenv("EGL_DRIVERS_PATH");
-         if (p) {
-            ret = _eglsnprintf(buf, len, "%s:", p);
-            if (ret > 0 && ret < len) {
-               buf += ret;
-               len -= ret;
-            }
-         }
-      }
-      else {
-         _eglLog(_EGL_DEBUG,
-               "ignore EGL_DRIVERS_PATH for setuid/setgid binaries");
-      }
-
-      ret = _eglsnprintf(buf, len, "%s", _EGL_DRIVER_SEARCH_DIR);
-      if (ret < 0 || ret >= len)
-         search_path[0] = '\0';
-
-      _eglLog(_EGL_DEBUG, "EGL search path is %s", search_path);
-   }
-#endif /* defined(_EGL_OS_UNIX) || defined(_EGL_OS_WINDOWS) */
-
-   return search_path;
-}
-
-
-/**
  * Add the user driver to the module array.
  *
  * The user driver is specified by EGL_DRIVER.
@@ -465,42 +172,15 @@ _eglGetSearchPath(void)
 static EGLBoolean
 _eglAddUserDriver(void)
 {
-   const char *search_path = _eglGetSearchPath();
    char *env;
-   size_t name_len = 0;
 
    env = getenv("EGL_DRIVER");
-#if defined(_EGL_OS_UNIX)
-   if (env && strchr(env, '/')) {
-      search_path = "";
-      if ((geteuid() != getuid() || getegid() != getgid())) {
-         _eglLog(_EGL_DEBUG,
-               "ignore EGL_DRIVER for setuid/setgid binaries");
-         env = NULL;
-      }
-   }
-   else if (env) {
-      char *suffix = strchr(env, '.');
-      name_len = (suffix) ? suffix - env : strlen(env);
-   }
-#else
-   if (env)
-      name_len = strlen(env);
-#endif /* _EGL_OS_UNIX */
-
-   /*
-    * Try built-in drivers first if we know the driver name.  This makes sure
-    * we do not load the outdated external driver that is still on the
-    * filesystem.
-    */
-   if (name_len) {
-      _EGLModule *mod;
+   if (env) {
       EGLint i;
 
       for (i = 0; _eglBuiltInDrivers[i].name; i++) {
-         if (strlen(_eglBuiltInDrivers[i].name) == name_len &&
-             !strncmp(_eglBuiltInDrivers[i].name, env, name_len)) {
-            mod = _eglAddModule(env);
+         if (!strcmp(_eglBuiltInDrivers[i].name, env)) {
+            _EGLModule *mod = _eglAddModule(env);
             if (mod)
                mod->BuiltIn = _eglBuiltInDrivers[i].main;
 
@@ -509,13 +189,6 @@ _eglAddUserDriver(void)
       }
    }
 
-   /* otherwise, treat env as a path */
-   if (env) {
-      _eglPreloadForEach(search_path, _eglLoaderFile, (void *) env);
-
-      return EGL_TRUE;
-   }
-
    return EGL_FALSE;
 }
 
@@ -683,18 +356,3 @@ _eglUnloadDrivers(void)
       _eglModules = NULL;
    }
 }
-
-
-/**
- * Invoke a callback function on each EGL search path.
- *
- * The first argument of the callback function is the name of the search path.
- * The second argument is the length of the name.
- */
-void
-_eglSearchPathForEach(EGLBoolean (*callback)(const char *, size_t, void *),
-                      void *callback_data)
-{
-   const char *search_path = _eglGetSearchPath();
-   _eglPreloadForEach(search_path, callback, callback_data);
-}
diff --git a/src/egl/main/egldriver.h b/src/egl/main/egldriver.h
index 11300ce..1cf6628 100644
--- a/src/egl/main/egldriver.h
+++ b/src/egl/main/egldriver.h
@@ -38,6 +38,11 @@
 #include "eglapi.h"
 #include <stddef.h>
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * Define an inline driver typecast function.
  *
@@ -87,19 +92,11 @@ struct _egl_driver
 
 
 extern _EGLDriver *
-_eglBuiltInDriverGALLIUM(const char *args);
-
-
-extern _EGLDriver *
 _eglBuiltInDriverDRI2(const char *args);
 
 
-extern _EGLDriver *
-_eglBuiltInDriverGLX(const char *args);
-
-
-extern _EGLDriver *
-_eglMain(const char *args);
+extern _EGLDriver*
+_eglBuiltInDriverHaiku(const char* args);
 
 
 extern _EGLDriver *
@@ -124,4 +121,9 @@ _eglSearchPathForEach(EGLBoolean (*callback)(const char *, size_t, void *),
                       void *callback_data);
 
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif /* EGLDRIVER_INCLUDED */
diff --git a/src/egl/main/eglfallbacks.c b/src/egl/main/eglfallbacks.c
index 83d7756..3c3701f 100644
--- a/src/egl/main/eglfallbacks.c
+++ b/src/egl/main/eglfallbacks.c
@@ -32,8 +32,6 @@
 #include "eglconfig.h"
 #include "eglcontext.h"
 #include "eglsurface.h"
-#include "eglscreen.h"
-#include "eglmode.h"
 #include "eglsync.h"
 
 
@@ -85,22 +83,6 @@ _eglInitDriverFallbacks(_EGLDriver *drv)
    drv->API.WaitNative = (WaitNative_t) _eglReturnFalse;
    drv->API.GetProcAddress = (GetProcAddress_t) _eglReturnFalse;
 
-#ifdef EGL_MESA_screen_surface
-   drv->API.CopyContextMESA = (CopyContextMESA_t) _eglReturnFalse;
-   drv->API.CreateScreenSurfaceMESA =
-      (CreateScreenSurfaceMESA_t) _eglReturnFalse;
-   drv->API.ShowScreenSurfaceMESA = (ShowScreenSurfaceMESA_t) _eglReturnFalse;
-   drv->API.ChooseModeMESA = _eglChooseModeMESA;
-   drv->API.GetModesMESA = _eglGetModesMESA;
-   drv->API.GetModeAttribMESA = _eglGetModeAttribMESA;
-   drv->API.GetScreensMESA = _eglGetScreensMESA;
-   drv->API.ScreenPositionMESA = _eglScreenPositionMESA;
-   drv->API.QueryScreenMESA = _eglQueryScreenMESA;
-   drv->API.QueryScreenSurfaceMESA = _eglQueryScreenSurfaceMESA;
-   drv->API.QueryScreenModeMESA = _eglQueryScreenModeMESA;
-   drv->API.QueryModeStringMESA = _eglQueryModeStringMESA;
-#endif /* EGL_MESA_screen_surface */
-
    drv->API.CreateImageKHR = NULL;
    drv->API.DestroyImageKHR = NULL;
 
@@ -109,7 +91,7 @@ _eglInitDriverFallbacks(_EGLDriver *drv)
    drv->API.ClientWaitSyncKHR = NULL;
    drv->API.WaitSyncKHR = NULL;
    drv->API.SignalSyncKHR = NULL;
-   drv->API.GetSyncAttribKHR = _eglGetSyncAttribKHR;
+   drv->API.GetSyncAttrib = _eglGetSyncAttrib;
 
 #ifdef EGL_MESA_drm_image
    drv->API.CreateDRMImageMESA = NULL;
@@ -120,7 +102,7 @@ _eglInitDriverFallbacks(_EGLDriver *drv)
    drv->API.SwapBuffersRegionNOK = NULL;
 #endif
 
-#ifdef EGL_MESA_dma_buf_image_export
+#ifdef EGL_MESA_image_dma_buf_export
    drv->API.ExportDMABUFImageQueryMESA = NULL;
    drv->API.ExportDMABUFImageMESA = NULL;
 #endif
diff --git a/src/egl/main/eglglobals.c b/src/egl/main/eglglobals.c
index 129bf29..884cff0 100644
--- a/src/egl/main/eglglobals.c
+++ b/src/egl/main/eglglobals.c
@@ -50,16 +50,6 @@ struct _egl_global _eglGlobal =
       _eglFiniDisplay
    },
 
-   /* ClientExtensions */
-   {
-      true, /* EGL_EXT_client_extensions */
-      true, /* EGL_EXT_platform_base */
-      true, /* EGL_EXT_platform_x11 */
-      true, /* EGL_EXT_platform_wayland */
-      true, /* EGL_MESA_platform_gbm */
-      true, /* EGL_KHR_client_get_all_proc_addresses */
-   },
-
    /* ClientExtensionsString */
    "EGL_EXT_client_extensions"
    " EGL_EXT_platform_base"
diff --git a/src/egl/main/eglglobals.h b/src/egl/main/eglglobals.h
index 04b9609..ae1b75b 100644
--- a/src/egl/main/eglglobals.h
+++ b/src/egl/main/eglglobals.h
@@ -50,15 +50,6 @@ struct _egl_global
    EGLint NumAtExitCalls;
    void (*AtExitCalls[10])(void);
 
-   struct _egl_client_extensions {
-      bool EXT_client_extensions;
-      bool EXT_platform_base;
-      bool EXT_platform_x11;
-      bool EXT_platform_wayland;
-      bool MESA_platform_gbm;
-      bool KHR_get_all_proc_addresses;
-   } ClientExtensions;
-
    const char *ClientExtensionString;
 };
 
diff --git a/src/egl/main/eglimage.h b/src/egl/main/eglimage.h
index 50a87a1..0dd5e12 100644
--- a/src/egl/main/eglimage.h
+++ b/src/egl/main/eglimage.h
@@ -35,6 +35,11 @@
 #include "egltypedefs.h"
 #include "egldisplay.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct _egl_image_attrib_int
 {
    EGLint Value;
@@ -116,11 +121,11 @@ _eglPutImage(_EGLImage *img)
  * Link an image to its display and return the handle of the link.
  * The handle can be passed to client directly.
  */
-static inline EGLImageKHR
+static inline EGLImage
 _eglLinkImage(_EGLImage *img)
 {
    _eglLinkResource(&img->Resource, _EGL_RESOURCE_IMAGE);
-   return (EGLImageKHR) img;
+   return (EGLImage) img;
 }
 
 
@@ -140,7 +145,7 @@ _eglUnlinkImage(_EGLImage *img)
  * Return NULL if the handle has no corresponding linked image.
  */
 static inline _EGLImage *
-_eglLookupImage(EGLImageKHR image, _EGLDisplay *dpy)
+_eglLookupImage(EGLImage image, _EGLDisplay *dpy)
 {
    _EGLImage *img = (_EGLImage *) image;
    if (!dpy || !_eglCheckResource((void *) img, _EGL_RESOURCE_IMAGE, dpy))
@@ -152,13 +157,17 @@ _eglLookupImage(EGLImageKHR image, _EGLDisplay *dpy)
 /**
  * Return the handle of a linked image, or EGL_NO_IMAGE_KHR.
  */
-static inline EGLImageKHR
+static inline EGLImage
 _eglGetImageHandle(_EGLImage *img)
 {
    _EGLResource *res = (_EGLResource *) img;
    return (res && _eglIsResourceLinked(res)) ?
-      (EGLImageKHR) img : EGL_NO_IMAGE_KHR;
+      (EGLImage) img : EGL_NO_IMAGE_KHR;
 }
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLIMAGE_INCLUDED */
diff --git a/src/egl/main/egllog.h b/src/egl/main/egllog.h
index 12a477e..cf58525 100644
--- a/src/egl/main/egllog.h
+++ b/src/egl/main/egllog.h
@@ -34,6 +34,10 @@
 #include "egltypedefs.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define _EGL_FATAL   0   /* unrecoverable error */
 #define _EGL_WARNING 1   /* recoverable error/problem */
 #define _EGL_INFO    2   /* just useful info */
@@ -55,4 +59,8 @@ extern void
 _eglLog(EGLint level, const char *fmtStr, ...);
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLLOG_INCLUDED */
diff --git a/src/egl/main/eglmode.c b/src/egl/main/eglmode.c
deleted file mode 100644
index d248ea4..0000000
--- a/src/egl/main/eglmode.c
+++ /dev/null
@@ -1,357 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include "egldisplay.h"
-#include "eglmode.h"
-#include "eglcurrent.h"
-#include "eglscreen.h"
-
-
-#ifdef EGL_MESA_screen_surface
-
-
-#define MIN2(A, B)  (((A) < (B)) ? (A) : (B))
-
-
-/**
- * Given an EGLModeMESA handle, return the corresponding _EGLMode object
- * or null if non-existant.
- */
-_EGLMode *
-_eglLookupMode(EGLModeMESA mode, _EGLDisplay *disp)
-{
-   EGLint scrnum;
-
-   if (!disp || !disp->Screens)
-      return NULL;
-
-   /* loop over all screens on the display */
-   for (scrnum = 0; scrnum < disp->Screens->Size; scrnum++) {
-      const _EGLScreen *scrn = disp->Screens->Elements[scrnum];
-      EGLint idx;
-
-      /*
-       * the mode ids of a screen ranges from scrn->Handle to scrn->Handle +
-       * scrn->NumModes
-       */
-      if (mode >= scrn->Handle &&
-          mode < scrn->Handle + _EGL_SCREEN_MAX_MODES) {
-         idx = mode - scrn->Handle;
-
-         assert(idx < scrn->NumModes && scrn->Modes[idx].Handle == mode);
-
-         return &scrn->Modes[idx];
-      }
-   }
-
-   return NULL;
-}
-
-
-/**
- * Parse the attrib_list to fill in the fields of the given _eglMode
- * Return EGL_FALSE if any errors, EGL_TRUE otherwise.
- */
-static EGLBoolean
-_eglParseModeAttribs(_EGLMode *mode, const EGLint *attrib_list)
-{
-   EGLint i;
-
-   /* init all attribs to EGL_DONT_CARE */
-   mode->Handle = EGL_DONT_CARE;
-   mode->Width = EGL_DONT_CARE;
-   mode->Height = EGL_DONT_CARE;
-   mode->RefreshRate = EGL_DONT_CARE;
-   mode->Optimal = EGL_DONT_CARE;
-   mode->Interlaced = EGL_DONT_CARE;
-   mode->Name = NULL;
-
-   for (i = 0; attrib_list && attrib_list[i] != EGL_NONE; i++) {
-      switch (attrib_list[i]) {
-      case EGL_MODE_ID_MESA:
-         mode->Handle = attrib_list[++i];
-         if (mode->Handle <= 0) {
-            _eglError(EGL_BAD_PARAMETER, "eglChooseModeMESA(handle)");
-            return EGL_FALSE;
-         }
-         break;
-      case EGL_WIDTH:
-         mode->Width = attrib_list[++i];
-         if (mode->Width <= 0) {
-            _eglError(EGL_BAD_PARAMETER, "eglChooseModeMESA(width)");
-            return EGL_FALSE;
-         }
-         break;
-      case EGL_HEIGHT:
-         mode->Height = attrib_list[++i];
-         if (mode->Height <= 0) {
-            _eglError(EGL_BAD_PARAMETER, "eglChooseModeMESA(height)");
-            return EGL_FALSE;
-         }
-         break;
-      case EGL_REFRESH_RATE_MESA:
-         mode->RefreshRate = attrib_list[++i];
-         if (mode->RefreshRate <= 0) {
-            _eglError(EGL_BAD_PARAMETER, "eglChooseModeMESA(refresh rate)");
-            return EGL_FALSE;
-         }
-         break;
-      case EGL_INTERLACED_MESA:
-         mode->Interlaced = attrib_list[++i];
-         if (mode->Interlaced != EGL_TRUE && mode->Interlaced != EGL_FALSE) {
-            _eglError(EGL_BAD_PARAMETER, "eglChooseModeMESA(interlaced)");
-            return EGL_FALSE;
-         }
-         break;
-      case EGL_OPTIMAL_MESA:
-         mode->Optimal = attrib_list[++i];
-         if (mode->Optimal != EGL_TRUE && mode->Optimal != EGL_FALSE) {
-            _eglError(EGL_BAD_PARAMETER, "eglChooseModeMESA(optimal)");
-            return EGL_FALSE;
-         }
-         break;
-      default:
-         _eglError(EGL_BAD_ATTRIBUTE, "eglChooseModeMESA");
-         return EGL_FALSE;
-      }
-   }
-   return EGL_TRUE;
-}
-
-
-/**
- * Determine if the candidate mode's attributes are at least as good
- * as the minimal mode's.
- * \return EGL_TRUE if qualifies, EGL_FALSE otherwise
- */
-static EGLBoolean
-_eglModeQualifies(const _EGLMode *c, const _EGLMode *min)
-{
-   if (min->Handle != EGL_DONT_CARE && c->Handle != min->Handle)
-      return EGL_FALSE;
-   if (min->Width != EGL_DONT_CARE && c->Width < min->Width)
-      return EGL_FALSE;
-   if (min->Height != EGL_DONT_CARE && c->Height < min->Height)
-      return EGL_FALSE;
-   if (min->RefreshRate != EGL_DONT_CARE && c->RefreshRate < min->RefreshRate)
-      return EGL_FALSE;
-   if (min->Optimal != EGL_DONT_CARE && c->Optimal != min->Optimal)
-      return EGL_FALSE;
-   if (min->Interlaced != EGL_DONT_CARE && c->Interlaced != min->Interlaced)
-      return EGL_FALSE;
-
-   return EGL_TRUE;
-}
-
-
-/**
- * Return value of given mode attribute, or -1 if bad attrib.
- */
-static EGLint
-getModeAttrib(const _EGLMode *m, EGLint attrib)
-{
-   switch (attrib) {
-   case EGL_MODE_ID_MESA:
-      return m->Handle;
-   case EGL_WIDTH:
-      return m->Width;
-   case EGL_HEIGHT:
-      return m->Height;
-   case EGL_REFRESH_RATE_MESA:
-      return m->RefreshRate;
-   case EGL_OPTIMAL_MESA:
-      return m->Optimal;
-   case EGL_INTERLACED_MESA:
-      return m->Interlaced;
-   default:
-      return -1;
-   }
-}
-
-
-#define SMALLER 1
-#define LARGER  2
-
-struct sort_info {
-   EGLint Attrib;
-   EGLint Order; /* SMALLER or LARGER */
-};
-
-/* the order of these entries is the priority */
-static struct sort_info SortInfo[] = {
-   { EGL_OPTIMAL_MESA, LARGER },
-   { EGL_INTERLACED_MESA, SMALLER },
-   { EGL_WIDTH, LARGER },
-   { EGL_HEIGHT, LARGER },
-   { EGL_REFRESH_RATE_MESA, LARGER },
-   { EGL_MODE_ID_MESA, SMALLER },
-   { 0, 0 }
-};
-
-
-/**
- * Compare modes 'a' and 'b' and return -1 if a belongs before b, or 1 if a
- * belongs after b, or 0 if they're equal.
- * Used by qsort().
- */
-static int
-_eglCompareModes(const void *a, const void *b)
-{
-   const _EGLMode *aMode = *((const _EGLMode **) a);
-   const _EGLMode *bMode = *((const _EGLMode **) b);
-   EGLint i;
-
-   for (i = 0; SortInfo[i].Attrib; i++) {
-      const EGLint aVal = getModeAttrib(aMode, SortInfo[i].Attrib);
-      const EGLint bVal = getModeAttrib(bMode, SortInfo[i].Attrib);
-      if (aVal == bVal) {
-         /* a tie */
-         continue;
-      }
-      else if (SortInfo[i].Order == SMALLER) {
-         return (aVal < bVal) ? -1 : 1;
-      }
-      else if (SortInfo[i].Order == LARGER) {
-         return (aVal > bVal) ? -1 : 1;
-      }
-   }
-
-   /* all attributes identical */
-   return 0;
-}
-
-
-/**
- * Search for EGLModes which match the given attribute list.
- * Called via eglChooseModeMESA API function.
- */
-EGLBoolean
-_eglChooseModeMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn,
-                   const EGLint *attrib_list, EGLModeMESA *modes,
-                   EGLint modes_size, EGLint *num_modes)
-{
-   _EGLMode **modeList, min;
-   EGLint i, count;
-
-   if (!_eglParseModeAttribs(&min, attrib_list)) {
-      /* error code will have been recorded */
-      return EGL_FALSE;
-   }
-
-   /* allocate array of mode pointers */
-   modeList = malloc(modes_size * sizeof(_EGLMode *));
-   if (!modeList) {
-      _eglError(EGL_BAD_MODE_MESA, "eglChooseModeMESA(out of memory)");
-      return EGL_FALSE;
-   }
-
-   /* make array of pointers to qualifying modes */
-   for (i = count = 0; i < scrn->NumModes && count < modes_size; i++) {
-      if (_eglModeQualifies(scrn->Modes + i, &min)) {
-         modeList[count++] = scrn->Modes + i;
-      }
-   }
-
-   /* sort array of pointers */
-   qsort(modeList, count, sizeof(_EGLMode *), _eglCompareModes);
-
-   /* copy mode handles to output array */
-   for (i = 0; i < count; i++) {
-      modes[i] = modeList[i]->Handle;
-   }
-
-   free(modeList);
-
-   *num_modes = count;
-
-   return EGL_TRUE;
-}
-
-
-
-/**
- * Return all possible modes for the given screen.  No sorting of results.
- * Called via eglGetModesMESA() API function.
- */
-EGLBoolean
-_eglGetModesMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn,
-                 EGLModeMESA *modes, EGLint modes_size, EGLint *num_modes)
-{
-   if (modes) {
-      EGLint i;
-      *num_modes = MIN2(scrn->NumModes, modes_size);
-      for (i = 0; i < *num_modes; i++) {
-         modes[i] = scrn->Modes[i].Handle;
-      }
-   }
-   else {
-      /* just return total number of supported modes */
-      *num_modes = scrn->NumModes;
-   }
-
-   return EGL_TRUE;
-}
-
-
-/**
- * Query an attribute of a mode.
- */
-EGLBoolean
-_eglGetModeAttribMESA(_EGLDriver *drv, _EGLDisplay *dpy,
-                      _EGLMode *m, EGLint attribute, EGLint *value)
-{
-   EGLint v;
-
-   v = getModeAttrib(m, attribute);
-   if (v < 0) {
-      _eglError(EGL_BAD_ATTRIBUTE, "eglGetModeAttribMESA");
-      return EGL_FALSE;
-   }
-   *value = v;
-   return EGL_TRUE;
-}
-
-
-/**
- * Return human-readable string for given mode.
- * This is the default function called by eglQueryModeStringMESA().
- */
-const char *
-_eglQueryModeStringMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLMode *m)
-{
-   return m->Name;
-}
-
-
-#endif /* EGL_MESA_screen_surface */
diff --git a/src/egl/main/eglmode.h b/src/egl/main/eglmode.h
deleted file mode 100644
index 664074f..0000000
--- a/src/egl/main/eglmode.h
+++ /dev/null
@@ -1,88 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef EGLMODE_INCLUDED
-#define EGLMODE_INCLUDED
-
-#include "egltypedefs.h"
-
-
-#ifdef EGL_MESA_screen_surface
-
-
-#define EGL_NO_MODE_MESA 0
-
-
-/**
- * Data structure which corresponds to an EGLModeMESA.
- */
-struct _egl_mode
-{
-   EGLModeMESA Handle;     /* the public/opaque handle which names this mode */
-   EGLint Width, Height;   /* size in pixels */
-   EGLint RefreshRate;     /* rate * 1000.0 */
-   EGLint Optimal;
-   EGLint Interlaced;
-   const char *Name;
-
-   /* Other possible attributes */
-   /* interlaced */
-   /* external sync */
-};
-
-
-extern _EGLMode *
-_eglLookupMode(EGLModeMESA mode, _EGLDisplay *dpy);
-
-
-extern EGLBoolean
-_eglChooseModeMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn,
-                   const EGLint *attrib_list, EGLModeMESA *modes,
-                   EGLint modes_size, EGLint *num_modes);
-
-
-extern EGLBoolean
-_eglGetModesMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn,
-                 EGLModeMESA *modes, EGLint modes_size, EGLint *num_modes);
-
-
-extern EGLBoolean
-_eglGetModeAttribMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLMode *m,
-                      EGLint attribute, EGLint *value);
-
-
-extern const char *
-_eglQueryModeStringMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLMode *m);
-
-
-#endif /* EGL_MESA_screen_surface */
-
-
-#endif /* EGLMODE_INCLUDED */
diff --git a/src/egl/main/eglscreen.c b/src/egl/main/eglscreen.c
deleted file mode 100644
index 42ac621..0000000
--- a/src/egl/main/eglscreen.c
+++ /dev/null
@@ -1,235 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/*
- * Ideas for screen management extension to EGL.
- *
- * Each EGLDisplay has one or more screens (CRTs, Flat Panels, etc).
- * The screens' handles can be obtained with eglGetScreensMESA().
- *
- * A new kind of EGLSurface is possible- one which can be directly scanned
- * out on a screen.  Such a surface is created with eglCreateScreenSurface().
- *
- * To actually display a screen surface on a screen, the eglShowSurface()
- * function is called.
- */
-
-#include <assert.h>
-#include <stdlib.h>
-#include <string.h>
-#include "c11/threads.h"
-
-#include "egldisplay.h"
-#include "eglcurrent.h"
-#include "eglmode.h"
-#include "eglsurface.h"
-#include "eglscreen.h"
-
-
-#ifdef EGL_MESA_screen_surface
-
-
-/* ugh, no atomic op? */
-static mtx_t _eglNextScreenHandleMutex = _MTX_INITIALIZER_NP;
-static EGLScreenMESA _eglNextScreenHandle = 1;
-
-
-/**
- * Return a new screen handle/ID.
- * NOTE: we never reuse these!
- */
-static EGLScreenMESA
-_eglAllocScreenHandle(void)
-{
-   EGLScreenMESA s;
-
-   mtx_lock(&_eglNextScreenHandleMutex);
-   s = _eglNextScreenHandle;
-   _eglNextScreenHandle += _EGL_SCREEN_MAX_MODES;
-   mtx_unlock(&_eglNextScreenHandleMutex);
-
-   return s;
-}
-
-
-/**
- * Initialize an _EGLScreen object to default values.
- */
-void
-_eglInitScreen(_EGLScreen *screen, _EGLDisplay *dpy, EGLint num_modes)
-{
-   memset(screen, 0, sizeof(_EGLScreen));
-
-   screen->Display = dpy;
-   screen->NumModes = num_modes;
-   screen->StepX = 1;
-   screen->StepY = 1;
-
-   if (num_modes > _EGL_SCREEN_MAX_MODES)
-      num_modes = _EGL_SCREEN_MAX_MODES;
-   screen->Modes = calloc(num_modes, sizeof(*screen->Modes));
-   screen->NumModes = (screen->Modes) ? num_modes : 0;
-}
-
-
-/**
- * Link a screen to its display and return the handle of the link.
- * The handle can be passed to client directly.
- */
-EGLScreenMESA
-_eglLinkScreen(_EGLScreen *screen)
-{
-   _EGLDisplay *display;
-   EGLint i;
-
-   assert(screen && screen->Display);
-   display = screen->Display;
-
-   if (!display->Screens) {
-      display->Screens = _eglCreateArray("Screen", 4);
-      if (!display->Screens)
-         return (EGLScreenMESA) 0;
-   }
-
-   screen->Handle = _eglAllocScreenHandle();
-   for (i = 0; i < screen->NumModes; i++)
-      screen->Modes[i].Handle = screen->Handle + i;
-
-   _eglAppendArray(display->Screens, (void *) screen);
-
-   return screen->Handle;
-}
-
-
-/**
- * Lookup a handle to find the linked config.
- * Return NULL if the handle has no corresponding linked config.
- */
-_EGLScreen *
-_eglLookupScreen(EGLScreenMESA screen, _EGLDisplay *display)
-{
-   EGLint i;
-
-   if (!display || !display->Screens)
-      return NULL;
-
-   for (i = 0; i < display->Screens->Size; i++) {
-      _EGLScreen *scr = (_EGLScreen *) display->Screens->Elements[i];
-      if (scr->Handle == screen) {
-         assert(scr->Display == display);
-         return scr;
-      }
-   }
-   return NULL;
-}
-
-
-static EGLBoolean
-_eglFlattenScreen(void *elem, void *buffer)
-{
-   _EGLScreen *scr = (_EGLScreen *) elem;
-   EGLScreenMESA *handle = (EGLScreenMESA *) buffer;
-   *handle = _eglGetScreenHandle(scr);
-   return EGL_TRUE;
-}
-
-
-EGLBoolean
-_eglGetScreensMESA(_EGLDriver *drv, _EGLDisplay *display, EGLScreenMESA *screens,
-                   EGLint max_screens, EGLint *num_screens)
-{
-   *num_screens = _eglFlattenArray(display->Screens, (void *) screens,
-         sizeof(screens[0]), max_screens, _eglFlattenScreen);
-
-   return EGL_TRUE;
-}
-
-
-/**
- * Set a screen's surface origin.
- */
-EGLBoolean
-_eglScreenPositionMESA(_EGLDriver *drv, _EGLDisplay *dpy,
-                       _EGLScreen *scrn, EGLint x, EGLint y)
-{
-   scrn->OriginX = x;
-   scrn->OriginY = y;
-
-   return EGL_TRUE;
-}
-
-
-/**
- * Query a screen's current surface.
- */
-EGLBoolean
-_eglQueryScreenSurfaceMESA(_EGLDriver *drv, _EGLDisplay *dpy,
-                           _EGLScreen *scrn, _EGLSurface **surf)
-{
-   *surf = scrn->CurrentSurface;
-   return EGL_TRUE;
-}
-
-
-/**
- * Query a screen's current mode.
- */
-EGLBoolean
-_eglQueryScreenModeMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn,
-                        _EGLMode **m)
-{
-   *m = scrn->CurrentMode;
-   return EGL_TRUE;
-}
-
-
-EGLBoolean
-_eglQueryScreenMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn,
-                    EGLint attribute, EGLint *value)
-{
-   switch (attribute) {
-   case EGL_SCREEN_POSITION_MESA:
-      value[0] = scrn->OriginX;
-      value[1] = scrn->OriginY;
-      break;
-   case EGL_SCREEN_POSITION_GRANULARITY_MESA:
-      value[0] = scrn->StepX;
-      value[1] = scrn->StepY;
-      break;
-   default:
-      _eglError(EGL_BAD_ATTRIBUTE, "eglQueryScreenMESA");
-      return EGL_FALSE;
-   }
-
-   return EGL_TRUE;
-}
-
-
-#endif /* EGL_MESA_screen_surface */
diff --git a/src/egl/main/eglscreen.h b/src/egl/main/eglscreen.h
deleted file mode 100644
index c554e1d..0000000
--- a/src/egl/main/eglscreen.h
+++ /dev/null
@@ -1,117 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2008 VMware, Inc.
- * Copyright 2009-2010 Chia-I Wu <olvaffe@gmail.com>
- * Copyright 2010 LunarG, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-#ifndef EGLSCREEN_INCLUDED
-#define EGLSCREEN_INCLUDED
-
-#include "c99_compat.h"
-
-#include "egltypedefs.h"
-
-
-#ifdef EGL_MESA_screen_surface
-
-
-#define _EGL_SCREEN_MAX_MODES 16
-
-
-/**
- * Per-screen information.
- * Note that an EGL screen doesn't have a size.  A screen may be set to
- * one of several display modes (width/height/scanrate).  The screen
- * then displays a drawing surface.  The drawing surface must be at least
- * as large as the display mode's resolution.  If it's larger, the
- * OriginX and OriginY fields control what part of the surface is visible
- * on the screen.
- */
-struct _egl_screen
-{
-   _EGLDisplay *Display;
-
-   EGLScreenMESA Handle; /* The public/opaque handle which names this object */
-
-   _EGLMode *CurrentMode;
-   _EGLSurface *CurrentSurface;
-
-   EGLint OriginX, OriginY; /**< Origin of scan-out region w.r.t. surface */
-   EGLint StepX, StepY;     /**< Screen position/origin granularity */
-
-   EGLint NumModes;
-   _EGLMode *Modes;  /**< array [NumModes] */
-};
-
-
-extern void
-_eglInitScreen(_EGLScreen *screen, _EGLDisplay *dpy, EGLint num_modes);
-
-
-extern EGLScreenMESA
-_eglLinkScreen(_EGLScreen *screen);
-
-
-extern _EGLScreen *
-_eglLookupScreen(EGLScreenMESA screen, _EGLDisplay *dpy);
-
-
-/**
- * Return the handle of a linked screen.
- */
-static inline EGLScreenMESA
-_eglGetScreenHandle(_EGLScreen *screen)
-{
-   return (screen) ? screen->Handle : (EGLScreenMESA) 0;
-}
-
-
-extern EGLBoolean
-_eglGetScreensMESA(_EGLDriver *drv, _EGLDisplay *dpy, EGLScreenMESA *screens, EGLint max_screens, EGLint *num_screens);
-
-
-extern EGLBoolean
-_eglScreenPositionMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn, EGLint x, EGLint y);
-
-
-extern EGLBoolean
-_eglQueryScreenSurfaceMESA(_EGLDriver *drv, _EGLDisplay *dpy,
-                           _EGLScreen *scrn, _EGLSurface **surface);
-
-
-extern EGLBoolean
-_eglQueryScreenModeMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn, _EGLMode **m);
-
-
-extern EGLBoolean
-_eglQueryScreenMESA(_EGLDriver *drv, _EGLDisplay *dpy, _EGLScreen *scrn, EGLint attribute, EGLint *value);
-
-
-#endif /* EGL_MESA_screen_surface */
-
-
-#endif /* EGLSCREEN_INCLUDED */
diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index e2cb73b..76c60e9 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -61,50 +61,6 @@ _eglClampSwapInterval(_EGLSurface *surf, EGLint interval)
 }
 
 
-#ifdef EGL_MESA_screen_surface
-static EGLint
-_eglParseScreenSurfaceAttribList(_EGLSurface *surf, const EGLint *attrib_list)
-{
-   EGLint i, err = EGL_SUCCESS;
-
-   if (!attrib_list)
-      return EGL_SUCCESS;
-
-   for (i = 0; attrib_list[i] != EGL_NONE; i++) {
-      EGLint attr = attrib_list[i++];
-      EGLint val = attrib_list[i];
-
-      switch (attr) {
-      case EGL_WIDTH:
-         if (val < 0) {
-            err = EGL_BAD_PARAMETER;
-            break;
-         }
-         surf->Width = val;
-         break;
-      case EGL_HEIGHT:
-         if (val < 0) {
-            err = EGL_BAD_PARAMETER;
-            break;
-         }
-         surf->Height = val;
-         break;
-      default:
-         err = EGL_BAD_ATTRIBUTE;
-         break;
-      }
-
-      if (err != EGL_SUCCESS) {
-         _eglLog(_EGL_WARNING, "bad surface attribute 0x%04x", attr);
-         break;
-      }
-   }
-
-   return err;
-}
-#endif /* EGL_MESA_screen_surface */
-
-
 /**
  * Parse the list of surface attributes and return the proper error code.
  */
@@ -119,11 +75,6 @@ _eglParseSurfaceAttribList(_EGLSurface *surf, const EGLint *attrib_list)
    if (!attrib_list)
       return EGL_SUCCESS;
 
-#ifdef EGL_MESA_screen_surface
-   if (type == EGL_SCREEN_BIT_MESA)
-      return _eglParseScreenSurfaceAttribList(surf, attrib_list);
-#endif
-
    if (dpy->Extensions.NOK_texture_from_pixmap)
       texture_type |= EGL_PIXMAP_BIT;
 
@@ -297,12 +248,6 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
    case EGL_PBUFFER_BIT:
       func = "eglCreatePBufferSurface";
       break;
-#ifdef EGL_MESA_screen_surface
-   case EGL_SCREEN_BIT_MESA:
-      func = "eglCreateScreenSurface";
-      renderBuffer = EGL_SINGLE_BUFFER; /* XXX correct? */
-      break;
-#endif
    default:
       _eglLog(_EGL_WARNING, "Bad type in _eglInitSurface");
       return EGL_FALSE;
diff --git a/src/egl/main/eglsurface.h b/src/egl/main/eglsurface.h
index 438e27c..74c429a 100644
--- a/src/egl/main/eglsurface.h
+++ b/src/egl/main/eglsurface.h
@@ -37,6 +37,10 @@
 #include "egldisplay.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /**
  * "Base" class for device driver surfaces.
  */
@@ -174,4 +178,8 @@ _eglGetSurfaceHandle(_EGLSurface *surf)
 }
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLSURFACE_INCLUDED */
diff --git a/src/egl/main/eglsync.c b/src/egl/main/eglsync.c
index 8b8ab16..3019e6e 100644
--- a/src/egl/main/eglsync.c
+++ b/src/egl/main/eglsync.c
@@ -67,7 +67,7 @@ _eglParseSyncAttribList(_EGLSync *sync, const EGLint *attrib_list)
 
 
 static EGLint
-_eglParseSyncAttribList64(_EGLSync *sync, const EGLAttribKHR *attrib_list)
+_eglParseSyncAttribList64(_EGLSync *sync, const EGLAttrib *attrib_list)
 {
    EGLint i, err = EGL_SUCCESS;
 
@@ -103,7 +103,7 @@ _eglParseSyncAttribList64(_EGLSync *sync, const EGLAttribKHR *attrib_list)
 
 EGLBoolean
 _eglInitSync(_EGLSync *sync, _EGLDisplay *dpy, EGLenum type,
-             const EGLint *attrib_list, const EGLAttribKHR *attrib_list64)
+             const EGLint *attrib_list, const EGLAttrib *attrib_list64)
 {
    EGLint err;
 
@@ -141,8 +141,8 @@ _eglInitSync(_EGLSync *sync, _EGLDisplay *dpy, EGLenum type,
 
 
 EGLBoolean
-_eglGetSyncAttribKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
-                     EGLint attribute, EGLint *value)
+_eglGetSyncAttrib(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                  EGLint attribute, EGLAttrib *value)
 {
    if (!value)
       return _eglError(EGL_BAD_PARAMETER, "eglGetSyncAttribKHR");
diff --git a/src/egl/main/eglsync.h b/src/egl/main/eglsync.h
index 1d2eb11..9b2aac8 100644
--- a/src/egl/main/eglsync.h
+++ b/src/egl/main/eglsync.h
@@ -47,18 +47,18 @@ struct _egl_sync
    EGLenum Type;
    EGLenum SyncStatus;
    EGLenum SyncCondition;
-   EGLAttribKHR CLEvent;
+   EGLAttrib CLEvent;
 };
 
 
 extern EGLBoolean
 _eglInitSync(_EGLSync *sync, _EGLDisplay *dpy, EGLenum type,
-             const EGLint *attrib_list, const EGLAttribKHR *attrib_list64);
+             const EGLint *attrib_list, const EGLAttrib *attrib_list64);
 
 
 extern EGLBoolean
-_eglGetSyncAttribKHR(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
-                     EGLint attribute, EGLint *value);
+_eglGetSyncAttrib(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync,
+                  EGLint attribute, EGLAttrib *value);
 
 
 /**
@@ -87,11 +87,11 @@ _eglPutSync(_EGLSync *sync)
  * Link a sync to its display and return the handle of the link.
  * The handle can be passed to client directly.
  */
-static inline EGLSyncKHR
+static inline EGLSync
 _eglLinkSync(_EGLSync *sync)
 {
    _eglLinkResource(&sync->Resource, _EGL_RESOURCE_SYNC);
-   return (EGLSyncKHR) sync;
+   return (EGLSync) sync;
 }
 
 
@@ -110,7 +110,7 @@ _eglUnlinkSync(_EGLSync *sync)
  * Return NULL if the handle has no corresponding linked sync.
  */
 static inline _EGLSync *
-_eglLookupSync(EGLSyncKHR handle, _EGLDisplay *dpy)
+_eglLookupSync(EGLSync handle, _EGLDisplay *dpy)
 {
    _EGLSync *sync = (_EGLSync *) handle;
    if (!dpy || !_eglCheckResource((void *) sync, _EGL_RESOURCE_SYNC, dpy))
@@ -122,12 +122,12 @@ _eglLookupSync(EGLSyncKHR handle, _EGLDisplay *dpy)
 /**
  * Return the handle of a linked sync, or EGL_NO_SYNC_KHR.
  */
-static inline EGLSyncKHR
+static inline EGLSync
 _eglGetSyncHandle(_EGLSync *sync)
 {
    _EGLResource *res = (_EGLResource *) sync;
    return (res && _eglIsResourceLinked(res)) ?
-      (EGLSyncKHR) sync : EGL_NO_SYNC_KHR;
+      (EGLSync) sync : EGL_NO_SYNC_KHR;
 }
 
 
diff --git a/src/egl/main/egltypedefs.h b/src/egl/main/egltypedefs.h
index e90959a..7facdb4 100644
--- a/src/egl/main/egltypedefs.h
+++ b/src/egl/main/egltypedefs.h
@@ -31,13 +31,16 @@
 #ifndef EGLTYPEDEFS_INCLUDED
 #define EGLTYPEDEFS_INCLUDED
 
-#define EGL_EGLEXT_PROTOTYPES
-
 #include <EGL/egl.h>
 #include <EGL/eglext.h>
 
 #include "eglcompiler.h"
 
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 typedef struct _egl_api _EGLAPI;
 
 typedef struct _egl_array _EGLArray;
@@ -68,4 +71,9 @@ typedef struct _egl_sync _EGLSync;
 
 typedef struct _egl_thread_info _EGLThreadInfo;
 
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* EGLTYPEDEFS_INCLUDED */
diff --git a/src/gallium/Android.common.mk b/src/gallium/Android.common.mk
index 782510f..7c6c7ac 100644
--- a/src/gallium/Android.common.mk
+++ b/src/gallium/Android.common.mk
@@ -29,4 +29,12 @@ LOCAL_C_INCLUDES += \
 	$(GALLIUM_TOP)/winsys \
 	$(GALLIUM_TOP)/drivers
 
+ifeq ($(MESA_ENABLE_LLVM),true)
+LOCAL_C_INCLUDES += \
+	external/llvm/include \
+	external/llvm/device/include \
+	external/libcxx/include \
+	external/elfutils/$(if $(filter true,$(MESA_LOLLIPOP_BUILD)),0.153/)libelf
+endif
+
 include $(MESA_COMMON_MK)
diff --git a/src/gallium/Android.mk b/src/gallium/Android.mk
index b2662ff..b946681 100644
--- a/src/gallium/Android.mk
+++ b/src/gallium/Android.mk
@@ -33,7 +33,9 @@ SUBDIRS := auxiliary
 #
 
 # swrast
-SUBDIRS += winsys/sw/android drivers/softpipe
+ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
+SUBDIRS += winsys/sw/dri winsys/sw/kms-dri drivers/softpipe
+endif
 
 # freedreno
 ifneq ($(filter freedreno, $(MESA_GPU_DRIVERS)),)
@@ -74,10 +76,17 @@ endif
 endif
 endif
 
+# vc4
+ifneq ($(filter vc4, $(MESA_GPU_DRIVERS)),)
+SUBDIRS += winsys/vc4/drm drivers/vc4
+endif
+
 # vmwgfx
 ifneq ($(filter vmwgfx, $(MESA_GPU_DRIVERS)),)
 SUBDIRS += winsys/svga/drm drivers/svga
 endif
 
-mkfiles := $(patsubst %,$(GALLIUM_TOP)/%/Android.mk,$(SUBDIRS))
-include $(mkfiles)
+# Gallium state trackers and target for dri
+SUBDIRS += state_trackers/dri targets/dri
+
+include $(call all-named-subdir-makefiles,$(SUBDIRS))
diff --git a/src/gallium/auxiliary/Android.mk b/src/gallium/auxiliary/Android.mk
index 96a2125..86430eb 100644
--- a/src/gallium/auxiliary/Android.mk
+++ b/src/gallium/auxiliary/Android.mk
@@ -30,12 +30,23 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := \
 	$(C_SOURCES) \
+	$(NIR_SOURCES) \
 	$(VL_STUB_SOURCES)
 
 LOCAL_C_INCLUDES := \
 	$(GALLIUM_TOP)/auxiliary/util
 
+ifeq ($(MESA_ENABLE_LLVM),true)
+LOCAL_SRC_FILES += \
+	$(GALLIVM_SOURCES) \
+	$(GALLIVM_CPP_SOURCES)
+
+LOCAL_CPPFLAGS := -std=c++11
+endif
+
+# We need libmesa_glsl to get NIR's generated include directories.
 LOCAL_MODULE := libmesa_gallium
+LOCAL_STATIC_LIBRARIES += libmesa_glsl
 
 # generate sources
 LOCAL_MODULE_CLASS := STATIC_LIBRARIES
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index 31ffa7d..744b00c 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -82,6 +82,7 @@ struct cso_context {
    struct u_vbuf *vbuf;
 
    boolean has_geometry_shader;
+   boolean has_tessellation;
    boolean has_streamout;
 
    struct sampler_info samplers[PIPE_SHADER_TYPES];
@@ -108,6 +109,8 @@ struct cso_context {
    void *fragment_shader, *fragment_shader_saved;
    void *vertex_shader, *vertex_shader_saved;
    void *geometry_shader, *geometry_shader_saved;
+   void *tessctrl_shader, *tessctrl_shader_saved;
+   void *tesseval_shader, *tesseval_shader_saved;
    void *velements, *velements_saved;
    struct pipe_query *render_condition, *render_condition_saved;
    uint render_condition_mode, render_condition_mode_saved;
@@ -273,6 +276,10 @@ struct cso_context *cso_create_context( struct pipe_context *pipe )
                                 PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
       ctx->has_geometry_shader = TRUE;
    }
+   if (pipe->screen->get_shader_param(pipe->screen, PIPE_SHADER_TESS_CTRL,
+                                PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0) {
+      ctx->has_tessellation = TRUE;
+   }
    if (pipe->screen->get_param(pipe->screen,
                                PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS) != 0) {
       ctx->has_streamout = TRUE;
@@ -293,6 +300,8 @@ void cso_destroy_context( struct cso_context *ctx )
    unsigned i, shader;
 
    if (ctx->pipe) {
+      ctx->pipe->set_index_buffer(ctx->pipe, NULL);
+
       ctx->pipe->bind_blend_state( ctx->pipe, NULL );
       ctx->pipe->bind_rasterizer_state( ctx->pipe, NULL );
 
@@ -319,7 +328,19 @@ void cso_destroy_context( struct cso_context *ctx )
 
       ctx->pipe->bind_depth_stencil_alpha_state( ctx->pipe, NULL );
       ctx->pipe->bind_fs_state( ctx->pipe, NULL );
+      ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_FRAGMENT, 0, NULL);
       ctx->pipe->bind_vs_state( ctx->pipe, NULL );
+      ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_VERTEX, 0, NULL);
+      if (ctx->has_geometry_shader) {
+         ctx->pipe->bind_gs_state(ctx->pipe, NULL);
+         ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_GEOMETRY, 0, NULL);
+      }
+      if (ctx->has_tessellation) {
+         ctx->pipe->bind_tcs_state(ctx->pipe, NULL);
+         ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_TESS_CTRL, 0, NULL);
+         ctx->pipe->bind_tes_state(ctx->pipe, NULL);
+         ctx->pipe->set_constant_buffer(ctx->pipe, PIPE_SHADER_TESS_EVAL, 0, NULL);
+      }
       ctx->pipe->bind_vertex_elements_state( ctx->pipe, NULL );
 
       if (ctx->has_streamout)
@@ -812,6 +833,92 @@ void cso_restore_geometry_shader(struct cso_context *ctx)
    ctx->geometry_shader_saved = NULL;
 }
 
+void cso_set_tessctrl_shader_handle(struct cso_context *ctx, void *handle)
+{
+   assert(ctx->has_tessellation || !handle);
+
+   if (ctx->has_tessellation && ctx->tessctrl_shader != handle) {
+      ctx->tessctrl_shader = handle;
+      ctx->pipe->bind_tcs_state(ctx->pipe, handle);
+   }
+}
+
+void cso_delete_tessctrl_shader(struct cso_context *ctx, void *handle)
+{
+    if (handle == ctx->tessctrl_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_tcs_state(ctx->pipe, NULL);
+      ctx->tessctrl_shader = NULL;
+   }
+   ctx->pipe->delete_tcs_state(ctx->pipe, handle);
+}
+
+void cso_save_tessctrl_shader(struct cso_context *ctx)
+{
+   if (!ctx->has_tessellation) {
+      return;
+   }
+
+   assert(!ctx->tessctrl_shader_saved);
+   ctx->tessctrl_shader_saved = ctx->tessctrl_shader;
+}
+
+void cso_restore_tessctrl_shader(struct cso_context *ctx)
+{
+   if (!ctx->has_tessellation) {
+      return;
+   }
+
+   if (ctx->tessctrl_shader_saved != ctx->tessctrl_shader) {
+      ctx->pipe->bind_tcs_state(ctx->pipe, ctx->tessctrl_shader_saved);
+      ctx->tessctrl_shader = ctx->tessctrl_shader_saved;
+   }
+   ctx->tessctrl_shader_saved = NULL;
+}
+
+void cso_set_tesseval_shader_handle(struct cso_context *ctx, void *handle)
+{
+   assert(ctx->has_tessellation || !handle);
+
+   if (ctx->has_tessellation && ctx->tesseval_shader != handle) {
+      ctx->tesseval_shader = handle;
+      ctx->pipe->bind_tes_state(ctx->pipe, handle);
+   }
+}
+
+void cso_delete_tesseval_shader(struct cso_context *ctx, void *handle)
+{
+    if (handle == ctx->tesseval_shader) {
+      /* unbind before deleting */
+      ctx->pipe->bind_tes_state(ctx->pipe, NULL);
+      ctx->tesseval_shader = NULL;
+   }
+   ctx->pipe->delete_tes_state(ctx->pipe, handle);
+}
+
+void cso_save_tesseval_shader(struct cso_context *ctx)
+{
+   if (!ctx->has_tessellation) {
+      return;
+   }
+
+   assert(!ctx->tesseval_shader_saved);
+   ctx->tesseval_shader_saved = ctx->tesseval_shader;
+}
+
+void cso_restore_tesseval_shader(struct cso_context *ctx)
+{
+   if (!ctx->has_tessellation) {
+      return;
+   }
+
+   if (ctx->tesseval_shader_saved != ctx->tesseval_shader) {
+      ctx->pipe->bind_tes_state(ctx->pipe, ctx->tesseval_shader_saved);
+      ctx->tesseval_shader = ctx->tesseval_shader_saved;
+   }
+   ctx->tesseval_shader_saved = NULL;
+}
+
 /* clip state */
 
 static INLINE void
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.h b/src/gallium/auxiliary/cso_cache/cso_context.h
index aa56c58..cc50b60 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.h
+++ b/src/gallium/auxiliary/cso_cache/cso_context.h
@@ -141,6 +141,18 @@ void cso_save_geometry_shader(struct cso_context *cso);
 void cso_restore_geometry_shader(struct cso_context *cso);
 
 
+void cso_set_tessctrl_shader_handle(struct cso_context *ctx, void *handle);
+void cso_delete_tessctrl_shader(struct cso_context *ctx, void *handle);
+void cso_save_tessctrl_shader(struct cso_context *cso);
+void cso_restore_tessctrl_shader(struct cso_context *cso);
+
+
+void cso_set_tesseval_shader_handle(struct cso_context *ctx, void *handle);
+void cso_delete_tesseval_shader(struct cso_context *ctx, void *handle);
+void cso_save_tesseval_shader(struct cso_context *cso);
+void cso_restore_tesseval_shader(struct cso_context *cso);
+
+
 void cso_set_framebuffer(struct cso_context *cso,
                          const struct pipe_framebuffer_state *fb);
 void cso_save_framebuffer(struct cso_context *cso);
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index 6375d41..a1564f9 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -190,9 +190,15 @@ static void tgsi_gs_prepare(struct draw_geometry_shader *shader,
                             const unsigned constants_size[PIPE_MAX_CONSTANT_BUFFERS])
 {
    struct tgsi_exec_machine *machine = shader->machine;
-
+   int j;
    tgsi_exec_set_constant_buffers(machine, PIPE_MAX_CONSTANT_BUFFERS,
                                   constants, constants_size);
+
+   if (shader->info.uses_invocationid) {
+      unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_INVOCATIONID];
+      for (j = 0; j < TGSI_QUAD_SIZE; j++)
+         machine->SystemValue[i].i[j] = shader->invocation_id;
+   }
 }
 
 static unsigned tgsi_gs_run(struct draw_geometry_shader *shader,
@@ -385,7 +391,8 @@ llvm_gs_run(struct draw_geometry_shader *shader,
       (struct vertex_header*)input,
       input_primitives,
       shader->draw->instance_id,
-      shader->llvm_prim_ids);
+      shader->llvm_prim_ids,
+      shader->invocation_id);
 
    return ret;
 }
@@ -555,7 +562,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader,
     * overflown vertices into some area where they won't harm anyone */
    unsigned total_verts_per_buffer = shader->primitive_boundary *
       num_in_primitives;
-
+   unsigned invocation;
    //Assume at least one primitive
    max_out_prims = MAX2(max_out_prims, 1);
 
@@ -564,7 +571,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader,
    output_verts->stride = output_verts->vertex_size;
    output_verts->verts =
       (struct vertex_header *)MALLOC(output_verts->vertex_size *
-                                     total_verts_per_buffer);
+                                     total_verts_per_buffer * shader->num_invocations);
    debug_assert(output_verts->verts);
 
 #if 0
@@ -592,7 +599,7 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader,
    shader->input = input;
    shader->input_info = input_info;
    FREE(shader->primitive_lengths);
-   shader->primitive_lengths = MALLOC(max_out_prims * sizeof(unsigned));
+   shader->primitive_lengths = MALLOC(max_out_prims * sizeof(unsigned) * shader->num_invocations);
 
 
 #ifdef HAVE_LLVM
@@ -622,23 +629,26 @@ int draw_geometry_shader_run(struct draw_geometry_shader *shader,
    }
 #endif
 
-   shader->prepare(shader, constants, constants_size);
+   for (invocation = 0; invocation < shader->num_invocations; invocation++) {
+      shader->invocation_id = invocation;
 
-   if (input_prim->linear)
-      gs_run(shader, input_prim, input_verts,
-             output_prims, output_verts);
-   else
-      gs_run_elts(shader, input_prim, input_verts,
-                  output_prims, output_verts);
+      shader->prepare(shader, constants, constants_size);
 
-   /* Flush the remaining primitives. Will happen if
-    * num_input_primitives % 4 != 0
-    */
-   if (shader->fetched_prim_count > 0) {
-      gs_flush(shader);
-   }
+      if (input_prim->linear)
+         gs_run(shader, input_prim, input_verts,
+                output_prims, output_verts);
+      else
+         gs_run_elts(shader, input_prim, input_verts,
+                     output_prims, output_verts);
 
-   debug_assert(shader->fetched_prim_count == 0);
+      /* Flush the remaining primitives. Will happen if
+       * num_input_primitives % 4 != 0
+       */
+      if (shader->fetched_prim_count > 0) {
+         gs_flush(shader);
+      }
+      debug_assert(shader->fetched_prim_count == 0);
+   }
 
    /* Update prim_info:
     */
@@ -771,6 +781,8 @@ draw_create_geometry_shader(struct draw_context *draw,
          gs->info.properties[TGSI_PROPERTY_GS_OUTPUT_PRIM];
    gs->max_output_vertices =
          gs->info.properties[TGSI_PROPERTY_GS_MAX_OUTPUT_VERTICES];
+   gs->num_invocations =
+      gs->info.properties[TGSI_PROPERTY_GS_INVOCATIONS];
    if (!gs->max_output_vertices)
       gs->max_output_vertices = 32;
 
diff --git a/src/gallium/auxiliary/draw/draw_gs.h b/src/gallium/auxiliary/draw/draw_gs.h
index 49e93d5..663ba84 100644
--- a/src/gallium/auxiliary/draw/draw_gs.h
+++ b/src/gallium/auxiliary/draw/draw_gs.h
@@ -90,6 +90,8 @@ struct draw_geometry_shader {
    unsigned vector_length;
    unsigned max_out_prims;
 
+   unsigned num_invocations;
+   unsigned invocation_id;
 #ifdef HAVE_LLVM
    struct draw_gs_inputs *gs_input;
    struct draw_gs_jit_context *jit_context;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.c b/src/gallium/auxiliary/draw/draw_llvm.c
index b9e55af..90a31bc 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_llvm.c
@@ -97,6 +97,7 @@ create_jit_dvbuffer_type(struct gallivm_state *gallivm,
    dvbuffer_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                            Elements(elem_types), 0);
 
+   (void) target; /* silence unused var warning for non-debug build */
    LP_CHECK_MEMBER_OFFSET(struct draw_vertex_buffer, map,
                           target, dvbuffer_type,
                           DRAW_JIT_DVBUFFER_MAP);
@@ -133,6 +134,7 @@ create_jit_texture_type(struct gallivm_state *gallivm, const char *struct_name)
    texture_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                           Elements(elem_types), 0);
 
+   (void) target; /* silence unused var warning for non-debug build */
    LP_CHECK_MEMBER_OFFSET(struct draw_jit_texture, width,
                           target, texture_type,
                           DRAW_JIT_TEXTURE_WIDTH);
@@ -290,6 +292,7 @@ create_gs_jit_context_type(struct gallivm_state *gallivm,
    context_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                           Elements(elem_types), 0);
 
+   (void) target; /* silence unused var warning for non-debug build */
    LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, constants,
                           target, context_type, DRAW_GS_JIT_CTX_CONSTANTS);
    LP_CHECK_MEMBER_OFFSET(struct draw_gs_jit_context, num_constants,
@@ -353,6 +356,7 @@ create_jit_vertex_buffer_type(struct gallivm_state *gallivm,
    vb_type = LLVMStructTypeInContext(gallivm->context, elem_types,
                                      Elements(elem_types), 0);
 
+   (void) target; /* silence unused var warning for non-debug build */
    LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, stride,
                           target, vb_type, 0);
    LP_CHECK_MEMBER_OFFSET(struct pipe_vertex_buffer, buffer_offset,
@@ -1965,7 +1969,7 @@ draw_llvm_set_sampler_state(struct draw_context *draw,
       for (i = 0; i < draw->num_samplers[PIPE_SHADER_VERTEX]; i++) {
          struct draw_jit_sampler *jit_sam = &draw->llvm->jit_context.samplers[i];
 
-         if (draw->samplers[i]) {
+         if (draw->samplers[PIPE_SHADER_VERTEX][i]) {
             const struct pipe_sampler_state *s
                = draw->samplers[PIPE_SHADER_VERTEX][i];
             jit_sam->min_lod = s->min_lod;
@@ -1978,7 +1982,7 @@ draw_llvm_set_sampler_state(struct draw_context *draw,
       for (i = 0; i < draw->num_samplers[PIPE_SHADER_GEOMETRY]; i++) {
          struct draw_jit_sampler *jit_sam = &draw->llvm->gs_jit_context.samplers[i];
 
-         if (draw->samplers[i]) {
+         if (draw->samplers[PIPE_SHADER_GEOMETRY][i]) {
             const struct pipe_sampler_state *s
                = draw->samplers[PIPE_SHADER_GEOMETRY][i];
             jit_sam->min_lod = s->min_lod;
@@ -2065,7 +2069,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
    struct gallivm_state *gallivm = variant->gallivm;
    LLVMContextRef context = gallivm->context;
    LLVMTypeRef int32_type = LLVMInt32TypeInContext(context);
-   LLVMTypeRef arg_types[6];
+   LLVMTypeRef arg_types[7];
    LLVMTypeRef func_type;
    LLVMValueRef variant_func;
    LLVMValueRef context_ptr;
@@ -2101,6 +2105,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
    arg_types[4] = int32_type;                          /* instance_id */
    arg_types[5] = LLVMPointerType(
       LLVMVectorType(int32_type, vector_length), 0);   /* prim_id_ptr */
+   arg_types[6] = int32_type;
 
    func_type = LLVMFunctionType(int32_type, arg_types, Elements(arg_types), 0);
 
@@ -2121,6 +2126,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
    num_prims                 = LLVMGetParam(variant_func, 3);
    system_values.instance_id = LLVMGetParam(variant_func, 4);
    prim_id_ptr               = LLVMGetParam(variant_func, 5);
+   system_values.invocation_id = LLVMGetParam(variant_func, 6);
 
    lp_build_name(context_ptr, "context");
    lp_build_name(input_array, "input");
@@ -2128,6 +2134,7 @@ draw_gs_llvm_generate(struct draw_llvm *llvm,
    lp_build_name(num_prims, "num_prims");
    lp_build_name(system_values.instance_id, "instance_id");
    lp_build_name(prim_id_ptr, "prim_id_ptr");
+   lp_build_name(system_values.invocation_id, "invocation_id");
 
    variant->context_ptr = context_ptr;
    variant->io_ptr = io_ptr;
diff --git a/src/gallium/auxiliary/draw/draw_llvm.h b/src/gallium/auxiliary/draw/draw_llvm.h
index 9565fc6..d48ed72 100644
--- a/src/gallium/auxiliary/draw/draw_llvm.h
+++ b/src/gallium/auxiliary/draw/draw_llvm.h
@@ -298,7 +298,8 @@ typedef int
                     struct vertex_header *output,
                     unsigned num_prims,
                     unsigned instance_id,
-                    int *prim_ids);
+                    int *prim_ids,
+                    unsigned invocation_id);
 
 struct draw_llvm_variant_key
 {
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index 2f14efe..936046e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -51,7 +51,7 @@
 
 
 /** Approx number of new tokens for instructions in aa_transform_inst() */
-#define NUM_NEW_TOKENS 50
+#define NUM_NEW_TOKENS 53
 
 
 /**
@@ -137,6 +137,7 @@ struct aa_transform_context {
    uint tempsUsed;  /**< bitmask */
    int colorOutput; /**< which output is the primary color */
    uint samplersUsed;  /**< bitfield of samplers used */
+   bool hasSview;
    int freeSampler;  /** an available sampler for the pstipple */
    int maxInput, maxGeneric;  /**< max input index found */
    int colorTemp, texTemp;  /**< temp registers */
@@ -165,6 +166,9 @@ aa_transform_decl(struct tgsi_transform_context *ctx,
          aactx->samplersUsed |= 1 << i;
       }
    }
+   else if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+      aactx->hasSview = true;
+   }
    else if (decl->Declaration.File == TGSI_FILE_INPUT) {
       if ((int) decl->Range.Last > aactx->maxInput)
          aactx->maxInput = decl->Range.Last;
@@ -232,6 +236,17 @@ aa_transform_prolog(struct tgsi_transform_context *ctx)
    /* declare new sampler */
    tgsi_transform_sampler_decl(ctx, aactx->freeSampler);
 
+   /* if the src shader has SVIEW decl's for each SAMP decl, we
+    * need to continue the trend and ensure there is a matching
+    * SVIEW for the new SAMP we just created
+    */
+   if (aactx->hasSview) {
+      tgsi_transform_sampler_view_decl(ctx,
+                                       aactx->freeSampler,
+                                       TGSI_TEXTURE_2D,
+                                       TGSI_RETURN_TYPE_FLOAT);
+   }
+
    /* declare new temp regs */
    tgsi_transform_temp_decl(ctx, aactx->texTemp);
    tgsi_transform_temp_decl(ctx, aactx->colorTemp);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
index 8f21c46..445f195 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_pstipple.c
@@ -53,7 +53,7 @@
 
 
 /** Approx number of new tokens for instructions in pstip_transform_inst() */
-#define NUM_NEW_TOKENS 50
+#define NUM_NEW_TOKENS 53
 
 
 /**
@@ -126,6 +126,7 @@ struct pstip_transform_context {
    int wincoordInput;
    int maxInput;
    uint samplersUsed;  /**< bitfield of samplers used */
+   bool hasSview;
    int freeSampler;  /** an available sampler for the pstipple */
    int texTemp;  /**< temp registers */
    int numImmed;
@@ -149,6 +150,9 @@ pstip_transform_decl(struct tgsi_transform_context *ctx,
          pctx->samplersUsed |= 1 << i;
       }
    }
+   else if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+      pctx->hasSview = true;
+   }
    else if (decl->Declaration.File == TGSI_FILE_INPUT) {
       pctx->maxInput = MAX2(pctx->maxInput, (int) decl->Range.Last);
       if (decl->Semantic.Name == TGSI_SEMANTIC_POSITION)
@@ -232,6 +236,17 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
    /* declare new sampler */
    tgsi_transform_sampler_decl(ctx, pctx->freeSampler);
 
+   /* if the src shader has SVIEW decl's for each SAMP decl, we
+    * need to continue the trend and ensure there is a matching
+    * SVIEW for the new SAMP we just created
+    */
+   if (pctx->hasSview) {
+      tgsi_transform_sampler_view_decl(ctx,
+                                       pctx->freeSampler,
+                                       TGSI_TEXTURE_2D,
+                                       TGSI_RETURN_TYPE_FLOAT);
+   }
+
    /* declare new temp regs */
    tgsi_transform_temp_decl(ctx, pctx->texTemp);
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index be3e834..405e648 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -28,40 +28,12 @@
 #include <stddef.h>
 
 #include <llvm-c/Core.h>
-#include <llvm/Target/TargetMachine.h>
-#include <llvm/Target/TargetInstrInfo.h>
+#include <llvm-c/Disassembler.h>
 #include <llvm/Support/raw_ostream.h>
 #include <llvm/Support/Format.h>
-
-#if HAVE_LLVM >= 0x0306
-#include <llvm/Target/TargetSubtargetInfo.h>
-#else
-#include <llvm/Support/MemoryObject.h>
-#endif
-
-#include <llvm/Support/TargetRegistry.h>
-#include <llvm/MC/MCSubtargetInfo.h>
-
 #include <llvm/Support/Host.h>
-
 #include <llvm/IR/Module.h>
 
-#include <llvm/MC/MCDisassembler.h>
-#include <llvm/MC/MCAsmInfo.h>
-#include <llvm/MC/MCInst.h>
-#include <llvm/MC/MCInstPrinter.h>
-#include <llvm/MC/MCRegisterInfo.h>
-
-#if HAVE_LLVM >= 0x0305
-#define OwningPtr std::unique_ptr
-#else
-#include <llvm/ADT/OwningPtr.h>
-#endif
-
-#if HAVE_LLVM >= 0x0305
-#include <llvm/MC/MCContext.h>
-#endif
-
 #include "util/u_math.h"
 #include "util/u_debug.h"
 
@@ -133,7 +105,7 @@ lp_get_module_id(LLVMModuleRef module)
 extern "C" void
 lp_debug_dump_value(LLVMValueRef value)
 {
-#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBDDED)
+#if (defined(PIPE_OS_WINDOWS) && !defined(PIPE_CC_MSVC)) || defined(PIPE_OS_EMBEDDED)
    raw_debug_ostream os;
    llvm::unwrap(value)->print(os);
    os.flush();
@@ -143,46 +115,6 @@ lp_debug_dump_value(LLVMValueRef value)
 }
 
 
-#if HAVE_LLVM < 0x0306
-
-/*
- * MemoryObject wrapper around a buffer of memory, to be used by MC
- * disassembler.
- */
-class BufferMemoryObject:
-   public llvm::MemoryObject
-{
-private:
-   const uint8_t *Bytes;
-   uint64_t Length;
-public:
-   BufferMemoryObject(const uint8_t *bytes, uint64_t length) :
-      Bytes(bytes), Length(length)
-   {
-   }
-
-   uint64_t getBase() const
-   {
-      return 0;
-   }
-
-   uint64_t getExtent() const
-   {
-      return Length;
-   }
-
-   int readByte(uint64_t addr, uint8_t *byte) const
-   {
-      if (addr > getExtent())
-         return -1;
-      *byte = Bytes[addr];
-      return 0;
-   }
-};
-
-#endif /* HAVE_LLVM < 0x0306 */
-
-
 /*
  * Disassemble a function, using the LLVM MC disassembler.
  *
@@ -193,8 +125,6 @@ public:
 static size_t
 disassemble(const void* func, llvm::raw_ostream & Out)
 {
-   using namespace llvm;
-
    const uint8_t *bytes = (const uint8_t *)func;
 
    /*
@@ -202,99 +132,23 @@ disassemble(const void* func, llvm::raw_ostream & Out)
     */
    const uint64_t extent = 96 * 1024;
 
-   uint64_t max_pc = 0;
-
    /*
     * Initialize all used objects.
     */
 
-   std::string Triple = sys::getDefaultTargetTriple();
-
-   std::string Error;
-   const Target *T = TargetRegistry::lookupTarget(Triple, Error);
-
-#if HAVE_LLVM >= 0x0304
-   OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(*T->createMCRegInfo(Triple), Triple));
-#else
-   OwningPtr<const MCAsmInfo> AsmInfo(T->createMCAsmInfo(Triple));
-#endif
-
-   if (!AsmInfo) {
-      Out << "error: no assembly info for target " << Triple << "\n";
-      Out.flush();
-      return 0;
-   }
-
-   unsigned int AsmPrinterVariant = AsmInfo->getAssemblerDialect();
-
-   OwningPtr<const MCRegisterInfo> MRI(T->createMCRegInfo(Triple));
-   if (!MRI) {
-      Out << "error: no register info for target " << Triple.c_str() << "\n";
-      Out.flush();
-      return 0;
-   }
+   std::string Triple = llvm::sys::getProcessTriple();
+   LLVMDisasmContextRef D = LLVMCreateDisasm(Triple.c_str(), NULL, 0, NULL, NULL);
+   char outline[1024];
 
-   OwningPtr<const MCInstrInfo> MII(T->createMCInstrInfo());
-   if (!MII) {
-      Out << "error: no instruction info for target " << Triple.c_str() << "\n";
-      Out.flush();
+   if (!D) {
+      Out << "error: couldn't create disassembler for triple " << Triple << "\n";
       return 0;
    }
 
-#if HAVE_LLVM >= 0x0305
-   OwningPtr<const MCSubtargetInfo> STI(T->createMCSubtargetInfo(Triple, sys::getHostCPUName(), ""));
-   OwningPtr<MCContext> MCCtx(new MCContext(AsmInfo.get(), MRI.get(), 0));
-   OwningPtr<const MCDisassembler> DisAsm(T->createMCDisassembler(*STI, *MCCtx));
-#else
-   OwningPtr<const MCSubtargetInfo> STI(T->createMCSubtargetInfo(Triple, sys::getHostCPUName(), ""));
-   OwningPtr<const MCDisassembler> DisAsm(T->createMCDisassembler(*STI));
-#endif
-   if (!DisAsm) {
-      Out << "error: no disassembler for target " << Triple << "\n";
-      Out.flush();
-      return 0;
-   }
-
-
-#if HAVE_LLVM >= 0x0307
-   OwningPtr<MCInstPrinter> Printer(
-         T->createMCInstPrinter(llvm::Triple(Triple), AsmPrinterVariant, *AsmInfo, *MII, *MRI));
-#else
-   OwningPtr<MCInstPrinter> Printer(
-         T->createMCInstPrinter(AsmPrinterVariant, *AsmInfo, *MII, *MRI, *STI));
-#endif
-   if (!Printer) {
-      Out << "error: no instruction printer for target " << Triple.c_str() << "\n";
-      Out.flush();
-      return 0;
-   }
-
-   TargetOptions options;
-#if defined(DEBUG) && HAVE_LLVM < 0x0307
-   options.JITEmitDebugInfo = true;
-#endif
-#if defined(PIPE_ARCH_X86)
-   options.StackAlignmentOverride = 4;
-#endif
-#if defined(DEBUG) || defined(PROFILE)
-   options.NoFramePointerElim = true;
-#endif
-   OwningPtr<TargetMachine> TM(T->createTargetMachine(Triple, sys::getHostCPUName(), "", options));
-
-   /*
-    * Wrap the data in a MemoryObject
-    */
-#if HAVE_LLVM >= 0x0306
-   ArrayRef<uint8_t> memoryObject((const uint8_t *)bytes, extent);
-#else
-   BufferMemoryObject memoryObject((const uint8_t *)bytes, extent);
-#endif
-
    uint64_t pc;
    pc = 0;
-   while (true) {
-      MCInst Inst;
-      uint64_t Size;
+   while (pc < extent) {
+      size_t Size;
 
       /*
        * Print address.  We use addresses relative to the start of the function,
@@ -303,11 +157,13 @@ disassemble(const void* func, llvm::raw_ostream & Out)
 
       Out << llvm::format("%6lu:\t", (unsigned long)pc);
 
-      if (!DisAsm->getInstruction(Inst, Size, memoryObject,
-                                 pc,
-				  nulls(), nulls())) {
-         Out << "invalid";
+      Size = LLVMDisasmInstruction(D, (uint8_t *)bytes + pc, extent - pc, 0, outline,
+                                   sizeof outline);
+
+      if (!Size) {
+         Out << "invalid\n";
          pc += 1;
+         break;
       }
 
       /*
@@ -317,7 +173,7 @@ disassemble(const void* func, llvm::raw_ostream & Out)
       if (0) {
          unsigned i;
          for (i = 0; i < Size; ++i) {
-            Out << llvm::format("%02x ", ((const uint8_t*)bytes)[pc + i]);
+            Out << llvm::format("%02x ", bytes[pc + i]);
          }
          for (; i < 16; ++i) {
             Out << "   ";
@@ -327,81 +183,27 @@ disassemble(const void* func, llvm::raw_ostream & Out)
       /*
        * Print the instruction.
        */
-#if HAVE_LLVM >= 0x0307
-      Printer->printInst(&Inst, Out, "", *STI);
-#else
-      Printer->printInst(&Inst, Out, "");
-#endif
 
-      /*
-       * Advance.
-       */
+      Out << outline;
 
-      pc += Size;
-
-      const MCInstrDesc &TID = MII->get(Inst.getOpcode());
+      Out << "\n";
 
       /*
-       * Keep track of forward jumps to a nearby address.
+       * Stop disassembling on return statements, if there is no record of a
+       * jump to a successive address.
+       *
+       * XXX: This currently assumes x86
        */
 
-      if (TID.isBranch()) {
-         for (unsigned i = 0; i < Inst.getNumOperands(); ++i) {
-            const MCOperand &operand = Inst.getOperand(i);
-            if (operand.isImm()) {
-               uint64_t jump;
-
-               /*
-                * FIXME: Handle both relative and absolute addresses correctly.
-                * EDInstInfo actually has this info, but operandTypes and
-                * operandFlags enums are not exposed in the public interface.
-                */
-
-               if (1) {
-                  /*
-                   * PC relative addr.
-                   */
-
-                  jump = pc + operand.getImm();
-               } else {
-                  /*
-                   * Absolute addr.
-                   */
-
-                  jump = (uint64_t)operand.getImm();
-               }
-
-               /*
-                * Output the address relative to the function start, given
-                * that MC will print the addresses relative the current pc.
-                */
-               Out << "\t\t; " << jump;
-
-               /*
-                * Ignore far jumps given it could be actually a tail return to
-                * a random address.
-                */
-
-               if (jump > max_pc &&
-                   jump < extent) {
-                  max_pc = jump;
-               }
-            }
-         }
+      if (Size == 1 && bytes[pc] == 0xc3) {
+         break;
       }
 
-      Out << "\n";
-
       /*
-       * Stop disassembling on return statements, if there is no record of a
-       * jump to a successive address.
+       * Advance.
        */
 
-      if (TID.isReturn()) {
-         if (pc > max_pc) {
-            break;
-         }
-      }
+      pc += Size;
 
       if (pc >= extent) {
          Out << "disassembly larger than " << extent << "bytes, aborting\n";
@@ -412,6 +214,8 @@ disassemble(const void* func, llvm::raw_ostream & Out)
    Out << "\n";
    Out.flush();
 
+   LLVMDisasmDispose(D);
+
    /*
     * Print GDB command, useful to verify output.
     */
@@ -442,7 +246,7 @@ lp_disassemble(LLVMValueRef func, const void *code) {
 extern "C" void
 lp_profile(LLVMValueRef func, const void *code)
 {
-#if defined(__linux__) && (defined(DEBUG) || defined(PROFILE))
+#if defined(__linux__) && defined(PROFILE)
    static boolean first_time = TRUE;
    static FILE *perf_map_file = NULL;
    static int perf_asm_fd = -1;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
index 3c25c32..efe7170 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_aos.c
@@ -405,6 +405,7 @@ lp_build_fetch_rgba_aos(struct gallivm_state *gallivm,
                                aligned, base_ptr, offset, TRUE);
 
       assert(format_desc->block.bits <= vec_len);
+      (void) vec_len; /* silence unused var warning for non-debug build */
 
       packed = LLVMBuildBitCast(gallivm->builder, packed, dst_vec_type, "");
       return lp_build_format_swizzle_aos(format_desc, &bld, packed);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.c b/src/gallium/auxiliary/gallivm/lp_bld_init.c
index 7b906c2..384ea86 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.c
@@ -533,6 +533,16 @@ gallivm_compile_module(struct gallivm_state *gallivm)
       if (0) {
          debug_printf("optimizing func %s...\n", LLVMGetValueName(func));
       }
+
+   /* Disable frame pointer omission on debug/profile builds */
+   /* XXX: And workaround http://llvm.org/PR21435 */
+#if HAVE_LLVM >= 0x0307 && \
+    (defined(DEBUG) || defined(PROFILE) || \
+     defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64))
+      LLVMAddTargetDependentFunctionAttr(func, "no-frame-pointer-elim", "true");
+      LLVMAddTargetDependentFunctionAttr(func, "no-frame-pointer-elim-non-leaf", "true");
+#endif
+
       LLVMRunFunctionPassManager(gallivm->passmgr, func);
       func = LLVMGetNextFunction(func);
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_limits.h b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
index c5c51c1..db50351 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_limits.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_limits.h
@@ -51,8 +51,12 @@
 
 #define LP_MAX_TGSI_PREDS 16
 
+#define LP_MAX_TGSI_CONSTS 4096
+
 #define LP_MAX_TGSI_CONST_BUFFERS 16
 
+#define LP_MAX_TGSI_CONST_BUFFER_SIZE (LP_MAX_TGSI_CONSTS * sizeof(float[4]))
+
 /*
  * For quick access we cache registers in statically
  * allocated arrays. Here we define the maximum size
@@ -100,7 +104,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_MAX_OUTPUTS:
       return 32;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
-      return sizeof(float[4]) * 4096;
+      return LP_MAX_TGSI_CONST_BUFFER_SIZE;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
       return PIPE_MAX_CONSTANT_BUFFERS;
    case PIPE_SHADER_CAP_MAX_TEMPS:
@@ -125,6 +129,7 @@ gallivm_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_PREFERRED_IR:
       return PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 1;
    case PIPE_SHADER_CAP_DOUBLES:
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
index 5e8a634..5e25819 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_misc.cpp
@@ -50,6 +50,12 @@
 
 #include <stddef.h>
 
+// Workaround http://llvm.org/PR23628
+#if HAVE_LLVM >= 0x0307
+#  pragma push_macro("DEBUG")
+#  undef DEBUG
+#endif
+
 #include <llvm-c/Core.h>
 #include <llvm-c/ExecutionEngine.h>
 #include <llvm/Target/TargetOptions.h>
@@ -70,6 +76,11 @@
 #include <llvm/IR/Module.h>
 #include <llvm/Support/CBindingWrapping.h>
 
+// Workaround http://llvm.org/PR23628
+#if HAVE_LLVM >= 0x0307
+#  pragma pop_macro("DEBUG")
+#endif
+
 #include "pipe/p_config.h"
 #include "util/u_debug.h"
 #include "util/u_cpu_detect.h"
@@ -439,8 +450,10 @@ lp_build_create_jit_compiler_for_module(LLVMExecutionEngineRef *OutJIT,
 #if HAVE_LLVM < 0x0304
    options.NoFramePointerElimNonLeaf = true;
 #endif
+#if HAVE_LLVM < 0x0307
    options.NoFramePointerElim = true;
 #endif
+#endif
 
    builder.setEngineKind(EngineKind::JIT)
           .setErrorStr(&Error)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 5b22045..4befb3a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -113,7 +113,7 @@ lp_sampler_static_texture_state(struct lp_static_texture_state *state,
    state->swizzle_b         = view->swizzle_b;
    state->swizzle_a         = view->swizzle_a;
 
-   state->target            = texture->target;
+   state->target            = view->target;
    state->pot_width         = util_is_power_of_two(texture->width0);
    state->pot_height        = util_is_power_of_two(texture->height0);
    state->pot_depth         = util_is_power_of_two(texture->depth0);
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 1a60ca9..b5c06b6 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -2501,7 +2501,7 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
        * all zero as mandated by d3d10 in this case.
        */
       unsigned chan;
-      LLVMValueRef zero = lp_build_const_vec(gallivm, type, 0.0F);
+      LLVMValueRef zero = lp_build_zero(gallivm, type);
       for (chan = 0; chan < 4; chan++) {
          texel_out[chan] = zero;
       }
@@ -2748,11 +2748,37 @@ lp_build_sample_soa_code(struct gallivm_state *gallivm,
    else {
       LLVMValueRef lod_fpart = NULL, lod_positive = NULL;
       LLVMValueRef ilevel0 = NULL, ilevel1 = NULL;
-      boolean use_aos = util_format_fits_8unorm(bld.format_desc) &&
-                        op_is_tex &&
-                        /* not sure this is strictly needed or simply impossible */
-                        derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
-                        lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
+      boolean use_aos;
+
+      if (util_format_is_pure_integer(static_texture_state->format) &&
+          !util_format_has_depth(bld.format_desc) &&
+          (static_sampler_state->min_mip_filter == PIPE_TEX_MIPFILTER_LINEAR ||
+           static_sampler_state->min_img_filter == PIPE_TEX_FILTER_LINEAR ||
+           static_sampler_state->mag_img_filter == PIPE_TEX_FILTER_LINEAR)) {
+         /*
+          * Bail if impossible filtering is specified (the awkard additional
+          * depth check is because it is legal in gallium to have things like S8Z24
+          * here which would say it's pure int despite such formats should sample
+          * the depth component).
+          * In GL such filters make the texture incomplete, this makes it robust
+          * against state trackers which set this up regardless (we'd crash in the
+          * lerp later (except for gather)).
+          * Must do this after fetch_texel code since with GL state tracker we'll
+          * get some junk sampler for buffer textures.
+          */
+         unsigned chan;
+         LLVMValueRef zero = lp_build_zero(gallivm, type);
+         for (chan = 0; chan < 4; chan++) {
+            texel_out[chan] = zero;
+         }
+         return;
+      }
+
+      use_aos = util_format_fits_8unorm(bld.format_desc) &&
+                op_is_tex &&
+                /* not sure this is strictly needed or simply impossible */
+                derived_sampler_state.compare_mode == PIPE_TEX_COMPARE_NONE &&
+                lp_is_simple_wrap_mode(derived_sampler_state.wrap_s);
 
       use_aos &= bld.num_lods <= num_quads ||
                  derived_sampler_state.min_img_filter ==
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index 3f76b79..967373c 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -165,6 +165,7 @@ struct lp_bld_tgsi_system_values {
    LLVMValueRef vertex_id_nobase;
    LLVMValueRef prim_id;
    LLVMValueRef basevertex;
+   LLVMValueRef invocation_id;
 };
 
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
index 738d5e9..610283d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_aos.c
@@ -232,23 +232,9 @@ lp_emit_store_aos(
    /*
     * Saturate the value
     */
-
-   switch (inst->Instruction.Saturate) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
+   if (inst->Instruction.Saturate) {
       value = lp_build_max(&bld->bld_base.base, value, bld->bld_base.base.zero);
       value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one);
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      value = lp_build_max(&bld->bld_base.base, value, lp_build_const_vec(bld->bld_base.base.gallivm, bld->bld_base.base.type, -1.0));
-      value = lp_build_min(&bld->bld_base.base, value, bld->bld_base.base.one);
-      break;
-
-   default:
-      assert(0);
    }
 
    /*
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
index 448c99d..268379e 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@@ -1532,6 +1532,11 @@ emit_fetch_system_value(
       atype = TGSI_TYPE_UNSIGNED;
       break;
 
+   case TGSI_SEMANTIC_INVOCATIONID:
+      res = lp_build_broadcast_scalar(&bld_base->uint_bld, bld->system_values.invocation_id);
+      atype = TGSI_TYPE_UNSIGNED;
+      break;
+
    default:
       assert(!"unexpected semantic in emit_fetch_system_value");
       res = bld_base->base.zero;
@@ -1670,30 +1675,11 @@ emit_store_chan(
     *
     * It is always assumed to be float.
     */
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
+   if (inst->Instruction.Saturate) {
       assert(dtype == TGSI_TYPE_FLOAT ||
              dtype == TGSI_TYPE_UNTYPED);
       value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
       value = lp_build_clamp_zero_one_nanzero(float_bld, value);
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert(dtype == TGSI_TYPE_FLOAT ||
-             dtype == TGSI_TYPE_UNTYPED);
-      value = LLVMBuildBitCast(builder, value, float_bld->vec_type, "");
-      /* This will give -1.0 for NaN which is probably not what we want. */
-      value = lp_build_max_ext(float_bld, value,
-                               lp_build_const_vec(gallivm, float_bld->type, -1.0),
-                               GALLIVM_NAN_RETURN_OTHER_SECOND_NONNAN);
-      value = lp_build_min(float_bld, value, float_bld->one);
-      break;
-
-   default:
-      assert(0);
    }
 
    if (reg->Register.Indirect) {
diff --git a/src/gallium/auxiliary/hud/hud_context.c b/src/gallium/auxiliary/hud/hud_context.c
index 00ec205..6a124f7 100644
--- a/src/gallium/auxiliary/hud/hud_context.c
+++ b/src/gallium/auxiliary/hud/hud_context.c
@@ -423,6 +423,8 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_save_viewport(cso);
    cso_save_stream_outputs(cso);
    cso_save_geometry_shader(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_vertex_shader(cso);
    cso_save_vertex_elements(cso);
    cso_save_aux_vertex_buffer_slot(cso);
@@ -456,6 +458,8 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_set_rasterizer(cso, &hud->rasterizer);
    cso_set_viewport(cso, &viewport);
    cso_set_stream_outputs(cso, 0, NULL, NULL);
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
    cso_set_vertex_shader_handle(cso, hud->vs);
    cso_set_vertex_elements(cso, 2, hud->velems);
@@ -548,6 +552,8 @@ hud_draw(struct hud_context *hud, struct pipe_resource *tex)
    cso_restore_rasterizer(cso);
    cso_restore_viewport(cso);
    cso_restore_stream_outputs(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_vertex_shader(cso);
    cso_restore_vertex_elements(cso);
diff --git a/src/gallium/auxiliary/nir/tgsi_to_nir.c b/src/gallium/auxiliary/nir/tgsi_to_nir.c
index 59aaf67..061f39a 100644
--- a/src/gallium/auxiliary/nir/tgsi_to_nir.c
+++ b/src/gallium/auxiliary/nir/tgsi_to_nir.c
@@ -58,6 +58,9 @@ struct ttn_compile {
    struct ttn_reg_info *temp_regs;
    nir_ssa_def **imm_defs;
 
+   unsigned num_samp_types;
+   nir_alu_type *samp_types;
+
    nir_register *addr_reg;
 
    /**
@@ -156,6 +159,30 @@ ttn_emit_declaration(struct ttn_compile *c)
       /* Nothing to record for system values. */
    } else if (file == TGSI_FILE_SAMPLER) {
       /* Nothing to record for samplers. */
+   } else if (file == TGSI_FILE_SAMPLER_VIEW) {
+      struct tgsi_declaration_sampler_view *sview = &decl->SamplerView;
+      nir_alu_type type;
+
+      assert((sview->ReturnTypeX == sview->ReturnTypeY) &&
+             (sview->ReturnTypeX == sview->ReturnTypeZ) &&
+             (sview->ReturnTypeX == sview->ReturnTypeW));
+
+      switch (sview->ReturnTypeX) {
+      case TGSI_RETURN_TYPE_SINT:
+         type = nir_type_int;
+         break;
+      case TGSI_RETURN_TYPE_UINT:
+         type = nir_type_unsigned;
+         break;
+      case TGSI_RETURN_TYPE_FLOAT:
+      default:
+         type = nir_type_float;
+         break;
+      }
+
+      for (i = 0; i < array_size; i++) {
+         c->samp_types[decl->Range.First + i] = type;
+      }
    } else {
       nir_variable *var;
       assert(file == TGSI_FILE_INPUT ||
@@ -401,7 +428,6 @@ ttn_src_for_file_and_index(struct ttn_compile *c, unsigned file, unsigned index,
 
       load->num_components = 4;
       load->const_index[0] = index;
-      load->const_index[1] = 1;
       if (dim) {
          if (dimind) {
             load->src[srcn] =
@@ -1027,7 +1053,7 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    struct tgsi_full_instruction *tgsi_inst = &c->token->FullInstruction;
    nir_tex_instr *instr;
    nir_texop op;
-   unsigned num_srcs, samp = 1, i;
+   unsigned num_srcs, samp = 1, sview, i;
 
    switch (tgsi_inst->Instruction.Opcode) {
    case TGSI_OPCODE_TEX:
@@ -1106,6 +1132,18 @@ ttn_tex(struct ttn_compile *c, nir_alu_dest dest, nir_ssa_def **src)
    assert(tgsi_inst->Src[samp].Register.File == TGSI_FILE_SAMPLER);
    instr->sampler_index = tgsi_inst->Src[samp].Register.Index;
 
+   /* TODO if we supported any opc's which take an explicit SVIEW
+    * src, we would use that here instead.  But for the "legacy"
+    * texture opc's the SVIEW index is same as SAMP index:
+    */
+   sview = instr->sampler_index;
+
+   if (sview < c->num_samp_types) {
+      instr->dest_type = c->samp_types[sview];
+   } else {
+      instr->dest_type = nir_type_float;
+   }
+
    unsigned src_number = 0;
 
    instr->src[src_number].src =
@@ -1286,6 +1324,7 @@ static const nir_op op_trans[TGSI_OPCODE_LAST] = {
    [TGSI_OPCODE_SEQ] = nir_op_seq,
    [TGSI_OPCODE_SGT] = 0,
    [TGSI_OPCODE_SIN] = nir_op_fsin,
+   [TGSI_OPCODE_SNE] = nir_op_sne,
    [TGSI_OPCODE_SLE] = 0,
    [TGSI_OPCODE_TEX] = 0,
    [TGSI_OPCODE_TXD] = 0,
@@ -1625,7 +1664,6 @@ ttn_emit_instruction(struct ttn_compile *c)
    }
 
    if (tgsi_inst->Instruction.Saturate) {
-      assert(tgsi_inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
       assert(!dest.dest.is_ssa);
       ttn_move_dest(b, dest, nir_fsat(b, ttn_src_for_dest(b, &dest)));
    }
@@ -1672,7 +1710,6 @@ ttn_add_output_stores(struct ttn_compile *c)
             nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
          store->num_components = 4;
          store->const_index[0] = var->data.driver_location + i;
-         store->const_index[1] = 1;
          store->src[0].reg.reg = c->output_regs[var->data.driver_location].reg;
          nir_instr_insert_after_cf_list(b->cf_node_list, &store->instr);
       }
@@ -1713,6 +1750,9 @@ tgsi_to_nir(const void *tgsi_tokens,
    c->imm_defs = rzalloc_array(c, nir_ssa_def *,
                                scan.file_max[TGSI_FILE_IMMEDIATE] + 1);
 
+   c->num_samp_types = scan.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
+   c->samp_types = rzalloc_array(c, nir_alu_type, c->num_samp_types);
+
    c->if_stack = rzalloc_array(c, struct exec_list *,
                                (scan.opcode_count[TGSI_OPCODE_IF] +
                                 scan.opcode_count[TGSI_OPCODE_UIF]) * 2);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 3bd9cd7..fc81e11 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -376,6 +376,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
          /* TODO: remove consequents buffers with the same fence? */
 
          assert(!destroyed);
+         (void) destroyed; /* silence unused var warning for non-debug build */
 
          fenced_buf->flags &= ~PB_USAGE_GPU_READ_WRITE;
 
diff --git a/src/gallium/auxiliary/postprocess/postprocess.h b/src/gallium/auxiliary/postprocess/postprocess.h
index c72f2c4..9b9f981 100644
--- a/src/gallium/auxiliary/postprocess/postprocess.h
+++ b/src/gallium/auxiliary/postprocess/postprocess.h
@@ -30,6 +30,10 @@
 
 #include "pipe/p_state.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct cso_context;
 
 struct pp_queue_t;              /* Forward definition */
@@ -85,4 +89,9 @@ void pp_celshade_free(struct pp_queue_t *, unsigned int);
 void pp_nocolor_free(struct pp_queue_t *, unsigned int);
 void pp_jimenezmlaa_free(struct pp_queue_t *, unsigned int);
 
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/gallium/auxiliary/postprocess/pp_run.c b/src/gallium/auxiliary/postprocess/pp_run.c
index 06281c8..e76ce85 100644
--- a/src/gallium/auxiliary/postprocess/pp_run.c
+++ b/src/gallium/auxiliary/postprocess/pp_run.c
@@ -119,6 +119,8 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_save_depth_stencil_alpha(cso);
    cso_save_fragment_shader(cso);
    cso_save_framebuffer(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_geometry_shader(cso);
    cso_save_rasterizer(cso);
    cso_save_sample_mask(cso);
@@ -139,6 +141,8 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_set_sample_mask(cso, ~0);
    cso_set_min_samples(cso, 1);
    cso_set_stream_outputs(cso, 0, NULL, NULL);
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
    cso_set_render_condition(cso, NULL, FALSE, 0);
 
@@ -186,6 +190,8 @@ pp_run(struct pp_queue_t *ppq, struct pipe_resource *in,
    cso_restore_depth_stencil_alpha(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_framebuffer(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_rasterizer(cso);
    cso_restore_sample_mask(cso);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index 8c3dbef..f7e605e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -49,7 +49,7 @@
 #include <windows.h>
 #endif
 
-#if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS) || defined(PIPE_OS_HAIKU) || defined(PIPE_OS_CYGWIN)
+#if defined(PIPE_OS_UNIX)
 
 
 /*
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 39a4296..fdb7feb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -610,7 +610,7 @@ tgsi_default_instruction( void )
    instruction.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
    instruction.NrTokens = 0;
    instruction.Opcode = TGSI_OPCODE_MOV;
-   instruction.Saturate = TGSI_SAT_NONE;
+   instruction.Saturate = 0;
    instruction.Predicate = 0;
    instruction.NumDstRegs = 1;
    instruction.NumSrcRegs = 1;
@@ -632,7 +632,7 @@ tgsi_build_instruction(unsigned opcode,
    struct tgsi_instruction instruction;
 
    assert (opcode <= TGSI_OPCODE_LAST);
-   assert (saturate <= TGSI_SAT_MINUS_PLUS_ONE);
+   assert (saturate <= 1);
    assert (num_dst_regs <= 3);
    assert (num_src_regs <= 15);
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index 13d6769..c80d7a2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -271,14 +271,30 @@ iter_declaration(
    struct tgsi_full_declaration *decl )
 {
    struct dump_ctx *ctx = (struct dump_ctx *)iter;
+   boolean patch = decl->Semantic.Name == TGSI_SEMANTIC_PATCH ||
+      decl->Semantic.Name == TGSI_SEMANTIC_TESSINNER ||
+      decl->Semantic.Name == TGSI_SEMANTIC_TESSOUTER ||
+      decl->Semantic.Name == TGSI_SEMANTIC_PRIMID;
 
    TXT( "DCL " );
 
    TXT(tgsi_file_name(decl->Declaration.File));
 
-   /* all geometry shader inputs are two dimensional */
+   /* all geometry shader inputs and non-patch tessellation shader inputs are
+    * two dimensional
+    */
    if (decl->Declaration.File == TGSI_FILE_INPUT &&
-       iter->processor.Processor == TGSI_PROCESSOR_GEOMETRY) {
+       (iter->processor.Processor == TGSI_PROCESSOR_GEOMETRY ||
+        (!patch &&
+         (iter->processor.Processor == TGSI_PROCESSOR_TESS_CTRL ||
+          iter->processor.Processor == TGSI_PROCESSOR_TESS_EVAL)))) {
+      TXT("[]");
+   }
+
+   /* all non-patch tess ctrl shader outputs are two dimensional */
+   if (decl->Declaration.File == TGSI_FILE_OUTPUT &&
+       !patch &&
+       iter->processor.Processor == TGSI_PROCESSOR_TESS_CTRL) {
       TXT("[]");
    }
 
@@ -523,17 +539,8 @@ iter_instruction(
 
    TXT( info->mnemonic );
 
-   switch (inst->Instruction.Saturate) {
-   case TGSI_SAT_NONE:
-      break;
-   case TGSI_SAT_ZERO_ONE:
+   if (inst->Instruction.Saturate) {
       TXT( "_SAT" );
-      break;
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      TXT( "_SATNV" );
-      break;
-   default:
-      assert( 0 );
    }
 
    for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index d9e4050..44000ff 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1765,14 +1765,12 @@ store_dest(struct tgsi_exec_machine *mach,
    if (!dst)
       return;
 
-   switch (inst->Instruction.Saturate) {
-   case TGSI_SAT_NONE:
+   if (!inst->Instruction.Saturate) {
       for (i = 0; i < TGSI_QUAD_SIZE; i++)
          if (execmask & (1 << i))
             dst->i[i] = chan->i[i];
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
+   }
+   else {
       for (i = 0; i < TGSI_QUAD_SIZE; i++)
          if (execmask & (1 << i)) {
             if (chan->f[i] < 0.0f)
@@ -1782,22 +1780,6 @@ store_dest(struct tgsi_exec_machine *mach,
             else
                dst->i[i] = chan->i[i];
          }
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      for (i = 0; i < TGSI_QUAD_SIZE; i++)
-         if (execmask & (1 << i)) {
-            if (chan->f[i] < -1.0f)
-               dst->f[i] = -1.0f;
-            else if (chan->f[i] > 1.0f)
-               dst->f[i] = 1.0f;
-            else
-               dst->i[i] = chan->i[i];
-         }
-      break;
-
-   default:
-      assert( 0 );
    }
 }
 
@@ -1952,7 +1934,7 @@ fetch_texel( struct tgsi_sampler *sampler,
 #define TEX_MODIFIER_LOD_BIAS       2
 #define TEX_MODIFIER_EXPLICIT_LOD   3
 #define TEX_MODIFIER_LEVEL_ZERO     4
-
+#define TEX_MODIFIER_GATHER         5
 
 /*
  * Fetch all 3 (for s,t,r coords) texel offsets, put them into int array.
@@ -2006,6 +1988,35 @@ fetch_assign_deriv_channel(struct tgsi_exec_machine *mach,
    derivs[1][3] = d.f[3];
 }
 
+static uint
+fetch_sampler_unit(struct tgsi_exec_machine *mach,
+                   const struct tgsi_full_instruction *inst,
+                   uint sampler)
+{
+   uint unit;
+
+   if (inst->Src[sampler].Register.Indirect) {
+      const struct tgsi_full_src_register *reg = &inst->Src[sampler];
+      union tgsi_exec_channel indir_index, index2;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->Indirect.Index;
+
+      fetch_src_file_channel(mach,
+                             0,
+                             reg->Indirect.File,
+                             reg->Indirect.Swizzle,
+                             &index2,
+                             &ZeroVec,
+                             &indir_index);
+      unit = inst->Src[sampler].Register.Index + indir_index.i[0];
+   } else {
+      unit = inst->Src[sampler].Register.Index;
+   }
+   return unit;
+}
 
 /*
  * execute a texture instruction.
@@ -2019,14 +2030,15 @@ exec_tex(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst,
          uint modifier, uint sampler)
 {
-   const uint unit = inst->Src[sampler].Register.Index;
    const union tgsi_exec_channel *args[5], *proj = NULL;
    union tgsi_exec_channel r[5];
    enum tgsi_sampler_control control =  tgsi_sampler_lod_none;
    uint chan;
+   uint unit;
    int8_t offsets[3];
    int dim, shadow_ref, i;
 
+   unit = fetch_sampler_unit(mach, inst, sampler);
    /* always fetch all 3 offsets, overkill but keeps code simple */
    fetch_texel_offsets(mach, inst, offsets);
 
@@ -2069,6 +2081,8 @@ exec_tex(struct tgsi_exec_machine *mach,
          control = tgsi_sampler_lod_explicit;
       else if (modifier == TEX_MODIFIER_LOD_BIAS)
          control = tgsi_sampler_lod_bias;
+      else if (modifier == TEX_MODIFIER_GATHER)
+         control = tgsi_sampler_gather;
    }
    else {
       for (i = dim; i < Elements(args); i++)
@@ -2123,12 +2137,13 @@ static void
 exec_txd(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst)
 {
-   const uint unit = inst->Src[3].Register.Index;
    union tgsi_exec_channel r[4];
    float derivs[3][2][TGSI_QUAD_SIZE];
    uint chan;
+   uint unit;
    int8_t offsets[3];
 
+   unit = fetch_sampler_unit(mach, inst, 3);
    /* always fetch all 3 offsets, overkill but keeps code simple */
    fetch_texel_offsets(mach, inst, offsets);
 
@@ -2230,14 +2245,15 @@ static void
 exec_txf(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst)
 {
-   const uint unit = inst->Src[1].Register.Index;
    union tgsi_exec_channel r[4];
    uint chan;
+   uint unit;
    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
    int j;
    int8_t offsets[3];
    unsigned target;
 
+   unit = fetch_sampler_unit(mach, inst, 1);
    /* always fetch all 3 offsets, overkill but keeps code simple */
    fetch_texel_offsets(mach, inst, offsets);
 
@@ -2312,12 +2328,14 @@ static void
 exec_txq(struct tgsi_exec_machine *mach,
          const struct tgsi_full_instruction *inst)
 {
-   const uint unit = inst->Src[1].Register.Index;
    int result[4];
    union tgsi_exec_channel r[4], src;
    uint chan;
+   uint unit;
    int i,j;
 
+   unit = fetch_sampler_unit(mach, inst, 1);
+
    fetch_source(mach, &src, &inst->Src[0], TGSI_CHAN_X, TGSI_EXEC_DATA_INT);
 
    /* XXX: This interface can't return per-pixel values */
@@ -3315,16 +3333,14 @@ store_double_channel(struct tgsi_exec_machine *mach,
    union tgsi_double_channel temp;
    const uint execmask = mach->ExecMask;
 
-   switch (inst->Instruction.Saturate) {
-   case TGSI_SAT_NONE:
+   if (!inst->Instruction.Saturate) {
       for (i = 0; i < TGSI_QUAD_SIZE; i++)
          if (execmask & (1 << i)) {
             dst[0].u[i] = chan->u[i][0];
             dst[1].u[i] = chan->u[i][1];
          }
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
+   }
+   else {
       for (i = 0; i < TGSI_QUAD_SIZE; i++)
          if (execmask & (1 << i)) {
             if (chan->d[i] < 0.0)
@@ -3337,25 +3353,6 @@ store_double_channel(struct tgsi_exec_machine *mach,
             dst[0].u[i] = temp.u[i][0];
             dst[1].u[i] = temp.u[i][1];
          }
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      for (i = 0; i < TGSI_QUAD_SIZE; i++)
-         if (execmask & (1 << i)) {
-            if (chan->d[i] < -1.0)
-               temp.d[i] = -1.0;
-            else if (chan->d[i] > 1.0)
-               temp.d[i] = 1.0;
-            else
-               temp.d[i] = chan->d[i];
-
-            dst[0].u[i] = temp.u[i][0];
-            dst[1].u[i] = temp.u[i][1];
-         }
-      break;
-
-   default:
-      assert( 0 );
    }
 
    store_dest_double(mach, &dst[0], reg, inst, chan_0, TGSI_EXEC_DATA_UINT);
@@ -4374,6 +4371,13 @@ exec_instruction(
       exec_tex(mach, inst, TEX_MODIFIER_PROJECTED, 1);
       break;
 
+   case TGSI_OPCODE_TG4:
+      /* src[0] = texcoord */
+      /* src[1] = component */
+      /* src[2] = sampler unit */
+      exec_tex(mach, inst, TEX_MODIFIER_GATHER, 2);
+      break;
+
    case TGSI_OPCODE_UP2H:
       assert (0);
       break;
@@ -4431,8 +4435,12 @@ exec_instruction(
          mach->BreakStack[mach->BreakStackTop++] = mach->BreakType;
          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
 
-         /* Finally, jump to the subroutine */
+         /* Finally, jump to the subroutine.  The label is a pointer
+          * (an instruction number) to the BGNSUB instruction.
+          */
          *pc = inst->Label.Label;
+         assert(mach->Instructions[*pc].Instruction.Opcode
+                == TGSI_OPCODE_BGNSUB);
       }
       break;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 0e59b88..208640c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -93,7 +93,8 @@ enum tgsi_sampler_control {
    tgsi_sampler_lod_bias,
    tgsi_sampler_lod_explicit,
    tgsi_sampler_lod_zero,
-   tgsi_sampler_derivs_explicit
+   tgsi_sampler_derivs_explicit,
+   tgsi_sampler_gather,
 };
 
 /**
@@ -457,6 +458,7 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
       return 1;
    case PIPE_SHADER_CAP_DOUBLES:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 3cab86e..9295311 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -302,6 +302,10 @@ tgsi_get_processor_name( uint processor )
       return "fragment shader";
    case TGSI_PROCESSOR_GEOMETRY:
       return "geometry shader";
+   case TGSI_PROCESSOR_TESS_CTRL:
+      return "tessellation control shader";
+   case TGSI_PROCESSOR_TESS_EVAL:
+      return "tessellation evaluation shader";
    default:
       return "unknown shader type!";
    }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_lowering.c b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
index 4954c11..a3b90bd 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_lowering.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_lowering.c
@@ -1133,8 +1133,7 @@ transform_samp(struct tgsi_transform_context *tctx,
 
    /* MOV_SAT tmpA.<mask>, tmpA */
    if (mask) {
-      create_mov(tctx, &ctx->tmp[A].dst, &ctx->tmp[A].src, mask,
-                 TGSI_SAT_ZERO_ONE);
+      create_mov(tctx, &ctx->tmp[A].dst, &ctx->tmp[A].src, mask, 1);
    }
 
    /* modify the texture samp instruction to take fixed up coord: */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index fbfe652..be4851f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -58,6 +58,7 @@ struct sanity_check_ctx
    uint errors;
    uint warnings;
    uint implied_array_size;
+   uint implied_out_array_size;
 
    boolean print;
 };
@@ -406,16 +407,30 @@ iter_declaration(
    if (!check_file_name( ctx, file ))
       return TRUE;
    for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-      /* declared TGSI_FILE_INPUT's for geometry processor
+      /* declared TGSI_FILE_INPUT's for geometry and tessellation
        * have an implied second dimension */
-      if (file == TGSI_FILE_INPUT &&
-          ctx->iter.processor.Processor == TGSI_PROCESSOR_GEOMETRY) {
+      uint processor = ctx->iter.processor.Processor;
+      uint patch = decl->Semantic.Name == TGSI_SEMANTIC_PATCH ||
+         decl->Semantic.Name == TGSI_SEMANTIC_TESSOUTER ||
+         decl->Semantic.Name == TGSI_SEMANTIC_TESSINNER;
+      if (file == TGSI_FILE_INPUT && !patch && (
+                processor == TGSI_PROCESSOR_GEOMETRY ||
+                processor == TGSI_PROCESSOR_TESS_CTRL ||
+                processor == TGSI_PROCESSOR_TESS_EVAL)) {
          uint vert;
          for (vert = 0; vert < ctx->implied_array_size; ++vert) {
             scan_register *reg = MALLOC(sizeof(scan_register));
             fill_scan_register2d(reg, file, i, vert);
             check_and_declare(ctx, reg);
          }
+      } else if (file == TGSI_FILE_OUTPUT && !patch &&
+                 processor == TGSI_PROCESSOR_TESS_CTRL) {
+         uint vert;
+         for (vert = 0; vert < ctx->implied_out_array_size; ++vert) {
+            scan_register *reg = MALLOC(sizeof(scan_register));
+            fill_scan_register2d(reg, file, i, vert);
+            check_and_declare(ctx, reg);
+         }
       } else {
          scan_register *reg = MALLOC(sizeof(scan_register));
          if (decl->Declaration.Dimension) {
@@ -474,6 +489,19 @@ iter_property(
        prop->Property.PropertyName == TGSI_PROPERTY_GS_INPUT_PRIM) {
       ctx->implied_array_size = u_vertices_per_prim(prop->u[0].Data);
    }
+   if (iter->processor.Processor == TGSI_PROCESSOR_TESS_CTRL &&
+       prop->Property.PropertyName == TGSI_PROPERTY_TCS_VERTICES_OUT)
+      ctx->implied_out_array_size = prop->u[0].Data;
+   return TRUE;
+}
+
+static boolean
+prolog(struct tgsi_iterate_context *iter)
+{
+   struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
+   if (iter->processor.Processor == TGSI_PROCESSOR_TESS_CTRL ||
+       iter->processor.Processor == TGSI_PROCESSOR_TESS_EVAL)
+      ctx->implied_array_size = 32;
    return TRUE;
 }
 
@@ -532,7 +560,7 @@ tgsi_sanity_check(
 {
    struct sanity_check_ctx ctx;
 
-   ctx.iter.prolog = NULL;
+   ctx.iter.prolog = prolog;
    ctx.iter.iterate_instruction = iter_instruction;
    ctx.iter.iterate_declaration = iter_declaration;
    ctx.iter.iterate_immediate = iter_immediate;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index e6011d2..7523baf 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -62,6 +62,7 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
       info->file_max[i] = -1;
    for (i = 0; i < Elements(info->const_file_max); i++)
       info->const_file_max[i] = -1;
+   info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = 1;
 
    /**
     ** Setup to begin parsing input shader
@@ -74,6 +75,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
    assert(procType == TGSI_PROCESSOR_FRAGMENT ||
           procType == TGSI_PROCESSOR_VERTEX ||
           procType == TGSI_PROCESSOR_GEOMETRY ||
+          procType == TGSI_PROCESSOR_TESS_CTRL ||
+          procType == TGSI_PROCESSOR_TESS_EVAL ||
           procType == TGSI_PROCESSOR_COMPUTE);
    info->processor = procType;
 
@@ -165,13 +168,31 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                = &parse.FullToken.FullDeclaration;
             const uint file = fulldecl->Declaration.File;
             uint reg;
-            if (fulldecl->Declaration.Array)
-               info->array_max[file] = MAX2(info->array_max[file], fulldecl->Array.ArrayID);
+
+            if (fulldecl->Declaration.Array) {
+               unsigned array_id = fulldecl->Array.ArrayID;
+
+               switch (file) {
+               case TGSI_FILE_INPUT:
+                  assert(array_id < ARRAY_SIZE(info->input_array_first));
+                  info->input_array_first[array_id] = fulldecl->Range.First;
+                  info->input_array_last[array_id] = fulldecl->Range.Last;
+                  break;
+               case TGSI_FILE_OUTPUT:
+                  assert(array_id < ARRAY_SIZE(info->output_array_first));
+                  info->output_array_first[array_id] = fulldecl->Range.First;
+                  info->output_array_last[array_id] = fulldecl->Range.Last;
+                  break;
+               }
+               info->array_max[file] = MAX2(info->array_max[file], array_id);
+            }
+
             for (reg = fulldecl->Range.First;
                  reg <= fulldecl->Range.Last;
                  reg++) {
                unsigned semName = fulldecl->Semantic.Name;
-               unsigned semIndex = fulldecl->Semantic.Index;
+               unsigned semIndex =
+                  fulldecl->Semantic.Index + (reg - fulldecl->Range.First);
 
                /* only first 32 regs will appear in this bitfield */
                info->file_mask[file] |= (1 << reg);
@@ -228,6 +249,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   }
                   else if (semName == TGSI_SEMANTIC_PRIMID) {
                      info->uses_primid = TRUE;
+                  } else if (semName == TGSI_SEMANTIC_INVOCATIONID) {
+                     info->uses_invocationid = TRUE;
                   }
                }
                else if (file == TGSI_FILE_OUTPUT) {
@@ -236,7 +259,9 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
                   info->num_outputs++;
 
                   if (procType == TGSI_PROCESSOR_VERTEX ||
-                      procType == TGSI_PROCESSOR_GEOMETRY) {
+                      procType == TGSI_PROCESSOR_GEOMETRY ||
+                      procType == TGSI_PROCESSOR_TESS_CTRL ||
+                      procType == TGSI_PROCESSOR_TESS_EVAL) {
                      if (semName == TGSI_SEMANTIC_CLIPDIST) {
                         info->num_written_clipdistance +=
                            util_bitcount(fulldecl->Declaration.UsageMask);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 0ea0e88..b81bdd7 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -65,6 +65,10 @@ struct tgsi_shader_info
    int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
    int const_file_max[PIPE_MAX_CONSTANT_BUFFERS];
 
+   ubyte input_array_first[PIPE_MAX_SHADER_INPUTS];
+   ubyte input_array_last[PIPE_MAX_SHADER_INPUTS];
+   ubyte output_array_first[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_array_last[PIPE_MAX_SHADER_OUTPUTS];
    unsigned array_max[TGSI_FILE_COUNT];  /**< highest index array per register file */
 
    uint immediate_count; /**< number of immediates declared */
@@ -85,6 +89,7 @@ struct tgsi_shader_info
    boolean uses_basevertex;
    boolean uses_primid;
    boolean uses_frontface;
+   boolean uses_invocationid;
    boolean writes_psize;
    boolean writes_clipvertex;
    boolean writes_viewport_index;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index 9b727cf..6b6a14f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -32,11 +32,13 @@
 #include "tgsi_strings.h"
 
 
-const char *tgsi_processor_type_names[4] =
+const char *tgsi_processor_type_names[6] =
 {
    "FRAG",
    "VERT",
    "GEOM",
+   "TESS_CTRL",
+   "TESS_EVAL",
    "COMP"
 };
 
@@ -88,6 +90,11 @@ const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT] =
    "INVOCATIONID",
    "VERTEXID_NOBASE",
    "BASEVERTEX",
+   "PATCH",
+   "TESSCOORD",
+   "TESSOUTER",
+   "TESSINNER",
+   "VERTICESIN",
 };
 
 const char *tgsi_texture_names[TGSI_TEXTURE_COUNT] =
@@ -124,7 +131,12 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
    "FS_DEPTH_LAYOUT",
    "VS_PROHIBIT_UCPS",
    "GS_INVOCATIONS",
-   "VS_WINDOW_SPACE_POSITION"
+   "VS_WINDOW_SPACE_POSITION",
+   "TCS_VERTICES_OUT",
+   "TES_PRIM_MODE",
+   "TES_SPACING",
+   "TES_VERTEX_ORDER_CW",
+   "TES_POINT_MODE",
 };
 
 const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
@@ -166,7 +178,8 @@ const char *tgsi_primitive_names[PIPE_PRIM_MAX] =
    "LINES_ADJACENCY",
    "LINE_STRIP_ADJACENCY",
    "TRIANGLES_ADJACENCY",
-   "TRIANGLE_STRIP_ADJACENCY"
+   "TRIANGLE_STRIP_ADJACENCY",
+   "PATCHES",
 };
 
 const char *tgsi_fs_coord_origin_names[2] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.h b/src/gallium/auxiliary/tgsi/tgsi_strings.h
index 90014a2..71e7437 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.h
@@ -38,7 +38,7 @@ extern "C" {
 #endif
 
 
-extern const char *tgsi_processor_type_names[4];
+extern const char *tgsi_processor_type_names[6];
 
 extern const char *tgsi_semantic_names[TGSI_SEMANTIC_COUNT];
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index b6b3585..a6675c5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -297,6 +297,10 @@ static boolean parse_header( struct translate_ctx *ctx )
       processor = TGSI_PROCESSOR_VERTEX;
    else if (str_match_nocase_whole( &ctx->cur, "GEOM" ))
       processor = TGSI_PROCESSOR_GEOMETRY;
+   else if (str_match_nocase_whole( &ctx->cur, "TESS_CTRL" ))
+      processor = TGSI_PROCESSOR_TESS_CTRL;
+   else if (str_match_nocase_whole( &ctx->cur, "TESS_EVAL" ))
+      processor = TGSI_PROCESSOR_TESS_EVAL;
    else if (str_match_nocase_whole( &ctx->cur, "COMP" ))
       processor = TGSI_PROCESSOR_COMPUTE;
    else {
@@ -903,7 +907,7 @@ match_inst(const char **pcur,
    /* simple case: the whole string matches the instruction name */
    if (str_match_nocase_whole(&cur, info->mnemonic)) {
       *pcur = cur;
-      *saturate = TGSI_SAT_NONE;
+      *saturate = 0;
       return TRUE;
    }
 
@@ -911,13 +915,7 @@ match_inst(const char **pcur,
       /* the instruction has a suffix, figure it out */
       if (str_match_nocase_whole(&cur, "_SAT")) {
          *pcur = cur;
-         *saturate = TGSI_SAT_ZERO_ONE;
-         return TRUE;
-      }
-
-      if (str_match_nocase_whole(&cur, "_SATNV")) {
-         *pcur = cur;
-         *saturate = TGSI_SAT_MINUS_PLUS_ONE;
+         *saturate = 1;
          return TRUE;
       }
    }
@@ -931,7 +929,7 @@ parse_instruction(
    boolean has_label )
 {
    uint i;
-   uint saturate = TGSI_SAT_NONE;
+   uint saturate = 0;
    const struct tgsi_opcode_info *info;
    struct tgsi_full_instruction inst;
    const char *cur;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_transform.h b/src/gallium/auxiliary/tgsi/tgsi_transform.h
index 921aa90..39d7688 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_transform.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_transform.h
@@ -143,6 +143,27 @@ tgsi_transform_sampler_decl(struct tgsi_transform_context *ctx,
    ctx->emit_declaration(ctx, &decl);
 }
 
+static INLINE void
+tgsi_transform_sampler_view_decl(struct tgsi_transform_context *ctx,
+                                 unsigned index,
+                                 unsigned target,
+                                 enum tgsi_return_type type)
+{
+   struct tgsi_full_declaration decl;
+
+   decl = tgsi_default_full_declaration();
+   decl.Declaration.File = TGSI_FILE_SAMPLER_VIEW;
+   decl.Declaration.UsageMask = 0xf;
+   decl.Range.First =
+   decl.Range.Last = index;
+   decl.SamplerView.Resource = target;
+   decl.SamplerView.ReturnTypeX = type;
+   decl.SamplerView.ReturnTypeY = type;
+   decl.SamplerView.ReturnTypeZ = type;
+   decl.SamplerView.ReturnTypeW = type;
+
+   ctx->emit_declaration(ctx, &decl);
+}
 
 static INLINE void
 tgsi_transform_immediate_decl(struct tgsi_transform_context *ctx,
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index a4c0fc5..201a849 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 
+#include "pipe/p_screen.h"
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_ureg.h"
@@ -73,7 +74,7 @@ struct ureg_tokens {
    unsigned count;
 };
 
-#define UREG_MAX_INPUT PIPE_MAX_ATTRIBS
+#define UREG_MAX_INPUT PIPE_MAX_SHADER_INPUTS
 #define UREG_MAX_SYSTEM_VALUE PIPE_MAX_ATTRIBS
 #define UREG_MAX_OUTPUT PIPE_MAX_SHADER_OUTPUTS
 #define UREG_MAX_CONSTANT_RANGE 32
@@ -96,7 +97,7 @@ struct const_decl {
 struct ureg_program
 {
    unsigned processor;
-   struct pipe_context *pipe;
+   bool supports_any_inout_decl_range;
 
    struct {
       unsigned semantic_name;
@@ -104,17 +105,13 @@ struct ureg_program
       unsigned interp;
       unsigned char cylindrical_wrap;
       unsigned interp_location;
-   } fs_input[UREG_MAX_INPUT];
-   unsigned nr_fs_inputs;
+      unsigned first;
+      unsigned last;
+      unsigned array_id;
+   } input[UREG_MAX_INPUT];
+   unsigned nr_inputs, nr_input_regs;
 
-   unsigned vs_inputs[UREG_MAX_INPUT/32];
-
-   struct {
-      unsigned index;
-      unsigned semantic_name;
-      unsigned semantic_index;
-   } gs_input[UREG_MAX_INPUT];
-   unsigned nr_gs_inputs;
+   unsigned vs_inputs[PIPE_MAX_ATTRIBS/32];
 
    struct {
       unsigned index;
@@ -127,8 +124,11 @@ struct ureg_program
       unsigned semantic_name;
       unsigned semantic_index;
       unsigned usage_mask; /* = TGSI_WRITEMASK_* */
+      unsigned first;
+      unsigned last;
+      unsigned array_id;
    } output[UREG_MAX_OUTPUT];
-   unsigned nr_outputs;
+   unsigned nr_outputs, nr_output_regs;
 
    struct {
       union {
@@ -254,30 +254,42 @@ ureg_DECL_fs_input_cyl_centroid(struct ureg_program *ureg,
                        unsigned semantic_index,
                        unsigned interp_mode,
                        unsigned cylindrical_wrap,
-                       unsigned interp_location)
+                       unsigned interp_location,
+                       unsigned array_id,
+                       unsigned array_size)
 {
    unsigned i;
 
-   for (i = 0; i < ureg->nr_fs_inputs; i++) {
-      if (ureg->fs_input[i].semantic_name == semantic_name &&
-          ureg->fs_input[i].semantic_index == semantic_index) {
+   for (i = 0; i < ureg->nr_inputs; i++) {
+      if (ureg->input[i].semantic_name == semantic_name &&
+          ureg->input[i].semantic_index == semantic_index) {
+         assert(ureg->input[i].interp == interp_mode);
+         assert(ureg->input[i].cylindrical_wrap == cylindrical_wrap);
+         assert(ureg->input[i].interp_location == interp_location);
+         assert(ureg->input[i].array_id == array_id);
          goto out;
       }
    }
 
-   if (ureg->nr_fs_inputs < UREG_MAX_INPUT) {
-      ureg->fs_input[i].semantic_name = semantic_name;
-      ureg->fs_input[i].semantic_index = semantic_index;
-      ureg->fs_input[i].interp = interp_mode;
-      ureg->fs_input[i].cylindrical_wrap = cylindrical_wrap;
-      ureg->fs_input[i].interp_location = interp_location;
-      ureg->nr_fs_inputs++;
+   if (ureg->nr_inputs < UREG_MAX_INPUT) {
+      assert(array_size >= 1);
+      ureg->input[i].semantic_name = semantic_name;
+      ureg->input[i].semantic_index = semantic_index;
+      ureg->input[i].interp = interp_mode;
+      ureg->input[i].cylindrical_wrap = cylindrical_wrap;
+      ureg->input[i].interp_location = interp_location;
+      ureg->input[i].first = ureg->nr_input_regs;
+      ureg->input[i].last = ureg->nr_input_regs + array_size - 1;
+      ureg->input[i].array_id = array_id;
+      ureg->nr_input_regs += array_size;
+      ureg->nr_inputs++;
    } else {
       set_bad(ureg);
    }
 
 out:
-   return ureg_src_register(TGSI_FILE_INPUT, i);
+   return ureg_src_array_register(TGSI_FILE_INPUT, ureg->input[i].first,
+                                  array_id);
 }
 
 
@@ -286,29 +298,22 @@ ureg_DECL_vs_input( struct ureg_program *ureg,
                     unsigned index )
 {
    assert(ureg->processor == TGSI_PROCESSOR_VERTEX);
-   
+   assert(index / 32 < ARRAY_SIZE(ureg->vs_inputs));
+
    ureg->vs_inputs[index/32] |= 1 << (index % 32);
    return ureg_src_register( TGSI_FILE_INPUT, index );
 }
 
 
 struct ureg_src
-ureg_DECL_gs_input(struct ureg_program *ureg,
-                   unsigned index,
-                   unsigned semantic_name,
-                   unsigned semantic_index)
-{
-   if (ureg->nr_gs_inputs < UREG_MAX_INPUT) {
-      ureg->gs_input[ureg->nr_gs_inputs].index = index;
-      ureg->gs_input[ureg->nr_gs_inputs].semantic_name = semantic_name;
-      ureg->gs_input[ureg->nr_gs_inputs].semantic_index = semantic_index;
-      ureg->nr_gs_inputs++;
-   } else {
-      set_bad(ureg);
-   }
-
-   /* XXX: Add suport for true 2D input registers. */
-   return ureg_src_register(TGSI_FILE_INPUT, index);
+ureg_DECL_input(struct ureg_program *ureg,
+                unsigned semantic_name,
+                unsigned semantic_index,
+                unsigned array_id,
+                unsigned array_size)
+{
+   return ureg_DECL_fs_input_cyl_centroid(ureg, semantic_name, semantic_index,
+                                          0, 0, 0, array_id, array_size);
 }
 
 
@@ -332,10 +337,12 @@ ureg_DECL_system_value(struct ureg_program *ureg,
 
 
 struct ureg_dst 
-ureg_DECL_output_masked( struct ureg_program *ureg,
-                         unsigned name,
-                         unsigned index,
-                         unsigned usage_mask )
+ureg_DECL_output_masked(struct ureg_program *ureg,
+                        unsigned name,
+                        unsigned index,
+                        unsigned usage_mask,
+                        unsigned array_id,
+                        unsigned array_size)
 {
    unsigned i;
 
@@ -343,7 +350,8 @@ ureg_DECL_output_masked( struct ureg_program *ureg,
 
    for (i = 0; i < ureg->nr_outputs; i++) {
       if (ureg->output[i].semantic_name == name &&
-          ureg->output[i].semantic_index == index) { 
+          ureg->output[i].semantic_index == index) {
+         assert(ureg->output[i].array_id == array_id);
          ureg->output[i].usage_mask |= usage_mask;
          goto out;
       }
@@ -353,6 +361,10 @@ ureg_DECL_output_masked( struct ureg_program *ureg,
       ureg->output[i].semantic_name = name;
       ureg->output[i].semantic_index = index;
       ureg->output[i].usage_mask = usage_mask;
+      ureg->output[i].first = ureg->nr_output_regs;
+      ureg->output[i].last = ureg->nr_output_regs + array_size - 1;
+      ureg->output[i].array_id = array_id;
+      ureg->nr_output_regs += array_size;
       ureg->nr_outputs++;
    }
    else {
@@ -360,16 +372,30 @@ ureg_DECL_output_masked( struct ureg_program *ureg,
    }
 
 out:
-   return ureg_dst_register( TGSI_FILE_OUTPUT, i );
+   return ureg_dst_array_register(TGSI_FILE_OUTPUT, ureg->output[i].first,
+                                  array_id);
 }
 
 
 struct ureg_dst 
-ureg_DECL_output( struct ureg_program *ureg,
-                  unsigned name,
-                  unsigned index )
+ureg_DECL_output(struct ureg_program *ureg,
+                 unsigned name,
+                 unsigned index)
+{
+   return ureg_DECL_output_masked(ureg, name, index, TGSI_WRITEMASK_XYZW,
+                                  0, 1);
+}
+
+struct ureg_dst
+ureg_DECL_output_array(struct ureg_program *ureg,
+                       unsigned semantic_name,
+                       unsigned semantic_index,
+                       unsigned array_id,
+                       unsigned array_size)
 {
-   return ureg_DECL_output_masked(ureg, name, index, TGSI_WRITEMASK_XYZW);
+   return ureg_DECL_output_masked(ureg, semantic_name, semantic_index,
+                                  TGSI_WRITEMASK_XYZW,
+                                  array_id, array_size);
 }
 
 
@@ -882,7 +908,11 @@ ureg_emit_src( struct ureg_program *ureg,
       out[n].ind.File = src.IndirectFile;
       out[n].ind.Swizzle = src.IndirectSwizzle;
       out[n].ind.Index = src.IndirectIndex;
-      out[n].ind.ArrayID = src.ArrayID;
+      if (!ureg->supports_any_inout_decl_range &&
+          (src.File == TGSI_FILE_INPUT || src.File == TGSI_FILE_OUTPUT))
+         out[n].ind.ArrayID = 0;
+      else
+         out[n].ind.ArrayID = src.ArrayID;
       n++;
    }
 
@@ -898,7 +928,11 @@ ureg_emit_src( struct ureg_program *ureg,
          out[n].ind.File = src.DimIndFile;
          out[n].ind.Swizzle = src.DimIndSwizzle;
          out[n].ind.Index = src.DimIndIndex;
-         out[n].ind.ArrayID = src.ArrayID;
+         if (!ureg->supports_any_inout_decl_range &&
+             (src.File == TGSI_FILE_INPUT || src.File == TGSI_FILE_OUTPUT))
+            out[n].ind.ArrayID = 0;
+         else
+            out[n].ind.ArrayID = src.ArrayID;
       } else {
          out[n].dim.Indirect = 0;
          out[n].dim.Index = src.DimensionIndex;
@@ -914,8 +948,8 @@ void
 ureg_emit_dst( struct ureg_program *ureg,
                struct ureg_dst dst )
 {
-   unsigned size = (1 + 
-                    (dst.Indirect ? 1 : 0));
+   unsigned size = 1 + (dst.Indirect ? 1 : 0) +
+                   (dst.Dimension ? (dst.DimIndirect ? 2 : 1) : 0);
 
    union tgsi_any_token *out = get_tokens( ureg, DOMAIN_INSN, size );
    unsigned n = 0;
@@ -940,7 +974,35 @@ ureg_emit_dst( struct ureg_program *ureg,
       out[n].ind.File = dst.IndirectFile;
       out[n].ind.Swizzle = dst.IndirectSwizzle;
       out[n].ind.Index = dst.IndirectIndex;
-      out[n].ind.ArrayID = dst.ArrayID;
+      if (!ureg->supports_any_inout_decl_range &&
+          (dst.File == TGSI_FILE_INPUT || dst.File == TGSI_FILE_OUTPUT))
+         out[n].ind.ArrayID = 0;
+      else
+         out[n].ind.ArrayID = dst.ArrayID;
+      n++;
+   }
+
+   if (dst.Dimension) {
+      out[0].dst.Dimension = 1;
+      out[n].dim.Dimension = 0;
+      out[n].dim.Padding = 0;
+      if (dst.DimIndirect) {
+         out[n].dim.Indirect = 1;
+         out[n].dim.Index = dst.DimensionIndex;
+         n++;
+         out[n].value = 0;
+         out[n].ind.File = dst.DimIndFile;
+         out[n].ind.Swizzle = dst.DimIndSwizzle;
+         out[n].ind.Index = dst.DimIndIndex;
+         if (!ureg->supports_any_inout_decl_range &&
+             (dst.File == TGSI_FILE_INPUT || dst.File == TGSI_FILE_OUTPUT))
+            out[n].ind.ArrayID = 0;
+         else
+            out[n].ind.ArrayID = dst.ArrayID;
+      } else {
+         out[n].dim.Indirect = 0;
+         out[n].dim.Index = dst.DimensionIndex;
+      }
       n++;
    }
 
@@ -1007,6 +1069,12 @@ ureg_emit_insn(struct ureg_program *ureg,
 }
 
 
+/**
+ * Emit a label token.
+ * \param label_token returns a token number indicating where the label
+ * needs to be patched later.  Later, this value should be passed to the
+ * ureg_fixup_label() function.
+ */
 void
 ureg_emit_label(struct ureg_program *ureg,
                 unsigned extended_token,
@@ -1234,12 +1302,14 @@ ureg_label_insn(struct ureg_program *ureg,
 static void
 emit_decl_semantic(struct ureg_program *ureg,
                    unsigned file,
-                   unsigned index,
+                   unsigned first,
+                   unsigned last,
                    unsigned semantic_name,
                    unsigned semantic_index,
-                   unsigned usage_mask)
+                   unsigned usage_mask,
+                   unsigned array_id)
 {
-   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 3);
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, array_id ? 4 : 3);
 
    out[0].value = 0;
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
@@ -1247,28 +1317,37 @@ emit_decl_semantic(struct ureg_program *ureg,
    out[0].decl.File = file;
    out[0].decl.UsageMask = usage_mask;
    out[0].decl.Semantic = 1;
+   out[0].decl.Array = array_id != 0;
 
    out[1].value = 0;
-   out[1].decl_range.First = index;
-   out[1].decl_range.Last = index;
+   out[1].decl_range.First = first;
+   out[1].decl_range.Last = last;
 
    out[2].value = 0;
    out[2].decl_semantic.Name = semantic_name;
    out[2].decl_semantic.Index = semantic_index;
+
+   if (array_id) {
+      out[3].value = 0;
+      out[3].array.ArrayID = array_id;
+   }
 }
 
 
 static void
 emit_decl_fs(struct ureg_program *ureg,
              unsigned file,
-             unsigned index,
+             unsigned first,
+             unsigned last,
              unsigned semantic_name,
              unsigned semantic_index,
              unsigned interpolate,
              unsigned cylindrical_wrap,
-             unsigned interpolate_location)
+             unsigned interpolate_location,
+             unsigned array_id)
 {
-   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL, 4);
+   union tgsi_any_token *out = get_tokens(ureg, DOMAIN_DECL,
+                                          array_id ? 5 : 4);
 
    out[0].value = 0;
    out[0].decl.Type = TGSI_TOKEN_TYPE_DECLARATION;
@@ -1277,10 +1356,11 @@ emit_decl_fs(struct ureg_program *ureg,
    out[0].decl.UsageMask = TGSI_WRITEMASK_XYZW; /* FIXME! */
    out[0].decl.Interpolate = 1;
    out[0].decl.Semantic = 1;
+   out[0].decl.Array = array_id != 0;
 
    out[1].value = 0;
-   out[1].decl_range.First = index;
-   out[1].decl_range.Last = index;
+   out[1].decl_range.First = first;
+   out[1].decl_range.Last = last;
 
    out[2].value = 0;
    out[2].decl_interp.Interpolate = interpolate;
@@ -1290,6 +1370,11 @@ emit_decl_fs(struct ureg_program *ureg,
    out[3].value = 0;
    out[3].decl_semantic.Name = semantic_name;
    out[3].decl_semantic.Index = semantic_index;
+
+   if (array_id) {
+      out[4].value = 0;
+      out[4].array.ArrayID = array_id;
+   }
 }
 
 static void
@@ -1428,37 +1513,73 @@ emit_property(struct ureg_program *ureg,
 
 static void emit_decls( struct ureg_program *ureg )
 {
-   unsigned i;
+   unsigned i,j;
 
    for (i = 0; i < Elements(ureg->properties); i++)
       if (ureg->properties[i] != ~0)
          emit_property(ureg, i, ureg->properties[i]);
 
    if (ureg->processor == TGSI_PROCESSOR_VERTEX) {
-      for (i = 0; i < UREG_MAX_INPUT; i++) {
+      for (i = 0; i < PIPE_MAX_ATTRIBS; i++) {
          if (ureg->vs_inputs[i/32] & (1 << (i%32))) {
             emit_decl_range( ureg, TGSI_FILE_INPUT, i, 1 );
          }
       }
    } else if (ureg->processor == TGSI_PROCESSOR_FRAGMENT) {
-      for (i = 0; i < ureg->nr_fs_inputs; i++) {
-         emit_decl_fs(ureg,
-                      TGSI_FILE_INPUT,
-                      i,
-                      ureg->fs_input[i].semantic_name,
-                      ureg->fs_input[i].semantic_index,
-                      ureg->fs_input[i].interp,
-                      ureg->fs_input[i].cylindrical_wrap,
-                      ureg->fs_input[i].interp_location);
+      if (ureg->supports_any_inout_decl_range) {
+         for (i = 0; i < ureg->nr_inputs; i++) {
+            emit_decl_fs(ureg,
+                         TGSI_FILE_INPUT,
+                         ureg->input[i].first,
+                         ureg->input[i].last,
+                         ureg->input[i].semantic_name,
+                         ureg->input[i].semantic_index,
+                         ureg->input[i].interp,
+                         ureg->input[i].cylindrical_wrap,
+                         ureg->input[i].interp_location,
+                         ureg->input[i].array_id);
+         }
       }
-   } else {
-      for (i = 0; i < ureg->nr_gs_inputs; i++) {
-         emit_decl_semantic(ureg,
+      else {
+         for (i = 0; i < ureg->nr_inputs; i++) {
+            for (j = ureg->input[i].first; j <= ureg->input[i].last; j++) {
+               emit_decl_fs(ureg,
                             TGSI_FILE_INPUT,
-                            ureg->gs_input[i].index,
-                            ureg->gs_input[i].semantic_name,
-                            ureg->gs_input[i].semantic_index,
-                            TGSI_WRITEMASK_XYZW);
+                            j, j,
+                            ureg->input[i].semantic_name,
+                            ureg->input[i].semantic_index +
+                            (j - ureg->input[i].first),
+                            ureg->input[i].interp,
+                            ureg->input[i].cylindrical_wrap,
+                            ureg->input[i].interp_location, 0);
+            }
+         }
+      }
+   } else {
+      if (ureg->supports_any_inout_decl_range) {
+         for (i = 0; i < ureg->nr_inputs; i++) {
+            emit_decl_semantic(ureg,
+                               TGSI_FILE_INPUT,
+                               ureg->input[i].first,
+                               ureg->input[i].last,
+                               ureg->input[i].semantic_name,
+                               ureg->input[i].semantic_index,
+                               TGSI_WRITEMASK_XYZW,
+                               ureg->input[i].array_id);
+         }
+      }
+      else {
+         for (i = 0; i < ureg->nr_inputs; i++) {
+            for (j = ureg->input[i].first; j <= ureg->input[i].last; j++) {
+               emit_decl_semantic(ureg,
+                                  TGSI_FILE_INPUT,
+                                  j, j,
+                                  ureg->input[i].semantic_name,
+                                  ureg->input[i].semantic_index +
+                                  (j - ureg->input[i].first),
+                                  TGSI_WRITEMASK_XYZW, 0);
+            }
+         }
       }
    }
 
@@ -1466,18 +1587,36 @@ static void emit_decls( struct ureg_program *ureg )
       emit_decl_semantic(ureg,
                          TGSI_FILE_SYSTEM_VALUE,
                          ureg->system_value[i].index,
+                         ureg->system_value[i].index,
                          ureg->system_value[i].semantic_name,
                          ureg->system_value[i].semantic_index,
-                         TGSI_WRITEMASK_XYZW);
+                         TGSI_WRITEMASK_XYZW, 0);
    }
 
-   for (i = 0; i < ureg->nr_outputs; i++) {
-      emit_decl_semantic(ureg,
-                         TGSI_FILE_OUTPUT,
-                         i,
-                         ureg->output[i].semantic_name,
-                         ureg->output[i].semantic_index,
-                         ureg->output[i].usage_mask);
+   if (ureg->supports_any_inout_decl_range) {
+      for (i = 0; i < ureg->nr_outputs; i++) {
+         emit_decl_semantic(ureg,
+                            TGSI_FILE_OUTPUT,
+                            ureg->output[i].first,
+                            ureg->output[i].last,
+                            ureg->output[i].semantic_name,
+                            ureg->output[i].semantic_index,
+                            ureg->output[i].usage_mask,
+                            ureg->output[i].array_id);
+      }
+   }
+   else {
+      for (i = 0; i < ureg->nr_outputs; i++) {
+         for (j = ureg->output[i].first; j <= ureg->output[i].last; j++) {
+            emit_decl_semantic(ureg,
+                               TGSI_FILE_OUTPUT,
+                               j, j,
+                               ureg->output[i].semantic_name,
+                               ureg->output[i].semantic_index +
+                               (j - ureg->output[i].first),
+                               ureg->output[i].usage_mask, 0);
+         }
+      }
    }
 
    for (i = 0; i < ureg->nr_samplers; i++) {
@@ -1647,10 +1786,20 @@ void *ureg_create_shader( struct ureg_program *ureg,
    else
       memset(&state.stream_output, 0, sizeof(state.stream_output));
 
-   if (ureg->processor == TGSI_PROCESSOR_VERTEX)
-      return pipe->create_vs_state( pipe, &state );
-   else
-      return pipe->create_fs_state( pipe, &state );
+   switch (ureg->processor) {
+   case TGSI_PROCESSOR_VERTEX:
+      return pipe->create_vs_state(pipe, &state);
+   case TGSI_PROCESSOR_TESS_CTRL:
+      return pipe->create_tcs_state(pipe, &state);
+   case TGSI_PROCESSOR_TESS_EVAL:
+      return pipe->create_tes_state(pipe, &state);
+   case TGSI_PROCESSOR_GEOMETRY:
+      return pipe->create_gs_state(pipe, &state);
+   case TGSI_PROCESSOR_FRAGMENT:
+      return pipe->create_fs_state(pipe, &state);
+   default:
+      return NULL;
+   }
 }
 
 
@@ -1681,7 +1830,38 @@ void ureg_free_tokens( const struct tgsi_token *tokens )
 }
 
 
-struct ureg_program *ureg_create( unsigned processor )
+static INLINE unsigned
+pipe_shader_from_tgsi_processor(unsigned processor)
+{
+   switch (processor) {
+   case TGSI_PROCESSOR_VERTEX:
+      return PIPE_SHADER_VERTEX;
+   case TGSI_PROCESSOR_TESS_CTRL:
+      return PIPE_SHADER_TESS_CTRL;
+   case TGSI_PROCESSOR_TESS_EVAL:
+      return PIPE_SHADER_TESS_EVAL;
+   case TGSI_PROCESSOR_GEOMETRY:
+      return PIPE_SHADER_GEOMETRY;
+   case TGSI_PROCESSOR_FRAGMENT:
+      return PIPE_SHADER_FRAGMENT;
+   case TGSI_PROCESSOR_COMPUTE:
+      return PIPE_SHADER_COMPUTE;
+   default:
+      assert(0);
+      return PIPE_SHADER_VERTEX;
+   }
+}
+
+
+struct ureg_program *
+ureg_create(unsigned processor)
+{
+   return ureg_create_with_screen(processor, NULL);
+}
+
+
+struct ureg_program *
+ureg_create_with_screen(unsigned processor, struct pipe_screen *screen)
 {
    int i;
    struct ureg_program *ureg = CALLOC_STRUCT( ureg_program );
@@ -1689,6 +1869,11 @@ struct ureg_program *ureg_create( unsigned processor )
       goto no_ureg;
 
    ureg->processor = processor;
+   ureg->supports_any_inout_decl_range =
+      screen &&
+      screen->get_shader_param(screen,
+                               pipe_shader_from_tgsi_processor(processor),
+                               PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE) != 0;
 
    for (i = 0; i < Elements(ureg->properties); i++)
       ureg->properties[i] = ~0;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 8a2ed0a..1891b06 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -36,6 +36,7 @@
 extern "C" {
 #endif
    
+struct pipe_screen;
 struct ureg_program;
 struct pipe_stream_output_info;
 
@@ -75,6 +76,8 @@ struct ureg_dst
    unsigned File            : 4;  /* TGSI_FILE_ */
    unsigned WriteMask       : 4;  /* TGSI_WRITEMASK_ */
    unsigned Indirect        : 1;  /* BOOL */
+   unsigned DimIndirect     : 1;  /* BOOL */
+   unsigned Dimension       : 1;  /* BOOL */
    unsigned Saturate        : 1;  /* BOOL */
    unsigned Predicate       : 1;
    unsigned PredNegate      : 1;  /* BOOL */
@@ -86,13 +89,20 @@ struct ureg_dst
    int      IndirectIndex   : 16; /* SINT */
    unsigned IndirectFile    : 4;  /* TGSI_FILE_ */
    int      IndirectSwizzle : 2;  /* TGSI_SWIZZLE_ */
+   unsigned DimIndFile      : 4;  /* TGSI_FILE_ */
+   unsigned DimIndSwizzle   : 2;  /* TGSI_SWIZZLE_ */
+   int      DimensionIndex  : 16; /* SINT */
+   int      DimIndIndex     : 16; /* SINT */
    unsigned ArrayID         : 10; /* UINT */
 };
 
 struct pipe_context;
 
 struct ureg_program *
-ureg_create( unsigned processor );
+ureg_create(unsigned processor);
+
+struct ureg_program *
+ureg_create_with_screen(unsigned processor, struct pipe_screen *screen);
 
 const struct tgsi_token *
 ureg_finalize( struct ureg_program * );
@@ -166,7 +176,9 @@ ureg_DECL_fs_input_cyl_centroid(struct ureg_program *,
                        unsigned semantic_index,
                        unsigned interp_mode,
                        unsigned cylindrical_wrap,
-                       unsigned interp_location);
+                       unsigned interp_location,
+                       unsigned array_id,
+                       unsigned array_size);
 
 static INLINE struct ureg_src
 ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
@@ -180,7 +192,7 @@ ureg_DECL_fs_input_cyl(struct ureg_program *ureg,
                                  semantic_index,
                                  interp_mode,
                                  cylindrical_wrap,
-                                 0);
+                                 0, 0, 1);
 }
 
 static INLINE struct ureg_src
@@ -193,7 +205,7 @@ ureg_DECL_fs_input(struct ureg_program *ureg,
                                  semantic_name,
                                  semantic_index,
                                  interp_mode,
-                                 0, 0);
+                                 0, 0, 0, 1);
 }
 
 struct ureg_src
@@ -201,10 +213,11 @@ ureg_DECL_vs_input( struct ureg_program *,
                     unsigned index );
 
 struct ureg_src
-ureg_DECL_gs_input(struct ureg_program *,
-                   unsigned index,
-                   unsigned semantic_name,
-                   unsigned semantic_index);
+ureg_DECL_input(struct ureg_program *,
+                unsigned semantic_name,
+                unsigned semantic_index,
+                unsigned array_id,
+                unsigned array_size);
 
 struct ureg_src
 ureg_DECL_system_value(struct ureg_program *,
@@ -213,15 +226,24 @@ ureg_DECL_system_value(struct ureg_program *,
                        unsigned semantic_index);
 
 struct ureg_dst
-ureg_DECL_output_masked( struct ureg_program *,
-                         unsigned semantic_name,
-                         unsigned semantic_index,
-                         unsigned usage_mask );
+ureg_DECL_output_masked(struct ureg_program *,
+                        unsigned semantic_name,
+                        unsigned semantic_index,
+                        unsigned usage_mask,
+                        unsigned array_id,
+                        unsigned array_size);
 
 struct ureg_dst
-ureg_DECL_output( struct ureg_program *,
-                  unsigned semantic_name,
-                  unsigned semantic_index );
+ureg_DECL_output(struct ureg_program *,
+                 unsigned semantic_name,
+                 unsigned semantic_index);
+
+struct ureg_dst
+ureg_DECL_output_array(struct ureg_program *ureg,
+                       unsigned semantic_name,
+                       unsigned semantic_index,
+                       unsigned array_id,
+                       unsigned array_size);
 
 struct ureg_src
 ureg_DECL_immediate( struct ureg_program *,
@@ -1108,6 +1130,16 @@ ureg_src_indirect( struct ureg_src reg, struct ureg_src addr )
    return reg;
 }
 
+static INLINE struct ureg_dst
+ureg_dst_dimension( struct ureg_dst reg, int index )
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Dimension = 1;
+   reg.DimIndirect = 0;
+   reg.DimensionIndex = index;
+   return reg;
+}
+
 static INLINE struct ureg_src
 ureg_src_dimension( struct ureg_src reg, int index )
 {
@@ -1118,6 +1150,19 @@ ureg_src_dimension( struct ureg_src reg, int index )
    return reg;
 }
 
+static INLINE struct ureg_dst
+ureg_dst_dimension_indirect( struct ureg_dst reg, struct ureg_src addr,
+                             int index )
+{
+   assert(reg.File != TGSI_FILE_NULL);
+   reg.Dimension = 1;
+   reg.DimIndirect = 1;
+   reg.DimensionIndex = index;
+   reg.DimIndFile = addr.File;
+   reg.DimIndIndex = addr.Index;
+   reg.DimIndSwizzle = addr.SwizzleX;
+   return reg;
+}
 
 static INLINE struct ureg_src
 ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
@@ -1133,17 +1178,24 @@ ureg_src_dimension_indirect( struct ureg_src reg, struct ureg_src addr,
    return reg;
 }
 
+static INLINE struct ureg_src
+ureg_src_array_offset(struct ureg_src reg, int offset)
+{
+   reg.Index += offset;
+   return reg;
+}
+
 static INLINE struct ureg_dst
 ureg_dst_array_offset( struct ureg_dst reg, int offset )
 {
-   assert(reg.File == TGSI_FILE_TEMPORARY);
    reg.Index += offset;
    return reg;
 }
 
 static INLINE struct ureg_dst
-ureg_dst_register( unsigned file,
-                   unsigned index )
+ureg_dst_array_register(unsigned file,
+                        unsigned index,
+                        unsigned array_id)
 {
    struct ureg_dst dst;
 
@@ -1161,12 +1213,25 @@ ureg_dst_register( unsigned file,
    dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
    dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = index;
-   dst.ArrayID = 0;
+   dst.Dimension = 0;
+   dst.DimensionIndex = 0;
+   dst.DimIndirect = 0;
+   dst.DimIndFile = TGSI_FILE_NULL;
+   dst.DimIndIndex = 0;
+   dst.DimIndSwizzle = 0;
+   dst.ArrayID = array_id;
 
    return dst;
 }
 
 static INLINE struct ureg_dst
+ureg_dst_register(unsigned file,
+                  unsigned index)
+{
+   return ureg_dst_array_register(file, index, 0);
+}
+
+static INLINE struct ureg_dst
 ureg_dst( struct ureg_src src )
 {
    struct ureg_dst dst;
@@ -1189,14 +1254,21 @@ ureg_dst( struct ureg_src src )
    dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
    dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = src.Index;
+   dst.Dimension = src.Dimension;
+   dst.DimensionIndex = src.DimensionIndex;
+   dst.DimIndirect = src.DimIndirect;
+   dst.DimIndFile = src.DimIndFile;
+   dst.DimIndIndex = src.DimIndIndex;
+   dst.DimIndSwizzle = src.DimIndSwizzle;
    dst.ArrayID = src.ArrayID;
 
    return dst;
 }
 
 static INLINE struct ureg_src
-ureg_src_register(unsigned file,
-                  unsigned index)
+ureg_src_array_register(unsigned file,
+                        unsigned index,
+                        unsigned array_id)
 {
    struct ureg_src src;
 
@@ -1218,12 +1290,19 @@ ureg_src_register(unsigned file,
    src.DimIndFile = TGSI_FILE_NULL;
    src.DimIndIndex = 0;
    src.DimIndSwizzle = 0;
-   src.ArrayID = 0;
+   src.ArrayID = array_id;
 
    return src;
 }
 
 static INLINE struct ureg_src
+ureg_src_register(unsigned file,
+                  unsigned index)
+{
+   return ureg_src_array_register(file, index, 0);
+}
+
+static INLINE struct ureg_src
 ureg_src( struct ureg_dst dst )
 {
    struct ureg_src src;
@@ -1240,12 +1319,12 @@ ureg_src( struct ureg_dst dst )
    src.Absolute  = 0;
    src.Index     = dst.Index;
    src.Negate    = 0;
-   src.Dimension = 0;
-   src.DimensionIndex = 0;
-   src.DimIndirect = 0;
-   src.DimIndFile = TGSI_FILE_NULL;
-   src.DimIndIndex = 0;
-   src.DimIndSwizzle = 0;
+   src.Dimension = dst.Dimension;
+   src.DimensionIndex = dst.DimensionIndex;
+   src.DimIndirect = dst.DimIndirect;
+   src.DimIndFile = dst.DimIndFile;
+   src.DimIndIndex = dst.DimIndIndex;
+   src.DimIndSwizzle = dst.DimIndSwizzle;
    src.ArrayID = dst.ArrayID;
 
    return src;
@@ -1272,6 +1351,12 @@ ureg_dst_undef( void )
    dst.PredSwizzleZ = TGSI_SWIZZLE_Z;
    dst.PredSwizzleW = TGSI_SWIZZLE_W;
    dst.Index     = 0;
+   dst.Dimension = 0;
+   dst.DimensionIndex = 0;
+   dst.DimIndirect = 0;
+   dst.DimIndFile = TGSI_FILE_NULL;
+   dst.DimIndIndex = 0;
+   dst.DimIndSwizzle = 0;
    dst.ArrayID = 0;
 
    return dst;
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 90408ff..e3f3055 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -65,7 +65,7 @@ struct blit_state
    struct pipe_vertex_element velem[2];
 
    void *vs;
-   void *fs[PIPE_MAX_TEXTURE_TYPES][TGSI_WRITEMASK_XYZW + 1];
+   void *fs[PIPE_MAX_TEXTURE_TYPES][TGSI_WRITEMASK_XYZW + 1][3];
 
    struct pipe_resource *vbuf;  /**< quad vertices */
    unsigned vbuf_slot;
@@ -135,15 +135,17 @@ void
 util_destroy_blit(struct blit_state *ctx)
 {
    struct pipe_context *pipe = ctx->pipe;
-   unsigned i, j;
+   unsigned i, j, k;
 
    if (ctx->vs)
       pipe->delete_vs_state(pipe, ctx->vs);
 
    for (i = 0; i < Elements(ctx->fs); i++) {
       for (j = 0; j < Elements(ctx->fs[i]); j++) {
-         if (ctx->fs[i][j])
-            pipe->delete_fs_state(pipe, ctx->fs[i][j]);
+         for (k = 0; k < Elements(ctx->fs[i][j]); k++) {
+            if (ctx->fs[i][j][k])
+               pipe->delete_fs_state(pipe, ctx->fs[i][j][k]);
+         }
       }
    }
 
@@ -158,18 +160,34 @@ util_destroy_blit(struct blit_state *ctx)
  */
 static INLINE void
 set_fragment_shader(struct blit_state *ctx, uint writemask,
+                    enum pipe_format format,
                     enum pipe_texture_target pipe_tex)
 {
-   if (!ctx->fs[pipe_tex][writemask]) {
+   enum tgsi_return_type stype;
+   unsigned idx;
+
+   if (util_format_is_pure_uint(format)) {
+      stype = TGSI_RETURN_TYPE_UINT;
+      idx = 0;
+   } else if (util_format_is_pure_sint(format)) {
+      stype = TGSI_RETURN_TYPE_SINT;
+      idx = 1;
+   } else {
+      stype = TGSI_RETURN_TYPE_FLOAT;
+      idx = 2;
+   }
+
+   if (!ctx->fs[pipe_tex][writemask][idx]) {
       unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(pipe_tex, 0);
 
-      ctx->fs[pipe_tex][writemask] =
+      ctx->fs[pipe_tex][writemask][idx] =
          util_make_fragment_tex_shader_writemask(ctx->pipe, tgsi_tex,
                                                  TGSI_INTERPOLATE_LINEAR,
-                                                 writemask);
+                                                 writemask,
+                                                 stype);
    }
 
-   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][writemask]);
+   cso_set_fragment_shader_handle(ctx->cso, ctx->fs[pipe_tex][writemask][idx]);
 }
 
 
@@ -535,6 +553,8 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_save_framebuffer(ctx->cso);
    cso_save_fragment_shader(ctx->cso);
    cso_save_vertex_shader(ctx->cso);
+   cso_save_tessctrl_shader(ctx->cso);
+   cso_save_tesseval_shader(ctx->cso);
    cso_save_geometry_shader(ctx->cso);
    cso_save_vertex_elements(ctx->cso);
    cso_save_aux_vertex_buffer_slot(ctx->cso);
@@ -569,8 +589,11 @@ util_blit_pixels_tex(struct blit_state *ctx,
 
    /* shaders */
    set_fragment_shader(ctx, TGSI_WRITEMASK_XYZW,
+                       src_sampler_view->format,
                        src_sampler_view->texture->target);
    set_vertex_shader(ctx);
+   cso_set_tessctrl_shader_handle(ctx->cso, NULL);
+   cso_set_tesseval_shader_handle(ctx->cso, NULL);
    cso_set_geometry_shader_handle(ctx->cso, NULL);
 
    /* drawing dest */
@@ -611,6 +634,8 @@ util_blit_pixels_tex(struct blit_state *ctx,
    cso_restore_framebuffer(ctx->cso);
    cso_restore_fragment_shader(ctx->cso);
    cso_restore_vertex_shader(ctx->cso);
+   cso_restore_tessctrl_shader(ctx->cso);
+   cso_restore_tesseval_shader(ctx->cso);
    cso_restore_geometry_shader(ctx->cso);
    cso_restore_vertex_elements(ctx->cso);
    cso_restore_aux_vertex_buffer_slot(ctx->cso);
diff --git a/src/gallium/auxiliary/util/u_blitter.c b/src/gallium/auxiliary/util/u_blitter.c
index 9d087fe..b5ef9a2 100644
--- a/src/gallium/auxiliary/util/u_blitter.c
+++ b/src/gallium/auxiliary/util/u_blitter.c
@@ -81,6 +81,8 @@ struct blitter_context_priv
    /* FS which outputs a color from a texture,
       where the index is PIPE_TEXTURE_* to be sampled. */
    void *fs_texfetch_col[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_texfetch_col_uint[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_texfetch_col_sint[PIPE_MAX_TEXTURE_TYPES];
 
    /* FS which outputs a depth from a texture,
       where the index is PIPE_TEXTURE_* to be sampled. */
@@ -90,6 +92,8 @@ struct blitter_context_priv
 
    /* FS which outputs one sample from a multisample texture. */
    void *fs_texfetch_col_msaa[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_texfetch_col_msaa_uint[PIPE_MAX_TEXTURE_TYPES];
+   void *fs_texfetch_col_msaa_sint[PIPE_MAX_TEXTURE_TYPES];
    void *fs_texfetch_depth_msaa[PIPE_MAX_TEXTURE_TYPES];
    void *fs_texfetch_depthstencil_msaa[PIPE_MAX_TEXTURE_TYPES];
    void *fs_texfetch_stencil_msaa[PIPE_MAX_TEXTURE_TYPES];
@@ -130,6 +134,7 @@ struct blitter_context_priv
    unsigned dst_height;
 
    boolean has_geometry_shader;
+   boolean has_tessellation;
    boolean has_layered;
    boolean has_stream_out;
    boolean has_stencil_export;
@@ -183,6 +188,11 @@ struct blitter_context *util_blitter_create(struct pipe_context *pipe)
    ctx->has_geometry_shader =
       pipe->screen->get_shader_param(pipe->screen, PIPE_SHADER_GEOMETRY,
                                      PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0;
+
+   ctx->has_tessellation =
+      pipe->screen->get_shader_param(pipe->screen, PIPE_SHADER_TESS_CTRL,
+                                     PIPE_SHADER_CAP_MAX_INSTRUCTIONS) > 0;
+
    ctx->has_stream_out =
       pipe->screen->get_param(pipe->screen,
                               PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS) != 0;
@@ -432,6 +442,10 @@ void util_blitter_destroy(struct blitter_context *blitter)
    for (i = 0; i < PIPE_MAX_TEXTURE_TYPES; i++) {
       if (ctx->fs_texfetch_col[i])
          ctx->delete_fs_state(pipe, ctx->fs_texfetch_col[i]);
+      if (ctx->fs_texfetch_col_sint[i])
+         ctx->delete_fs_state(pipe, ctx->fs_texfetch_col_sint[i]);
+      if (ctx->fs_texfetch_col_uint[i])
+         ctx->delete_fs_state(pipe, ctx->fs_texfetch_col_uint[i]);
       if (ctx->fs_texfetch_depth[i])
          ctx->delete_fs_state(pipe, ctx->fs_texfetch_depth[i]);
       if (ctx->fs_texfetch_depthstencil[i])
@@ -441,6 +455,10 @@ void util_blitter_destroy(struct blitter_context *blitter)
 
       if (ctx->fs_texfetch_col_msaa[i])
          ctx->delete_fs_state(pipe, ctx->fs_texfetch_col_msaa[i]);
+      if (ctx->fs_texfetch_col_msaa_sint[i])
+         ctx->delete_fs_state(pipe, ctx->fs_texfetch_col_msaa_sint[i]);
+      if (ctx->fs_texfetch_col_msaa_uint[i])
+         ctx->delete_fs_state(pipe, ctx->fs_texfetch_col_msaa_uint[i]);
       if (ctx->fs_texfetch_depth_msaa[i])
          ctx->delete_fs_state(pipe, ctx->fs_texfetch_depth_msaa[i]);
       if (ctx->fs_texfetch_depthstencil_msaa[i])
@@ -510,6 +528,8 @@ static void blitter_check_saved_vertex_states(struct blitter_context_priv *ctx)
    assert(ctx->base.saved_velem_state != INVALID_PTR);
    assert(ctx->base.saved_vs != INVALID_PTR);
    assert(!ctx->has_geometry_shader || ctx->base.saved_gs != INVALID_PTR);
+   assert(!ctx->has_tessellation || ctx->base.saved_tcs != INVALID_PTR);
+   assert(!ctx->has_tessellation || ctx->base.saved_tes != INVALID_PTR);
    assert(!ctx->has_stream_out || ctx->base.saved_num_so_targets != ~0);
    assert(ctx->base.saved_rs_state != INVALID_PTR);
 }
@@ -538,6 +558,13 @@ static void blitter_restore_vertex_states(struct blitter_context_priv *ctx)
       ctx->base.saved_gs = INVALID_PTR;
    }
 
+   if (ctx->has_tessellation) {
+      pipe->bind_tcs_state(pipe, ctx->base.saved_tcs);
+      pipe->bind_tes_state(pipe, ctx->base.saved_tes);
+      ctx->base.saved_tcs = INVALID_PTR;
+      ctx->base.saved_tes = INVALID_PTR;
+   }
+
    /* Stream outputs. */
    if (ctx->has_stream_out) {
       unsigned offsets[PIPE_MAX_SO_BUFFERS];
@@ -829,25 +856,29 @@ static void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
 {
    struct pipe_context *pipe = ctx->base.pipe;
    unsigned tgsi_tex = util_pipe_tex_to_tgsi_tex(target, src_nr_samples);
+   enum tgsi_return_type stype;
 
    assert(target < PIPE_MAX_TEXTURE_TYPES);
 
+   if (util_format_is_pure_uint(format))
+      stype = TGSI_RETURN_TYPE_UINT;
+   else if (util_format_is_pure_sint(format))
+      stype = TGSI_RETURN_TYPE_SINT;
+   else
+      stype = TGSI_RETURN_TYPE_FLOAT;
+
    if (src_nr_samples > 1) {
       void **shader;
 
       if (dst_nr_samples <= 1) {
          /* The destination has one sample, so we'll do color resolve. */
-         boolean is_uint, is_sint;
          unsigned index = GET_MSAA_RESOLVE_FS_IDX(src_nr_samples);
 
-         is_uint = util_format_is_pure_uint(format);
-         is_sint = util_format_is_pure_sint(format);
-
          assert(filter < 2);
 
-         if (is_uint)
+         if (stype == TGSI_RETURN_TYPE_UINT)
             shader = &ctx->fs_resolve_uint[target][index][filter];
-         else if (is_sint)
+         else if (stype == TGSI_RETURN_TYPE_SINT)
             shader = &ctx->fs_resolve_sint[target][index][filter];
          else
             shader = &ctx->fs_resolve[target][index][filter];
@@ -857,12 +888,12 @@ static void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
             if (filter == PIPE_TEX_FILTER_LINEAR) {
                *shader = util_make_fs_msaa_resolve_bilinear(pipe, tgsi_tex,
                                                    src_nr_samples,
-                                                   is_uint, is_sint);
+                                                   stype);
             }
             else {
                *shader = util_make_fs_msaa_resolve(pipe, tgsi_tex,
                                                    src_nr_samples,
-                                                   is_uint, is_sint);
+                                                   stype);
             }
          }
       }
@@ -870,24 +901,37 @@ static void *blitter_get_fs_texfetch_col(struct blitter_context_priv *ctx,
          /* The destination has multiple samples, we'll do
           * an MSAA->MSAA copy.
           */
-         shader = &ctx->fs_texfetch_col_msaa[target];
+          if (stype == TGSI_RETURN_TYPE_UINT)
+             shader = &ctx->fs_texfetch_col_msaa_uint[target];
+          else if (stype == TGSI_RETURN_TYPE_SINT)
+             shader = &ctx->fs_texfetch_col_msaa_sint[target];
+          else
+             shader = &ctx->fs_texfetch_col_msaa[target];
 
          /* Create the fragment shader on-demand. */
          if (!*shader) {
             assert(!ctx->cached_all_shaders);
-            *shader = util_make_fs_blit_msaa_color(pipe, tgsi_tex);
+            *shader = util_make_fs_blit_msaa_color(pipe, tgsi_tex, stype);
          }
       }
 
       return *shader;
    } else {
-      void **shader = &ctx->fs_texfetch_col[target];
+      void **shader;
+
+      if (stype == TGSI_RETURN_TYPE_UINT)
+         shader = &ctx->fs_texfetch_col_uint[target];
+      else if (stype == TGSI_RETURN_TYPE_SINT)
+         shader = &ctx->fs_texfetch_col_sint[target];
+      else
+         shader = &ctx->fs_texfetch_col[target];
 
       /* Create the fragment shader on-demand. */
       if (!*shader) {
          assert(!ctx->cached_all_shaders);
          *shader = util_make_fragment_tex_shader(pipe, tgsi_tex,
-                                                 TGSI_INTERPOLATE_LINEAR);
+                                                 TGSI_INTERPOLATE_LINEAR,
+                                                 stype);
       }
 
       return *shader;
@@ -1051,6 +1095,10 @@ void util_blitter_cache_all_shaders(struct blitter_context *blitter)
           */
          blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_FLOAT, target,
                                      samples, samples, 0);
+         blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_UINT, target,
+                                     samples, samples, 0);
+         blitter_get_fs_texfetch_col(ctx, PIPE_FORMAT_R32_SINT, target,
+                                     samples, samples, 0);
          blitter_get_fs_texfetch_depth(ctx, target, samples);
          if (ctx->has_stencil_export) {
             blitter_get_fs_texfetch_depthstencil(ctx, target, samples);
@@ -1108,6 +1156,10 @@ static void blitter_set_common_draw_rect_state(struct blitter_context_priv *ctx,
 
    if (ctx->has_geometry_shader)
       pipe->bind_gs_state(pipe, NULL);
+   if (ctx->has_tessellation) {
+      pipe->bind_tcs_state(pipe, NULL);
+      pipe->bind_tes_state(pipe, NULL);
+   }
    if (ctx->has_stream_out)
       pipe->set_stream_output_targets(pipe, 0, NULL, NULL);
 }
@@ -1306,6 +1358,7 @@ void util_blitter_default_src_texture(struct pipe_sampler_view *src_templ,
                                       unsigned srclevel)
 {
     memset(src_templ, 0, sizeof(*src_templ));
+    src_templ->target = src->target;
     src_templ->format = util_format_linear(src->format);
     src_templ->u.tex.first_level = srclevel;
     src_templ->u.tex.last_level = srclevel;
@@ -1966,6 +2019,10 @@ void util_blitter_copy_buffer(struct blitter_context *blitter,
    bind_vs_pos_only(ctx);
    if (ctx->has_geometry_shader)
       pipe->bind_gs_state(pipe, NULL);
+   if (ctx->has_tessellation) {
+      pipe->bind_tcs_state(pipe, NULL);
+      pipe->bind_tes_state(pipe, NULL);
+   }
    pipe->bind_rasterizer_state(pipe, ctx->rs_discard_state);
 
    so_target = pipe->create_stream_output_target(pipe, dst, dstx, size);
@@ -2026,6 +2083,10 @@ void util_blitter_clear_buffer(struct blitter_context *blitter,
    bind_vs_pos_only(ctx);
    if (ctx->has_geometry_shader)
       pipe->bind_gs_state(pipe, NULL);
+   if (ctx->has_tessellation) {
+      pipe->bind_tcs_state(pipe, NULL);
+      pipe->bind_tes_state(pipe, NULL);
+   }
    pipe->bind_rasterizer_state(pipe, ctx->rs_discard_state);
 
    so_target = pipe->create_stream_output_target(pipe, dst, offset, size);
diff --git a/src/gallium/auxiliary/util/u_blitter.h b/src/gallium/auxiliary/util/u_blitter.h
index 1568030..93b0e51 100644
--- a/src/gallium/auxiliary/util/u_blitter.h
+++ b/src/gallium/auxiliary/util/u_blitter.h
@@ -102,7 +102,7 @@ struct blitter_context
    void *saved_dsa_state;     /**< depth stencil alpha state */
    void *saved_velem_state;   /**< vertex elements state */
    void *saved_rs_state;      /**< rasterizer state */
-   void *saved_fs, *saved_vs, *saved_gs; /**< shaders */
+   void *saved_fs, *saved_vs, *saved_gs, *saved_tcs, *saved_tes; /**< shaders */
 
    struct pipe_framebuffer_state saved_fb_state;  /**< framebuffer state */
    struct pipe_stencil_ref saved_stencil_ref;     /**< stencil ref */
@@ -427,6 +427,20 @@ void util_blitter_save_geometry_shader(struct blitter_context *blitter,
    blitter->saved_gs = gs;
 }
 
+static INLINE void
+util_blitter_save_tessctrl_shader(struct blitter_context *blitter,
+                                  void *sh)
+{
+   blitter->saved_tcs = sh;
+}
+
+static INLINE void
+util_blitter_save_tesseval_shader(struct blitter_context *blitter,
+                                  void *sh)
+{
+   blitter->saved_tes = sh;
+}
+
 static INLINE
 void util_blitter_save_framebuffer(struct blitter_context *blitter,
                                    const struct pipe_framebuffer_state *state)
diff --git a/src/gallium/auxiliary/util/u_dump_state.c b/src/gallium/auxiliary/util/u_dump_state.c
index e6614d5..7f620b5 100644
--- a/src/gallium/auxiliary/util/u_dump_state.c
+++ b/src/gallium/auxiliary/util/u_dump_state.c
@@ -750,6 +750,8 @@ util_dump_draw_info(FILE *stream, const struct pipe_draw_info *state)
    util_dump_member(stream, uint, state, start_instance);
    util_dump_member(stream, uint, state, instance_count);
 
+   util_dump_member(stream, uint, state, vertices_per_patch);
+
    util_dump_member(stream, int,  state, index_bias);
    util_dump_member(stream, uint, state, min_index);
    util_dump_member(stream, uint, state, max_index);
diff --git a/src/gallium/auxiliary/util/u_format_etc.c b/src/gallium/auxiliary/util/u_format_etc.c
index f909b16..63e03ff 100644
--- a/src/gallium/auxiliary/util/u_format_etc.c
+++ b/src/gallium/auxiliary/util/u_format_etc.c
@@ -65,11 +65,10 @@ util_format_etc1_rgb8_pack_rgba_float(uint8_t *dst_row, unsigned dst_stride, con
 void
 util_format_etc1_rgb8_fetch_rgba_float(float *dst, const uint8_t *src, unsigned i, unsigned j)
 {
-   const unsigned bw = 4, bh = 4;
    struct etc1_block block;
    uint8_t tmp[3];
 
-   assert(i < bw && j < bh);
+   assert(i < 4 && j < 4); /* check i, j against 4x4 block size */
 
    etc1_parse_block(&block, src);
    etc1_fetch_texel(&block, i, j, tmp);
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 3d27a59..3b4040f 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -42,6 +42,7 @@
 #include "pipe/p_compiler.h"
 
 #include "c99_math.h"
+#include <assert.h>
 #include <float.h>
 #include <stdarg.h>
 
@@ -424,6 +425,25 @@ util_last_bit(unsigned u)
 }
 
 /**
+ * Find last bit set in a word.  The least significant bit is 1.
+ * Return 0 if no bits are set.
+ */
+static INLINE unsigned
+util_last_bit64(uint64_t u)
+{
+#if defined(HAVE___BUILTIN_CLZLL)
+   return u == 0 ? 0 : 64 - __builtin_clzll(u);
+#else
+   unsigned r = 0;
+   while (u) {
+       r++;
+       u >>= 1;
+   }
+   return r;
+#endif
+}
+
+/**
  * Find last bit in a word that does not match the sign bit. The least
  * significant bit is 1.
  * Return 0 if no bits are set.
diff --git a/src/gallium/auxiliary/util/u_pstipple.c b/src/gallium/auxiliary/util/u_pstipple.c
index 0a20bdb..1f65672 100644
--- a/src/gallium/auxiliary/util/u_pstipple.c
+++ b/src/gallium/auxiliary/util/u_pstipple.c
@@ -55,7 +55,7 @@
 #include "tgsi/tgsi_scan.h"
 
 /** Approx number of new tokens for instructions in pstip_transform_inst() */
-#define NUM_NEW_TOKENS 50
+#define NUM_NEW_TOKENS 53
 
 
 static void
@@ -262,6 +262,7 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
       (struct pstip_transform_context *) ctx;
    int wincoordInput;
    int texTemp;
+   int sampIdx;
 
    /* find free texture sampler */
    pctx->freeSampler = free_bit(pctx->samplersUsed);
@@ -280,9 +281,21 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
                                 TGSI_INTERPOLATE_LINEAR);
    }
 
+   sampIdx = pctx->hasFixedUnit ? pctx->fixedUnit : pctx->freeSampler;
+
    /* declare new sampler */
-   tgsi_transform_sampler_decl(ctx,
-         pctx->hasFixedUnit ? pctx->fixedUnit : pctx->freeSampler);
+   tgsi_transform_sampler_decl(ctx, sampIdx);
+
+   /* if the src shader has SVIEW decl's for each SAMP decl, we
+    * need to continue the trend and ensure there is a matching
+    * SVIEW for the new SAMP we just created
+    */
+   if (pctx->info.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
+      tgsi_transform_sampler_view_decl(ctx,
+                                       sampIdx,
+                                       TGSI_TEXTURE_2D,
+                                       TGSI_RETURN_TYPE_FLOAT);
+   }
 
    /* Declare temp[0] reg if not already declared.
     * We can always use temp[0] since this code is before
@@ -321,8 +334,7 @@ pstip_transform_prolog(struct tgsi_transform_context *ctx)
    tgsi_transform_tex_2d_inst(ctx,
                               TGSI_FILE_TEMPORARY, texTemp,
                               TGSI_FILE_TEMPORARY, texTemp,
-                              pctx->hasFixedUnit ? pctx->fixedUnit
-                                                 : pctx->freeSampler);
+                              sampIdx);
 
    /* KILL_IF -texTemp;   # if -texTemp < 0, kill fragment */
    tgsi_transform_kill_inst(ctx,
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.c b/src/gallium/auxiliary/util/u_simple_shaders.c
index c612b67..6d29cab 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.c
+++ b/src/gallium/auxiliary/util/u_simple_shaders.c
@@ -216,7 +216,8 @@ void *
 util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
                                         unsigned tex_target,
                                         unsigned interp_mode,
-                                        unsigned writemask )
+                                        unsigned writemask,
+                                        enum tgsi_return_type stype)
 {
    struct ureg_program *ureg;
    struct ureg_src sampler;
@@ -232,6 +233,8 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
    
    sampler = ureg_DECL_sampler( ureg, 0 );
 
+   ureg_DECL_sampler_view(ureg, 0, tex_target, stype, stype, stype, stype);
+
    tex = ureg_DECL_fs_input( ureg, 
                              TGSI_SEMANTIC_GENERIC, 0, 
                              interp_mode );
@@ -268,12 +271,14 @@ util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
  */
 void *
 util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
-                              unsigned interp_mode)
+                              unsigned interp_mode,
+                              enum tgsi_return_type stype)
 {
    return util_make_fragment_tex_shader_writemask( pipe,
                                                    tex_target,
                                                    interp_mode,
-                                                   TGSI_WRITEMASK_XYZW );
+                                                   TGSI_WRITEMASK_XYZW,
+                                                   stype );
 }
 
 
@@ -298,6 +303,12 @@ util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
 
    sampler = ureg_DECL_sampler( ureg, 0 );
 
+   ureg_DECL_sampler_view(ureg, 0, tex_target,
+                          TGSI_RETURN_TYPE_FLOAT,
+                          TGSI_RETURN_TYPE_FLOAT,
+                          TGSI_RETURN_TYPE_FLOAT,
+                          TGSI_RETURN_TYPE_FLOAT);
+
    tex = ureg_DECL_fs_input( ureg,
                              TGSI_SEMANTIC_GENERIC, 0,
                              interp_mode );
@@ -343,7 +354,17 @@ util_make_fragment_tex_shader_writedepthstencil(struct pipe_context *pipe,
       return NULL;
 
    depth_sampler = ureg_DECL_sampler( ureg, 0 );
+   ureg_DECL_sampler_view(ureg, 0, tex_target,
+                          TGSI_RETURN_TYPE_FLOAT,
+                          TGSI_RETURN_TYPE_FLOAT,
+                          TGSI_RETURN_TYPE_FLOAT,
+                          TGSI_RETURN_TYPE_FLOAT);
    stencil_sampler = ureg_DECL_sampler( ureg, 1 );
+   ureg_DECL_sampler_view(ureg, 0, tex_target,
+                          TGSI_RETURN_TYPE_UINT,
+                          TGSI_RETURN_TYPE_UINT,
+                          TGSI_RETURN_TYPE_UINT,
+                          TGSI_RETURN_TYPE_UINT);
 
    tex = ureg_DECL_fs_input( ureg,
                              TGSI_SEMANTIC_GENERIC, 0,
@@ -398,6 +419,12 @@ util_make_fragment_tex_shader_writestencil(struct pipe_context *pipe,
 
    stencil_sampler = ureg_DECL_sampler( ureg, 0 );
 
+   ureg_DECL_sampler_view(ureg, 0, tex_target,
+                          TGSI_RETURN_TYPE_UINT,
+                          TGSI_RETURN_TYPE_UINT,
+                          TGSI_RETURN_TYPE_UINT,
+                          TGSI_RETURN_TYPE_UINT);
+
    tex = ureg_DECL_fs_input( ureg,
                              TGSI_SEMANTIC_GENERIC, 0,
                              interp_mode );
@@ -512,6 +539,7 @@ util_make_fragment_cloneinput_shader(struct pipe_context *pipe, int num_cbufs,
 static void *
 util_make_fs_blit_msaa_gen(struct pipe_context *pipe,
                            unsigned tgsi_tex,
+                           const char *samp_type,
                            const char *output_semantic,
                            const char *output_mask)
 {
@@ -519,6 +547,7 @@ util_make_fs_blit_msaa_gen(struct pipe_context *pipe,
          "FRAG\n"
          "DCL IN[0], GENERIC[0], LINEAR\n"
          "DCL SAMP[0]\n"
+         "DCL SVIEW[0], %s, %s\n"
          "DCL OUT[0], %s\n"
          "DCL TEMP[0]\n"
 
@@ -534,7 +563,8 @@ util_make_fs_blit_msaa_gen(struct pipe_context *pipe,
    assert(tgsi_tex == TGSI_TEXTURE_2D_MSAA ||
           tgsi_tex == TGSI_TEXTURE_2D_ARRAY_MSAA);
 
-   sprintf(text, shader_templ, output_semantic, output_mask, type);
+   sprintf(text, shader_templ, type, samp_type,
+           output_semantic, output_mask, type);
 
    if (!tgsi_text_translate(text, tokens, Elements(tokens))) {
       puts(text);
@@ -556,9 +586,19 @@ util_make_fs_blit_msaa_gen(struct pipe_context *pipe,
  */
 void *
 util_make_fs_blit_msaa_color(struct pipe_context *pipe,
-                             unsigned tgsi_tex)
+                             unsigned tgsi_tex,
+                             enum tgsi_return_type stype)
 {
-   return util_make_fs_blit_msaa_gen(pipe, tgsi_tex,
+   const char *samp_type;
+
+   if (stype == TGSI_RETURN_TYPE_UINT)
+      samp_type = "UINT";
+   else if (stype == TGSI_RETURN_TYPE_SINT)
+      samp_type = "SINT";
+   else
+      samp_type = "FLOAT";
+
+   return util_make_fs_blit_msaa_gen(pipe, tgsi_tex, samp_type,
                                      "COLOR[0]", "");
 }
 
@@ -572,7 +612,7 @@ void *
 util_make_fs_blit_msaa_depth(struct pipe_context *pipe,
                              unsigned tgsi_tex)
 {
-   return util_make_fs_blit_msaa_gen(pipe, tgsi_tex,
+   return util_make_fs_blit_msaa_gen(pipe, tgsi_tex, "FLOAT",
                                      "POSITION", ".z");
 }
 
@@ -586,7 +626,7 @@ void *
 util_make_fs_blit_msaa_stencil(struct pipe_context *pipe,
                                unsigned tgsi_tex)
 {
-   return util_make_fs_blit_msaa_gen(pipe, tgsi_tex,
+   return util_make_fs_blit_msaa_gen(pipe, tgsi_tex, "UINT",
                                      "STENCIL", ".y");
 }
 
@@ -640,7 +680,7 @@ util_make_fs_blit_msaa_depthstencil(struct pipe_context *pipe,
 void *
 util_make_fs_msaa_resolve(struct pipe_context *pipe,
                           unsigned tgsi_tex, unsigned nr_samples,
-                          boolean is_uint, boolean is_sint)
+                          enum tgsi_return_type stype)
 {
    struct ureg_program *ureg;
    struct ureg_src sampler, coord;
@@ -653,6 +693,7 @@ util_make_fs_msaa_resolve(struct pipe_context *pipe,
 
    /* Declarations. */
    sampler = ureg_DECL_sampler(ureg, 0);
+   ureg_DECL_sampler_view(ureg, 0, tgsi_tex, stype, stype, stype, stype);
    coord = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 0,
                               TGSI_INTERPOLATE_LINEAR);
    out = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
@@ -670,9 +711,9 @@ util_make_fs_msaa_resolve(struct pipe_context *pipe,
                ureg_imm1u(ureg, i));
       ureg_TXF(ureg, tmp, tgsi_tex, ureg_src(tmp_coord), sampler);
 
-      if (is_uint)
+      if (stype == TGSI_RETURN_TYPE_UINT)
          ureg_U2F(ureg, tmp, ureg_src(tmp));
-      else if (is_sint)
+      else if (stype == TGSI_RETURN_TYPE_SINT)
          ureg_I2F(ureg, tmp, ureg_src(tmp));
 
       /* Add it to the sum.*/
@@ -683,9 +724,9 @@ util_make_fs_msaa_resolve(struct pipe_context *pipe,
    ureg_MUL(ureg, tmp_sum, ureg_src(tmp_sum),
             ureg_imm1f(ureg, 1.0 / nr_samples));
 
-   if (is_uint)
+   if (stype == TGSI_RETURN_TYPE_UINT)
       ureg_F2U(ureg, out, ureg_src(tmp_sum));
-   else if (is_sint)
+   else if (stype == TGSI_RETURN_TYPE_SINT)
       ureg_F2I(ureg, out, ureg_src(tmp_sum));
    else
       ureg_MOV(ureg, out, ureg_src(tmp_sum));
@@ -699,7 +740,7 @@ util_make_fs_msaa_resolve(struct pipe_context *pipe,
 void *
 util_make_fs_msaa_resolve_bilinear(struct pipe_context *pipe,
                                    unsigned tgsi_tex, unsigned nr_samples,
-                                   boolean is_uint, boolean is_sint)
+                                   enum tgsi_return_type stype)
 {
    struct ureg_program *ureg;
    struct ureg_src sampler, coord;
@@ -713,6 +754,7 @@ util_make_fs_msaa_resolve_bilinear(struct pipe_context *pipe,
 
    /* Declarations. */
    sampler = ureg_DECL_sampler(ureg, 0);
+   ureg_DECL_sampler_view(ureg, 0, tgsi_tex, stype, stype, stype, stype);
    coord = ureg_DECL_fs_input(ureg, TGSI_SEMANTIC_GENERIC, 0,
                               TGSI_INTERPOLATE_LINEAR);
    out = ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 0);
@@ -744,9 +786,9 @@ util_make_fs_msaa_resolve_bilinear(struct pipe_context *pipe,
                   ureg_imm1u(ureg, i));
          ureg_TXF(ureg, tmp, tgsi_tex, ureg_src(tmp_coord[c]), sampler);
 
-         if (is_uint)
+         if (stype == TGSI_RETURN_TYPE_UINT)
             ureg_U2F(ureg, tmp, ureg_src(tmp));
-         else if (is_sint)
+         else if (stype == TGSI_RETURN_TYPE_SINT)
             ureg_I2F(ureg, tmp, ureg_src(tmp));
 
          /* Add it to the sum.*/
@@ -778,9 +820,9 @@ util_make_fs_msaa_resolve_bilinear(struct pipe_context *pipe,
             ureg_src(top));
 
    /* Convert to the texture format and return. */
-   if (is_uint)
+   if (stype == TGSI_RETURN_TYPE_UINT)
       ureg_F2U(ureg, out, ureg_src(tmp));
-   else if (is_sint)
+   else if (stype == TGSI_RETURN_TYPE_SINT)
       ureg_F2I(ureg, out, ureg_src(tmp));
    else
       ureg_MOV(ureg, out, ureg_src(tmp));
diff --git a/src/gallium/auxiliary/util/u_simple_shaders.h b/src/gallium/auxiliary/util/u_simple_shaders.h
index dd282e0..08d798e 100644
--- a/src/gallium/auxiliary/util/u_simple_shaders.h
+++ b/src/gallium/auxiliary/util/u_simple_shaders.h
@@ -68,15 +68,16 @@ extern void *
 util_make_layered_clear_geometry_shader(struct pipe_context *pipe);
 
 extern void *
-util_make_fragment_tex_shader_writemask(struct pipe_context *pipe, 
+util_make_fragment_tex_shader_writemask(struct pipe_context *pipe,
                                         unsigned tex_target,
                                         unsigned interp_mode,
-                                        unsigned writemask);
+                                        unsigned writemask,
+                                        enum tgsi_return_type stype);
 
 extern void *
 util_make_fragment_tex_shader(struct pipe_context *pipe, unsigned tex_target,
-                              unsigned interp_mode);
-
+                              unsigned interp_mode,
+                              enum tgsi_return_type stype);
 
 extern void *
 util_make_fragment_tex_shader_writedepth(struct pipe_context *pipe,
@@ -115,7 +116,8 @@ util_make_fragment_cloneinput_shader(struct pipe_context *pipe, int num_cbufs,
 
 extern void *
 util_make_fs_blit_msaa_color(struct pipe_context *pipe,
-                             unsigned tgsi_tex);
+                             unsigned tgsi_tex,
+                             enum tgsi_return_type stype);
 
 
 extern void *
@@ -136,13 +138,13 @@ util_make_fs_blit_msaa_stencil(struct pipe_context *pipe,
 void *
 util_make_fs_msaa_resolve(struct pipe_context *pipe,
                           unsigned tgsi_tex, unsigned nr_samples,
-                          boolean is_uint, boolean is_sint);
+                          enum tgsi_return_type stype);
 
 
 void *
 util_make_fs_msaa_resolve_bilinear(struct pipe_context *pipe,
                                    unsigned tgsi_tex, unsigned nr_samples,
-                                   boolean is_uint, boolean is_sint);
+                                   enum tgsi_return_type stype);
 
 #ifdef __cplusplus
 }
diff --git a/src/gallium/auxiliary/util/u_tests.c b/src/gallium/auxiliary/util/u_tests.c
index fe54972..6a489d6 100644
--- a/src/gallium/auxiliary/util/u_tests.c
+++ b/src/gallium/auxiliary/util/u_tests.c
@@ -373,7 +373,8 @@ null_sampler_view(struct pipe_context *ctx, unsigned tgsi_tex_target)
 
    /* Fragment shader. */
    fs = util_make_fragment_tex_shader(ctx, tgsi_tex_target,
-                                      TGSI_INTERPOLATE_LINEAR);
+                                      TGSI_INTERPOLATE_LINEAR,
+                                      TGSI_RETURN_TYPE_FLOAT);
    cso_set_fragment_shader_handle(cso, fs);
 
    /* Vertex shader. */
diff --git a/src/gallium/auxiliary/util/u_tests.h b/src/gallium/auxiliary/util/u_tests.h
index 49ae54f..106b0a0 100644
--- a/src/gallium/auxiliary/util/u_tests.h
+++ b/src/gallium/auxiliary/util/u_tests.h
@@ -30,8 +30,16 @@
 
 #include "pipe/p_compiler.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 
 void util_run_tests(struct pipe_screen *screen);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/gallium/auxiliary/util/u_vbuf.c b/src/gallium/auxiliary/util/u_vbuf.c
index b1b89bf..02ae0b8 100644
--- a/src/gallium/auxiliary/util/u_vbuf.c
+++ b/src/gallium/auxiliary/util/u_vbuf.c
@@ -781,10 +781,11 @@ u_vbuf_create_vertex_elements(struct u_vbuf *mgr, unsigned count,
    ve->compatible_vb_mask_all = ~ve->incompatible_vb_mask_any & used_buffers;
    ve->incompatible_vb_mask_all = ~ve->compatible_vb_mask_any & used_buffers;
 
-   /* Align the formats to the size of DWORD if needed. */
+   /* Align the formats and offsets to the size of DWORD if needed. */
    if (!mgr->caps.velem_src_offset_unaligned) {
       for (i = 0; i < count; i++) {
          ve->native_format_size[i] = align(ve->native_format_size[i], 4);
+         driver_attribs[i].src_offset = align(ve->ve[i].src_offset, 4);
       }
    }
 
diff --git a/src/gallium/docs/source/context.rst b/src/gallium/docs/source/context.rst
index 5861f46..0908ee7 100644
--- a/src/gallium/docs/source/context.rst
+++ b/src/gallium/docs/source/context.rst
@@ -79,6 +79,11 @@ objects. They all follow simple, one-method binding calls, e.g.
   should be the same as the number of set viewports and can be up to
   PIPE_MAX_VIEWPORTS.
 * ``set_viewport_states``
+* ``set_tess_state`` configures the default tessellation parameters:
+  * ``default_outer_level`` is the default value for the outer tessellation
+    levels. This corresponds to GL's ``PATCH_DEFAULT_OUTER_LEVEL``.
+  * ``default_inner_level`` is the default value for the inner tessellation
+    levels. This corresponds to GL's ``PATCH_DEFAULT_INNER_LEVEL``.
 
 
 Sampler Views
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index 68931cf..8f64817 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -252,6 +252,8 @@ The integer capabilities:
   existing user memory into the device address space for direct device access.
   The create function is pipe_screen::resource_from_user_memory. The address
   and size must be page-aligned.
+* ``PIPE_CAP_DEVICE_RESET_STATUS_QUERY``:
+  Whether pipe_context::get_device_reset_status is implemented.
 
 
 .. _pipe_capf:
@@ -338,6 +340,8 @@ to be 0.
   DLDEXP are supported.
 * ``PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED``: Whether FMA and DFMA (doubles only)
   are supported.
+* ``PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE``: Whether the driver doesn't
+  ignore tgsi_declaration_range::Last for shader inputs and outputs.
 
 
 .. _pipe_compute_cap:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 7771136..89ca172 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2894,6 +2894,43 @@ and only the X component is used.
 FIXME: This right now can be either a ordinary input or a system value...
 
 
+TGSI_SEMANTIC_PATCH
+"""""""""""""""""""
+
+For tessellation evaluation/control shaders, this semantic label indicates a
+generic per-patch attribute. Such semantics will not implicitly be per-vertex
+arrays.
+
+TGSI_SEMANTIC_TESSCOORD
+"""""""""""""""""""""""
+
+For tessellation evaluation shaders, this semantic label indicates the
+coordinates of the vertex being processed. This is available in XYZ; W is
+undefined.
+
+TGSI_SEMANTIC_TESSOUTER
+"""""""""""""""""""""""
+
+For tessellation evaluation/control shaders, this semantic label indicates the
+outer tessellation levels of the patch. Isoline tessellation will only have XY
+defined, triangle will have XYZ and quads will have XYZW defined. This
+corresponds to gl_TessLevelOuter.
+
+TGSI_SEMANTIC_TESSINNER
+"""""""""""""""""""""""
+
+For tessellation evaluation/control shaders, this semantic label indicates the
+inner tessellation levels of the patch. The X value is only defined for
+triangle tessellation, while quads will have XY defined. This is entirely
+undefined for isoline tessellation.
+
+TGSI_SEMANTIC_VERTICESIN
+""""""""""""""""""""""""
+
+For tessellation evaluation/control shaders, this semantic label indicates the
+number of vertices provided in the input patch. Only the X value is defined.
+
+
 Declaration Interpolate
 ^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -2928,6 +2965,18 @@ resource can be one of BUFFER, 1D, 2D, 3D, 1DArray and 2DArray.
 type must be 1 or 4 entries (if specifying on a per-component
 level) out of UNORM, SNORM, SINT, UINT and FLOAT.
 
+For TEX\* style texture sample opcodes (as opposed to SAMPLE\* opcodes
+which take an explicit SVIEW[#] source register), there may be optionally
+SVIEW[#] declarations.  In this case, the SVIEW index is implied by the
+SAMP index, and there must be a corresponding SVIEW[#] declaration for
+each SAMP[#] declaration.  Drivers are free to ignore this if they wish.
+But note in particular that some drivers need to know the sampler type
+(float/int/unsigned) in order to generate the correct code, so cases
+where integer textures are sampled, SVIEW[#] declarations should be
+used.
+
+NOTE: It is NOT legal to mix SAMPLE\* style opcodes and TEX\* opcodes
+in the same shader.
 
 Declaration Resource
 ^^^^^^^^^^^^^^^^^^^^
@@ -3034,6 +3083,39 @@ Naturally, clipping is not performed on window coordinates either.
 The effect of this property is undefined if a geometry or tessellation shader
 are in use.
 
+TCS_VERTICES_OUT
+""""""""""""""""
+
+The number of vertices written by the tessellation control shader. This
+effectively defines the patch input size of the tessellation evaluation shader
+as well.
+
+TES_PRIM_MODE
+"""""""""""""
+
+This sets the tessellation primitive mode, one of ``PIPE_PRIM_TRIANGLES``,
+``PIPE_PRIM_QUADS``, or ``PIPE_PRIM_LINES``. (Unlike in GL, there is no
+separate isolines settings, the regular lines is assumed to mean isolines.)
+
+TES_SPACING
+"""""""""""
+
+This sets the spacing mode of the tessellation generator, one of
+``PIPE_TESS_SPACING_*``.
+
+TES_VERTEX_ORDER_CW
+"""""""""""""""""""
+
+This sets the vertex order to be clockwise if the value is 1, or
+counter-clockwise if set to 0.
+
+TES_POINT_MODE
+""""""""""""""
+
+If set to a non-zero value, this turns on point mode for the tessellator,
+which means that points will be generated instead of primitives.
+
+
 Texture Sampling and Texture Formats
 ------------------------------------
 
diff --git a/src/gallium/drivers/freedreno/Makefile.am b/src/gallium/drivers/freedreno/Makefile.am
index 4b2629f..cbf62c6 100644
--- a/src/gallium/drivers/freedreno/Makefile.am
+++ b/src/gallium/drivers/freedreno/Makefile.am
@@ -21,15 +21,16 @@ libfreedreno_la_SOURCES = \
 
 noinst_PROGRAMS = ir3_compiler
 
+# XXX: Required due to the C++ sources in libnir/libglsl_util
+nodist_EXTRA_ir3_compiler_SOURCES = dummy.cpp
 ir3_compiler_SOURCES = \
 	ir3/ir3_cmdline.c
 
 ir3_compiler_LDADD = \
 	libfreedreno.la \
-	../../auxiliary/libgallium.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/glsl/libnir.la \
 	$(top_builddir)/src/libglsl_util.la \
-	-lstdc++ \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_COMMON_LIB_DEPS) \
 	$(FREEDRENO_LIBS)
diff --git a/src/gallium/drivers/freedreno/Makefile.sources b/src/gallium/drivers/freedreno/Makefile.sources
index a565a9c..baae914 100644
--- a/src/gallium/drivers/freedreno/Makefile.sources
+++ b/src/gallium/drivers/freedreno/Makefile.sources
@@ -120,18 +120,17 @@ ir3_SOURCES := \
 	ir3/disasm-a3xx.c \
 	ir3/instr-a3xx.h \
 	ir3/ir3.c \
-	ir3/ir3_compiler.c \
 	ir3/ir3_compiler_nir.c \
+	ir3/ir3_compiler.c \
 	ir3/ir3_compiler.h \
 	ir3/ir3_cp.c \
 	ir3/ir3_depth.c \
-	ir3/ir3_dump.c \
-	ir3/ir3_flatten.c \
 	ir3/ir3_group.c \
 	ir3/ir3.h \
 	ir3/ir3_legalize.c \
 	ir3/ir3_nir.h \
 	ir3/ir3_nir_lower_if_else.c \
+	ir3/ir3_print.c \
 	ir3/ir3_ra.c \
 	ir3/ir3_sched.c \
 	ir3/ir3_shader.c \
diff --git a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
index e4acc7e..b48fb46 100644
--- a/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
+++ b/src/gallium/drivers/freedreno/a2xx/fd2_compiler.c
@@ -414,32 +414,16 @@ add_src_reg(struct fd2_compile_context *ctx, struct ir2_instruction *alu,
 static void
 add_vector_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
 {
-	switch (inst->Instruction.Saturate) {
-	case TGSI_SAT_NONE:
-		break;
-	case TGSI_SAT_ZERO_ONE:
+	if (inst->Instruction.Saturate) {
 		alu->alu.vector_clamp = true;
-		break;
-	case TGSI_SAT_MINUS_PLUS_ONE:
-		DBG("unsupported saturate");
-		assert(0);
-		break;
 	}
 }
 
 static void
 add_scalar_clamp(struct tgsi_full_instruction *inst, struct ir2_instruction *alu)
 {
-	switch (inst->Instruction.Saturate) {
-	case TGSI_SAT_NONE:
-		break;
-	case TGSI_SAT_ZERO_ONE:
+	if (inst->Instruction.Saturate) {
 		alu->alu.scalar_clamp = true;
-		break;
-	case TGSI_SAT_MINUS_PLUS_ONE:
-		DBG("unsupported saturate");
-		assert(0);
-		break;
 	}
 }
 
@@ -758,7 +742,7 @@ translate_tex(struct fd2_compile_context *ctx,
 	struct tgsi_src_register tmp_src;
 	const struct tgsi_src_register *coord;
 	bool using_temp = (inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) ||
-			(inst->Instruction.Saturate != TGSI_SAT_NONE);
+			inst->Instruction.Saturate;
 	int idx;
 
 	if (using_temp || (opc == TGSI_OPCODE_TXP))
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_context.h b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
index 4e3f521..77e4605 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_context.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_context.h
@@ -105,9 +105,6 @@ struct fd3_context {
 	 */
 	unsigned fsaturate_s, fsaturate_t, fsaturate_r;
 
-	/* bitmask of integer texture samplers */
-	uint16_t vinteger_s, finteger_s;
-
 	/* some state changes require a different shader variant.  Keep
 	 * track of this so we know when we need to re-emit shader state
 	 * due to variant change.  See fixup_shader_state()
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index b522cf8..b5838b5 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -104,14 +104,12 @@ fixup_shader_state(struct fd_context *ctx, struct ir3_shader_key *key)
 		if (last_key->has_per_samp || key->has_per_samp) {
 			if ((last_key->vsaturate_s != key->vsaturate_s) ||
 					(last_key->vsaturate_t != key->vsaturate_t) ||
-					(last_key->vsaturate_r != key->vsaturate_r) ||
-					(last_key->vinteger_s != key->vinteger_s))
+					(last_key->vsaturate_r != key->vsaturate_r))
 				ctx->prog.dirty |= FD_SHADER_DIRTY_VP;
 
 			if ((last_key->fsaturate_s != key->fsaturate_s) ||
 					(last_key->fsaturate_t != key->fsaturate_t) ||
-					(last_key->fsaturate_r != key->fsaturate_r) ||
-					(last_key->finteger_s != key->finteger_s))
+					(last_key->fsaturate_r != key->fsaturate_r))
 				ctx->prog.dirty |= FD_SHADER_DIRTY_FP;
 		}
 
@@ -140,16 +138,13 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
-			.has_per_samp = (fd3_ctx->fsaturate || fd3_ctx->vsaturate ||
-							 fd3_ctx->vinteger_s || fd3_ctx->finteger_s),
+			.has_per_samp = (fd3_ctx->fsaturate || fd3_ctx->vsaturate),
 			.vsaturate_s = fd3_ctx->vsaturate_s,
 			.vsaturate_t = fd3_ctx->vsaturate_t,
 			.vsaturate_r = fd3_ctx->vsaturate_r,
 			.fsaturate_s = fd3_ctx->fsaturate_s,
 			.fsaturate_t = fd3_ctx->fsaturate_t,
 			.fsaturate_r = fd3_ctx->fsaturate_r,
-			.vinteger_s = fd3_ctx->vinteger_s,
-			.finteger_s = fd3_ctx->finteger_s,
 		},
 		.rasterflat = ctx->rasterizer && ctx->rasterizer->flatshade,
 		.sprite_coord_enable = ctx->rasterizer ? ctx->rasterizer->sprite_coord_enable : 0,
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index a6824ef..57fcaa9 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -413,12 +413,15 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 				}
 			}
 
-			/* TODO: Figure out if there's a way to make it spit out 0's and
-			 * 1's for the .z and .w components.
+			/* Replace the .xy coordinates with S/T from the point sprite. Set
+			 * interpolation bits for .zw such that they become .01
 			 */
-			if (emit->sprite_coord_enable & (1 << sem2idx(fp->inputs[j].semantic)))
+			if (emit->sprite_coord_enable & (1 << sem2idx(fp->inputs[j].semantic))) {
 				vpsrepl[inloc / 16] |= (emit->sprite_coord_mode ? 0x0d : 0x09)
 					<< ((inloc % 16) * 2);
+				vinterp[(inloc + 2) / 16] |= 2 << (((inloc + 2) % 16) * 2);
+				vinterp[(inloc + 3) / 16] |= 3 << (((inloc + 3) % 16) * 2);
+			}
 		}
 
 		OUT_PKT0(ring, REG_A3XX_VPC_ATTR, 2);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index 3497921..094dcf3 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -32,6 +32,7 @@
 #include "fd3_screen.h"
 #include "fd3_context.h"
 #include "fd3_format.h"
+#include "ir3_compiler.h"
 
 static boolean
 fd3_screen_is_format_supported(struct pipe_screen *pscreen,
@@ -103,7 +104,9 @@ fd3_screen_is_format_supported(struct pipe_screen *pscreen,
 void
 fd3_screen_init(struct pipe_screen *pscreen)
 {
-	fd_screen(pscreen)->max_rts = 4;
+	struct fd_screen *screen = fd_screen(pscreen);
+	screen->max_rts = 4;
+	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd3_context_create;
 	pscreen->is_format_supported = fd3_screen_is_format_supported;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
index 6f44ee3..a278bf5 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_texture.c
@@ -263,44 +263,11 @@ fd3_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	return &so->base;
 }
 
-static void
-fd3_set_sampler_views(struct pipe_context *pctx, unsigned shader,
-					  unsigned start, unsigned nr,
-					  struct pipe_sampler_view **views)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	struct fd3_context *fd3_ctx = fd3_context(ctx);
-	struct fd_texture_stateobj *tex;
-	uint16_t integer_s = 0, *ptr;
-	int i;
-
-	fd_set_sampler_views(pctx, shader, start, nr, views);
-
-	switch (shader) {
-	case PIPE_SHADER_FRAGMENT:
-		tex = &ctx->fragtex;
-		ptr = &fd3_ctx->finteger_s;
-		break;
-	case PIPE_SHADER_VERTEX:
-		tex = &ctx->verttex;
-		ptr = &fd3_ctx->vinteger_s;
-		break;
-	default:
-		return;
-	}
-
-	for (i = 0; i < tex->num_textures; i++)
-		if (util_format_is_pure_integer(tex->textures[i]->format))
-			integer_s |= 1 << i;
-	*ptr = integer_s;
-}
-
-
 void
 fd3_texture_init(struct pipe_context *pctx)
 {
 	pctx->create_sampler_state = fd3_sampler_state_create;
 	pctx->bind_sampler_states = fd3_sampler_states_bind;
 	pctx->create_sampler_view = fd3_sampler_view_create;
-	pctx->set_sampler_views = fd3_set_sampler_views;
+	pctx->set_sampler_views = fd_set_sampler_views;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 384602a..53e1bf6 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -83,9 +83,6 @@ struct fd4_context {
 	 */
 	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
 
-	/* bitmask of integer texture samplers */
-	uint16_t vinteger_s, finteger_s;
-
 	/* some state changes require a different shader variant.  Keep
 	 * track of this so we know when we need to re-emit shader state
 	 * due to variant change.  See fixup_shader_state()
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index ae407f7..de5a306 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -82,8 +82,7 @@ fixup_shader_state(struct fd_context *ctx, struct ir3_shader_key *key)
 		if (last_key->has_per_samp || key->has_per_samp) {
 			if ((last_key->vsaturate_s != key->vsaturate_s) ||
 					(last_key->vsaturate_t != key->vsaturate_t) ||
-					(last_key->vsaturate_r != key->vsaturate_r) ||
-					(last_key->vinteger_s != key->vinteger_s))
+					(last_key->vsaturate_r != key->vsaturate_r))
 				ctx->prog.dirty |= FD_SHADER_DIRTY_VP;
 
 			if ((last_key->fsaturate_s != key->fsaturate_s) ||
@@ -122,16 +121,13 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 			// TODO set .half_precision based on render target format,
 			// ie. float16 and smaller use half, float32 use full..
 			.half_precision = !!(fd_mesa_debug & FD_DBG_FRAGHALF),
-			.has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate ||
-					fd4_ctx->vinteger_s || fd4_ctx->finteger_s),
+			.has_per_samp = (fd4_ctx->fsaturate || fd4_ctx->vsaturate),
 			.vsaturate_s = fd4_ctx->vsaturate_s,
 			.vsaturate_t = fd4_ctx->vsaturate_t,
 			.vsaturate_r = fd4_ctx->vsaturate_r,
 			.fsaturate_s = fd4_ctx->fsaturate_s,
 			.fsaturate_t = fd4_ctx->fsaturate_t,
 			.fsaturate_r = fd4_ctx->fsaturate_r,
-			.vinteger_s = fd4_ctx->vinteger_s,
-			.finteger_s = fd4_ctx->finteger_s,
 		},
 		.format = fd4_emit_format(pfb->cbufs[0]),
 		.pformat = pipe_surface_format(pfb->cbufs[0]),
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index f5b4668..e8cbb2d 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -32,6 +32,7 @@
 #include "fd4_screen.h"
 #include "fd4_context.h"
 #include "fd4_format.h"
+#include "ir3_compiler.h"
 
 static boolean
 fd4_screen_is_format_supported(struct pipe_screen *pscreen,
@@ -100,7 +101,9 @@ fd4_screen_is_format_supported(struct pipe_screen *pscreen,
 void
 fd4_screen_init(struct pipe_screen *pscreen)
 {
-	fd_screen(pscreen)->max_rts = 1;
+	struct fd_screen *screen = fd_screen(pscreen);
+	screen->max_rts = 1;
+	screen->compiler = ir3_compiler_create(screen->gpu_id);
 	pscreen->context_create = fd4_context_create;
 	pscreen->is_format_supported = fd4_screen_is_format_supported;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
index ff1ff8f..6ba25d0 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_texture.c
@@ -205,43 +205,11 @@ fd4_sampler_view_create(struct pipe_context *pctx, struct pipe_resource *prsc,
 	return &so->base;
 }
 
-static void
-fd4_set_sampler_views(struct pipe_context *pctx, unsigned shader,
-		unsigned start, unsigned nr, struct pipe_sampler_view **views)
-{
-	struct fd_context *ctx = fd_context(pctx);
-	struct fd4_context *fd4_ctx = fd4_context(ctx);
-	struct fd_texture_stateobj *tex;
-	uint16_t integer_s = 0, *ptr;
-	int i;
-
-	fd_set_sampler_views(pctx, shader, start, nr, views);
-
-	switch (shader) {
-	case PIPE_SHADER_FRAGMENT:
-		tex = &ctx->fragtex;
-		ptr = &fd4_ctx->finteger_s;
-		break;
-	case PIPE_SHADER_VERTEX:
-		tex = &ctx->verttex;
-		ptr = &fd4_ctx->vinteger_s;
-		break;
-	default:
-		return;
-	}
-
-	for (i = 0; i < tex->num_textures; i++)
-		if (util_format_is_pure_integer(tex->textures[i]->format))
-			integer_s |= 1 << i;
-
-	*ptr = integer_s;
-}
-
 void
 fd4_texture_init(struct pipe_context *pctx)
 {
 	pctx->create_sampler_state = fd4_sampler_state_create;
 	pctx->bind_sampler_states = fd_sampler_states_bind;
 	pctx->create_sampler_view = fd4_sampler_view_create;
-	pctx->set_sampler_views = fd4_set_sampler_views;
+	pctx->set_sampler_views = fd_set_sampler_views;
 }
diff --git a/src/gallium/drivers/freedreno/freedreno_context.h b/src/gallium/drivers/freedreno/freedreno_context.h
index 2c816b4..e420f1e 100644
--- a/src/gallium/drivers/freedreno/freedreno_context.h
+++ b/src/gallium/drivers/freedreno/freedreno_context.h
@@ -297,7 +297,7 @@ struct fd_context {
 	 */
 	struct fd_gmem_stateobj gmem;
 	struct fd_vsc_pipe      pipe[8];
-	struct fd_tile          tile[64];
+	struct fd_tile          tile[256];
 
 	/* which state objects need to be re-emit'd: */
 	enum {
diff --git a/src/gallium/drivers/freedreno/freedreno_fence.c b/src/gallium/drivers/freedreno/freedreno_fence.c
index 46b057d..375e58f 100644
--- a/src/gallium/drivers/freedreno/freedreno_fence.c
+++ b/src/gallium/drivers/freedreno/freedreno_fence.c
@@ -35,6 +35,7 @@
 struct pipe_fence_handle {
 	struct pipe_reference reference;
 	struct fd_context *ctx;
+	struct fd_screen *screen;
 	uint32_t timestamp;
 };
 
@@ -68,7 +69,7 @@ boolean fd_screen_fence_finish(struct pipe_screen *screen,
 		struct pipe_fence_handle *fence,
 		uint64_t timeout)
 {
-	if (fd_pipe_wait(fence->ctx->screen->pipe, fence->timestamp))
+	if (fd_pipe_wait(fence->screen->pipe, fence->timestamp))
 		return false;
 
 	return true;
@@ -86,6 +87,7 @@ struct pipe_fence_handle * fd_fence_create(struct pipe_context *pctx)
 	pipe_reference_init(&fence->reference, 1);
 
 	fence->ctx = ctx;
+	fence->screen = ctx->screen;
 	fence->timestamp = fd_ringbuffer_timestamp(ctx->ring);
 
 	return fence;
diff --git a/src/gallium/drivers/freedreno/freedreno_gmem.c b/src/gallium/drivers/freedreno/freedreno_gmem.c
index 11a1b62..c105378 100644
--- a/src/gallium/drivers/freedreno/freedreno_gmem.c
+++ b/src/gallium/drivers/freedreno/freedreno_gmem.c
@@ -117,6 +117,7 @@ calculate_tiles(struct fd_context *ctx)
 	uint32_t i, j, t, xoff, yoff;
 	uint32_t tpp_x, tpp_y;
 	bool has_zs = !!(ctx->resolve & (FD_BUFFER_DEPTH | FD_BUFFER_STENCIL));
+	int tile_n[ARRAY_SIZE(ctx->pipe)];
 
 	if (has_zs) {
 		struct fd_resource *rsc = fd_resource(pfb->zsbuf->texture);
@@ -247,6 +248,7 @@ calculate_tiles(struct fd_context *ctx)
 	/* configure tiles: */
 	t = 0;
 	yoff = miny;
+	memset(tile_n, 0, sizeof(tile_n));
 	for (i = 0; i < nbins_y; i++) {
 		uint32_t bw, bh;
 
@@ -257,20 +259,17 @@ calculate_tiles(struct fd_context *ctx)
 
 		for (j = 0; j < nbins_x; j++) {
 			struct fd_tile *tile = &ctx->tile[t];
-			uint32_t n, p;
+			uint32_t p;
 
 			assert(t < ARRAY_SIZE(ctx->tile));
 
 			/* pipe number: */
 			p = ((i / tpp_y) * div_round_up(nbins_x, tpp_x)) + (j / tpp_x);
 
-			/* slot number: */
-			n = ((i % tpp_y) * tpp_x) + (j % tpp_x);
-
 			/* clip bin width: */
 			bw = MIN2(bin_w, minx + width - xoff);
 
-			tile->n = n;
+			tile->n = tile_n[p]++;
 			tile->p = p;
 			tile->bin_w = bw;
 			tile->bin_h = bh;
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 556c8ab..b3b5462 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -68,10 +68,7 @@ static const struct debug_named_value debug_options[] = {
 		{"fraghalf",  FD_DBG_FRAGHALF, "Use half-precision in fragment shader"},
 		{"nobin",     FD_DBG_NOBIN,  "Disable hw binning"},
 		{"optmsgs",   FD_DBG_OPTMSGS,"Enable optimizer debug messages"},
-		{"optdump",   FD_DBG_OPTDUMP,"Dump shader DAG to .dot files"},
 		{"glsl120",   FD_DBG_GLSL120,"Temporary flag to force GLSL 120 (rather than 130) on a3xx+"},
-		{"nocp",      FD_DBG_NOCP,   "Disable copy-propagation"},
-		{"nir",       FD_DBG_NIR,    "Enable experimental NIR compiler"},
 		DEBUG_NAMED_VALUE_END
 };
 
@@ -220,6 +217,7 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	case PIPE_CAP_MAX_VIEWPORTS:
@@ -374,6 +372,7 @@ fd_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
 	case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 0;
 	case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
 		return 1;
@@ -519,6 +518,7 @@ fd_screen_create(struct fd_device *dev)
 	case 220:
 		fd2_screen_init(pscreen);
 		break;
+	case 307:
 	case 320:
 	case 330:
 		fd3_screen_init(pscreen);
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.h b/src/gallium/drivers/freedreno/freedreno_screen.h
index 3b470d1..dbc2808 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.h
+++ b/src/gallium/drivers/freedreno/freedreno_screen.h
@@ -46,7 +46,9 @@ struct fd_screen {
 	uint32_t device_id;
 	uint32_t gpu_id;         /* 220, 305, etc */
 	uint32_t chip_id;        /* coreid:8 majorrev:8 minorrev:8 patch:8 */
-	uint32_t max_rts;
+	uint32_t max_rts;        /* max # of render targets */
+
+	void *compiler;          /* currently unused for a2xx */
 
 	struct fd_device *dev;
 	struct fd_pipe *pipe;
diff --git a/src/gallium/drivers/freedreno/freedreno_util.h b/src/gallium/drivers/freedreno/freedreno_util.h
index 2735ae4..deb0e60 100644
--- a/src/gallium/drivers/freedreno/freedreno_util.h
+++ b/src/gallium/drivers/freedreno/freedreno_util.h
@@ -62,11 +62,8 @@ enum adreno_stencil_op fd_stencil_op(unsigned op);
 #define FD_DBG_NOBYPASS 0x0040
 #define FD_DBG_FRAGHALF 0x0080
 #define FD_DBG_NOBIN    0x0100
-#define FD_DBG_OPTMSGS  0x0400
-#define FD_DBG_OPTDUMP  0x0800
-#define FD_DBG_GLSL120  0x1000
-#define FD_DBG_NOCP     0x2000
-#define FD_DBG_NIR      0x4000
+#define FD_DBG_OPTMSGS  0x0200
+#define FD_DBG_GLSL120  0x0400
 
 extern int fd_mesa_debug;
 extern bool fd_binning_enabled;
diff --git a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
index a5136c6..48ae7c7 100644
--- a/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
+++ b/src/gallium/drivers/freedreno/ir3/disasm-a3xx.c
@@ -133,16 +133,16 @@ static void print_instr_cat0(instr_t *instr)
 		break;
 	case OPC_BR:
 		printf(" %sp0.%c, #%d", cat0->inv ? "!" : "",
-				component[cat0->comp], cat0->immed);
+				component[cat0->comp], cat0->a3xx.immed);
 		break;
 	case OPC_JUMP:
 	case OPC_CALL:
-		printf(" #%d", cat0->immed);
+		printf(" #%d", cat0->a3xx.immed);
 		break;
 	}
 
-	if ((debug & PRINT_VERBOSE) && (cat0->dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
-		printf("\t{0: %x,%x,%x,%x}", cat0->dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
+	if ((debug & PRINT_VERBOSE) && (cat0->a3xx.dummy1|cat0->dummy2|cat0->dummy3|cat0->dummy4))
+		printf("\t{0: %x,%x,%x,%x}", cat0->a3xx.dummy1, cat0->dummy2, cat0->dummy3, cat0->dummy4);
 }
 
 static void print_instr_cat1(instr_t *instr)
diff --git a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
index cffa62b..efb07ea 100644
--- a/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
+++ b/src/gallium/drivers/freedreno/ir3/instr-a3xx.h
@@ -191,9 +191,9 @@ typedef enum {
 	OPC_LDLV = 31,
 
 	/* meta instructions (category -1): */
-	/* placeholder instr to mark inputs/outputs: */
+	/* placeholder instr to mark shader inputs: */
 	OPC_META_INPUT = 0,
-	OPC_META_OUTPUT = 1,
+	OPC_META_PHI = 1,
 	/* The "fan-in" and "fan-out" instructions are used for keeping
 	 * track of instructions that write to multiple dst registers
 	 * (fan-out) like texture sample instructions, or read multiple
@@ -201,9 +201,6 @@ typedef enum {
 	 */
 	OPC_META_FO = 2,
 	OPC_META_FI = 3,
-	/* branches/flow control */
-	OPC_META_FLOW = 4,
-	OPC_META_PHI = 5,
 
 } opc_t;
 
@@ -281,8 +278,16 @@ static inline int reg_special(reg_t reg)
 
 typedef struct PACKED {
 	/* dword0: */
-	int16_t  immed    : 16;
-	uint32_t dummy1   : 16;
+	union PACKED {
+		struct PACKED {
+			int16_t  immed    : 16;
+			uint32_t dummy1   : 16;
+		} a3xx;
+		struct PACKED {
+			int32_t  immed    : 20;
+			uint32_t dummy1   : 12;
+		} a4xx;
+	};
 
 	/* dword1: */
 	uint32_t dummy2   : 8;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.c b/src/gallium/drivers/freedreno/ir3/ir3.c
index e015de9..a166b67 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3.c
@@ -66,11 +66,22 @@ void * ir3_alloc(struct ir3 *shader, int sz)
 	return ptr;
 }
 
-struct ir3 * ir3_create(void)
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout)
 {
-	struct ir3 *shader =
-			calloc(1, sizeof(struct ir3));
+	struct ir3 *shader = calloc(1, sizeof(struct ir3));
+
 	grow_heap(shader);
+
+	shader->compiler = compiler;
+	shader->ninputs = nin;
+	shader->inputs = ir3_alloc(shader, sizeof(shader->inputs[0]) * nin);
+
+	shader->noutputs = nout;
+	shader->outputs = ir3_alloc(shader, sizeof(shader->outputs[0]) * nout);
+
+	list_inithead(&shader->block_list);
+
 	return shader;
 }
 
@@ -81,7 +92,8 @@ void ir3_destroy(struct ir3 *shader)
 		shader->chunk = chunk->next;
 		free(chunk);
 	}
-	free(shader->instrs);
+	free(shader->indirects);
+	free(shader->predicates);
 	free(shader->baryfs);
 	free(shader);
 }
@@ -142,7 +154,11 @@ static int emit_cat0(struct ir3_instruction *instr, void *ptr,
 {
 	instr_cat0_t *cat0 = ptr;
 
-	cat0->immed    = instr->cat0.immed;
+	if (info->gpu_id >= 400) {
+		cat0->a4xx.immed = instr->cat0.immed;
+	} else {
+		cat0->a3xx.immed = instr->cat0.immed;
+	}
 	cat0->repeat   = instr->repeat;
 	cat0->ss       = !!(instr->flags & IR3_INSTR_SS);
 	cat0->inv      = instr->cat0.inv;
@@ -535,32 +551,40 @@ void * ir3_assemble(struct ir3 *shader, struct ir3_info *info,
 		uint32_t gpu_id)
 {
 	uint32_t *ptr, *dwords;
-	uint32_t i;
 
+	info->gpu_id        = gpu_id;
 	info->max_reg       = -1;
 	info->max_half_reg  = -1;
 	info->max_const     = -1;
 	info->instrs_count  = 0;
+	info->sizedwords    = 0;
+
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			info->sizedwords += 2;
+		}
+	}
 
 	/* need a integer number of instruction "groups" (sets of 16
 	 * instructions on a4xx or sets of 4 instructions on a3xx),
 	 * so pad out w/ NOPs if needed: (NOTE each instruction is 64bits)
 	 */
 	if (gpu_id >= 400) {
-		info->sizedwords = 2 * align(shader->instrs_count, 16);
+		info->sizedwords = align(info->sizedwords, 16 * 2);
 	} else {
-		info->sizedwords = 2 * align(shader->instrs_count, 4);
+		info->sizedwords = align(info->sizedwords, 4 * 2);
 	}
 
 	ptr = dwords = calloc(4, info->sizedwords);
 
-	for (i = 0; i < shader->instrs_count; i++) {
-		struct ir3_instruction *instr = shader->instrs[i];
-		int ret = emit[instr->category](instr, dwords, info);
-		if (ret)
-			goto fail;
-		info->instrs_count += 1 + instr->repeat;
-		dwords += 2;
+	list_for_each_entry (struct ir3_block, block, &shader->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			int ret = emit[instr->category](instr, dwords, info);
+			if (ret)
+				goto fail;
+			info->instrs_count += 1 + instr->repeat;
+			dwords += 2;
+		}
 	}
 
 	return ptr;
@@ -581,50 +605,30 @@ static struct ir3_register * reg_create(struct ir3 *shader,
 	return reg;
 }
 
-static void insert_instr(struct ir3 *shader,
+static void insert_instr(struct ir3_block *block,
 		struct ir3_instruction *instr)
 {
+	struct ir3 *shader = block->shader;
 #ifdef DEBUG
 	static uint32_t serialno = 0;
 	instr->serialno = ++serialno;
 #endif
-	array_insert(shader->instrs, instr);
+	list_addtail(&instr->node, &block->instr_list);
 
 	if (is_input(instr))
 		array_insert(shader->baryfs, instr);
 }
 
-struct ir3_block * ir3_block_create(struct ir3 *shader,
-		unsigned ntmp, unsigned nin, unsigned nout)
+struct ir3_block * ir3_block_create(struct ir3 *shader)
 {
-	struct ir3_block *block;
-	unsigned size;
-	char *ptr;
-
-	size = sizeof(*block);
-	size += sizeof(block->temporaries[0]) * ntmp;
-	size += sizeof(block->inputs[0]) * nin;
-	size += sizeof(block->outputs[0]) * nout;
-
-	ptr = ir3_alloc(shader, size);
-
-	block = (void *)ptr;
-	ptr += sizeof(*block);
-
-	block->temporaries = (void *)ptr;
-	block->ntemporaries = ntmp;
-	ptr += sizeof(block->temporaries[0]) * ntmp;
-
-	block->inputs = (void *)ptr;
-	block->ninputs = nin;
-	ptr += sizeof(block->inputs[0]) * nin;
-
-	block->outputs = (void *)ptr;
-	block->noutputs = nout;
-	ptr += sizeof(block->outputs[0]) * nout;
-
+	struct ir3_block *block = ir3_alloc(shader, sizeof(*block));
+#ifdef DEBUG
+	static uint32_t serialno = 0;
+	block->serialno = ++serialno;
+#endif
 	block->shader = shader;
-
+	list_inithead(&block->node);
+	list_inithead(&block->instr_list);
 	return block;
 }
 
@@ -652,7 +656,7 @@ struct ir3_instruction * ir3_instr_create2(struct ir3_block *block,
 	instr->block = block;
 	instr->category = category;
 	instr->opc = opc;
-	insert_instr(block->shader, instr);
+	insert_instr(block, instr);
 	return instr;
 }
 
@@ -677,7 +681,7 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 	*new_instr = *instr;
 	new_instr->regs = regs;
 
-	insert_instr(instr->block->shader, new_instr);
+	insert_instr(instr->block, new_instr);
 
 	/* clone registers: */
 	new_instr->regs_count = 0;
@@ -694,10 +698,40 @@ struct ir3_instruction * ir3_instr_clone(struct ir3_instruction *instr)
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags)
 {
-	struct ir3_register *reg = reg_create(instr->block->shader, num, flags);
+	struct ir3 *shader = instr->block->shader;
+	struct ir3_register *reg = reg_create(shader, num, flags);
 #ifdef DEBUG
 	debug_assert(instr->regs_count < instr->regs_max);
 #endif
 	instr->regs[instr->regs_count++] = reg;
 	return reg;
 }
+
+void
+ir3_block_clear_mark(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+		instr->flags &= ~IR3_INSTR_MARK;
+}
+
+void
+ir3_clear_mark(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ir3_block_clear_mark(block);
+	}
+}
+
+/* note: this will destroy instr->depth, don't do it until after sched! */
+void
+ir3_count_instructions(struct ir3 *ir)
+{
+	unsigned ip = 0;
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			instr->ip = ip++;
+		}
+		block->start_ip = list_first_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+		block->end_ip = list_last_entry(&block->instr_list, struct ir3_instruction, node)->ip;
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3.h b/src/gallium/drivers/freedreno/ir3/ir3.h
index c0a14a0..9c35a76 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3.h
@@ -28,17 +28,20 @@
 #include <stdbool.h>
 
 #include "util/u_debug.h"
+#include "util/list.h"
 
 #include "instr-a3xx.h"
 #include "disasm.h"  /* TODO move 'enum shader_t' somewhere else.. */
 
 /* low level intermediate representation of an adreno shader program */
 
+struct ir3_compiler;
 struct ir3;
 struct ir3_instruction;
 struct ir3_block;
 
 struct ir3_info {
+	uint32_t gpu_id;
 	uint16_t sizedwords;
 	uint16_t instrs_count;   /* expanded to account for rpt's */
 	/* NOTE: max_reg, etc, does not include registers not touched
@@ -80,8 +83,8 @@ struct ir3_register {
 		 * before register assignment is done:
 		 */
 		IR3_REG_SSA    = 0x2000,   /* 'instr' is ptr to assigning instr */
-		IR3_REG_IA     = 0x4000,   /* meta-input dst is "assigned" */
-		IR3_REG_ADDR   = 0x8000,   /* register is a0.x */
+		IR3_REG_PHI_SRC= 0x4000,   /* phi src, regs[0]->instr points to phi */
+
 	} flags;
 	union {
 		/* normal registers:
@@ -185,6 +188,7 @@ struct ir3_instruction {
 			char inv;
 			char comp;
 			int  immed;
+			struct ir3_block *target;
 		} cat0;
 		struct {
 			type_t src_type, dst_type;
@@ -218,14 +222,14 @@ struct ir3_instruction {
 			int aid;
 		} fi;
 		struct {
-			struct ir3_block *if_block, *else_block;
-		} flow;
+			/* used to temporarily hold reference to nir_phi_instr
+			 * until we resolve the phi srcs
+			 */
+			void *nphi;
+		} phi;
 		struct {
 			struct ir3_block *block;
 		} inout;
-
-		/* XXX keep this as big as all other union members! */
-		uint32_t info[3];
 	};
 
 	/* transient values used during various algorithms: */
@@ -243,6 +247,13 @@ struct ir3_instruction {
 		 */
 #define DEPTH_UNUSED  ~0
 		unsigned depth;
+		/* When we get to the RA stage, we no longer need depth, but
+		 * we do need instruction's position/name:
+		 */
+		struct {
+			uint16_t ip;
+			uint16_t name;
+		};
 	};
 
 	/* Used during CP and RA stages.  For fanin and shader inputs/
@@ -290,7 +301,9 @@ struct ir3_instruction {
 	 */
 	struct ir3_instruction *fanin;
 
-	struct ir3_instruction *next;
+	/* Entry in ir3_block's instruction list: */
+	struct list_head node;
+
 #ifdef DEBUG
 	uint32_t serialno;
 #endif
@@ -321,8 +334,11 @@ static inline int ir3_neighbor_count(struct ir3_instruction *instr)
 struct ir3_heap_chunk;
 
 struct ir3 {
-	unsigned instrs_count, instrs_sz;
-	struct ir3_instruction **instrs;
+	struct ir3_compiler *compiler;
+
+	unsigned ninputs, noutputs;
+	struct ir3_instruction **inputs;
+	struct ir3_instruction **outputs;
 
 	/* Track bary.f (and ldlv) instructions.. this is needed in
 	 * scheduling to ensure that all varying fetches happen before
@@ -345,33 +361,54 @@ struct ir3 {
 	 */
 	unsigned indirects_count, indirects_sz;
 	struct ir3_instruction **indirects;
+	/* and same for instructions that consume predicate register: */
+	unsigned predicates_count, predicates_sz;
+	struct ir3_instruction **predicates;
+
+	/* List of blocks: */
+	struct list_head block_list;
 
-	struct ir3_block *block;
 	unsigned heap_idx;
 	struct ir3_heap_chunk *chunk;
 };
 
+typedef struct nir_block nir_block;
+
 struct ir3_block {
+	struct list_head node;
 	struct ir3 *shader;
-	unsigned ntemporaries, ninputs, noutputs;
-	/* maps TGSI_FILE_TEMPORARY index back to the assigning instruction: */
-	struct ir3_instruction **temporaries;
-	struct ir3_instruction **inputs;
-	struct ir3_instruction **outputs;
-	/* only a single address register: */
-	struct ir3_instruction *address;
-	struct ir3_block *parent;
-	struct ir3_instruction *head;
+
+	nir_block *nblock;
+
+	struct list_head instr_list;  /* list of ir3_instruction */
+
+	/* each block has either one or two successors.. in case of
+	 * two successors, 'condition' decides which one to follow.
+	 * A block preceding an if/else has two successors.
+	 */
+	struct ir3_instruction *condition;
+	struct ir3_block *successors[2];
+
+	uint16_t start_ip, end_ip;
+
+	/* used for per-pass extra block data.  Mainly used right
+	 * now in RA step to track livein/liveout.
+	 */
+	void *bd;
+
+#ifdef DEBUG
+	uint32_t serialno;
+#endif
 };
 
-struct ir3 * ir3_create(void);
+struct ir3 * ir3_create(struct ir3_compiler *compiler,
+		unsigned nin, unsigned nout);
 void ir3_destroy(struct ir3 *shader);
 void * ir3_assemble(struct ir3 *shader,
 		struct ir3_info *info, uint32_t gpu_id);
 void * ir3_alloc(struct ir3 *shader, int sz);
 
-struct ir3_block * ir3_block_create(struct ir3 *shader,
-		unsigned ntmp, unsigned nin, unsigned nout);
+struct ir3_block * ir3_block_create(struct ir3 *shader);
 
 struct ir3_instruction * ir3_instr_create(struct ir3_block *block,
 		int category, opc_t opc);
@@ -383,7 +420,6 @@ const char *ir3_instr_name(struct ir3_instruction *instr);
 struct ir3_register * ir3_reg_create(struct ir3_instruction *instr,
 		int num, int flags);
 
-
 static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 {
 	if (instr->flags & IR3_INSTR_MARK)
@@ -392,22 +428,10 @@ static inline bool ir3_instr_check_mark(struct ir3_instruction *instr)
 	return false;
 }
 
-static inline void ir3_clear_mark(struct ir3 *shader)
-{
-	/* TODO would be nice to drop the instruction array.. for
-	 * new compiler, _clear_mark() is all we use it for, and
-	 * we could probably manage a linked list instead..
-	 *
-	 * Also, we'll probably want to mark instructions within
-	 * a block, so tracking the list of instrs globally is
-	 * unlikely to be what we want.
-	 */
-	unsigned i;
-	for (i = 0; i < shader->instrs_count; i++) {
-		struct ir3_instruction *instr = shader->instrs[i];
-		instr->flags &= ~IR3_INSTR_MARK;
-	}
-}
+void ir3_block_clear_mark(struct ir3_block *block);
+void ir3_clear_mark(struct ir3 *shader);
+
+void ir3_count_instructions(struct ir3 *ir);
 
 static inline int ir3_instr_regno(struct ir3_instruction *instr,
 		struct ir3_register *reg)
@@ -501,6 +525,28 @@ static inline bool is_mem(struct ir3_instruction *instr)
 	return (instr->category == 6);
 }
 
+static inline bool
+is_store(struct ir3_instruction *instr)
+{
+	if (is_mem(instr)) {
+		/* these instructions, the "destination" register is
+		 * actually a source, the address to store to.
+		 */
+		switch (instr->opc) {
+		case OPC_STG:
+		case OPC_STP:
+		case OPC_STL:
+		case OPC_STLW:
+		case OPC_L2G:
+		case OPC_G2L:
+			return true;
+		default:
+			break;
+		}
+	}
+	return false;
+}
+
 static inline bool is_input(struct ir3_instruction *instr)
 {
 	/* in some cases, ldlv is used to fetch varying without
@@ -525,7 +571,7 @@ static inline bool writes_addr(struct ir3_instruction *instr)
 {
 	if (instr->regs_count > 0) {
 		struct ir3_register *dst = instr->regs[0];
-		return !!(dst->flags & IR3_REG_ADDR);
+		return reg_num(dst) == REG_A0;
 	}
 	return false;
 }
@@ -556,13 +602,29 @@ static inline bool conflicts(struct ir3_instruction *a,
 
 static inline bool reg_gpr(struct ir3_register *r)
 {
-	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED | IR3_REG_ADDR))
+	if (r->flags & (IR3_REG_CONST | IR3_REG_IMMED))
 		return false;
 	if ((reg_num(r) == REG_A0) || (reg_num(r) == REG_P0))
 		return false;
 	return true;
 }
 
+static inline type_t half_type(type_t type)
+{
+	switch (type) {
+	case TYPE_F32: return TYPE_F16;
+	case TYPE_U32: return TYPE_U16;
+	case TYPE_S32: return TYPE_S16;
+	case TYPE_F16:
+	case TYPE_U16:
+	case TYPE_S16:
+		return type;
+	default:
+		assert(0);
+		return ~0;
+	}
+}
+
 /* some cat2 instructions (ie. those which are not float) can embed an
  * immediate:
  */
@@ -747,37 +809,31 @@ static inline struct ir3_instruction * __ssa_src_n(struct ir3_instruction *instr
 
 
 /* dump: */
-#include <stdio.h>
-void ir3_dump(struct ir3 *shader, const char *name,
-		struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
-		FILE *f);
-void ir3_dump_instr_single(struct ir3_instruction *instr);
-void ir3_dump_instr_list(struct ir3_instruction *instr);
-
-/* flatten if/else: */
-int ir3_block_flatten(struct ir3_block *block);
+void ir3_print(struct ir3 *ir);
+void ir3_print_instr(struct ir3_instruction *instr);
 
 /* depth calculation: */
 int ir3_delayslots(struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned n);
-void ir3_block_depth(struct ir3_block *block);
+void ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list);
+void ir3_depth(struct ir3 *ir);
 
 /* copy-propagate: */
-void ir3_block_cp(struct ir3_block *block);
+void ir3_cp(struct ir3 *ir);
 
-/* group neightbors and insert mov's to resolve conflicts: */
-void ir3_block_group(struct ir3_block *block);
+/* group neighbors and insert mov's to resolve conflicts: */
+void ir3_group(struct ir3 *ir);
 
 /* scheduling: */
-int ir3_block_sched(struct ir3_block *block);
+int ir3_sched(struct ir3 *ir);
 
 /* register assignment: */
-int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+struct ir3_ra_reg_set * ir3_ra_alloc_reg_set(void *memctx);
+int ir3_ra(struct ir3 *ir3, enum shader_t type,
 		bool frag_coord, bool frag_face);
 
 /* legalize: */
-void ir3_block_legalize(struct ir3_block *block,
-		bool *has_samp, int *max_bary);
+void ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary);
 
 /* ************************************************************************* */
 /* instruction helpers */
@@ -807,6 +863,21 @@ ir3_COV(struct ir3_block *block, struct ir3_instruction *src,
 	return instr;
 }
 
+static inline struct ir3_instruction *
+ir3_NOP(struct ir3_block *block)
+{
+	return ir3_instr_create(block, 0, OPC_NOP);
+}
+
+#define INSTR0(CAT, name)                                                \
+static inline struct ir3_instruction *                                   \
+ir3_##name(struct ir3_block *block)                                      \
+{                                                                        \
+	struct ir3_instruction *instr =                                      \
+		ir3_instr_create(block, CAT, OPC_##name);                        \
+	return instr;                                                        \
+}
+
 #define INSTR1(CAT, name)                                                \
 static inline struct ir3_instruction *                                   \
 ir3_##name(struct ir3_block *block,                                      \
@@ -850,7 +921,10 @@ ir3_##name(struct ir3_block *block,                                      \
 }
 
 /* cat0 instructions: */
+INSTR0(0, BR);
+INSTR0(0, JUMP);
 INSTR1(0, KILL);
+INSTR0(0, END);
 
 /* cat2 instructions, most 2 src but some 1 src: */
 INSTR2(2, ADD_F)
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index d0517aa..ad9d271 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -30,6 +30,7 @@
 #include <fcntl.h>
 #include <stdint.h>
 #include <stdlib.h>
+#include <stdio.h>
 #include <err.h>
 
 #include "tgsi/tgsi_parse.h"
@@ -65,34 +66,34 @@ static void dump_info(struct ir3_shader_variant *so, const char *str)
 	// TODO make gpu_id configurable on cmdline
 	bin = ir3_shader_assemble(so, 320);
 	if (fd_mesa_debug & FD_DBG_DISASM) {
-		struct ir3_block *block = so->ir->block;
+		struct ir3 *ir = so->ir;
 		struct ir3_register *reg;
 		uint8_t regid;
 		unsigned i;
 
 		debug_printf("; %s: %s\n", type, str);
 
-		for (i = 0; i < block->ninputs; i++) {
-			if (!block->inputs[i]) {
+		for (i = 0; i < ir->ninputs; i++) {
+			if (!ir->inputs[i]) {
 				debug_printf("; in%d unused\n", i);
 				continue;
 			}
-			reg = block->inputs[i]->regs[0];
+			reg = ir->inputs[i]->regs[0];
 			regid = reg->num;
 			debug_printf("@in(%sr%d.%c)\tin%d\n",
 					(reg->flags & IR3_REG_HALF) ? "h" : "",
 					(regid >> 2), "xyzw"[regid & 0x3], i);
 		}
 
-		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i]) {
+		for (i = 0; i < ir->noutputs; i++) {
+			if (!ir->outputs[i]) {
 				debug_printf("; out%d unused\n", i);
 				continue;
 			}
 			/* kill shows up as a virtual output.. skip it! */
-			if (is_kill(block->outputs[i]))
+			if (is_kill(ir->outputs[i]))
 				continue;
-			reg = block->outputs[i]->regs[0];
+			reg = ir->outputs[i]->regs[0];
 			regid = reg->num;
 			debug_printf("@out(%sr%d.%c)\tout%d\n",
 					(reg->flags & IR3_REG_HALF) ? "h" : "",
@@ -194,16 +195,6 @@ read_file(const char *filename, void **ptr, size_t *size)
 	return 0;
 }
 
-static void reset_variant(struct ir3_shader_variant *v, const char *msg)
-{
-	printf("; %s\n", msg);
-	v->inputs_count = 0;
-	v->outputs_count = 0;
-	v->total_in = 0;
-	v->has_samp = false;
-	v->immediates_count = 0;
-}
-
 static void print_usage(void)
 {
 	printf("Usage: ir3_compiler [OPTIONS]... FILE\n");
@@ -225,12 +216,12 @@ int main(int argc, char **argv)
 	const char *filename;
 	struct tgsi_token toks[65536];
 	struct tgsi_parse_context parse;
+	struct ir3_compiler *compiler;
 	struct ir3_shader_variant v;
 	struct ir3_shader_key key = {};
 	const char *info;
 	void *ptr;
 	size_t size;
-	int use_nir = 0;
 
 	fd_mesa_debug |= FD_DBG_DISASM;
 
@@ -243,7 +234,7 @@ int main(int argc, char **argv)
 
 	while (n < argc) {
 		if (!strcmp(argv[n], "--verbose")) {
-			fd_mesa_debug |=  FD_DBG_OPTDUMP | FD_DBG_MSGS | FD_DBG_OPTMSGS;
+			fd_mesa_debug |= FD_DBG_MSGS | FD_DBG_OPTMSGS;
 			n++;
 			continue;
 		}
@@ -290,17 +281,6 @@ int main(int argc, char **argv)
 			continue;
 		}
 
-		if (!strcmp(argv[n], "--nocp")) {
-			fd_mesa_debug |= FD_DBG_NOCP;
-			n++;
-			continue;
-		}
-		if (!strcmp(argv[n], "--nir")) {
-			use_nir = true;
-			n++;
-			continue;
-		}
-
 		if (!strcmp(argv[n], "--help")) {
 			print_usage();
 			return 0;
@@ -340,31 +320,14 @@ int main(int argc, char **argv)
 		break;
 	}
 
-	if (use_nir) {
-		info = "NIR compiler";
-		ret = ir3_compile_shader_nir(&v, toks, key);
-	} else {
-		info = "TGSI compiler";
-		ret = ir3_compile_shader(&v, toks, key, true);
-	}
-
-	if (ret) {
-		reset_variant(&v, "compiler failed, trying without copy propagation!");
-		info = "compiler (no copy propagation)";
-		ret = ir3_compile_shader(&v, toks, key, false);
-	}
+	/* TODO cmdline option to target different gpus: */
+	compiler = ir3_compiler_create(320);
 
+	info = "NIR compiler";
+	ret = ir3_compile_shader_nir(compiler, &v, toks, key);
 	if (ret) {
 		fprintf(stderr, "compiler failed!\n");
 		return ret;
 	}
 	dump_info(&v, info);
 }
-
-void _mesa_error_no_memory(const char *caller);
-
-void
-_mesa_error_no_memory(const char *caller)
-{
-	fprintf(stderr, "Mesa error: out of memory in %s", caller);
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
index 43f4c95..7c8eccb 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -1,7 +1,7 @@
 /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
 
 /*
- * Copyright (C) 2013 Rob Clark <robclark@freedesktop.org>
+ * Copyright (C) 2015 Rob Clark <robclark@freedesktop.org>
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -26,3710 +26,19 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
-#include <stdarg.h>
-
-#include "pipe/p_state.h"
-#include "util/u_string.h"
-#include "util/u_memory.h"
-#include "util/u_inlines.h"
-#include "tgsi/tgsi_lowering.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_ureg.h"
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_strings.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_scan.h"
-
-#include "freedreno_util.h"
+#include "util/ralloc.h"
 
 #include "ir3_compiler.h"
-#include "ir3_shader.h"
-
-#include "instr-a3xx.h"
-#include "ir3.h"
-
-struct ir3_compile_context {
-	const struct tgsi_token *tokens;
-	bool free_tokens;
-	struct ir3 *ir;
-	struct ir3_shader_variant *so;
-	uint16_t integer_s;
-
-	struct ir3_block *block;
-	struct ir3_instruction *current_instr;
-
-	/* we need to defer updates to block->outputs[] until the end
-	 * of an instruction (so we don't see new value until *after*
-	 * the src registers are processed)
-	 */
-	struct {
-		struct ir3_instruction *instr, **instrp;
-	} output_updates[64];
-	unsigned num_output_updates;
-
-	/* are we in a sequence of "atomic" instructions?
-	 */
-	bool atomic;
-
-	/* For fragment shaders, from the hw perspective the only
-	 * actual input is r0.xy position register passed to bary.f.
-	 * But TGSI doesn't know that, it still declares things as
-	 * IN[] registers.  So we do all the input tracking normally
-	 * and fix things up after compile_instructions()
-	 *
-	 * NOTE that frag_pos is the hardware position (possibly it
-	 * is actually an index or tag or some such.. it is *not*
-	 * values that can be directly used for gl_FragCoord..)
-	 */
-	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
-
-	/* For vertex shaders, keep track of the system values sources */
-	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
-
-	struct tgsi_parse_context parser;
-	unsigned type;
-
-	struct tgsi_shader_info info;
-
-	/* hmm, would be nice if tgsi_scan_shader figured this out
-	 * for us:
-	 */
-	struct {
-		unsigned first, last;
-		struct ir3_instruction *fanin;
-	} array[MAX_ARRAYS];
-	uint32_t array_dirty;
-	/* offset into array[], per file, of first array info */
-	uint8_t array_offsets[TGSI_FILE_COUNT];
-
-	/* for calculating input/output positions/linkages: */
-	unsigned next_inloc;
-
-	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
-	 * so we need to use ldlv.u32 to load the varying directly:
-	 */
-	bool flat_bypass;
-
-	unsigned num_internal_temps;
-	struct tgsi_src_register internal_temps[8];
-
-	/* for looking up which system value is which */
-	unsigned sysval_semantics[8];
-
-	/* idx/slot for last compiler generated immediate */
-	unsigned immediate_idx;
-
-	/* stack of branch instructions that mark (potentially nested)
-	 * branch if/else/loop/etc
-	 */
-	struct {
-		struct ir3_instruction *instr, *cond;
-		bool inv;   /* true iff in else leg of branch */
-	} branch[16];
-	unsigned int branch_count;
-
-	/* list of kill instructions: */
-	struct ir3_instruction *kill[16];
-	unsigned int kill_count;
-
-	/* used when dst is same as one of the src, to avoid overwriting a
-	 * src element before the remaining scalar instructions that make
-	 * up the vector operation
-	 */
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-
-	/* just for catching incorrect use of get_dst()/put_dst():
-	 */
-	bool using_tmp_dst;
-};
-
-
-static void vectorize(struct ir3_compile_context *ctx,
-		struct ir3_instruction *instr, struct tgsi_dst_register *dst,
-		int nsrcs, ...);
-static void create_mov(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *dst, struct tgsi_src_register *src);
-static type_t get_ftype(struct ir3_compile_context *ctx);
-static type_t get_utype(struct ir3_compile_context *ctx);
-
-static unsigned setup_arrays(struct ir3_compile_context *ctx, unsigned file, unsigned i)
-{
-	/* ArrayID 0 for a given file is the legacy array spanning the entire file: */
-	ctx->array[i].first = 0;
-	ctx->array[i].last = ctx->info.file_max[file];
-	ctx->array_offsets[file] = i;
-	i += ctx->info.array_max[file] + 1;
-	return i;
-}
-
-static unsigned
-compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens)
-{
-	unsigned ret, i;
-	struct tgsi_shader_info *info = &ctx->info;
-	struct tgsi_lowering_config lconfig = {
-			.color_two_side = so->key.color_two_side,
-			.lower_DST  = true,
-			.lower_XPD  = true,
-			.lower_SCS  = true,
-			.lower_LRP  = true,
-			.lower_FRC  = true,
-			.lower_POW  = true,
-			.lower_LIT  = true,
-			.lower_EXP  = true,
-			.lower_LOG  = true,
-			.lower_DP4  = true,
-			.lower_DP3  = true,
-			.lower_DPH  = true,
-			.lower_DP2  = true,
-			.lower_DP2A = true,
-	};
-
-	switch (so->type) {
-	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
-		lconfig.saturate_s = so->key.fsaturate_s;
-		lconfig.saturate_t = so->key.fsaturate_t;
-		lconfig.saturate_r = so->key.fsaturate_r;
-		ctx->integer_s = so->key.finteger_s;
-		break;
-	case SHADER_VERTEX:
-		lconfig.saturate_s = so->key.vsaturate_s;
-		lconfig.saturate_t = so->key.vsaturate_t;
-		lconfig.saturate_r = so->key.vsaturate_r;
-		ctx->integer_s = so->key.vinteger_s;
-		break;
-	}
-
-	if (!so->shader) {
-		/* hack for standalone compiler which does not have
-		 * screen/context:
-		 */
-	} else if (ir3_shader_gpuid(so->shader) >= 400) {
-		/* a4xx seems to have *no* sam.p */
-		lconfig.lower_TXP = ~0;  /* lower all txp */
-		/* need special handling for "flat" */
-		ctx->flat_bypass = true;
-	} else {
-		/* a3xx just needs to avoid sam.p for 3d tex */
-		lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
-		/* no special handling for "flat" */
-		ctx->flat_bypass = false;
-	}
-
-	ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info);
-	ctx->free_tokens = !!ctx->tokens;
-	if (!ctx->tokens) {
-		/* no lowering */
-		ctx->tokens = tokens;
-	}
-	ctx->ir = so->ir;
-	ctx->so = so;
-	ctx->array_dirty = 0;
-	ctx->next_inloc = 8;
-	ctx->num_internal_temps = 0;
-	ctx->branch_count = 0;
-	ctx->kill_count = 0;
-	ctx->block = NULL;
-	ctx->current_instr = NULL;
-	ctx->num_output_updates = 0;
-	ctx->atomic = false;
-	ctx->frag_pos = NULL;
-	ctx->frag_face = NULL;
-	ctx->vertex_id = NULL;
-	ctx->instance_id = NULL;
-	ctx->tmp_src = NULL;
-	ctx->using_tmp_dst = false;
-
-	memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
-	memset(ctx->array, 0, sizeof(ctx->array));
-	memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets));
-
-#define FM(x) (1 << TGSI_FILE_##x)
-	/* NOTE: if relative addressing is used, we set constlen in
-	 * the compiler (to worst-case value) since we don't know in
-	 * the assembler what the max addr reg value can be:
-	 */
-	if (info->indirect_files & FM(CONSTANT))
-		so->constlen = MIN2(255, ctx->info.const_file_max[0] + 1);
-
-	i = 0;
-	i += setup_arrays(ctx, TGSI_FILE_INPUT, i);
-	i += setup_arrays(ctx, TGSI_FILE_TEMPORARY, i);
-	i += setup_arrays(ctx, TGSI_FILE_OUTPUT, i);
-	/* any others? we don't track arrays for const..*/
-
-	/* Immediates go after constants: */
-	so->first_immediate = so->first_driver_param =
-		info->const_file_max[0] + 1;
-	/* 1 unit for the vertex id base */
-	if (so->type == SHADER_VERTEX)
-		so->first_immediate++;
-	/* 4 (vec4) units for ubo base addresses */
-	so->first_immediate += 4;
-	ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
-
-	ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
-	if (ret != TGSI_PARSE_OK)
-		return ret;
-
-	ctx->type = ctx->parser.FullHeader.Processor.Processor;
-
-	return ret;
-}
-
-static void
-compile_error(struct ir3_compile_context *ctx, const char *format, ...)
-{
-	va_list ap;
-	va_start(ap, format);
-	_debug_vprintf(format, ap);
-	va_end(ap);
-	tgsi_dump(ctx->tokens, 0);
-	debug_assert(0);
-}
-
-#define compile_assert(ctx, cond) do { \
-		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
-	} while (0)
-
-static void
-compile_free(struct ir3_compile_context *ctx)
-{
-	if (ctx->free_tokens)
-		free((void *)ctx->tokens);
-	tgsi_parse_free(&ctx->parser);
-}
-
-struct instr_translater {
-	void (*fxn)(const struct instr_translater *t,
-			struct ir3_compile_context *ctx,
-			struct tgsi_full_instruction *inst);
-	unsigned tgsi_opc;
-	opc_t opc;
-	opc_t hopc;    /* opc to use for half_precision mode, if different */
-	unsigned arg;
-};
-
-static void
-instr_finish(struct ir3_compile_context *ctx)
-{
-	unsigned i;
-
-	if (ctx->atomic)
-		return;
-
-	for (i = 0; i < ctx->num_output_updates; i++)
-		*(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
-
-	ctx->num_output_updates = 0;
-
-	while (ctx->array_dirty) {
-		unsigned aid = ffs(ctx->array_dirty) - 1;
-		ctx->array[aid].fanin = NULL;
-		ctx->array_dirty &= ~(1 << aid);
-	}
-}
-
-/* For "atomic" groups of instructions, for example the four scalar
- * instructions to perform a vec4 operation.  Basically this just
- * blocks out handling of output_updates so the next scalar instruction
- * still sees the result from before the start of the atomic group.
- *
- * NOTE: when used properly, this could probably replace get/put_dst()
- * stuff.
- */
-static void
-instr_atomic_start(struct ir3_compile_context *ctx)
-{
-	ctx->atomic = true;
-}
-
-static void
-instr_atomic_end(struct ir3_compile_context *ctx)
-{
-	ctx->atomic = false;
-	instr_finish(ctx);
-}
-
-static struct ir3_instruction *
-instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
-{
-	instr_finish(ctx);
-	return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
-}
-
-static struct ir3_block *
-push_block(struct ir3_compile_context *ctx)
-{
-	struct ir3_block *block;
-	unsigned ntmp, nin, nout;
-
-#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
-
-	/* hmm, give ourselves room to create 8 extra temporaries (vec4):
-	 */
-	ntmp = SCALAR_REGS(TEMPORARY);
-	ntmp += 8 * 4;
-
-	nout = SCALAR_REGS(OUTPUT);
-	nin  = SCALAR_REGS(INPUT) + SCALAR_REGS(SYSTEM_VALUE);
-
-	/* for outermost block, 'inputs' are the actual shader INPUT
-	 * register file.  Reads from INPUT registers always go back to
-	 * top block.  For nested blocks, 'inputs' is used to track any
-	 * TEMPORARY file register from one of the enclosing blocks that
-	 * is ready in this block.
-	 */
-	if (!ctx->block) {
-		/* NOTE: fragment shaders actually have two inputs (r0.xy, the
-		 * position)
-		 */
-		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			int n = 2;
-			if (ctx->info.reads_position)
-				n += 4;
-			if (ctx->info.uses_frontface)
-				n += 4;
-			nin = MAX2(n, nin);
-			nout += ARRAY_SIZE(ctx->kill);
-		}
-	} else {
-		nin = ntmp;
-	}
-
-	block = ir3_block_create(ctx->ir, ntmp, nin, nout);
-
-	if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
-		block->noutputs -= ARRAY_SIZE(ctx->kill);
-
-	block->parent = ctx->block;
-	ctx->block = block;
-
-	return block;
-}
-
-static void
-pop_block(struct ir3_compile_context *ctx)
-{
-	ctx->block = ctx->block->parent;
-	compile_assert(ctx, ctx->block);
-}
-
-static struct ir3_instruction *
-create_output(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned n)
-{
-	struct ir3_instruction *out;
-
-	out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
-	out->inout.block = block;
-	ir3_reg_create(out, n, 0);
-	if (instr)
-		ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
-
-	return out;
-}
-
-static struct ir3_instruction *
-create_input(struct ir3_block *block, struct ir3_instruction *instr,
-		unsigned n)
-{
-	struct ir3_instruction *in;
-
-	in = ir3_instr_create(block, -1, OPC_META_INPUT);
-	in->inout.block = block;
-	ir3_reg_create(in, n, 0);
-	if (instr)
-		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
-
-	return in;
-}
-
-static struct ir3_instruction *
-block_input(struct ir3_block *block, unsigned n)
-{
-	/* references to INPUT register file always go back up to
-	 * top level:
-	 */
-	if (block->parent)
-		return block_input(block->parent, n);
-	return block->inputs[n];
-}
-
-/* return temporary in scope, creating if needed meta-input node
- * to track block inputs
- */
-static struct ir3_instruction *
-block_temporary(struct ir3_block *block, unsigned n)
-{
-	/* references to TEMPORARY register file, find the nearest
-	 * enclosing block which has already assigned this temporary,
-	 * creating meta-input instructions along the way to keep
-	 * track of block inputs
-	 */
-	if (block->parent && !block->temporaries[n]) {
-		/* if already have input for this block, reuse: */
-		if (!block->inputs[n])
-			block->inputs[n] = block_temporary(block->parent, n);
-
-		/* and create new input to return: */
-		return create_input(block, block->inputs[n], n);
-	}
-	return block->temporaries[n];
-}
-
-static struct ir3_instruction *
-create_immed(struct ir3_compile_context *ctx, float val)
-{
-	/* NOTE: *don't* use instr_create() here!
-	 */
-	struct ir3_instruction *instr;
-	instr = ir3_instr_create(ctx->block, 1, 0);
-	instr->cat1.src_type = get_ftype(ctx);
-	instr->cat1.dst_type = get_ftype(ctx);
-	ir3_reg_create(instr, 0, 0);
-	ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
-	return instr;
-}
-
-static void
-ssa_instr_set(struct ir3_compile_context *ctx, unsigned file, unsigned n,
-		struct ir3_instruction *instr)
-{
-	struct ir3_block *block = ctx->block;
-	unsigned idx = ctx->num_output_updates;
-
-	compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
-
-	/* NOTE: defer update of temporaries[idx] or output[idx]
-	 * until instr_finish(), so that if the current instruction
-	 * reads the same TEMP/OUT[] it gets the old value:
-	 *
-	 * bleh.. this might be a bit easier to just figure out
-	 * in instr_finish().  But at that point we've already
-	 * lost information about OUTPUT vs TEMPORARY register
-	 * file..
-	 */
-
-	switch (file) {
-	case TGSI_FILE_OUTPUT:
-		compile_assert(ctx, n < block->noutputs);
-		ctx->output_updates[idx].instrp = &block->outputs[n];
-		ctx->output_updates[idx].instr = instr;
-		ctx->num_output_updates++;
-		break;
-	case TGSI_FILE_TEMPORARY:
-		compile_assert(ctx, n < block->ntemporaries);
-		ctx->output_updates[idx].instrp = &block->temporaries[n];
-		ctx->output_updates[idx].instr = instr;
-		ctx->num_output_updates++;
-		break;
-	case TGSI_FILE_ADDRESS:
-		compile_assert(ctx, n < 1);
-		ctx->output_updates[idx].instrp = &block->address;
-		ctx->output_updates[idx].instr = instr;
-		ctx->num_output_updates++;
-		break;
-	}
-}
-
-static struct ir3_instruction *
-ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n)
-{
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *instr = NULL;
-
-	switch (file) {
-	case TGSI_FILE_INPUT:
-		instr = block_input(ctx->block, n);
-		break;
-	case TGSI_FILE_OUTPUT:
-		/* really this should just happen in case of 'MOV_SAT OUT[n], ..',
-		 * for the following clamp instructions:
-		 */
-		instr = block->outputs[n];
-		/* we don't have to worry about read from an OUTPUT that was
-		 * assigned outside of the current block, because the _SAT
-		 * clamp instructions will always be in the same block as
-		 * the original instruction which wrote the OUTPUT
-		 */
-		compile_assert(ctx, instr);
-		break;
-	case TGSI_FILE_TEMPORARY:
-		instr = block_temporary(ctx->block, n);
-		if (!instr) {
-			/* this can happen when registers (or components of a TGSI
-			 * register) are used as src before they have been assigned
-			 * (undefined contents).  To avoid confusing the rest of the
-			 * compiler, and to generally keep things peachy, substitute
-			 * an instruction that sets the src to 0.0.  Or to keep
-			 * things undefined, I could plug in a random number? :-P
-			 *
-			 * NOTE: *don't* use instr_create() here!
-			 */
-			instr = create_immed(ctx, 0.0);
-			/* no need to recreate the immed for every access: */
-			block->temporaries[n] = instr;
-		}
-		break;
-	case TGSI_FILE_SYSTEM_VALUE:
-		switch (ctx->sysval_semantics[n >> 2]) {
-		case TGSI_SEMANTIC_VERTEXID_NOBASE:
-			instr = ctx->vertex_id;
-			break;
-		case TGSI_SEMANTIC_BASEVERTEX:
-			instr = ctx->basevertex;
-			break;
-		case TGSI_SEMANTIC_INSTANCEID:
-			instr = ctx->instance_id;
-			break;
-		}
-		break;
-	}
-
-	return instr;
-}
-
-static int dst_array_id(struct ir3_compile_context *ctx,
-		const struct tgsi_dst_register *dst)
-{
-	// XXX complete hack to recover tgsi_full_dst_register...
-	// nothing that isn't wrapped in a tgsi_full_dst_register
-	// should be indirect
-	const struct tgsi_full_dst_register *fdst = (const void *)dst;
-	return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File];
-}
-
-static int src_array_id(struct ir3_compile_context *ctx,
-		const struct tgsi_src_register *src)
-{
-	// XXX complete hack to recover tgsi_full_src_register...
-	// nothing that isn't wrapped in a tgsi_full_src_register
-	// should be indirect
-	const struct tgsi_full_src_register *fsrc = (const void *)src;
-	debug_assert(src->File != TGSI_FILE_CONSTANT);
-	return fsrc->Indirect.ArrayID + ctx->array_offsets[src->File];
-}
-
-static struct ir3_instruction *
-array_fanin(struct ir3_compile_context *ctx, unsigned aid, unsigned file)
-{
-	struct ir3_instruction *instr;
-
-	if (ctx->array[aid].fanin) {
-		instr = ctx->array[aid].fanin;
-	} else {
-		unsigned first = ctx->array[aid].first;
-		unsigned last  = ctx->array[aid].last;
-		unsigned i, j;
-
-		instr = ir3_instr_create2(ctx->block, -1, OPC_META_FI,
-				1 + (4 * (last + 1 - first)));
-		ir3_reg_create(instr, 0, 0);
-		for (i = first; i <= last; i++) {
-			for (j = 0; j < 4; j++) {
-				unsigned n = regid(i, j);
-				ir3_reg_create(instr, 0, IR3_REG_SSA)->instr =
-						ssa_instr_get(ctx, file, n);
-			}
-		}
-		ctx->array[aid].fanin = instr;
-		ctx->array_dirty |= (1 << aid);
-	}
-
-	return instr;
-}
-
-static void
-ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		const struct tgsi_dst_register *dst, unsigned chan)
-{
-	if (dst->Indirect) {
-		struct ir3_register *reg = instr->regs[0];
-		unsigned i, aid = dst_array_id(ctx, dst);
-		unsigned first = ctx->array[aid].first;
-		unsigned last  = ctx->array[aid].last;
-		unsigned off   = dst->Index - first; /* vec4 offset */
-
-		reg->size = 4 * (1 + last - first);
-		reg->offset = regid(off, chan);
-
-		instr->fanin = array_fanin(ctx, aid, dst->File);
-
-		/* annotate with the array-id, to help out the register-
-		 * assignment stage.  At least for the case of indirect
-		 * writes, we should capture enough dependencies to
-		 * preserve the order of reads/writes of the array, so
-		 * the multiple "names" for the array should end up all
-		 * assigned to the same registers.
-		 */
-		instr->fanin->fi.aid = aid;
-
-		/* Since we are scalarizing vec4 tgsi instructions/regs, we
-		 * run into a slight complication here.  To do the naive thing
-		 * and setup a fanout for each scalar array element would end
-		 * up with the result that the instructions generated for each
-		 * component of the vec4 would end up clobbering each other.
-		 * So we take advantage here of knowing that the array index
-		 * (after the shl.b) will be a multiple of four, and only set
-		 * every fourth scalar component in the array.  See also
-		 * fixup_ssa_dst_array()
-		 */
-		for (i = first; i <= last; i++) {
-			struct ir3_instruction *split;
-			unsigned n = regid(i, chan);
-			int off = (4 * (i - first)) + chan;
-
-			if (is_meta(instr) && (instr->opc == OPC_META_FO))
-				off -= instr->fo.off;
-
-			split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
-			split->fo.off = off;
-			ir3_reg_create(split, 0, 0);
-			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
-
-			ssa_instr_set(ctx, dst->File, n, split);
-		}
-	} else {
-		/* normal case (not relative addressed GPR) */
-		ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
-	}
-}
-
-static void
-ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
-		const struct tgsi_src_register *src, unsigned chan)
-{
-	struct ir3_instruction *instr;
-
-	if (src->Indirect && (src->File != TGSI_FILE_CONSTANT)) {
-		/* for relative addressing of gpr's (due to register assignment)
-		 * we must generate a fanin instruction to collect all possible
-		 * array elements that the instruction could address together:
-		 */
-		unsigned aid   = src_array_id(ctx, src);
-		unsigned first = ctx->array[aid].first;
-		unsigned last  = ctx->array[aid].last;
-		unsigned off   = src->Index - first; /* vec4 offset */
-
-		reg->size = 4 * (1 + last - first);
-		reg->offset = regid(off, chan);
-
-		instr = array_fanin(ctx, aid, src->File);
-	} else if (src->File == TGSI_FILE_CONSTANT && src->Dimension) {
-		const struct tgsi_full_src_register *fsrc = (const void *)src;
-		struct ir3_instruction *temp = NULL;
-		int ubo_regid = regid(ctx->so->first_driver_param, 0) +
-			fsrc->Dimension.Index - 1;
-		int offset = 0;
-
-		/* We don't handle indirect UBO array accesses... yet. */
-		compile_assert(ctx, !fsrc->Dimension.Indirect);
-		/* UBOs start at index 1. */
-		compile_assert(ctx, fsrc->Dimension.Index > 0);
-
-		if (src->Indirect) {
-			/* In case of an indirect index, it will have been loaded into an
-			 * address register. There will be a sequence of
-			 *
-			 *   shl.b x, val, 2
-			 *   mova a0, x
-			 *
-			 * We rely on this sequence to get the original val out and shift
-			 * it by 4, since we're dealing in vec4 units.
-			 */
-			compile_assert(ctx, ctx->block->address);
-			compile_assert(ctx, ctx->block->address->regs[1]->instr->opc ==
-						   OPC_SHL_B);
-
-			temp = instr = instr_create(ctx, 2, OPC_SHL_B);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, 0, IR3_REG_HALF | IR3_REG_SSA)->instr =
-				ctx->block->address->regs[1]->instr->regs[1]->instr;
-			ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
-		} else if (src->Index >= 64) {
-			/* Otherwise it's a plain index (in vec4 units). Move it into a
-			 * register.
-			 */
-			temp = instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = get_utype(ctx);
-			instr->cat1.dst_type = get_utype(ctx);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = src->Index * 16;
-		} else {
-			/* The offset is small enough to fit into the ldg instruction
-			 * directly.
-			 */
-			offset = src->Index * 16;
-		}
-
-		if (temp) {
-			/* If there was an offset (most common), add it to the buffer
-			 * address.
-			 */
-			instr = instr_create(ctx, 2, OPC_ADD_S);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
-			ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
-		} else {
-			/* Otherwise just load the buffer address directly */
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = get_utype(ctx);
-			instr->cat1.dst_type = get_utype(ctx);
-			ir3_reg_create(instr, 0, 0);
-			ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
-		}
-
-		temp = instr;
-
-		instr = instr_create(ctx, 6, OPC_LDG);
-		instr->cat6.type = TYPE_U32;
-		instr->cat6.offset = offset + chan * 4;
-		ir3_reg_create(instr, 0, 0);
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-
-		reg->flags &= ~(IR3_REG_RELATIV | IR3_REG_CONST);
-	} else {
-		/* normal case (not relative addressed GPR) */
-		instr = ssa_instr_get(ctx, src->File, regid(src->Index, chan));
-	}
-
-	if (instr) {
-		reg->flags |= IR3_REG_SSA;
-		reg->instr = instr;
-	} else if (reg->flags & IR3_REG_SSA) {
-		/* special hack for trans_samp() which calls ssa_src() directly
-		 * to build up the collect (fanin) for const src.. (so SSA flag
-		 * set but no src instr... it basically gets lucky because we
-		 * default to 0.0 for "undefined" src instructions, which is
-		 * what it wants.  We probably need to give it a better way to
-		 * do this, but for now this hack:
-		 */
-		reg->instr = create_immed(ctx, 0.0);
-	}
-}
-
-static struct ir3_register *
-add_dst_reg_wrmask(struct ir3_compile_context *ctx,
-		struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
-		unsigned chan, unsigned wrmask)
-{
-	unsigned flags = 0, num = 0;
-	struct ir3_register *reg;
-
-	switch (dst->File) {
-	case TGSI_FILE_OUTPUT:
-	case TGSI_FILE_TEMPORARY:
-		/* uses SSA */
-		break;
-	case TGSI_FILE_ADDRESS:
-		flags |= IR3_REG_ADDR;
-		/* uses SSA */
-		break;
-	default:
-		compile_error(ctx, "unsupported dst register file: %s\n",
-			tgsi_file_name(dst->File));
-		break;
-	}
-
-	if (dst->Indirect) {
-		flags |= IR3_REG_RELATIV;
-
-		/* shouldn't happen, and we can't cope with it below: */
-		compile_assert(ctx, wrmask == 0x1);
-
-		compile_assert(ctx, ctx->block->address);
-		if (instr->address)
-			compile_assert(ctx, ctx->block->address == instr->address);
-
-		instr->address = ctx->block->address;
-		array_insert(ctx->ir->indirects, instr);
-	}
-
-	reg = ir3_reg_create(instr, regid(num, chan), flags);
-	reg->wrmask = wrmask;
-
-	if (wrmask == 0x1) {
-		/* normal case */
-		ssa_dst(ctx, instr, dst, chan);
-	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
-			(dst->File == TGSI_FILE_OUTPUT) ||
-			(dst->File == TGSI_FILE_ADDRESS)) {
-		struct ir3_instruction *prev = NULL;
-		unsigned i;
-
-		compile_assert(ctx, !dst->Indirect);
-
-		/* if instruction writes multiple, we need to create
-		 * some place-holder collect the registers:
-		 */
-		for (i = 0; i < 4; i++) {
-			/* NOTE: slightly ugly that we setup neighbor ptrs
-			 * for FO here, but handle FI in CP pass.. we should
-			 * probably just always setup neighbor ptrs in the
-			 * frontend?
-			 */
-			struct ir3_instruction *split =
-					ir3_instr_create(ctx->block, -1, OPC_META_FO);
-			split->fo.off = i;
-			/* unused dst reg: */
-			/* NOTE: set SSA flag on dst here, because unused FO's
-			 * which don't get scheduled will end up not in the
-			 * instruction list when RA sets SSA flag on each dst.
-			 * Slight hack.  We really should set SSA flag on
-			 * every dst register in the frontend.
-			 */
-			ir3_reg_create(split, 0, IR3_REG_SSA);
-			/* and src reg used to hold original instr */
-			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
-			if (prev) {
-				split->cp.left = prev;
-				split->cp.left_cnt++;
-				prev->cp.right = split;
-				prev->cp.right_cnt++;
-			}
-			if ((wrmask & (1 << i)) && !ctx->atomic)
-				ssa_dst(ctx, split, dst, chan+i);
-			prev = split;
-		}
-	}
-
-	return reg;
-}
-
-static struct ir3_register *
-add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		const struct tgsi_dst_register *dst, unsigned chan)
-{
-	return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
-}
-
-static struct ir3_register *
-add_src_reg_wrmask(struct ir3_compile_context *ctx,
-		struct ir3_instruction *instr, const struct tgsi_src_register *src,
-		unsigned chan, unsigned wrmask)
-{
-	unsigned flags = 0, num = 0;
-	struct ir3_register *reg;
-
-	switch (src->File) {
-	case TGSI_FILE_IMMEDIATE:
-		/* TODO if possible, use actual immediate instead of const.. but
-		 * TGSI has vec4 immediates, we can only embed scalar (of limited
-		 * size, depending on instruction..)
-		 */
-		flags |= IR3_REG_CONST;
-		num = src->Index + ctx->so->first_immediate;
-		break;
-	case TGSI_FILE_CONSTANT:
-		flags |= IR3_REG_CONST;
-		num = src->Index;
-		break;
-	case TGSI_FILE_OUTPUT:
-		/* NOTE: we should only end up w/ OUTPUT file for things like
-		 * clamp()'ing saturated dst instructions
-		 */
-	case TGSI_FILE_INPUT:
-	case TGSI_FILE_TEMPORARY:
-	case TGSI_FILE_SYSTEM_VALUE:
-		/* uses SSA */
-		break;
-	default:
-		compile_error(ctx, "unsupported src register file: %s\n",
-			tgsi_file_name(src->File));
-		break;
-	}
-
-	/* We seem to have 8 bits (6.2) for dst register always, so I think
-	 * it is safe to assume GPR cannot be >=64
-	 *
-	 * cat3 instructions only have 8 bits for src2, but cannot take a
-	 * const for src2
-	 *
-	 * cat5 and cat6 in some cases only has 8 bits, but cannot take a
-	 * const for any src.
-	 *
-	 * Other than that we seem to have 12 bits to encode const src,
-	 * except for cat1 which may only have 11 bits (but that seems like
-	 * a bug)
-	 */
-	if (flags & IR3_REG_CONST)
-		compile_assert(ctx, src->Index < (1 << 9));
-	else
-		compile_assert(ctx, src->Index < (1 << 6));
-
-	/* NOTE: abs/neg modifiers in tgsi only apply to float */
-	if (src->Absolute)
-		flags |= IR3_REG_FABS;
-	if (src->Negate)
-		flags |= IR3_REG_FNEG;
-
-	if (src->Indirect) {
-		flags |= IR3_REG_RELATIV;
-
-		/* shouldn't happen, and we can't cope with it below: */
-		compile_assert(ctx, wrmask == 0x1);
-
-		compile_assert(ctx, ctx->block->address);
-		if (instr->address)
-			compile_assert(ctx, ctx->block->address == instr->address);
-
-		instr->address = ctx->block->address;
-		array_insert(ctx->ir->indirects, instr);
-	}
-
-	reg = ir3_reg_create(instr, regid(num, chan), flags);
-	reg->wrmask = wrmask;
-
-	if (wrmask == 0x1) {
-		/* normal case */
-		ssa_src(ctx, reg, src, chan);
-	} else if ((src->File == TGSI_FILE_TEMPORARY) ||
-			(src->File == TGSI_FILE_OUTPUT) ||
-			(src->File == TGSI_FILE_INPUT)) {
-		struct ir3_instruction *collect;
-		unsigned i;
-
-		compile_assert(ctx, !src->Indirect);
-
-		/* if instruction reads multiple, we need to create
-		 * some place-holder collect the registers:
-		 */
-		collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
-		ir3_reg_create(collect, 0, 0);   /* unused dst reg */
-
-		for (i = 0; i < 4; i++) {
-			if (wrmask & (1 << i)) {
-				/* and src reg used point to the original instr */
-				ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-						src, chan + i);
-			} else if (wrmask & ~((i << i) - 1)) {
-				/* if any remaining components, then dummy
-				 * placeholder src reg to fill in the blanks:
-				 */
-				ir3_reg_create(collect, 0, 0);
-			}
-		}
-
-		reg->flags |= IR3_REG_SSA;
-		reg->instr = collect;
-	}
-
-	return reg;
-}
-
-static struct ir3_register *
-add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		const struct tgsi_src_register *src, unsigned chan)
-{
-	return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
-}
-
-static void
-src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
-{
-	src->File      = dst->File;
-	src->Indirect  = dst->Indirect;
-	src->Dimension = dst->Dimension;
-	src->Index     = dst->Index;
-	src->Absolute  = 0;
-	src->Negate    = 0;
-	src->SwizzleX  = TGSI_SWIZZLE_X;
-	src->SwizzleY  = TGSI_SWIZZLE_Y;
-	src->SwizzleZ  = TGSI_SWIZZLE_Z;
-	src->SwizzleW  = TGSI_SWIZZLE_W;
-}
-
-/* Get internal-temp src/dst to use for a sequence of instructions
- * generated by a single TGSI op.
- */
-static struct tgsi_src_register *
-get_internal_temp(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *tmp_dst)
-{
-	struct tgsi_src_register *tmp_src;
-	int n;
-
-	tmp_dst->File      = TGSI_FILE_TEMPORARY;
-	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
-	tmp_dst->Indirect  = 0;
-	tmp_dst->Dimension = 0;
-
-	/* assign next temporary: */
-	n = ctx->num_internal_temps++;
-	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
-	tmp_src = &ctx->internal_temps[n];
-
-	tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
-
-	src_from_dst(tmp_src, tmp_dst);
-
-	return tmp_src;
-}
-
-static inline bool
-is_const(struct tgsi_src_register *src)
-{
-	return (src->File == TGSI_FILE_CONSTANT) ||
-			(src->File == TGSI_FILE_IMMEDIATE);
-}
-
-static inline bool
-is_relative(struct tgsi_src_register *src)
-{
-	return src->Indirect;
-}
-
-static inline bool
-is_rel_or_const(struct tgsi_src_register *src)
-{
-	return is_relative(src) || is_const(src);
-}
-
-static type_t
-get_ftype(struct ir3_compile_context *ctx)
-{
-	return TYPE_F32;
-}
-
-static type_t
-get_utype(struct ir3_compile_context *ctx)
-{
-	return TYPE_U32;
-}
-
-static type_t
-get_stype(struct ir3_compile_context *ctx)
-{
-	return TYPE_S32;
-}
-
-static unsigned
-src_swiz(struct tgsi_src_register *src, int chan)
-{
-	switch (chan) {
-	case 0: return src->SwizzleX;
-	case 1: return src->SwizzleY;
-	case 2: return src->SwizzleZ;
-	case 3: return src->SwizzleW;
-	}
-	assert(0);
-	return 0;
-}
-
-/* for instructions that cannot take a const register as src, if needed
- * generate a move to temporary gpr:
- */
-static struct tgsi_src_register *
-get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
-{
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-
-	compile_assert(ctx, is_rel_or_const(src));
-
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	create_mov(ctx, &tmp_dst, src);
-
-	return tmp_src;
-}
-
-static void
-get_immediate(struct ir3_compile_context *ctx,
-		struct tgsi_src_register *reg, uint32_t val)
-{
-	unsigned neg, swiz, idx, i;
-	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
-	static const unsigned swiz2tgsi[] = {
-			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
-	};
-
-	for (i = 0; i < ctx->immediate_idx; i++) {
-		swiz = i % 4;
-		idx  = i / 4;
-
-		if (ctx->so->immediates[idx].val[swiz] == val) {
-			neg = 0;
-			break;
-		}
-
-		if (ctx->so->immediates[idx].val[swiz] == -val) {
-			neg = 1;
-			break;
-		}
-	}
-
-	if (i == ctx->immediate_idx) {
-		/* need to generate a new immediate: */
-		swiz = i % 4;
-		idx  = i / 4;
-		neg  = 0;
-		ctx->so->immediates[idx].val[swiz] = val;
-		ctx->so->immediates_count = idx + 1;
-		ctx->immediate_idx++;
-	}
-
-	reg->File      = TGSI_FILE_IMMEDIATE;
-	reg->Indirect  = 0;
-	reg->Dimension = 0;
-	reg->Index     = idx;
-	reg->Absolute  = 0;
-	reg->Negate    = neg;
-	reg->SwizzleX  = swiz2tgsi[swiz];
-	reg->SwizzleY  = swiz2tgsi[swiz];
-	reg->SwizzleZ  = swiz2tgsi[swiz];
-	reg->SwizzleW  = swiz2tgsi[swiz];
-}
-
-static void
-create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
-		struct tgsi_src_register *src)
-{
-	type_t type_mov = get_ftype(ctx);
-	unsigned i;
-
-	for (i = 0; i < 4; i++) {
-		/* move to destination: */
-		if (dst->WriteMask & (1 << i)) {
-			struct ir3_instruction *instr;
-
-			if (src->Absolute || src->Negate) {
-				/* can't have abs or neg on a mov instr, so use
-				 * absneg.f instead to handle these cases:
-				 */
-				instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-			} else {
-				instr = instr_create(ctx, 1, 0);
-				instr->cat1.src_type = type_mov;
-				instr->cat1.dst_type = type_mov;
-			}
-
-			add_dst_reg(ctx, instr, dst, i);
-			add_src_reg(ctx, instr, src, src_swiz(src, i));
-		}
-	}
-}
-
-static void
-create_clamp(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *dst, struct tgsi_src_register *val,
-		struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
-{
-	struct ir3_instruction *instr;
-
-	instr = instr_create(ctx, 2, OPC_MAX_F);
-	vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
-
-	instr = instr_create(ctx, 2, OPC_MIN_F);
-	vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
-}
-
-static void
-create_clamp_imm(struct ir3_compile_context *ctx,
-		struct tgsi_dst_register *dst,
-		uint32_t minval, uint32_t maxval)
-{
-	struct tgsi_src_register minconst, maxconst;
-	struct tgsi_src_register src;
-
-	src_from_dst(&src, dst);
-
-	get_immediate(ctx, &minconst, minval);
-	get_immediate(ctx, &maxconst, maxval);
-
-	create_clamp(ctx, dst, &src, &minconst, &maxconst);
-}
-
-static struct tgsi_dst_register *
-get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	unsigned i;
-
-	compile_assert(ctx, !ctx->using_tmp_dst);
-	ctx->using_tmp_dst = true;
-
-	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-		struct tgsi_src_register *src = &inst->Src[i].Register;
-		if ((src->File == dst->File) && (src->Index == dst->Index)) {
-			if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
-					(src->SwizzleX == TGSI_SWIZZLE_X) &&
-					(src->SwizzleY == TGSI_SWIZZLE_Y) &&
-					(src->SwizzleZ == TGSI_SWIZZLE_Z) &&
-					(src->SwizzleW == TGSI_SWIZZLE_W))
-				continue;
-			ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
-			ctx->tmp_dst.WriteMask = dst->WriteMask;
-			dst = &ctx->tmp_dst;
-			break;
-		}
-	}
-	return dst;
-}
-
-static void
-put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
-		struct tgsi_dst_register *dst)
-{
-	compile_assert(ctx, ctx->using_tmp_dst);
-	ctx->using_tmp_dst = false;
-
-	/* if necessary, add mov back into original dst: */
-	if (dst != &inst->Dst[0].Register) {
-		create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
-	}
-}
-
-/* helper to generate the necessary repeat and/or additional instructions
- * to turn a scalar instruction into a vector operation:
- */
-static void
-vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
-		struct tgsi_dst_register *dst, int nsrcs, ...)
-{
-	va_list ap;
-	int i, j, n = 0;
-
-	instr_atomic_start(ctx);
-
-	for (i = 0; i < 4; i++) {
-		if (dst->WriteMask & (1 << i)) {
-			struct ir3_instruction *cur;
-
-			if (n++ == 0) {
-				cur = instr;
-			} else {
-				cur = instr_create(ctx, instr->category, instr->opc);
-				memcpy(cur->info, instr->info, sizeof(cur->info));
-			}
-
-			add_dst_reg(ctx, cur, dst, i);
-
-			va_start(ap, nsrcs);
-			for (j = 0; j < nsrcs; j++) {
-				struct tgsi_src_register *src =
-						va_arg(ap, struct tgsi_src_register *);
-				unsigned flags = va_arg(ap, unsigned);
-				struct ir3_register *reg;
-				if (flags & IR3_REG_IMMED) {
-					reg = ir3_reg_create(cur, 0, IR3_REG_IMMED);
-					/* this is an ugly cast.. should have put flags first! */
-					reg->iim_val = *(int *)&src;
-				} else {
-					reg = add_src_reg(ctx, cur, src, src_swiz(src, i));
-				}
-				reg->flags |= flags & ~(IR3_REG_FNEG | IR3_REG_SNEG);
-				if (flags & IR3_REG_FNEG)
-					reg->flags ^= IR3_REG_FNEG;
-				if (flags & IR3_REG_SNEG)
-					reg->flags ^= IR3_REG_SNEG;
-			}
-			va_end(ap);
-		}
-	}
-
-	instr_atomic_end(ctx);
-}
-
-/*
- * Handlers for TGSI instructions which do not have a 1:1 mapping to
- * native instructions:
- */
-
-static void
-trans_clamp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src0 = &inst->Src[0].Register;
-	struct tgsi_src_register *src1 = &inst->Src[1].Register;
-	struct tgsi_src_register *src2 = &inst->Src[2].Register;
-
-	create_clamp(ctx, dst, src0, src1, src2);
-
-	put_dst(ctx, inst, dst);
-}
-
-/* ARL(x) = x, but mova from hrN.x to a0.. */
-static void
-trans_arl(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	unsigned chan = src->SwizzleX;
-
-	compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
-
-	/* NOTE: we allocate a temporary from a flat register
-	 * namespace (ignoring half vs full).  It turns out
-	 * not to really matter since registers get reassigned
-	 * later in ir3_ra which (hopefully!) can deal a bit
-	 * better with mixed half and full precision.
-	 */
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	/* cov.{u,f}{32,16}s16 Rtmp, Rsrc */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ?
-			get_ftype(ctx) : get_utype(ctx);
-	instr->cat1.dst_type = TYPE_S16;
-	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-	add_src_reg(ctx, instr, src, chan);
-
-	/* shl.b Rtmp, Rtmp, 2 */
-	instr = instr_create(ctx, 2, OPC_SHL_B);
-	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
-	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
-
-	/* mova a0, Rtmp */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = TYPE_S16;
-	instr->cat1.dst_type = TYPE_S16;
-	add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
-	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
-}
-
-/*
- * texture fetch/sample instructions:
- */
-
-struct tex_info {
-	int8_t order[4];
-	int8_t args;
-	unsigned src_wrmask, flags;
-};
-
-struct target_info {
-	uint8_t dims;
-	uint8_t cube;
-	uint8_t array;
-	uint8_t shadow;
-};
-
-static const struct target_info tex_targets[] = {
-	[TGSI_TEXTURE_1D]               = { 1, 0, 0, 0 },
-	[TGSI_TEXTURE_2D]               = { 2, 0, 0, 0 },
-	[TGSI_TEXTURE_3D]               = { 3, 0, 0, 0 },
-	[TGSI_TEXTURE_CUBE]             = { 3, 1, 0, 0 },
-	[TGSI_TEXTURE_RECT]             = { 2, 0, 0, 0 },
-	[TGSI_TEXTURE_SHADOW1D]         = { 1, 0, 0, 1 },
-	[TGSI_TEXTURE_SHADOW2D]         = { 2, 0, 0, 1 },
-	[TGSI_TEXTURE_SHADOWRECT]       = { 2, 0, 0, 1 },
-	[TGSI_TEXTURE_1D_ARRAY]         = { 1, 0, 1, 0 },
-	[TGSI_TEXTURE_2D_ARRAY]         = { 2, 0, 1, 0 },
-	[TGSI_TEXTURE_SHADOW1D_ARRAY]   = { 1, 0, 1, 1 },
-	[TGSI_TEXTURE_SHADOW2D_ARRAY]   = { 2, 0, 1, 1 },
-	[TGSI_TEXTURE_SHADOWCUBE]       = { 3, 1, 0, 1 },
-	[TGSI_TEXTURE_2D_MSAA]          = { 2, 0, 0, 0 },
-	[TGSI_TEXTURE_2D_ARRAY_MSAA]    = { 2, 0, 1, 0 },
-	[TGSI_TEXTURE_CUBE_ARRAY]       = { 3, 1, 1, 0 },
-	[TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 },
-};
-
-static void
-fill_tex_info(struct ir3_compile_context *ctx,
-			  struct tgsi_full_instruction *inst,
-			  struct tex_info *info)
-{
-	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
-
-	if (tgt->dims == 3)
-		info->flags |= IR3_INSTR_3D;
-	if (tgt->array)
-		info->flags |= IR3_INSTR_A;
-	if (tgt->shadow)
-		info->flags |= IR3_INSTR_S;
-
-	switch (inst->Instruction.Opcode) {
-	case TGSI_OPCODE_TXB:
-	case TGSI_OPCODE_TXB2:
-	case TGSI_OPCODE_TXL:
-	case TGSI_OPCODE_TXF:
-		info->args = 2;
-		break;
-	case TGSI_OPCODE_TXP:
-		info->flags |= IR3_INSTR_P;
-		/* fallthrough */
-	case TGSI_OPCODE_TEX:
-	case TGSI_OPCODE_TXD:
-		info->args = 1;
-		break;
-	}
-
-	/*
-	 * lay out the first argument in the proper order:
-	 *  - actual coordinates first
-	 *  - shadow reference
-	 *  - array index
-	 *  - projection w
-	 *
-	 * bias/lod go into the second arg
-	 */
-	int arg, pos = 0;
-	for (arg = 0; arg < tgt->dims; arg++)
-		info->order[arg] = pos++;
-	if (tgt->dims == 1)
-		info->order[pos++] = -1;
-	if (tgt->shadow)
-		info->order[pos++] = MAX2(arg + tgt->array, 2);
-	if (tgt->array)
-		info->order[pos++] = arg++;
-	if (info->flags & IR3_INSTR_P)
-		info->order[pos++] = 3;
-
-	info->src_wrmask = (1 << pos) - 1;
-
-	for (; pos < 4; pos++)
-		info->order[pos] = -1;
-
-	assert(pos <= 4);
-}
-
-static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
-{
-	unsigned i;
-	for (i = 1; (i < 4) && order[i] >= 0; i++)
-		if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
-			return false;
-	return true;
-}
-
-static bool is_1d(unsigned tex)
-{
-	return tex_targets[tex].dims == 1;
-}
-
-static struct tgsi_src_register *
-get_tex_coord(struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst,
-		const struct tex_info *tinf)
-{
-	struct tgsi_src_register *coord = &inst->Src[0].Register;
-	struct ir3_instruction *instr;
-	unsigned tex = inst->Texture.Texture;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	type_t type_mov = get_ftype(ctx);
-	unsigned j;
-
-	/* need to move things around: */
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	for (j = 0; j < 4; j++) {
-		if (tinf->order[j] < 0)
-			continue;
-		instr = instr_create(ctx, 1, 0);  /* mov */
-		instr->cat1.src_type = type_mov;
-		instr->cat1.dst_type = type_mov;
-		add_dst_reg(ctx, instr, &tmp_dst, j);
-		add_src_reg(ctx, instr, coord,
-				src_swiz(coord, tinf->order[j]));
-	}
-
-	/* fix up .y coord: */
-	if (is_1d(tex)) {
-		struct ir3_register *imm;
-		instr = instr_create(ctx, 1, 0);  /* mov */
-		instr->cat1.src_type = type_mov;
-		instr->cat1.dst_type = type_mov;
-		add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
-		imm = ir3_reg_create(instr, 0, IR3_REG_IMMED);
-		if (inst->Instruction.Opcode == TGSI_OPCODE_TXF)
-			imm->iim_val = 0;
-		else
-			imm->fim_val = 0.5;
-	}
-
-	return tmp_src;
-}
-
-static void
-trans_samp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr, *collect;
-	struct ir3_register *reg;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy;
-	struct tgsi_src_register zero;
-	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
-	struct tex_info tinf;
-	int i;
-
-	memset(&tinf, 0, sizeof(tinf));
-	fill_tex_info(ctx, inst, &tinf);
-	coord = get_tex_coord(ctx, inst, &tinf);
-	get_immediate(ctx, &zero, 0);
-
-	switch (inst->Instruction.Opcode) {
-	case TGSI_OPCODE_TXB2:
-		orig = &inst->Src[1].Register;
-		samp = &inst->Src[2].Register;
-		break;
-	case TGSI_OPCODE_TXD:
-		orig = &inst->Src[0].Register;
-		dpdx = &inst->Src[1].Register;
-		dpdy = &inst->Src[2].Register;
-		samp = &inst->Src[3].Register;
-		if (is_rel_or_const(dpdx))
-				dpdx = get_unconst(ctx, dpdx);
-		if (is_rel_or_const(dpdy))
-				dpdy = get_unconst(ctx, dpdy);
-		break;
-	default:
-		orig = &inst->Src[0].Register;
-		samp = &inst->Src[1].Register;
-		break;
-	}
-	if (tinf.args > 1 && is_rel_or_const(orig))
-		orig = get_unconst(ctx, orig);
-
-	/* scale up integer coords for TXF based on the LOD */
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-		type_t type_mov = get_utype(ctx);
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		for (i = 0; i < tgt->dims; i++) {
-			instr = instr_create(ctx, 2, OPC_SHL_B);
-			add_dst_reg(ctx, instr, &tmp_dst, i);
-			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
-			add_src_reg(ctx, instr, orig, orig->SwizzleW);
-		}
-		if (tgt->dims < 2) {
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = type_mov;
-			instr->cat1.dst_type = type_mov;
-			add_dst_reg(ctx, instr, &tmp_dst, i);
-			add_src_reg(ctx, instr, &zero, 0);
-			i++;
-		}
-		if (tgt->array) {
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = type_mov;
-			instr->cat1.dst_type = type_mov;
-			add_dst_reg(ctx, instr, &tmp_dst, i);
-			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
-		}
-		coord = tmp_src;
-	}
-
-	if (inst->Texture.NumOffsets) {
-		struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0];
-		struct tgsi_src_register offset_src = {0};
-
-		offset_src.File = tex_offset->File;
-		offset_src.Index = tex_offset->Index;
-		offset_src.SwizzleX = tex_offset->SwizzleX;
-		offset_src.SwizzleY = tex_offset->SwizzleY;
-		offset_src.SwizzleZ = tex_offset->SwizzleZ;
-		offset = get_unconst(ctx, &offset_src);
-		tinf.flags |= IR3_INSTR_O;
-	}
-
-	instr = instr_create(ctx, 5, t->opc);
-	if (ctx->integer_s & (1 << samp->Index))
-		instr->cat5.type = get_utype(ctx);
-	else
-		instr->cat5.type = get_ftype(ctx);
-	instr->cat5.samp = samp->Index;
-	instr->cat5.tex  = samp->Index;
-	instr->flags |= tinf.flags;
-
-	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
-
-	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
-
-	collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 12);
-	ir3_reg_create(collect, 0, 0);
-	for (i = 0; i < 4; i++) {
-		if (tinf.src_wrmask & (1 << i))
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-					coord, src_swiz(coord, i));
-		else if (tinf.src_wrmask & ~((1 << i) - 1))
-			ir3_reg_create(collect, 0, 0);
-	}
-
-	/* Attach derivatives onto the end of the fan-in. Derivatives start after
-	 * the 4th argument, so make sure that fi is padded up to 4 first.
-	 */
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
-		while (collect->regs_count < 5)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-		for (i = 0; i < tgt->dims; i++)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i);
-		if (tgt->dims < 2)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-		for (i = 0; i < tgt->dims; i++)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i);
-		if (tgt->dims < 2)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-		tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4;
-	}
-
-	reg->instr = collect;
-	reg->wrmask = tinf.src_wrmask;
-
-	/* The second argument contains the offsets, followed by the lod/bias
-	 * argument. This is constructed more manually due to the dynamic nature.
-	 */
-	if (inst->Texture.NumOffsets == 0 && tinf.args == 1)
-		return;
-
-	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
-
-	collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 5);
-	ir3_reg_create(collect, 0, 0);
-
-	if (inst->Texture.NumOffsets) {
-		for (i = 0; i < tgt->dims; i++)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-					offset, i);
-		if (tgt->dims < 2)
-			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), &zero, 0);
-	}
-	if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2)
-		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-				orig, orig->SwizzleX);
-	else if (tinf.args > 1)
-		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
-				orig, orig->SwizzleW);
-
-	reg->instr = collect;
-	reg->wrmask = (1 << (collect->regs_count - 1)) - 1;
-}
-
-static void
-trans_txq(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *level = &inst->Src[0].Register;
-	struct tgsi_src_register *samp = &inst->Src[1].Register;
-	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
-	struct tex_info tinf;
-
-	memset(&tinf, 0, sizeof(tinf));
-	fill_tex_info(ctx, inst, &tinf);
-	if (is_rel_or_const(level))
-		level = get_unconst(ctx, level);
-
-	instr = instr_create(ctx, 5, OPC_GETSIZE);
-	instr->cat5.type = get_utype(ctx);
-	instr->cat5.samp = samp->Index;
-	instr->cat5.tex  = samp->Index;
-	instr->flags |= tinf.flags;
-
-	if (tgt->array && (dst->WriteMask & (1 << tgt->dims))) {
-		/* Array size actually ends up in .w rather than .z. This doesn't
-		 * matter for miplevel 0, but for higher mips the value in z is
-		 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
-		 * returned, which means that we have to add 1 to it for arrays.
-		 */
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-		type_t type_mov = get_utype(ctx);
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0,
-						   dst->WriteMask | TGSI_WRITEMASK_W);
-		add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
-
-		if (dst->WriteMask & TGSI_WRITEMASK_X) {
-			instr = instr_create(ctx, 1, 0);
-			instr->cat1.src_type = type_mov;
-			instr->cat1.dst_type = type_mov;
-			add_dst_reg(ctx, instr, dst, 0);
-			add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 0));
-		}
-
-		if (tgt->dims == 2) {
-			if (dst->WriteMask & TGSI_WRITEMASK_Y) {
-				instr = instr_create(ctx, 1, 0);
-				instr->cat1.src_type = type_mov;
-				instr->cat1.dst_type = type_mov;
-				add_dst_reg(ctx, instr, dst, 1);
-				add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 1));
-			}
-		}
-
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		add_dst_reg(ctx, instr, dst, tgt->dims);
-		add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 3));
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-	} else {
-		add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
-		add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
-	}
-
-	if (dst->WriteMask & TGSI_WRITEMASK_W) {
-		/* The # of levels comes from getinfo.z. We need to add 1 to it, since
-		 * the value in TEX_CONST_0 is zero-based.
-		 */
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		instr = instr_create(ctx, 5, OPC_GETINFO);
-		instr->cat5.type = get_utype(ctx);
-		instr->cat5.samp = samp->Index;
-		instr->cat5.tex  = samp->Index;
-		add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0, TGSI_WRITEMASK_Z);
-
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		add_dst_reg(ctx, instr, dst, 3);
-		add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 2));
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-	}
-}
-
-/* DDX/DDY */
-static void
-trans_deriv(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	static const int8_t order[4] = {0, 1, 2, 3};
-
-	if (!check_swiz(src, order)) {
-		struct tgsi_dst_register tmp_dst;
-		struct tgsi_src_register *tmp_src;
-
-		tmp_src = get_internal_temp(ctx, &tmp_dst);
-		create_mov(ctx, &tmp_dst, src);
-
-		src = tmp_src;
-	}
-
-	/* This might be a workaround for hw bug?  Blob compiler always
-	 * seems to work two components at a time for dsy/dsx.  It does
-	 * actually seem to work in some cases (or at least some piglit
-	 * tests) for four components at a time.  But seems more reliable
-	 * to split this into two instructions like the blob compiler
-	 * does:
-	 */
-
-	instr = instr_create(ctx, 5, t->opc);
-	instr->cat5.type = get_ftype(ctx);
-	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
-	add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
-
-	instr = instr_create(ctx, 5, t->opc);
-	instr->cat5.type = get_ftype(ctx);
-	add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
-	add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
-}
-
-/*
- * SEQ(a,b) = (a == b) ? 1.0 : 0.0
- *   cmps.f.eq tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SNE(a,b) = (a != b) ? 1.0 : 0.0
- *   cmps.f.ne tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SGE(a,b) = (a >= b) ? 1.0 : 0.0
- *   cmps.f.ge tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SLE(a,b) = (a <= b) ? 1.0 : 0.0
- *   cmps.f.le tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SGT(a,b) = (a > b)  ? 1.0 : 0.0
- *   cmps.f.gt tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * SLT(a,b) = (a < b)  ? 1.0 : 0.0
- *   cmps.f.lt tmp0, a, b
- *   cov.u16f16 dst, tmp0
- *
- * CMP(a,b,c) = (a < 0.0) ? b : c
- *   cmps.f.lt tmp0, a, {0.0}
- *   sel.b16 dst, b, tmp0, c
- */
-static void
-trans_cmp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_src_register constval0;
-	/* final instruction for CMP() uses orig src1 and src2: */
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a0, *a1, *a2;
-	unsigned condition;
-
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	a0 = &inst->Src[0].Register;  /* a */
-	a1 = &inst->Src[1].Register;  /* b */
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_SEQ:
-	case TGSI_OPCODE_FSEQ:
-		condition = IR3_COND_EQ;
-		break;
-	case TGSI_OPCODE_SNE:
-	case TGSI_OPCODE_FSNE:
-		condition = IR3_COND_NE;
-		break;
-	case TGSI_OPCODE_SGE:
-	case TGSI_OPCODE_FSGE:
-		condition = IR3_COND_GE;
-		break;
-	case TGSI_OPCODE_SLT:
-	case TGSI_OPCODE_FSLT:
-		condition = IR3_COND_LT;
-		break;
-	case TGSI_OPCODE_SLE:
-		condition = IR3_COND_LE;
-		break;
-	case TGSI_OPCODE_SGT:
-		condition = IR3_COND_GT;
-		break;
-	case TGSI_OPCODE_CMP:
-		get_immediate(ctx, &constval0, fui(0.0));
-		a0 = &inst->Src[0].Register;  /* a */
-		a1 = &constval0;              /* {0.0} */
-		condition = IR3_COND_LT;
-		break;
-	default:
-		compile_assert(ctx, 0);
-		return;
-	}
-
-	if (is_const(a0) && is_const(a1))
-		a0 = get_unconst(ctx, a0);
-
-	/* cmps.f.<cond> tmp, a0, a1 */
-	instr = instr_create(ctx, 2, OPC_CMPS_F);
-	instr->cat2.condition = condition;
-	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_SEQ:
-	case TGSI_OPCODE_SGE:
-	case TGSI_OPCODE_SLE:
-	case TGSI_OPCODE_SNE:
-	case TGSI_OPCODE_SGT:
-	case TGSI_OPCODE_SLT:
-		/* cov.u16f16 dst, tmp0 */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = get_utype(ctx);
-		instr->cat1.dst_type = get_ftype(ctx);
-		vectorize(ctx, instr, dst, 1, tmp_src, 0);
-		break;
-	case TGSI_OPCODE_FSEQ:
-	case TGSI_OPCODE_FSGE:
-	case TGSI_OPCODE_FSNE:
-	case TGSI_OPCODE_FSLT:
-		/* absneg.s dst, (neg)tmp0 */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG);
-		break;
-	case TGSI_OPCODE_CMP:
-		a1 = &inst->Src[1].Register;
-		a2 = &inst->Src[2].Register;
-		/* sel.{b32,b16} dst, src2, tmp, src1 */
-		instr = instr_create(ctx, 3, OPC_SEL_B32);
-		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
 
-		break;
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * USNE(a,b) = (a != b) ? ~0 : 0
- *   cmps.u32.ne dst, a, b
- *
- * USEQ(a,b) = (a == b) ? ~0 : 0
- *   cmps.u32.eq dst, a, b
- *
- * ISGE(a,b) = (a > b) ? ~0 : 0
- *   cmps.s32.ge dst, a, b
- *
- * USGE(a,b) = (a > b) ? ~0 : 0
- *   cmps.u32.ge dst, a, b
- *
- * ISLT(a,b) = (a < b) ? ~0 : 0
- *   cmps.s32.lt dst, a, b
- *
- * USLT(a,b) = (a < b) ? ~0 : 0
- *   cmps.u32.lt dst, a, b
- *
- */
-static void
-trans_icmp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_src_register *a0, *a1;
-	unsigned condition;
-
-	a0 = &inst->Src[0].Register;  /* a */
-	a1 = &inst->Src[1].Register;  /* b */
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_USNE:
-		condition = IR3_COND_NE;
-		break;
-	case TGSI_OPCODE_USEQ:
-		condition = IR3_COND_EQ;
-		break;
-	case TGSI_OPCODE_ISGE:
-	case TGSI_OPCODE_USGE:
-		condition = IR3_COND_GE;
-		break;
-	case TGSI_OPCODE_ISLT:
-	case TGSI_OPCODE_USLT:
-		condition = IR3_COND_LT;
-		break;
-
-	default:
-		compile_assert(ctx, 0);
-		return;
-	}
-
-	if (is_const(a0) && is_const(a1))
-		a0 = get_unconst(ctx, a0);
-
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-	/* cmps.{u32,s32}.<cond> tmp, a0, a1 */
-	instr = instr_create(ctx, 2, t->opc);
-	instr->cat2.condition = condition;
-	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
-
-	/* absneg.s dst, (neg)tmp */
-	instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-	vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG);
-
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * UCMP(a,b,c) = a ? b : c
- *   sel.b16 dst, b, a, c
- */
-static void
-trans_ucmp(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a0, *a1, *a2;
-
-	a0 = &inst->Src[0].Register;  /* a */
-	a1 = &inst->Src[1].Register;  /* b */
-	a2 = &inst->Src[2].Register;  /* c */
-
-	if (is_rel_or_const(a0))
-		a0 = get_unconst(ctx, a0);
-
-	/* sel.{b32,b16} dst, b, a, c */
-	instr = instr_create(ctx, 3, OPC_SEL_B32);
-	vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0);
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0
- *   cmps.s.lt tmp_neg, a, 0  # 1 if a is negative
- *   cmps.s.gt tmp_pos, a, 0  # 1 if a is positive
- *   sub.u dst, tmp_pos, tmp_neg
- */
-static void
-trans_issg(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a = &inst->Src[0].Register;
-	struct tgsi_dst_register neg_dst, pos_dst;
-	struct tgsi_src_register *neg_src, *pos_src;
-
-	neg_src = get_internal_temp(ctx, &neg_dst);
-	pos_src = get_internal_temp(ctx, &pos_dst);
-
-	/* cmps.s.lt neg, a, 0 */
-	instr = instr_create(ctx, 2, OPC_CMPS_S);
-	instr->cat2.condition = IR3_COND_LT;
-	vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED);
-
-	/* cmps.s.gt pos, a, 0 */
-	instr = instr_create(ctx, 2, OPC_CMPS_S);
-	instr->cat2.condition = IR3_COND_GT;
-	vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED);
-
-	/* sub.u dst, pos, neg */
-	instr = instr_create(ctx, 2, OPC_SUB_U);
-	vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0);
-
-	put_dst(ctx, inst, dst);
-}
-
-
-
-/*
- * Conditional / Flow control
- */
-
-static void
-push_branch(struct ir3_compile_context *ctx, bool inv,
-		struct ir3_instruction *instr, struct ir3_instruction *cond)
-{
-	unsigned int idx = ctx->branch_count++;
-	compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
-	ctx->branch[idx].instr = instr;
-	ctx->branch[idx].inv = inv;
-	/* else side of branch has same condition: */
-	if (!inv)
-		ctx->branch[idx].cond = cond;
-}
-
-static struct ir3_instruction *
-pop_branch(struct ir3_compile_context *ctx)
-{
-	unsigned int idx = --ctx->branch_count;
-	return ctx->branch[idx].instr;
-}
-
-static void
-trans_if(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr, *cond;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	struct tgsi_dst_register tmp_dst;
-	struct tgsi_src_register *tmp_src;
-	struct tgsi_src_register constval;
-
-	get_immediate(ctx, &constval, fui(0.0));
-	tmp_src = get_internal_temp(ctx, &tmp_dst);
-
-	if (is_const(src))
-		src = get_unconst(ctx, src);
-
-	/* cmps.{f,u}.ne tmp0, b, {0.0} */
-	instr = instr_create(ctx, 2, t->opc);
-	add_dst_reg(ctx, instr, &tmp_dst, 0);
-	add_src_reg(ctx, instr, src, src->SwizzleX);
-	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
-	instr->cat2.condition = IR3_COND_NE;
-
-	compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
-	cond = instr->regs[1]->instr;
-
-	/* meta:flow tmp0 */
-	instr = instr_create(ctx, -1, OPC_META_FLOW);
-	ir3_reg_create(instr, 0, 0);  /* dummy dst */
-	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
-
-	push_branch(ctx, false, instr, cond);
-	instr->flow.if_block = push_block(ctx);
-}
-
-static void
-trans_else(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-
-	pop_block(ctx);
-
-	instr = pop_branch(ctx);
-
-	compile_assert(ctx, (instr->category == -1) &&
-			(instr->opc == OPC_META_FLOW));
-
-	push_branch(ctx, true, instr, NULL);
-	instr->flow.else_block = push_block(ctx);
-}
-
-static struct ir3_instruction *
-find_temporary(struct ir3_block *block, unsigned n)
-{
-	if (block->parent && !block->temporaries[n])
-		return find_temporary(block->parent, n);
-	return block->temporaries[n];
-}
-
-static struct ir3_instruction *
-find_output(struct ir3_block *block, unsigned n)
-{
-	if (block->parent && !block->outputs[n])
-		return find_output(block->parent, n);
-	return block->outputs[n];
-}
-
-static struct ir3_instruction *
-create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
-		struct ir3_instruction *a, struct ir3_instruction *b)
-{
-	struct ir3_instruction *phi;
-
-	compile_assert(ctx, cond);
-
-	/* Either side of the condition could be null..  which
-	 * indicates a variable written on only one side of the
-	 * branch.  Normally this should only be variables not
-	 * used outside of that side of the branch.  So we could
-	 * just 'return a ? a : b;' in that case.  But for better
-	 * defined undefined behavior we just stick in imm{0.0}.
-	 * In the common case of a value only used within the
-	 * one side of the branch, the PHI instruction will not
-	 * get scheduled
-	 */
-	if (!a)
-		a = create_immed(ctx, 0.0);
-	if (!b)
-		b = create_immed(ctx, 0.0);
-
-	phi = instr_create(ctx, -1, OPC_META_PHI);
-	ir3_reg_create(phi, 0, 0);  /* dummy dst */
-	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
-	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
-	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
-
-	return phi;
-}
-
-static void
-trans_endif(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct ir3_block *ifb, *elseb;
-	struct ir3_instruction **ifout, **elseout;
-	unsigned i, ifnout = 0, elsenout = 0;
-
-	pop_block(ctx);
-
-	instr = pop_branch(ctx);
-
-	compile_assert(ctx, (instr->category == -1) &&
-			(instr->opc == OPC_META_FLOW));
-
-	ifb = instr->flow.if_block;
-	elseb = instr->flow.else_block;
-	/* if there is no else block, the parent block is used for the
-	 * branch-not-taken src of the PHI instructions:
-	 */
-	if (!elseb)
-		elseb = ifb->parent;
-
-	/* worst case sizes: */
-	ifnout = ifb->ntemporaries + ifb->noutputs;
-	elsenout = elseb->ntemporaries + elseb->noutputs;
-
-	ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
-	if (elseb != ifb->parent)
-		elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
-
-	ifnout = 0;
-	elsenout = 0;
-
-	/* generate PHI instructions for any temporaries written: */
-	for (i = 0; i < ifb->ntemporaries; i++) {
-		struct ir3_instruction *a = ifb->temporaries[i];
-		struct ir3_instruction *b = elseb->temporaries[i];
-
-		/* if temporary written in if-block, or if else block
-		 * is present and temporary written in else-block:
-		 */
-		if (a || ((elseb != ifb->parent) && b)) {
-			struct ir3_instruction *phi;
-
-			/* if only written on one side, find the closest
-			 * enclosing update on other side:
-			 */
-			if (!a)
-				a = find_temporary(ifb, i);
-			if (!b)
-				b = find_temporary(elseb, i);
-
-			ifout[ifnout] = a;
-			a = create_output(ifb, a, ifnout++);
-
-			if (elseb != ifb->parent) {
-				elseout[elsenout] = b;
-				b = create_output(elseb, b, elsenout++);
-			}
-
-			phi = create_phi(ctx, instr, a, b);
-			ctx->block->temporaries[i] = phi;
-		}
-	}
-
-	compile_assert(ctx, ifb->noutputs == elseb->noutputs);
-
-	/* .. and any outputs written: */
-	for (i = 0; i < ifb->noutputs; i++) {
-		struct ir3_instruction *a = ifb->outputs[i];
-		struct ir3_instruction *b = elseb->outputs[i];
-
-		/* if output written in if-block, or if else block
-		 * is present and output written in else-block:
-		 */
-		if (a || ((elseb != ifb->parent) && b)) {
-			struct ir3_instruction *phi;
-
-			/* if only written on one side, find the closest
-			 * enclosing update on other side:
-			 */
-			if (!a)
-				a = find_output(ifb, i);
-			if (!b)
-				b = find_output(elseb, i);
-
-			ifout[ifnout] = a;
-			a = create_output(ifb, a, ifnout++);
-
-			if (elseb != ifb->parent) {
-				elseout[elsenout] = b;
-				b = create_output(elseb, b, elsenout++);
-			}
-
-			phi = create_phi(ctx, instr, a, b);
-			ctx->block->outputs[i] = phi;
-		}
-	}
-
-	ifb->noutputs = ifnout;
-	ifb->outputs = ifout;
-
-	if (elseb != ifb->parent) {
-		elseb->noutputs = elsenout;
-		elseb->outputs = elseout;
-	}
-
-	// TODO maybe we want to compact block->inputs?
-}
-
-/*
- * Kill
- */
-
-static void
-trans_kill(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr, *immed, *cond = NULL;
-	bool inv = false;
-
-	/* unconditional kill, use enclosing if condition: */
-	if (ctx->branch_count > 0) {
-		unsigned int idx = ctx->branch_count - 1;
-		cond = ctx->branch[idx].cond;
-		inv = ctx->branch[idx].inv;
-	} else {
-		cond = create_immed(ctx, 1.0);
-	}
-
-	compile_assert(ctx, cond);
-
-	immed = create_immed(ctx, 0.0);
-
-	/* cmps.f.ne p0.x, cond, {0.0} */
-	instr = instr_create(ctx, 2, OPC_CMPS_F);
-	instr->cat2.condition = IR3_COND_NE;
-	ir3_reg_create(instr, regid(REG_P0, 0), 0);
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
-	cond = instr;
-
-	/* kill p0.x */
-	instr = instr_create(ctx, 0, OPC_KILL);
-	instr->cat0.inv = inv;
-	ir3_reg_create(instr, 0, 0);  /* dummy dst */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-
-	ctx->kill[ctx->kill_count++] = instr;
-
-	ctx->so->has_kill = true;
-}
-
-/*
- * Kill-If
- */
-
-static void
-trans_killif(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	struct ir3_instruction *instr, *immed, *cond = NULL;
-	bool inv = false;
-
-	immed = create_immed(ctx, 0.0);
-
-	/* cmps.f.ne p0.x, cond, {0.0} */
-	instr = instr_create(ctx, 2, OPC_CMPS_F);
-	instr->cat2.condition = IR3_COND_NE;
-	ir3_reg_create(instr, regid(REG_P0, 0), 0);
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
-	add_src_reg(ctx, instr, src, src->SwizzleX);
-
-	cond = instr;
-
-	/* kill p0.x */
-	instr = instr_create(ctx, 0, OPC_KILL);
-	instr->cat0.inv = inv;
-	ir3_reg_create(instr, 0, 0);  /* dummy dst */
-	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
-
-	ctx->kill[ctx->kill_count++] = instr;
-
-	ctx->so->has_kill = true;
-
-}
-/*
- * I2F / U2F / F2I / F2U
- */
-
-static void
-trans_cov(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-
-	// cov.f32s32 dst, tmp0 /
-	instr = instr_create(ctx, 1, 0);
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_U2F:
-		instr->cat1.src_type = TYPE_U32;
-		instr->cat1.dst_type = TYPE_F32;
-		break;
-	case TGSI_OPCODE_I2F:
-		instr->cat1.src_type = TYPE_S32;
-		instr->cat1.dst_type = TYPE_F32;
-		break;
-	case TGSI_OPCODE_F2U:
-		instr->cat1.src_type = TYPE_F32;
-		instr->cat1.dst_type = TYPE_U32;
-		break;
-	case TGSI_OPCODE_F2I:
-		instr->cat1.src_type = TYPE_F32;
-		instr->cat1.dst_type = TYPE_S32;
-		break;
-
-	}
-	vectorize(ctx, instr, dst, 1, src, 0);
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * UMUL / UMAD
- *
- * There is no 32-bit multiply instruction, so splitting a and b into high and
- * low components, we get that
- *
- * dst = al * bl + ah * bl << 16 + al * bh << 16
- *
- *  mull.u tmp0, a, b (mul low, i.e. al * bl)
- *  madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16)
- *  madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16)
- *
- * For UMAD, add in the extra argument after mull.u.
- */
-static void
-trans_umul(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *a = &inst->Src[0].Register;
-	struct tgsi_src_register *b = &inst->Src[1].Register;
-
-	struct tgsi_dst_register tmp0_dst, tmp1_dst;
-	struct tgsi_src_register *tmp0_src, *tmp1_src;
-
-	tmp0_src = get_internal_temp(ctx, &tmp0_dst);
-	tmp1_src = get_internal_temp(ctx, &tmp1_dst);
-
-	if (is_rel_or_const(a))
-		a = get_unconst(ctx, a);
-	if (is_rel_or_const(b))
-		b = get_unconst(ctx, b);
-
-	/* mull.u tmp0, a, b */
-	instr = instr_create(ctx, 2, OPC_MULL_U);
-	vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0);
-
-	if (t->tgsi_opc == TGSI_OPCODE_UMAD) {
-		struct tgsi_src_register *c = &inst->Src[2].Register;
-
-		/* add.u tmp0, tmp0, c */
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		vectorize(ctx, instr, &tmp0_dst, 2, tmp0_src, 0, c, 0);
-	}
-
-	/* madsh.m16 tmp1, a, b, tmp0 */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0);
-
-	/* madsh.m16 dst, b, a, tmp1 */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0);
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * IDIV / UDIV / MOD / UMOD
- *
- * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For
- * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus.
- */
-static void
-trans_idiv(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct ir3_instruction *instr;
-	struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst;
-	struct tgsi_src_register *a = &inst->Src[0].Register;
-	struct tgsi_src_register *b = &inst->Src[1].Register;
-
-	struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst;
-	struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src;
-
-	struct tgsi_src_register negative_2, thirty_one;
-	type_t src_type;
-
-	if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD)
-		src_type = get_stype(ctx);
-	else
-		src_type = get_utype(ctx);
-
-	af_src = get_internal_temp(ctx, &af_dst);
-	bf_src = get_internal_temp(ctx, &bf_dst);
-	q_src = get_internal_temp(ctx, &q_dst);
-	r_src = get_internal_temp(ctx, &r_dst);
-	a_src = get_internal_temp(ctx, &a_dst);
-	b_src = get_internal_temp(ctx, &b_dst);
-
-	get_immediate(ctx, &negative_2, -2);
-	get_immediate(ctx, &thirty_one, 31);
-
-	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD)
-		premod_dst = &q_dst;
-
-	/* cov.[us]32f32 af, numerator */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = src_type;
-	instr->cat1.dst_type = get_ftype(ctx);
-	vectorize(ctx, instr, &af_dst, 1, a, 0);
-
-	/* cov.[us]32f32 bf, denominator */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = src_type;
-	instr->cat1.dst_type = get_ftype(ctx);
-	vectorize(ctx, instr, &bf_dst, 1, b, 0);
-
-	/* Get the absolute values for IDIV */
-	if (type_sint(src_type)) {
-		/* absneg.f af, (abs)af */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-		vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_FABS);
-
-		/* absneg.f bf, (abs)bf */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
-		vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_FABS);
-
-		/* absneg.s a, (abs)numerator */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_SABS);
-
-		/* absneg.s b, (abs)denominator */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_SABS);
-	} else {
-		/* mov.u32u32 a, numerator */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = src_type;
-		instr->cat1.dst_type = src_type;
-		vectorize(ctx, instr, &a_dst, 1, a, 0);
-
-		/* mov.u32u32 b, denominator */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = src_type;
-		instr->cat1.dst_type = src_type;
-		vectorize(ctx, instr, &b_dst, 1, b, 0);
-	}
-
-	/* rcp.f bf, bf */
-	instr = instr_create(ctx, 4, OPC_RCP);
-	vectorize(ctx, instr, &bf_dst, 1, bf_src, 0);
-
-	/* That's right, subtract 2 as an integer from the float */
-	/* add.u bf, bf, -2 */
-	instr = instr_create(ctx, 2, OPC_ADD_U);
-	vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0);
-
-	/* mul.f q, af, bf */
-	instr = instr_create(ctx, 2, OPC_MUL_F);
-	vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0);
-
-	/* cov.f32[us]32 q, q */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = get_ftype(ctx);
-	instr->cat1.dst_type = src_type;
-	vectorize(ctx, instr, &q_dst, 1, q_src, 0);
-
-	/* integer multiply q by b */
-	/* mull.u r, q, b */
-	instr = instr_create(ctx, 2, OPC_MULL_U);
-	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
-
-	/* madsh.m16 r, q, b, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
-
-	/* madsh.m16, r, b, q, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
-
-	/* sub.u r, a, r */
-	instr = instr_create(ctx, 2, OPC_SUB_U);
-	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
-
-	/* cov.u32f32, r, r */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = get_utype(ctx);
-	instr->cat1.dst_type = get_ftype(ctx);
-	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
-
-	/* mul.f r, r, bf */
-	instr = instr_create(ctx, 2, OPC_MUL_F);
-	vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0);
-
-	/* cov.f32u32 r, r */
-	instr = instr_create(ctx, 1, 0);
-	instr->cat1.src_type = get_ftype(ctx);
-	instr->cat1.dst_type = get_utype(ctx);
-	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
-
-	/* add.u q, q, r */
-	instr = instr_create(ctx, 2, OPC_ADD_U);
-	vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
-
-	/* mull.u r, q, b */
-	instr = instr_create(ctx, 2, OPC_MULL_U);
-	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
-
-	/* madsh.m16 r, q, b, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
-
-	/* madsh.m16 r, b, q, r */
-	instr = instr_create(ctx, 3, OPC_MADSH_M16);
-	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
-
-	/* sub.u r, a, r */
-	instr = instr_create(ctx, 2, OPC_SUB_U);
-	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
-
-	/* cmps.u.ge r, r, b */
-	instr = instr_create(ctx, 2, OPC_CMPS_U);
-	instr->cat2.condition = IR3_COND_GE;
-	vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0);
-
-	if (type_uint(src_type)) {
-		/* add.u dst, q, r */
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0);
-	} else {
-		/* add.u q, q, r */
-		instr = instr_create(ctx, 2, OPC_ADD_U);
-		vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
-
-		/* negate result based on the original arguments */
-		if (is_const(a) && is_const(b))
-			a = get_unconst(ctx, a);
-
-		/* xor.b r, numerator, denominator */
-		instr = instr_create(ctx, 2, OPC_XOR_B);
-		vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0);
-
-		/* shr.b r, r, 31 */
-		instr = instr_create(ctx, 2, OPC_SHR_B);
-		vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0);
-
-		/* absneg.s b, (neg)q */
-		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
-		vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_SNEG);
-
-		/* sel.b dst, b, r, q */
-		instr = instr_create(ctx, 3, OPC_SEL_B32);
-		vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0);
-	}
-
-	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) {
-		/* The division result will have ended up in q. */
-
-		if (is_rel_or_const(b))
-			b = get_unconst(ctx, b);
-
-		/* mull.u r, q, b */
-		instr = instr_create(ctx, 2, OPC_MULL_U);
-		vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0);
-
-		/* madsh.m16 r, q, b, r */
-		instr = instr_create(ctx, 3, OPC_MADSH_M16);
-		vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0);
-
-		/* madsh.m16 r, b, q, r */
-		instr = instr_create(ctx, 3, OPC_MADSH_M16);
-		vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0);
-
-		/* sub.u dst, a, r */
-		instr = instr_create(ctx, 2, OPC_SUB_U);
-		vectorize(ctx, instr, dst, 2, a, 0, r_src, 0);
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-/*
- * Handlers for TGSI instructions which do have 1:1 mapping to native
- * instructions:
- */
-
-static void
-instr_cat0(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
+struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id)
 {
-	instr_create(ctx, 0, t->opc);
+	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+	compiler->gpu_id = gpu_id;
+	compiler->set = ir3_ra_alloc_reg_set(compiler);
+	return compiler;
 }
 
-static void
-instr_cat1(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
+void ir3_compiler_destroy(struct ir3_compiler *compiler)
 {
-	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-
-	/* NOTE: atomic start/end, rather than in create_mov() since
-	 * create_mov() is used already w/in atomic sequences (and
-	 * we aren't clever enough to deal with the nesting)
-	 */
-	instr_atomic_start(ctx);
-	create_mov(ctx, dst, src);
-	instr_atomic_end(ctx);
-}
-
-static void
-instr_cat2(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src0 = &inst->Src[0].Register;
-	struct tgsi_src_register *src1 = &inst->Src[1].Register;
-	struct ir3_instruction *instr;
-	unsigned src0_flags = 0, src1_flags = 0;
-
-	switch (t->tgsi_opc) {
-	case TGSI_OPCODE_ABS:
-		src0_flags = IR3_REG_FABS;
-		break;
-	case TGSI_OPCODE_IABS:
-		src0_flags = IR3_REG_SABS;
-		break;
-	case TGSI_OPCODE_INEG:
-		src0_flags = IR3_REG_SNEG;
-		break;
-	case TGSI_OPCODE_SUB:
-		src1_flags = IR3_REG_FNEG;
-		break;
-	}
-
-	switch (t->opc) {
-	case OPC_ABSNEG_F:
-	case OPC_ABSNEG_S:
-	case OPC_CLZ_B:
-	case OPC_CLZ_S:
-	case OPC_SIGN_F:
-	case OPC_FLOOR_F:
-	case OPC_CEIL_F:
-	case OPC_RNDNE_F:
-	case OPC_RNDAZ_F:
-	case OPC_TRUNC_F:
-	case OPC_NOT_B:
-	case OPC_BFREV_B:
-	case OPC_SETRM:
-	case OPC_CBITS_B:
-		/* these only have one src reg */
-		instr = instr_create(ctx, 2, t->opc);
-		vectorize(ctx, instr, dst, 1, src0, src0_flags);
-		break;
-	default:
-		if (is_const(src0) && is_const(src1))
-			src0 = get_unconst(ctx, src0);
-
-		instr = instr_create(ctx, 2, t->opc);
-		vectorize(ctx, instr, dst, 2, src0, src0_flags,
-				src1, src1_flags);
-		break;
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat3(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src0 = &inst->Src[0].Register;
-	struct tgsi_src_register *src1 = &inst->Src[1].Register;
-	struct ir3_instruction *instr;
-
-	/* in particular, can't handle const for src1 for cat3..
-	 * for mad, we can swap first two src's if needed:
-	 */
-	if (is_rel_or_const(src1)) {
-		if (is_mad(t->opc) && !is_rel_or_const(src0)) {
-			struct tgsi_src_register *tmp;
-			tmp = src0;
-			src0 = src1;
-			src1 = tmp;
-		} else {
-			src1 = get_unconst(ctx, src1);
-		}
-	}
-
-	instr = instr_create(ctx, 3, t->opc);
-	vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
-			&inst->Src[2].Register, 0);
-	put_dst(ctx, inst, dst);
-}
-
-static void
-instr_cat4(const struct instr_translater *t,
-		struct ir3_compile_context *ctx,
-		struct tgsi_full_instruction *inst)
-{
-	struct tgsi_dst_register *dst = get_dst(ctx, inst);
-	struct tgsi_src_register *src = &inst->Src[0].Register;
-	struct ir3_instruction *instr;
-	unsigned i;
-
-	/* seems like blob compiler avoids const as src.. */
-	if (is_const(src))
-		src = get_unconst(ctx, src);
-
-	/* we need to replicate into each component: */
-	for (i = 0; i < 4; i++) {
-		if (dst->WriteMask & (1 << i)) {
-			instr = instr_create(ctx, 4, t->opc);
-			add_dst_reg(ctx, instr, dst, i);
-			add_src_reg(ctx, instr, src, src->SwizzleX);
-		}
-	}
-
-	put_dst(ctx, inst, dst);
-}
-
-static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
-#define INSTR(n, f, ...) \
-	[TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
-
-	INSTR(MOV,          instr_cat1),
-	INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
-	INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
-	INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
-	INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
-	INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
-	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
-	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
-	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
-	INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
-	INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
-	INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
-	INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
-	INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
-	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
-	INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
-	INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
-	INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
-	INSTR(UMUL,         trans_umul),
-	INSTR(UMAD,         trans_umul),
-	INSTR(UDIV,         trans_idiv),
-	INSTR(IDIV,         trans_idiv),
-	INSTR(MOD,          trans_idiv),
-	INSTR(UMOD,         trans_idiv),
-	INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
-	INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
-	INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
-	INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
-	INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
-	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
-	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
-	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
-	INSTR(CLAMP,        trans_clamp),
-	INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
-	INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
-	INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
-	INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
-	INSTR(ARL,          trans_arl),
-	INSTR(UARL,         trans_arl),
-	INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
-	INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
-	INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
-	INSTR(COS,          instr_cat4, .opc = OPC_COS),
-	INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
-	INSTR(TEX,          trans_samp, .opc = OPC_SAM),
-	INSTR(TXP,          trans_samp, .opc = OPC_SAM),
-	INSTR(TXB,          trans_samp, .opc = OPC_SAMB),
-	INSTR(TXB2,         trans_samp, .opc = OPC_SAMB),
-	INSTR(TXL,          trans_samp, .opc = OPC_SAML),
-	INSTR(TXD,          trans_samp, .opc = OPC_SAMGQ),
-	INSTR(TXF,          trans_samp, .opc = OPC_ISAML),
-	INSTR(TXQ,          trans_txq),
-	INSTR(DDX,          trans_deriv, .opc = OPC_DSX),
-	INSTR(DDY,          trans_deriv, .opc = OPC_DSY),
-	INSTR(SGT,          trans_cmp),
-	INSTR(SLT,          trans_cmp),
-	INSTR(FSLT,         trans_cmp),
-	INSTR(SGE,          trans_cmp),
-	INSTR(FSGE,         trans_cmp),
-	INSTR(SLE,          trans_cmp),
-	INSTR(SNE,          trans_cmp),
-	INSTR(FSNE,         trans_cmp),
-	INSTR(SEQ,          trans_cmp),
-	INSTR(FSEQ,         trans_cmp),
-	INSTR(CMP,          trans_cmp),
-	INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
-	INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
-	INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
-	INSTR(UCMP,         trans_ucmp),
-	INSTR(ISSG,         trans_issg),
-	INSTR(IF,           trans_if,   .opc = OPC_CMPS_F),
-	INSTR(UIF,          trans_if,   .opc = OPC_CMPS_U),
-	INSTR(ELSE,         trans_else),
-	INSTR(ENDIF,        trans_endif),
-	INSTR(END,          instr_cat0, .opc = OPC_END),
-	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
-	INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
-	INSTR(I2F,          trans_cov),
-	INSTR(U2F,          trans_cov),
-	INSTR(F2I,          trans_cov),
-	INSTR(F2U,          trans_cov),
-};
-
-static ir3_semantic
-decl_semantic(const struct tgsi_declaration_semantic *sem)
-{
-	return ir3_semantic_name(sem->Name, sem->Index);
-}
-
-static struct ir3_instruction *
-decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
-		unsigned j, unsigned inloc, bool use_ldlv)
-{
-	struct ir3_instruction *instr;
-	struct ir3_register *src;
-
-	if (use_ldlv) {
-		/* ldlv.u32 dst, l[#inloc], 1 */
-		instr = instr_create(ctx, 6, OPC_LDLV);
-		instr->cat6.type = TYPE_U32;
-		instr->cat6.iim_val = 1;
-		ir3_reg_create(instr, regid, 0);   /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-
-		return instr;
-	}
-
-	/* bary.f dst, #inloc, r0.x */
-	instr = instr_create(ctx, 2, OPC_BARY_F);
-	ir3_reg_create(instr, regid, 0);   /* dummy dst */
-	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
-	src = ir3_reg_create(instr, 0, IR3_REG_SSA);
-	src->wrmask = 0x3;
-	src->instr = ctx->frag_pos;
-
-	return instr;
-}
-
-/* TGSI_SEMANTIC_POSITION
- * """"""""""""""""""""""
- *
- * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
- * fragment shader input contains the fragment's window position.  The X
- * component starts at zero and always increases from left to right.
- * The Y component starts at zero and always increases but Y=0 may either
- * indicate the top of the window or the bottom depending on the fragment
- * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
- * The Z coordinate ranges from 0 to 1 to represent depth from the front
- * to the back of the Z buffer.  The W component contains the reciprocol
- * of the interpolated vertex position W component.
- */
-static struct ir3_instruction *
-decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
-		unsigned j)
-{
-	struct ir3_instruction *instr, *src;
-
-	compile_assert(ctx, !ctx->frag_coord[j]);
-
-	ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
-
-
-	switch (j) {
-	case 0: /* .x */
-	case 1: /* .y */
-		/* for frag_coord, we get unsigned values.. we need
-		 * to subtract (integer) 8 and divide by 16 (right-
-		 * shift by 4) then convert to float:
-		 */
-
-		/* add.s tmp, src, -8 */
-		instr = instr_create(ctx, 2, OPC_ADD_S);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
-		src = instr;
-
-		/* shr.b tmp, tmp, 4 */
-		instr = instr_create(ctx, 2, OPC_SHR_B);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
-		src = instr;
-
-		/* mov.u32f32 dst, tmp */
-		instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = TYPE_U32;
-		instr->cat1.dst_type = TYPE_F32;
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-
-		break;
-	case 2: /* .z */
-	case 3: /* .w */
-		/* seems that we can use these as-is: */
-		instr = ctx->frag_coord[j];
-		break;
-	default:
-		compile_error(ctx, "invalid channel\n");
-		instr = create_immed(ctx, 0.0);
-		break;
-	}
-
-	return instr;
-}
-
-/* TGSI_SEMANTIC_FACE
- * """"""""""""""""""
- *
- * This label applies to fragment shader inputs only and indicates that
- * the register contains front/back-face information of the form (F, 0,
- * 0, 1).  The first component will be positive when the fragment belongs
- * to a front-facing polygon, and negative when the fragment belongs to a
- * back-facing polygon.
- */
-static struct ir3_instruction *
-decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
-		unsigned j)
-{
-	struct ir3_instruction *instr, *src;
-
-	switch (j) {
-	case 0: /* .x */
-		compile_assert(ctx, !ctx->frag_face);
-
-		ctx->frag_face = create_input(ctx->block, NULL, 0);
-
-		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
-		 * positive vs negative float.. and piglit further seems to
-		 * expect -1.0 or 1.0:
-		 *
-		 *    mul.s tmp, hr0.x, 2
-		 *    add.s tmp, tmp, 1
-		 *    mov.s16f32, dst, tmp
-		 *
-		 */
-
-		instr = instr_create(ctx, 2, OPC_MUL_S);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
-		src = instr;
-
-		instr = instr_create(ctx, 2, OPC_ADD_S);
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
-		src = instr;
-
-		instr = instr_create(ctx, 1, 0); /* mov */
-		instr->cat1.src_type = TYPE_S32;
-		instr->cat1.dst_type = TYPE_F32;
-		ir3_reg_create(instr, regid, 0);    /* dummy dst */
-		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
-
-		break;
-	case 1: /* .y */
-	case 2: /* .z */
-		instr = create_immed(ctx, 0.0);
-		break;
-	case 3: /* .w */
-		instr = create_immed(ctx, 1.0);
-		break;
-	default:
-		compile_error(ctx, "invalid channel\n");
-		instr = create_immed(ctx, 0.0);
-		break;
-	}
-
-	return instr;
-}
-
-static void
-decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned name = decl->Semantic.Name;
-	unsigned i;
-
-	/* I don't think we should get frag shader input without
-	 * semantic info?  Otherwise how do inputs get linked to
-	 * vert outputs?
-	 */
-	compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
-			decl->Declaration.Semantic);
-
-	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-		unsigned n = so->inputs_count++;
-		unsigned r = regid(i, 0);
-		unsigned ncomp, j;
-
-		/* we'll figure out the actual components used after scheduling */
-		ncomp = 4;
-
-		DBG("decl in -> r%d", i);
-
-		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
-
-		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
-		so->inputs[n].compmask = (1 << ncomp) - 1;
-		so->inputs[n].regid = r;
-		so->inputs[n].inloc = ctx->next_inloc;
-		so->inputs[n].interpolate = decl->Interp.Interpolate;
-
-		for (j = 0; j < ncomp; j++) {
-			struct ir3_instruction *instr = NULL;
-
-			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-				/* for fragment shaders, POSITION and FACE are handled
-				 * specially, not using normal varying / bary.f
-				 */
-				if (name == TGSI_SEMANTIC_POSITION) {
-					so->inputs[n].bary = false;
-					so->frag_coord = true;
-					instr = decl_in_frag_coord(ctx, r + j, j);
-				} else if (name == TGSI_SEMANTIC_FACE) {
-					so->inputs[n].bary = false;
-					so->frag_face = true;
-					instr = decl_in_frag_face(ctx, r + j, j);
-				} else {
-					bool use_ldlv = false;
-
-					/* if no interpolation given, pick based on
-					 * semantic:
-					 */
-					if (!decl->Declaration.Interpolate) {
-						switch (decl->Semantic.Name) {
-						case TGSI_SEMANTIC_COLOR:
-							so->inputs[n].interpolate =
-									TGSI_INTERPOLATE_COLOR;
-							break;
-						default:
-							so->inputs[n].interpolate =
-									TGSI_INTERPOLATE_LINEAR;
-						}
-					}
-
-					if (ctx->flat_bypass) {
-						switch (so->inputs[n].interpolate) {
-						case TGSI_INTERPOLATE_COLOR:
-							if (!ctx->so->key.rasterflat)
-								break;
-							/* fallthrough */
-						case TGSI_INTERPOLATE_CONSTANT:
-							use_ldlv = true;
-							break;
-						}
-					}
-
-					so->inputs[n].bary = true;
-
-					instr = decl_in_frag_bary(ctx, r + j, j,
-							so->inputs[n].inloc + j - 8, use_ldlv);
-				}
-			} else {
-				instr = create_input(ctx->block, NULL, (i * 4) + j);
-			}
-
-			ctx->block->inputs[(i * 4) + j] = instr;
-		}
-
-		if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
-			ctx->next_inloc += ncomp;
-			so->total_in += ncomp;
-		}
-	}
-}
-
-static void
-decl_sv(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned r = regid(so->inputs_count, 0);
-	unsigned n = so->inputs_count++;
-
-	DBG("decl sv -> r%d", n);
-
-	compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
-	compile_assert(ctx, decl->Range.First < ARRAY_SIZE(ctx->sysval_semantics));
-
-	ctx->sysval_semantics[decl->Range.First] = decl->Semantic.Name;
-	so->inputs[n].semantic = decl_semantic(&decl->Semantic);
-	so->inputs[n].compmask = 1;
-	so->inputs[n].regid = r;
-	so->inputs[n].inloc = ctx->next_inloc;
-	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
-
-	struct ir3_instruction *instr = NULL;
-
-	switch (decl->Semantic.Name) {
-	case TGSI_SEMANTIC_VERTEXID_NOBASE:
-		ctx->vertex_id = instr = create_input(ctx->block, NULL, r);
-		break;
-	case TGSI_SEMANTIC_BASEVERTEX:
-		ctx->basevertex = instr = instr_create(ctx, 1, 0);
-		instr->cat1.src_type = get_stype(ctx);
-		instr->cat1.dst_type = get_stype(ctx);
-		ir3_reg_create(instr, 0, 0);
-		ir3_reg_create(instr, regid(so->first_driver_param + 4, 0),
-					   IR3_REG_CONST);
-		break;
-	case TGSI_SEMANTIC_INSTANCEID:
-		ctx->instance_id = instr = create_input(ctx->block, NULL, r);
-		break;
-	default:
-		compile_error(ctx, "Unknown semantic: %s\n",
-					  tgsi_semantic_names[decl->Semantic.Name]);
-	}
-
-	ctx->block->inputs[r] = instr;
-	ctx->next_inloc++;
-	so->total_in++;
-}
-
-static void
-decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	unsigned comp = 0;
-	unsigned name = decl->Semantic.Name;
-	unsigned i;
-
-	compile_assert(ctx, decl->Declaration.Semantic);
-
-	DBG("decl out[%d] -> r%d", name, decl->Range.First);
-
-	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
-		switch (name) {
-		case TGSI_SEMANTIC_POSITION:
-			so->writes_pos = true;
-			break;
-		case TGSI_SEMANTIC_PSIZE:
-			so->writes_psize = true;
-			break;
-		case TGSI_SEMANTIC_COLOR:
-		case TGSI_SEMANTIC_BCOLOR:
-		case TGSI_SEMANTIC_GENERIC:
-		case TGSI_SEMANTIC_FOG:
-		case TGSI_SEMANTIC_TEXCOORD:
-			break;
-		default:
-			compile_error(ctx, "unknown VS semantic name: %s\n",
-					tgsi_semantic_names[name]);
-		}
-	} else {
-		switch (name) {
-		case TGSI_SEMANTIC_POSITION:
-			comp = 2;  /* tgsi will write to .z component */
-			so->writes_pos = true;
-			break;
-		case TGSI_SEMANTIC_COLOR:
-			break;
-		default:
-			compile_error(ctx, "unknown FS semantic name: %s\n",
-					tgsi_semantic_names[name]);
-		}
-	}
-
-	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
-		unsigned n = so->outputs_count++;
-		unsigned ncomp, j;
-
-		ncomp = 4;
-
-		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
-
-		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
-		so->outputs[n].regid = regid(i, comp);
-
-		/* avoid undefined outputs, stick a dummy mov from imm{0.0},
-		 * which if the output is actually assigned will be over-
-		 * written
-		 */
-		for (j = 0; j < ncomp; j++)
-			ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
-	}
-}
-
-/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
- * for a fragment shader are just bary.f instructions.  The *actual* inputs
- * from the hw perspective are the frag_pos and optionally frag_coord and
- * frag_face.
- */
-static void
-fixup_frag_inputs(struct ir3_compile_context *ctx)
-{
-	struct ir3_shader_variant *so = ctx->so;
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction **inputs;
-	struct ir3_instruction *instr;
-	int n, regid = 0;
-
-	block->ninputs = 0;
-
-	n  = 4;  /* always have frag_pos */
-	n += COND(so->frag_face, 4);
-	n += COND(so->frag_coord, 4);
-
-	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
-
-	if (so->frag_face) {
-		/* this ultimately gets assigned to hr0.x so doesn't conflict
-		 * with frag_coord/frag_pos..
-		 */
-		inputs[block->ninputs++] = ctx->frag_face;
-		ctx->frag_face->regs[0]->num = 0;
-
-		/* remaining channels not used, but let's avoid confusing
-		 * other parts that expect inputs to come in groups of vec4
-		 */
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
-	}
-
-	/* since we don't know where to set the regid for frag_coord,
-	 * we have to use r0.x for it.  But we don't want to *always*
-	 * use r1.x for frag_pos as that could increase the register
-	 * footprint on simple shaders:
-	 */
-	if (so->frag_coord) {
-		ctx->frag_coord[0]->regs[0]->num = regid++;
-		ctx->frag_coord[1]->regs[0]->num = regid++;
-		ctx->frag_coord[2]->regs[0]->num = regid++;
-		ctx->frag_coord[3]->regs[0]->num = regid++;
-
-		inputs[block->ninputs++] = ctx->frag_coord[0];
-		inputs[block->ninputs++] = ctx->frag_coord[1];
-		inputs[block->ninputs++] = ctx->frag_coord[2];
-		inputs[block->ninputs++] = ctx->frag_coord[3];
-	}
-
-	/* we always have frag_pos: */
-	so->pos_regid = regid;
-
-	/* r0.x */
-	instr = create_input(block, NULL, block->ninputs);
-	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
-	ctx->frag_pos->regs[1]->instr = instr;
-
-	/* r0.y */
-	instr = create_input(block, NULL, block->ninputs);
-	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
-	ctx->frag_pos->regs[2]->instr = instr;
-
-	block->inputs = inputs;
-}
-
-static void
-compile_instructions(struct ir3_compile_context *ctx)
-{
-	push_block(ctx);
-
-	/* for fragment shader, we have a single input register (usually
-	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
-	 */
-	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-		struct ir3_instruction *instr;
-		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
-		ir3_reg_create(instr, 0, 0);
-		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
-		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
-		ctx->frag_pos = instr;
-	}
-
-	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
-		tgsi_parse_token(&ctx->parser);
-
-		switch (ctx->parser.FullToken.Token.Type) {
-		case TGSI_TOKEN_TYPE_DECLARATION: {
-			struct tgsi_full_declaration *decl =
-					&ctx->parser.FullToken.FullDeclaration;
-			unsigned file = decl->Declaration.File;
-			if (file == TGSI_FILE_OUTPUT) {
-				decl_out(ctx, decl);
-			} else if (file == TGSI_FILE_INPUT) {
-				decl_in(ctx, decl);
-			} else if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
-				decl_sv(ctx, decl);
-			}
-
-			if ((file != TGSI_FILE_CONSTANT) && decl->Declaration.Array) {
-				int aid = decl->Array.ArrayID + ctx->array_offsets[file];
-
-				compile_assert(ctx, aid < ARRAY_SIZE(ctx->array));
-
-				/* legacy ArrayID==0 stuff probably isn't going to work
-				 * well (and is at least untested).. let's just scream:
-				 */
-				compile_assert(ctx, aid != 0);
-
-				ctx->array[aid].first = decl->Range.First;
-				ctx->array[aid].last  = decl->Range.Last;
-			}
-			break;
-		}
-		case TGSI_TOKEN_TYPE_IMMEDIATE: {
-			/* TODO: if we know the immediate is small enough, and only
-			 * used with instructions that can embed an immediate, we
-			 * can skip this:
-			 */
-			struct tgsi_full_immediate *imm =
-					&ctx->parser.FullToken.FullImmediate;
-			unsigned n = ctx->so->immediates_count++;
-			compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
-			memcpy(ctx->so->immediates[n].val, imm->u, 16);
-			break;
-		}
-		case TGSI_TOKEN_TYPE_INSTRUCTION: {
-			struct tgsi_full_instruction *inst =
-					&ctx->parser.FullToken.FullInstruction;
-			unsigned opc = inst->Instruction.Opcode;
-			const struct instr_translater *t = &translaters[opc];
-
-			if (t->fxn) {
-				t->fxn(t, ctx, inst);
-				ctx->num_internal_temps = 0;
-
-				compile_assert(ctx, !ctx->using_tmp_dst);
-			} else {
-				compile_error(ctx, "unknown TGSI opc: %s\n",
-						tgsi_get_opcode_name(opc));
-			}
-
-			switch (inst->Instruction.Saturate) {
-			case TGSI_SAT_ZERO_ONE:
-				create_clamp_imm(ctx, &inst->Dst[0].Register,
-						fui(0.0), fui(1.0));
-				break;
-			case TGSI_SAT_MINUS_PLUS_ONE:
-				create_clamp_imm(ctx, &inst->Dst[0].Register,
-						fui(-1.0), fui(1.0));
-				break;
-			}
-
-			instr_finish(ctx);
-
-			break;
-		}
-		case TGSI_TOKEN_TYPE_PROPERTY: {
-			struct tgsi_full_property *prop =
-				&ctx->parser.FullToken.FullProperty;
-			switch (prop->Property.PropertyName) {
-			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
-				ctx->so->color0_mrt = !!prop->u[0].Data;
-				break;
-			}
-		}
-		default:
-			break;
-		}
-	}
-}
-
-static void
-compile_dump(struct ir3_compile_context *ctx)
-{
-	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
-	static unsigned n = 0;
-	char fname[16];
-	FILE *f;
-	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
-	f = fopen(fname, "w");
-	if (!f)
-		return;
-	ir3_block_depth(ctx->block);
-	ir3_dump(ctx->ir, name, ctx->block, f);
-	fclose(f);
-}
-
-int
-ir3_compile_shader(struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens, struct ir3_shader_key key,
-		bool cp)
-{
-	struct ir3_compile_context ctx;
-	struct ir3_block *block;
-	struct ir3_instruction **inputs;
-	unsigned i, j, actual_in;
-	int ret = 0, max_bary;
-
-	assert(!so->ir);
-
-	so->ir = ir3_create();
-
-	assert(so->ir);
-
-	if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
-		DBG("INIT failed!");
-		ret = -1;
-		goto out;
-	}
-
-	/* for now, until the edge cases are worked out: */
-	if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
-		cp = false;
-
-	compile_instructions(&ctx);
-
-	block = ctx.block;
-	so->ir->block = block;
-
-	/* keep track of the inputs from TGSI perspective.. */
-	inputs = block->inputs;
-
-	/* but fixup actual inputs for frag shader: */
-	if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
-		fixup_frag_inputs(&ctx);
-
-	/* at this point, for binning pass, throw away unneeded outputs: */
-	if (key.binning_pass) {
-		for (i = 0, j = 0; i < so->outputs_count; i++) {
-			unsigned name = sem2name(so->outputs[i].semantic);
-			unsigned idx = sem2idx(so->outputs[i].semantic);
-
-			/* throw away everything but first position/psize */
-			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
-					(name == TGSI_SEMANTIC_PSIZE))) {
-				if (i != j) {
-					so->outputs[j] = so->outputs[i];
-					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
-					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
-					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
-					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
-				}
-				j++;
-			}
-		}
-		so->outputs_count = j;
-		block->noutputs = j * 4;
-	}
-
-	/* if we want half-precision outputs, mark the output registers
-	 * as half:
-	 */
-	if (key.half_precision) {
-		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i])
-				continue;
-			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
-		}
-	}
-
-	/* at this point, we want the kill's in the outputs array too,
-	 * so that they get scheduled (since they have no dst).. we've
-	 * already ensured that the array is big enough in push_block():
-	 */
-	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
-		for (i = 0; i < ctx.kill_count; i++)
-			block->outputs[block->noutputs++] = ctx.kill[i];
-	}
-
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(&ctx);
-
-	ret = ir3_block_flatten(block);
-	if (ret < 0) {
-		DBG("FLATTEN failed!");
-		goto out;
-	}
-	if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
-		compile_dump(&ctx);
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("BEFORE CP:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ir3_block_depth(block);
-
-	/* First remove all the extra mov's (which we could skip if the
-	 * front-end was clever enough not to insert them in the first
-	 * place).  Then figure out left/right neighbors, re-inserting
-	 * extra mov's when needed to avoid conflicts.
-	 */
-	if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
-		ir3_block_cp(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("BEFORE GROUPING:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	/* Group left/right neighbors, inserting mov's where needed to
-	 * solve conflicts:
-	 */
-	ir3_block_group(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(&ctx);
-
-	ir3_block_depth(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("AFTER DEPTH:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ret = ir3_block_sched(block);
-	if (ret) {
-		DBG("SCHED failed!");
-		goto out;
-	}
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("AFTER SCHED:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
-	if (ret) {
-		DBG("RA failed!");
-		goto out;
-	}
-
-	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
-		printf("AFTER RA:\n");
-		ir3_dump_instr_list(block->head);
-	}
-
-	ir3_block_legalize(block, &so->has_samp, &max_bary);
-
-	/* fixup input/outputs: */
-	for (i = 0; i < so->outputs_count; i++) {
-		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
-		/* preserve hack for depth output.. tgsi writes depth to .z,
-		 * but what we give the hw is the scalar register:
-		 */
-		if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
-			(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
-			so->outputs[i].regid += 2;
-	}
-	/* Note that some or all channels of an input may be unused: */
-	actual_in = 0;
-	for (i = 0; i < so->inputs_count; i++) {
-		unsigned j, regid = ~0, compmask = 0;
-		so->inputs[i].ncomp = 0;
-		for (j = 0; j < 4; j++) {
-			struct ir3_instruction *in = inputs[(i*4) + j];
-			if (in) {
-				compmask |= (1 << j);
-				regid = in->regs[0]->num - j;
-				actual_in++;
-				so->inputs[i].ncomp++;
-			}
-		}
-		so->inputs[i].regid = regid;
-		so->inputs[i].compmask = compmask;
-	}
-
-	/* fragment shader always gets full vec4's even if it doesn't
-	 * fetch all components, but vertex shader we need to update
-	 * with the actual number of components fetch, otherwise thing
-	 * will hang due to mismaptch between VFD_DECODE's and
-	 * TOTALATTRTOVS
-	 */
-	if (so->type == SHADER_VERTEX)
-		so->total_in = actual_in;
-	else
-		so->total_in = align(max_bary + 1, 4);
-
-out:
-	if (ret) {
-		ir3_destroy(so->ir);
-		so->ir = NULL;
-	}
-	compile_free(&ctx);
-
-	return ret;
+	ralloc_free(compiler);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
index 9213386..86b1161 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -31,12 +31,19 @@
 
 #include "ir3_shader.h"
 
+struct ir3_ra_reg_set;
 
-int ir3_compile_shader_nir(struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens, struct ir3_shader_key key);
+struct ir3_compiler {
+	uint32_t gpu_id;
+	struct ir3_ra_reg_set *set;
+};
 
-int ir3_compile_shader(struct ir3_shader_variant *so,
+struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id);
+void ir3_compiler_destroy(struct ir3_compiler *compiler);
+
+int ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so,
 		const struct tgsi_token *tokens,
-		struct ir3_shader_key key, bool cp);
+		struct ir3_shader_key key);
 
 #endif /* IR3_COMPILER_H_ */
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 05e7049..48b1d8f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -48,19 +48,19 @@
 #include "ir3.h"
 
 
-static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
-
 struct ir3_compile {
+	struct ir3_compiler *compiler;
+
 	const struct tgsi_token *tokens;
 	struct nir_shader *s;
 
 	struct ir3 *ir;
 	struct ir3_shader_variant *so;
 
-	/* bitmask of which samplers are integer: */
-	uint16_t integer_s;
+	struct ir3_block *block;      /* the current block */
+	struct ir3_block *in_block;   /* block created for shader inputs */
 
-	struct ir3_block *block;
+	nir_function_impl *impl;
 
 	/* For fragment shaders, from the hw perspective the only
 	 * actual input is r0.xy position register passed to bary.f.
@@ -92,6 +92,11 @@ struct ir3_compile {
 	 */
 	struct hash_table *addr_ht;
 
+	/* maps nir_block to ir3_block, mostly for the purposes of
+	 * figuring out the blocks successors
+	 */
+	struct hash_table *block_ht;
+
 	/* for calculating input/output positions/linkages: */
 	unsigned next_inloc;
 
@@ -104,6 +109,11 @@ struct ir3_compile {
 	 */
 	bool levels_add_one;
 
+	/* on a3xx, we need to scale up integer coords for isaml based
+	 * on LoD:
+	 */
+	bool unminify_coords;
+
 	/* for looking up which system value is which */
 	unsigned sysval_semantics[8];
 
@@ -118,6 +128,9 @@ struct ir3_compile {
 };
 
 
+static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
+static struct ir3_block * get_block(struct ir3_compile *ctx, nir_block *nblock);
+
 static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 {
 	struct nir_shader_compiler_options options = {
@@ -146,6 +159,7 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 
 		nir_lower_vars_to_ssa(s);
 		nir_lower_alu_to_scalar(s);
+		nir_lower_phis_to_scalar(s);
 
 		progress |= nir_copy_prop(s);
 		progress |= nir_opt_dce(s);
@@ -170,7 +184,8 @@ static struct nir_shader *to_nir(const struct tgsi_token *tokens)
 
 /* TODO nir doesn't lower everything for us yet, but ideally it would: */
 static const struct tgsi_token *
-lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
+lower_tgsi(struct ir3_compile *ctx, const struct tgsi_token *tokens,
+		struct ir3_shader_variant *so)
 {
 	struct tgsi_shader_info info;
 	struct tgsi_lowering_config lconfig = {
@@ -192,11 +207,7 @@ lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
 		break;
 	}
 
-	if (!so->shader) {
-		/* hack for standalone compiler which does not have
-		 * screen/context:
-		 */
-	} else if (ir3_shader_gpuid(so->shader) >= 400) {
+	if (ctx->compiler->gpu_id >= 400) {
 		/* a4xx seems to have *no* sam.p */
 		lconfig.lower_TXP = ~0;  /* lower all txp */
 	} else {
@@ -208,36 +219,26 @@ lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
 }
 
 static struct ir3_compile *
-compile_init(struct ir3_shader_variant *so,
+compile_init(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so,
 		const struct tgsi_token *tokens)
 {
 	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
 	const struct tgsi_token *lowered_tokens;
 
-	if (!so->shader) {
-		/* hack for standalone compiler which does not have
-		 * screen/context:
-		 */
-	} else if (ir3_shader_gpuid(so->shader) >= 400) {
+	if (compiler->gpu_id >= 400) {
 		/* need special handling for "flat" */
 		ctx->flat_bypass = true;
 		ctx->levels_add_one = false;
+		ctx->unminify_coords = false;
 	} else {
 		/* no special handling for "flat" */
 		ctx->flat_bypass = false;
 		ctx->levels_add_one = true;
+		ctx->unminify_coords = true;
 	}
 
-	switch (so->type) {
-	case SHADER_FRAGMENT:
-	case SHADER_COMPUTE:
-		ctx->integer_s = so->key.finteger_s;
-		break;
-	case SHADER_VERTEX:
-		ctx->integer_s = so->key.vinteger_s;
-		break;
-	}
-
+	ctx->compiler = compiler;
 	ctx->ir = so->ir;
 	ctx->so = so;
 	ctx->next_inloc = 8;
@@ -247,8 +248,10 @@ compile_init(struct ir3_shader_variant *so,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
 	ctx->addr_ht = _mesa_hash_table_create(ctx,
 			_mesa_hash_pointer, _mesa_key_pointer_equal);
+	ctx->block_ht = _mesa_hash_table_create(ctx,
+			_mesa_hash_pointer, _mesa_key_pointer_equal);
 
-	lowered_tokens = lower_tgsi(tokens, so);
+	lowered_tokens = lower_tgsi(ctx, tokens, so);
 	if (!lowered_tokens)
 		lowered_tokens = tokens;
 	ctx->s = to_nir(lowered_tokens);
@@ -290,33 +293,206 @@ compile_free(struct ir3_compile *ctx)
 	ralloc_free(ctx);
 }
 
-
+/* global per-array information: */
 struct ir3_array {
 	unsigned length, aid;
+};
+
+/* per-block array state: */
+struct ir3_array_value {
+	/* TODO drop length/aid, and just have ptr back to ir3_array */
+	unsigned length, aid;
+	/* initial array element values are phi's, other than for the
+	 * entry block.  The phi src's get added later in a resolve step
+	 * after we have visited all the blocks, to account for back
+	 * edges in the cfg.
+	 */
+	struct ir3_instruction **phis;
+	/* current array element values (as block is processed).  When
+	 * the array phi's are resolved, it will contain the array state
+	 * at exit of block, so successor blocks can use it to add their
+	 * phi srcs.
+	 */
 	struct ir3_instruction *arr[];
 };
 
+/* track array assignments per basic block.  When an array is read
+ * outside of the same basic block, we can use NIR's dominance-frontier
+ * information to figure out where phi nodes are needed.
+ */
+struct ir3_nir_block_data {
+	unsigned foo;
+	/* indexed by array-id (aid): */
+	struct ir3_array_value *arrs[];
+};
+
+static struct ir3_nir_block_data *
+get_block_data(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	if (!block->bd) {
+		struct ir3_nir_block_data *bd = ralloc_size(ctx, sizeof(*bd) +
+				((ctx->num_arrays + 1) * sizeof(bd->arrs[0])));
+		block->bd = bd;
+	}
+	return block->bd;
+}
+
 static void
 declare_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
-	struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) +
-			(length * sizeof(arr->arr[0])));
+	struct ir3_array *arr = ralloc(ctx, struct ir3_array);
 	arr->length = length;
 	arr->aid = ++ctx->num_arrays;
-	/* Some shaders end up reading array elements without first writing..
-	 * so initialize things to prevent null instr ptrs later:
-	 */
-	for (unsigned i = 0; i < length; i++)
-		arr->arr[i] = create_immed(ctx->block, 0);
 	_mesa_hash_table_insert(ctx->var_ht, var, arr);
 }
 
-static struct ir3_array *
+static nir_block *
+nir_block_pred(nir_block *block)
+{
+	assert(block->predecessors->entries < 2);
+	if (block->predecessors->entries == 0)
+		return NULL;
+	return (nir_block *)_mesa_set_next_entry(block->predecessors, NULL)->key;
+}
+
+static struct ir3_array_value *
 get_var(struct ir3_compile *ctx, nir_variable *var)
 {
 	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
-	return entry->data;
+	struct ir3_block *block = ctx->block;
+	struct ir3_nir_block_data *bd = get_block_data(ctx, block);
+	struct ir3_array *arr = entry->data;
+
+	if (!bd->arrs[arr->aid]) {
+		struct ir3_array_value *av = ralloc_size(bd, sizeof(*av) +
+				(arr->length * sizeof(av->arr[0])));
+		struct ir3_array_value *defn = NULL;
+		nir_block *pred_block;
+
+		av->length = arr->length;
+		av->aid = arr->aid;
+
+		/* For loops, we have to consider that we have not visited some
+		 * of the blocks who should feed into the phi (ie. back-edges in
+		 * the cfg).. for example:
+		 *
+		 *   loop {
+		 *      block { load_var; ... }
+		 *      if then block {} else block {}
+		 *      block { store_var; ... }
+		 *      if then block {} else block {}
+		 *      block {...}
+		 *   }
+		 *
+		 * We can skip the phi if we can chase the block predecessors
+		 * until finding the block previously defining the array without
+		 * crossing a block that has more than one predecessor.
+		 *
+		 * Otherwise create phi's and resolve them as a post-pass after
+		 * all the blocks have been visited (to handle back-edges).
+		 */
+
+		for (pred_block = block->nblock;
+				pred_block && (pred_block->predecessors->entries < 2) && !defn;
+				pred_block = nir_block_pred(pred_block)) {
+			struct ir3_block *pblock = get_block(ctx, pred_block);
+			struct ir3_nir_block_data *pbd = pblock->bd;
+			if (!pbd)
+				continue;
+			defn = pbd->arrs[arr->aid];
+		}
+
+		if (defn) {
+			/* only one possible definer: */
+			for (unsigned i = 0; i < arr->length; i++)
+				av->arr[i] = defn->arr[i];
+		} else if (pred_block) {
+			/* not the first block, and multiple potential definers: */
+			av->phis = ralloc_size(av, arr->length * sizeof(av->phis[0]));
+
+			for (unsigned i = 0; i < arr->length; i++) {
+				struct ir3_instruction *phi;
+
+				phi = ir3_instr_create2(block, -1, OPC_META_PHI,
+						1 + ctx->impl->num_blocks);
+				ir3_reg_create(phi, 0, 0);         /* dst */
+
+				/* phi's should go at head of block: */
+				list_delinit(&phi->node);
+				list_add(&phi->node, &block->instr_list);
+
+				av->phis[i] = av->arr[i] = phi;
+			}
+		} else {
+			/* Some shaders end up reading array elements without
+			 * first writing.. so initialize things to prevent null
+			 * instr ptrs later:
+			 */
+			for (unsigned i = 0; i < arr->length; i++)
+				av->arr[i] = create_immed(block, 0);
+		}
+
+		bd->arrs[arr->aid] = av;
+	}
+
+	return bd->arrs[arr->aid];
+}
+
+static void
+add_array_phi_srcs(struct ir3_compile *ctx, nir_block *nblock,
+		struct ir3_array_value *av, BITSET_WORD *visited)
+{
+	struct ir3_block *block;
+	struct ir3_nir_block_data *bd;
+
+	if (BITSET_TEST(visited, nblock->index))
+		return;
+
+	BITSET_SET(visited, nblock->index);
+
+	block = get_block(ctx, nblock);
+	bd = block->bd;
+
+	if (bd && bd->arrs[av->aid]) {
+		struct ir3_array_value *dav = bd->arrs[av->aid];
+		for (unsigned i = 0; i < av->length; i++) {
+			ir3_reg_create(av->phis[i], 0, IR3_REG_SSA)->instr =
+					dav->arr[i];
+		}
+	} else {
+		/* didn't find defn, recurse predecessors: */
+		struct set_entry *entry;
+		set_foreach(nblock->predecessors, entry) {
+			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+		}
+	}
+}
+
+static void
+resolve_array_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	struct ir3_nir_block_data *bd = block->bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->impl->num_blocks);
+
+	if (!bd)
+		return;
+
+	/* TODO use nir dom_frontier to help us with this? */
+
+	for (unsigned i = 1; i <= ctx->num_arrays; i++) {
+		struct ir3_array_value *av = bd->arrs[i];
+		BITSET_WORD visited[bitset_words];
+		struct set_entry *entry;
+
+		if (!(av && av->phis))
+			continue;
+
+		memset(visited, 0, sizeof(visited));
+		set_foreach(block->nblock->predecessors, entry) {
+			add_array_phi_srcs(ctx, (nir_block *)entry->key, av, visited);
+		}
+	}
 }
 
 /* allocate a n element value array (to be populated by caller) and
@@ -393,7 +569,8 @@ create_addr(struct ir3_block *block, struct ir3_instruction *src)
 	instr->regs[1]->flags |= IR3_REG_HALF;
 
 	instr = ir3_MOV(block, instr, TYPE_S16);
-	instr->regs[0]->flags |= IR3_REG_ADDR | IR3_REG_HALF;
+	instr->regs[0]->num = regid(REG_A0, 0);
+	instr->regs[0]->flags |= IR3_REG_HALF;
 	instr->regs[1]->flags |= IR3_REG_HALF;
 
 	return instr;
@@ -419,6 +596,22 @@ get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
 }
 
 static struct ir3_instruction *
+get_predicate(struct ir3_compile *ctx, struct ir3_instruction *src)
+{
+	struct ir3_block *b = ctx->block;
+	struct ir3_instruction *cond;
+
+	/* NOTE: only cmps.*.* can write p0.x: */
+	cond = ir3_CMPS_S(b, src, 0, create_immed(b, 0), 0);
+	cond->cat2.condition = IR3_COND_NE;
+
+	/* condition always goes in predicate register: */
+	cond->regs[0]->num = regid(REG_P0, 0);
+
+	return cond;
+}
+
+static struct ir3_instruction *
 create_uniform(struct ir3_compile *ctx, unsigned n)
 {
 	struct ir3_instruction *mov;
@@ -461,7 +654,7 @@ create_collect(struct ir3_block *block, struct ir3_instruction **arr,
 		return NULL;
 
 	collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz);
-	ir3_reg_create(collect, 0, 0);
+	ir3_reg_create(collect, 0, 0);     /* dst */
 	for (unsigned i = 0; i < arrsz; i++)
 		ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
 
@@ -597,6 +790,7 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
 		compile_assert(ctx, !ctx->frag_face);
 
 		ctx->frag_face = create_input(block, NULL, 0);
+		ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 
 		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
 		 * positive vs negative float.. and piglit further seems to
@@ -628,10 +822,10 @@ create_frag_face(struct ir3_compile *ctx, unsigned comp)
  */
 static void
 split_dest(struct ir3_block *block, struct ir3_instruction **dst,
-		struct ir3_instruction *src)
+		struct ir3_instruction *src, unsigned n)
 {
 	struct ir3_instruction *prev = NULL;
-	for (int i = 0, j = 0; i < 4; i++) {
+	for (int i = 0, j = 0; i < n; i++) {
 		struct ir3_instruction *split =
 				ir3_instr_create(block, -1, OPC_META_FO);
 		ir3_reg_create(split, 0, IR3_REG_SSA);
@@ -882,9 +1076,15 @@ emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
 	case nir_op_imax:
 		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
 		break;
+	case nir_op_umax:
+		dst[0] = ir3_MAX_U(b, src[0], 0, src[1], 0);
+		break;
 	case nir_op_imin:
 		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
 		break;
+	case nir_op_umin:
+		dst[0] = ir3_MIN_U(b, src[0], 0, src[1], 0);
+		break;
 	case nir_op_imul:
 		/*
 		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
@@ -1030,7 +1230,7 @@ emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_array_value *arr = get_var(ctx, dvar->var);
 
 	compile_assert(ctx, dvar->deref.child &&
 		(dvar->deref.child->deref_type == nir_deref_type_array));
@@ -1070,7 +1270,7 @@ emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 {
 	nir_deref_var *dvar = intr->variables[0];
 	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
-	struct ir3_array *arr = get_var(ctx, dvar->var);
+	struct ir3_array_value *arr = get_var(ctx, dvar->var);
 	struct ir3_instruction **src;
 
 	compile_assert(ctx, dvar->deref.child &&
@@ -1140,8 +1340,8 @@ static void add_sysval_input(struct ir3_compile *ctx, unsigned name,
 	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
 	so->total_in++;
 
-	ctx->block->ninputs = MAX2(ctx->block->ninputs, r + 1);
-	ctx->block->inputs[r] = instr;
+	ctx->ir->ninputs = MAX2(ctx->ir->ninputs, r + 1);
+	ctx->ir->inputs[r] = instr;
 }
 
 static void
@@ -1154,18 +1354,18 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 
 	if (info->has_dest) {
 		dst = get_dst(ctx, &intr->dest, intr->num_components);
+	} else {
+		dst = NULL;
 	}
 
 	switch (intr->intrinsic) {
 	case nir_intrinsic_load_uniform:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
 			dst[i] = create_uniform(ctx, n);
 		}
 		break;
 	case nir_intrinsic_load_uniform_indirect:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		src = get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
@@ -1178,21 +1378,20 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		emit_intrinsic_load_ubo(ctx, intr, dst);
 		break;
 	case nir_intrinsic_load_input:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
-			dst[i] = b->inputs[n];
+			dst[i] = ctx->ir->inputs[n];
 		}
 		break;
 	case nir_intrinsic_load_input_indirect:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		src = get_src(ctx, &intr->src[0]);
 		struct ir3_instruction *collect =
-				create_collect(b, b->inputs, b->ninputs);
+				create_collect(b, ctx->ir->inputs, ctx->ir->ninputs);
 		struct ir3_instruction *addr = get_addr(ctx, src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
-			dst[i] = create_indirect_load(ctx, b->ninputs, n, addr, collect);
+			dst[i] = create_indirect_load(ctx, ctx->ir->ninputs,
+					n, addr, collect);
 		}
 		break;
 	case nir_intrinsic_load_var:
@@ -1202,11 +1401,10 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		emit_intrinisic_store_var(ctx, intr);
 		break;
 	case nir_intrinsic_store_output:
-		compile_assert(ctx, intr->const_index[1] == 1);
 		src = get_src(ctx, &intr->src[0]);
 		for (int i = 0; i < intr->num_components; i++) {
 			unsigned n = idx * 4 + i;
-			b->outputs[n] = src[i];
+			ctx->ir->outputs[n] = src[i];
 		}
 		break;
 	case nir_intrinsic_load_base_vertex:
@@ -1248,6 +1446,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			cond = create_immed(b, 1);
 		}
 
+		/* NOTE: only cmps.*.* can write p0.x: */
 		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
 		cond->cat2.condition = IR3_COND_NE;
 
@@ -1255,6 +1454,7 @@ emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 		cond->regs[0]->num = regid(REG_P0, 0);
 
 		kill = ir3_KILL(b, cond, 0);
+		array_insert(ctx->ir->predicates, kill);
 
 		ctx->kill[ctx->kill_count++] = kill;
 		ctx->so->has_kill = true;
@@ -1318,6 +1518,8 @@ tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
 		coords = 3;
 		flags |= IR3_INSTR_3D;
 		break;
+	default:
+		unreachable("bad sampler_dim");
 	}
 
 	if (tex->is_shadow)
@@ -1340,7 +1542,10 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	unsigned i, coords, flags;
 	unsigned nsrc0 = 0, nsrc1 = 0;
 	type_t type;
-	opc_t opc;
+	opc_t opc = 0;
+
+	coord = off = ddx = ddy = NULL;
+	lod = proj = compare = NULL;
 
 	/* TODO: might just be one component for gathers? */
 	dst = get_dst(ctx, &tex->dest, 4);
@@ -1400,11 +1605,12 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	tex_info(tex, &flags, &coords);
 
 	/* scale up integer coords for TXF based on the LOD */
-	if (opc == OPC_ISAML) {
+	if (ctx->unminify_coords && (opc == OPC_ISAML)) {
 		assert(has_lod);
 		for (i = 0; i < coords; i++)
 			coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
 	}
+
 	/*
 	 * lay out the first argument in the proper order:
 	 *  - actual coordinates first
@@ -1484,6 +1690,8 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 	case nir_type_bool:
 		type = TYPE_U32;
 		break;
+	default:
+		unreachable("bad dest_type");
 	}
 
 	sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW,
@@ -1491,7 +1699,7 @@ emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
 			create_collect(b, src0, nsrc0),
 			create_collect(b, src1, nsrc1));
 
-	split_dest(b, dst, sam);
+	split_dest(b, dst, sam, 4);
 }
 
 static void
@@ -1508,7 +1716,7 @@ emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex)
 	/* even though there is only one component, since it ends
 	 * up in .z rather than .x, we need a split_dest()
 	 */
-	split_dest(b, dst, sam);
+	split_dest(b, dst, sam, 3);
 
 	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
 	 * the value in TEX_CONST_0 is zero-based.
@@ -1536,7 +1744,7 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
 	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
 			tex->sampler_index, tex->sampler_index, lod, NULL);
 
-	split_dest(b, dst, sam);
+	split_dest(b, dst, sam, 4);
 
 	/* Array size actually ends up in .w rather than .z. This doesn't
 	 * matter for miplevel 0, but for higher mips the value in z is
@@ -1553,6 +1761,71 @@ emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
 }
 
 static void
+emit_phi(struct ir3_compile *ctx, nir_phi_instr *nphi)
+{
+	struct ir3_instruction *phi, **dst;
+
+	/* NOTE: phi's should be lowered to scalar at this point */
+	compile_assert(ctx, nphi->dest.ssa.num_components == 1);
+
+	dst = get_dst(ctx, &nphi->dest, 1);
+
+	phi = ir3_instr_create2(ctx->block, -1, OPC_META_PHI,
+			1 + exec_list_length(&nphi->srcs));
+	ir3_reg_create(phi, 0, 0);         /* dst */
+	phi->phi.nphi = nphi;
+
+	dst[0] = phi;
+}
+
+/* phi instructions are left partially constructed.  We don't resolve
+ * their srcs until the end of the block, since (eg. loops) one of
+ * the phi's srcs might be defined after the phi due to back edges in
+ * the CFG.
+ */
+static void
+resolve_phis(struct ir3_compile *ctx, struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		nir_phi_instr *nphi;
+
+		/* phi's only come at start of block: */
+		if (!(is_meta(instr) && (instr->opc == OPC_META_PHI)))
+			break;
+
+		if (!instr->phi.nphi)
+			break;
+
+		nphi = instr->phi.nphi;
+		instr->phi.nphi = NULL;
+
+		foreach_list_typed(nir_phi_src, nsrc, node, &nphi->srcs) {
+			struct ir3_instruction *src = get_src(ctx, &nsrc->src)[0];
+			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
+		}
+	}
+
+	resolve_array_phis(ctx, block);
+}
+
+static void
+emit_jump(struct ir3_compile *ctx, nir_jump_instr *jump)
+{
+	switch (jump->type) {
+	case nir_jump_break:
+	case nir_jump_continue:
+		/* I *think* we can simply just ignore this, and use the
+		 * successor block link to figure out where we need to
+		 * jump to for break/continue
+		 */
+		break;
+	default:
+		compile_error(ctx, "Unhandled NIR jump type: %d\n", jump->type);
+		break;
+	}
+}
+
+static void
 emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 {
 	switch (instr->type) {
@@ -1585,45 +1858,112 @@ emit_instr(struct ir3_compile *ctx, nir_instr *instr)
 		}
 		break;
 	}
-	case nir_instr_type_call:
-	case nir_instr_type_jump:
 	case nir_instr_type_phi:
+		emit_phi(ctx, nir_instr_as_phi(instr));
+		break;
+	case nir_instr_type_jump:
+		emit_jump(ctx, nir_instr_as_jump(instr));
+		break;
+	case nir_instr_type_call:
 	case nir_instr_type_parallel_copy:
 		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
 		break;
 	}
 }
 
+static struct ir3_block *
+get_block(struct ir3_compile *ctx, nir_block *nblock)
+{
+	struct ir3_block *block;
+	struct hash_entry *entry;
+	entry = _mesa_hash_table_search(ctx->block_ht, nblock);
+	if (entry)
+		return entry->data;
+
+	block = ir3_block_create(ctx->ir);
+	block->nblock = nblock;
+	_mesa_hash_table_insert(ctx->block_ht, nblock, block);
+
+	return block;
+}
+
 static void
-emit_block(struct ir3_compile *ctx, nir_block *block)
+emit_block(struct ir3_compile *ctx, nir_block *nblock)
 {
-	nir_foreach_instr(block, instr) {
+	struct ir3_block *block = get_block(ctx, nblock);
+
+	for (int i = 0; i < ARRAY_SIZE(block->successors); i++) {
+		if (nblock->successors[i]) {
+			block->successors[i] =
+				get_block(ctx, nblock->successors[i]);
+		}
+	}
+
+	ctx->block = block;
+	list_addtail(&block->node, &ctx->ir->block_list);
+
+	nir_foreach_instr(nblock, instr) {
 		emit_instr(ctx, instr);
 		if (ctx->error)
 			return;
 	}
 }
 
+static void emit_cf_list(struct ir3_compile *ctx, struct exec_list *list);
+
 static void
-emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+emit_if(struct ir3_compile *ctx, nir_if *nif)
+{
+	struct ir3_instruction *condition = get_src(ctx, &nif->condition)[0];
+
+	ctx->block->condition =
+		get_predicate(ctx, ir3_b2n(condition->block, condition));
+
+	emit_cf_list(ctx, &nif->then_list);
+	emit_cf_list(ctx, &nif->else_list);
+}
+
+static void
+emit_loop(struct ir3_compile *ctx, nir_loop *nloop)
+{
+	emit_cf_list(ctx, &nloop->body);
+}
+
+static void
+emit_cf_list(struct ir3_compile *ctx, struct exec_list *list)
 {
-	foreach_list_typed(nir_cf_node, node, node, &impl->body) {
+	foreach_list_typed(nir_cf_node, node, node, list) {
 		switch (node->type) {
 		case nir_cf_node_block:
 			emit_block(ctx, nir_cf_node_as_block(node));
 			break;
 		case nir_cf_node_if:
+			emit_if(ctx, nir_cf_node_as_if(node));
+			break;
 		case nir_cf_node_loop:
+			emit_loop(ctx, nir_cf_node_as_loop(node));
+			break;
 		case nir_cf_node_function:
 			compile_error(ctx, "TODO\n");
 			break;
 		}
-		if (ctx->error)
-			return;
 	}
 }
 
 static void
+emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
+{
+	emit_cf_list(ctx, &impl->body);
+	emit_block(ctx, impl->end_block);
+
+	/* at this point, we should have a single empty block,
+	 * into which we emit the 'end' instruction.
+	 */
+	compile_assert(ctx, list_empty(&ctx->block->instr_list));
+	ir3_END(ctx->block);
+}
+
+static void
 setup_input(struct ir3_compile *ctx, nir_variable *in)
 {
 	struct ir3_shader_variant *so = ctx->so;
@@ -1708,7 +2048,7 @@ setup_input(struct ir3_compile *ctx, nir_variable *in)
 			instr = create_input(ctx->block, NULL, idx);
 		}
 
-		ctx->block->inputs[idx] = instr;
+		ctx->ir->inputs[idx] = instr;
 	}
 
 	if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
@@ -1775,15 +2115,26 @@ setup_output(struct ir3_compile *ctx, nir_variable *out)
 	for (int i = 0; i < ncomp; i++) {
 		unsigned idx = (n * 4) + i;
 
-		ctx->block->outputs[idx] = create_immed(ctx->block, fui(0.0));
+		ctx->ir->outputs[idx] = create_immed(ctx->block, fui(0.0));
 	}
 }
 
 static void
 emit_instructions(struct ir3_compile *ctx)
 {
-	unsigned ninputs  = exec_list_length(&ctx->s->inputs) * 4;
-	unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
+	unsigned ninputs, noutputs;
+	nir_function_impl *fxn = NULL;
+
+	/* Find the main function: */
+	nir_foreach_overload(ctx->s, overload) {
+		compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
+		compile_assert(ctx, overload->impl);
+		fxn = overload->impl;
+		break;
+	}
+
+	ninputs  = exec_list_length(&ctx->s->inputs) * 4;
+	noutputs = exec_list_length(&ctx->s->outputs) * 4;
 
 	/* we need to allocate big enough outputs array so that
 	 * we can stuff the kill's at the end.  Likewise for vtx
@@ -1795,12 +2146,17 @@ emit_instructions(struct ir3_compile *ctx)
 		ninputs += 8;
 	}
 
-	ctx->block = ir3_block_create(ctx->ir, 0, ninputs, noutputs);
+	ctx->ir = ir3_create(ctx->compiler, ninputs, noutputs);
+
+	/* Create inputs in first block: */
+	ctx->block = get_block(ctx, fxn->start_block);
+	ctx->in_block = ctx->block;
+	list_addtail(&ctx->block->node, &ctx->ir->block_list);
 
 	if (ctx->so->type == SHADER_FRAGMENT) {
-		ctx->block->noutputs -= ARRAY_SIZE(ctx->kill);
+		ctx->ir->noutputs -= ARRAY_SIZE(ctx->kill);
 	} else if (ctx->so->type == SHADER_VERTEX) {
-		ctx->block->ninputs -= 8;
+		ctx->ir->ninputs -= 8;
 	}
 
 	/* for fragment shader, we have a single input register (usually
@@ -1831,13 +2187,12 @@ emit_instructions(struct ir3_compile *ctx)
 		declare_var(ctx, var);
 	}
 
-	/* Find the main function and emit the body: */
-	nir_foreach_overload(ctx->s, overload) {
-		compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
-		compile_assert(ctx, overload->impl);
-		emit_function(ctx, overload->impl);
-		if (ctx->error)
-			return;
+	/* And emit the body: */
+	ctx->impl = fxn;
+	emit_function(ctx, fxn);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		resolve_phis(ctx, block);
 	}
 }
 
@@ -1850,12 +2205,12 @@ static void
 fixup_frag_inputs(struct ir3_compile *ctx)
 {
 	struct ir3_shader_variant *so = ctx->so;
-	struct ir3_block *block = ctx->block;
+	struct ir3 *ir = ctx->ir;
 	struct ir3_instruction **inputs;
 	struct ir3_instruction *instr;
 	int n, regid = 0;
 
-	block->ninputs = 0;
+	ir->ninputs = 0;
 
 	n  = 4;  /* always have frag_pos */
 	n += COND(so->frag_face, 4);
@@ -1867,15 +2222,15 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 		/* this ultimately gets assigned to hr0.x so doesn't conflict
 		 * with frag_coord/frag_pos..
 		 */
-		inputs[block->ninputs++] = ctx->frag_face;
+		inputs[ir->ninputs++] = ctx->frag_face;
 		ctx->frag_face->regs[0]->num = 0;
 
 		/* remaining channels not used, but let's avoid confusing
 		 * other parts that expect inputs to come in groups of vec4
 		 */
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
-		inputs[block->ninputs++] = NULL;
+		inputs[ir->ninputs++] = NULL;
+		inputs[ir->ninputs++] = NULL;
+		inputs[ir->ninputs++] = NULL;
 	}
 
 	/* since we don't know where to set the regid for frag_coord,
@@ -1889,63 +2244,45 @@ fixup_frag_inputs(struct ir3_compile *ctx)
 		ctx->frag_coord[2]->regs[0]->num = regid++;
 		ctx->frag_coord[3]->regs[0]->num = regid++;
 
-		inputs[block->ninputs++] = ctx->frag_coord[0];
-		inputs[block->ninputs++] = ctx->frag_coord[1];
-		inputs[block->ninputs++] = ctx->frag_coord[2];
-		inputs[block->ninputs++] = ctx->frag_coord[3];
+		inputs[ir->ninputs++] = ctx->frag_coord[0];
+		inputs[ir->ninputs++] = ctx->frag_coord[1];
+		inputs[ir->ninputs++] = ctx->frag_coord[2];
+		inputs[ir->ninputs++] = ctx->frag_coord[3];
 	}
 
 	/* we always have frag_pos: */
 	so->pos_regid = regid;
 
 	/* r0.x */
-	instr = create_input(block, NULL, block->ninputs);
+	instr = create_input(ctx->in_block, NULL, ir->ninputs);
 	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
+	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[1]->instr = instr;
 
 	/* r0.y */
-	instr = create_input(block, NULL, block->ninputs);
+	instr = create_input(ctx->in_block, NULL, ir->ninputs);
 	instr->regs[0]->num = regid++;
-	inputs[block->ninputs++] = instr;
+	inputs[ir->ninputs++] = instr;
 	ctx->frag_pos->regs[2]->instr = instr;
 
-	block->inputs = inputs;
-}
-
-static void
-compile_dump(struct ir3_compile *ctx)
-{
-	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
-	static unsigned n = 0;
-	char fname[16];
-	FILE *f;
-	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
-	f = fopen(fname, "w");
-	if (!f)
-		return;
-	ir3_block_depth(ctx->block);
-	ir3_dump(ctx->ir, name, ctx->block, f);
-	fclose(f);
+	ir->inputs = inputs;
 }
 
 int
-ir3_compile_shader_nir(struct ir3_shader_variant *so,
-		const struct tgsi_token *tokens, struct ir3_shader_key key)
+ir3_compile_shader_nir(struct ir3_compiler *compiler,
+		struct ir3_shader_variant *so,
+		const struct tgsi_token *tokens,
+		struct ir3_shader_key key)
 {
 	struct ir3_compile *ctx;
-	struct ir3_block *block;
+	struct ir3 *ir;
 	struct ir3_instruction **inputs;
 	unsigned i, j, actual_in;
 	int ret = 0, max_bary;
 
 	assert(!so->ir);
 
-	so->ir = ir3_create();
-
-	assert(so->ir);
-
-	ctx = compile_init(so, tokens);
+	ctx = compile_init(compiler, so, tokens);
 	if (!ctx) {
 		DBG("INIT failed!");
 		ret = -1;
@@ -1960,11 +2297,10 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 		goto out;
 	}
 
-	block = ctx->block;
-	so->ir->block = block;
+	ir = so->ir = ctx->ir;
 
 	/* keep track of the inputs from TGSI perspective.. */
-	inputs = block->inputs;
+	inputs = ir->inputs;
 
 	/* but fixup actual inputs for frag shader: */
 	if (so->type == SHADER_FRAGMENT)
@@ -1981,26 +2317,39 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 					(name == TGSI_SEMANTIC_PSIZE))) {
 				if (i != j) {
 					so->outputs[j] = so->outputs[i];
-					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
-					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
-					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
-					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
+					ir->outputs[(j*4)+0] = ir->outputs[(i*4)+0];
+					ir->outputs[(j*4)+1] = ir->outputs[(i*4)+1];
+					ir->outputs[(j*4)+2] = ir->outputs[(i*4)+2];
+					ir->outputs[(j*4)+3] = ir->outputs[(i*4)+3];
 				}
 				j++;
 			}
 		}
 		so->outputs_count = j;
-		block->noutputs = j * 4;
+		ir->noutputs = j * 4;
 	}
 
 	/* if we want half-precision outputs, mark the output registers
 	 * as half:
 	 */
 	if (key.half_precision) {
-		for (i = 0; i < block->noutputs; i++) {
-			if (!block->outputs[i])
+		for (i = 0; i < ir->noutputs; i++) {
+			struct ir3_instruction *out = ir->outputs[i];
+			if (!out)
 				continue;
-			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
+			out->regs[0]->flags |= IR3_REG_HALF;
+			/* output could be a fanout (ie. texture fetch output)
+			 * in which case we need to propagate the half-reg flag
+			 * up to the definer so that RA sees it:
+			 */
+			if (is_meta(out) && (out->opc == OPC_META_FO)) {
+				out = out->regs[1]->instr;
+				out->regs[0]->flags |= IR3_REG_HALF;
+			}
+
+			if (out->category == 1) {
+				out->cat1.dst_type = half_type(out->cat1.dst_type);
+			}
 		}
 	}
 
@@ -2010,42 +2359,34 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 	 */
 	if (so->type == SHADER_FRAGMENT) {
 		for (i = 0; i < ctx->kill_count; i++)
-			block->outputs[block->noutputs++] = ctx->kill[i];
+			ir->outputs[ir->noutputs++] = ctx->kill[i];
 	}
 
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(ctx);
-
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("BEFORE CP:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ir3_block_depth(block);
-
-	ir3_block_cp(block);
+	ir3_cp(ir);
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("BEFORE GROUPING:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
 	/* Group left/right neighbors, inserting mov's where needed to
 	 * solve conflicts:
 	 */
-	ir3_block_group(block);
-
-	if (fd_mesa_debug & FD_DBG_OPTDUMP)
-		compile_dump(ctx);
+	ir3_group(ir);
 
-	ir3_block_depth(block);
+	ir3_depth(ir);
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER DEPTH:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ret = ir3_block_sched(block);
+	ret = ir3_sched(ir);
 	if (ret) {
 		DBG("SCHED failed!");
 		goto out;
@@ -2053,10 +2394,10 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER SCHED:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
+	ret = ir3_ra(ir, so->type, so->frag_coord, so->frag_face);
 	if (ret) {
 		DBG("RA failed!");
 		goto out;
@@ -2064,14 +2405,19 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 
 	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
 		printf("AFTER RA:\n");
-		ir3_dump_instr_list(block->head);
+		ir3_print(ir);
 	}
 
-	ir3_block_legalize(block, &so->has_samp, &max_bary);
+	ir3_legalize(ir, &so->has_samp, &max_bary);
+
+	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
+		printf("AFTER LEGALIZE:\n");
+		ir3_print(ir);
+	}
 
 	/* fixup input/outputs: */
 	for (i = 0; i < so->outputs_count; i++) {
-		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
+		so->outputs[i].regid = ir->outputs[i*4]->regs[0]->num;
 		/* preserve hack for depth output.. tgsi writes depth to .z,
 		 * but what we give the hw is the scalar register:
 		 */
@@ -2111,7 +2457,8 @@ ir3_compile_shader_nir(struct ir3_shader_variant *so,
 
 out:
 	if (ret) {
-		ir3_destroy(so->ir);
+		if (so->ir)
+			ir3_destroy(so->ir);
 		so->ir = NULL;
 	}
 	compile_free(ctx);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cp.c b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
index fa7d363..8c7c80f 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cp.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cp.c
@@ -41,7 +41,7 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		struct ir3_register *dst = instr->regs[0];
 		struct ir3_register *src = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(src);
-		if (dst->flags & (IR3_REG_ADDR | IR3_REG_RELATIV))
+		if (dst->flags & IR3_REG_RELATIV)
 			return false;
 		if (src->flags & IR3_REG_RELATIV)
 			return false;
@@ -54,6 +54,13 @@ static bool is_eligible_mov(struct ir3_instruction *instr, bool allow_flags)
 		/* TODO: remove this hack: */
 		if (is_meta(src_instr) && (src_instr->opc == OPC_META_FO))
 			return false;
+		/* TODO: we currently don't handle left/right neighbors
+		 * very well when inserting parallel-copies into phi..
+		 * to avoid problems don't eliminate a mov coming out
+		 * of phi..
+		 */
+		if (is_meta(src_instr) && (src_instr->opc == OPC_META_PHI))
+			return false;
 		return true;
 	}
 	return false;
@@ -354,13 +361,6 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 {
 	struct ir3_register *reg;
 
-	/* stay within the block.. don't try to operate across
-	 * basic block boundaries or we'll have problems when
-	 * dealing with multiple basic blocks:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
-		return instr;
-
 	if (is_eligible_mov(instr, !!flags)) {
 		struct ir3_register *reg = instr->regs[1];
 		struct ir3_instruction *src_instr = ssa(reg);
@@ -394,22 +394,22 @@ instr_cp(struct ir3_instruction *instr, unsigned *flags)
 	return instr;
 }
 
-static void block_cp(struct ir3_block *block)
+void
+ir3_cp(struct ir3 *ir)
 {
-	unsigned i;
+	ir3_clear_mark(ir);
 
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
 			struct ir3_instruction *out =
-					instr_cp(block->outputs[i], NULL);
+					instr_cp(ir->outputs[i], NULL);
 
-			block->outputs[i] = out;
+			ir->outputs[i] = out;
 		}
 	}
-}
 
-void ir3_block_cp(struct ir3_block *block)
-{
-	ir3_clear_mark(block->shader);
-	block_cp(block);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition)
+			block->condition = instr_cp(block->condition, NULL);
+	}
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_depth.c b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
index b899c66..3a10824 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_depth.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_depth.c
@@ -84,25 +84,25 @@ int ir3_delayslots(struct ir3_instruction *assigner,
 	}
 }
 
-static void insert_by_depth(struct ir3_instruction *instr)
+void
+ir3_insert_by_depth(struct ir3_instruction *instr, struct list_head *list)
 {
-	struct ir3_block *block = instr->block;
-	struct ir3_instruction *n = block->head;
-	struct ir3_instruction *p = NULL;
-
-	while (n && (n != instr) && (n->depth > instr->depth)) {
-		p = n;
-		n = n->next;
+	/* remove from existing spot in list: */
+	list_delinit(&instr->node);
+
+	/* find where to re-insert instruction: */
+	list_for_each_entry (struct ir3_instruction, pos, list, node) {
+		if (pos->depth > instr->depth) {
+			list_add(&instr->node, &pos->node);
+			return;
+		}
 	}
-
-	instr->next = n;
-	if (p)
-		p->next = instr;
-	else
-		block->head = instr;
+	/* if we get here, we didn't find an insertion spot: */
+	list_addtail(&instr->node, list);
 }
 
-static void ir3_instr_depth(struct ir3_instruction *instr)
+static void
+ir3_instr_depth(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *src;
 
@@ -123,47 +123,54 @@ static void ir3_instr_depth(struct ir3_instruction *instr)
 		instr->depth = MAX2(instr->depth, sd);
 	}
 
-	/* meta-instructions don't add cycles, other than PHI.. which
-	 * might translate to a real instruction..
-	 *
-	 * well, not entirely true, fan-in/out, etc might need to need
-	 * to generate some extra mov's in edge cases, etc.. probably
-	 * we might want to do depth calculation considering the worst
-	 * case for these??
-	 */
 	if (!is_meta(instr))
 		instr->depth++;
 
-	insert_by_depth(instr);
+	ir3_insert_by_depth(instr, &instr->block->instr_list);
+}
+
+static void
+remove_unused_by_block(struct ir3_block *block)
+{
+	list_for_each_entry_safe (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (!ir3_instr_check_mark(instr)) {
+			if (is_flow(instr) && (instr->opc == OPC_END))
+				continue;
+			/* mark it, in case it is input, so we can
+			 * remove unused inputs:
+			 */
+			instr->depth = DEPTH_UNUSED;
+			/* and remove from instruction list: */
+			list_delinit(&instr->node);
+		}
+	}
 }
 
-void ir3_block_depth(struct ir3_block *block)
+void
+ir3_depth(struct ir3 *ir)
 {
 	unsigned i;
 
-	block->head = NULL;
+	ir3_clear_mark(ir);
+	for (i = 0; i < ir->noutputs; i++)
+		if (ir->outputs[i])
+			ir3_instr_depth(ir->outputs[i]);
 
-	ir3_clear_mark(block->shader);
-	for (i = 0; i < block->noutputs; i++)
-		if (block->outputs[i])
-			ir3_instr_depth(block->outputs[i]);
+	/* We also need to account for if-condition: */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		if (block->condition)
+			ir3_instr_depth(block->condition);
+	}
 
 	/* mark un-used instructions: */
-	for (i = 0; i < block->shader->instrs_count; i++) {
-		struct ir3_instruction *instr = block->shader->instrs[i];
-
-		/* just consider instructions within this block: */
-		if (instr->block != block)
-			continue;
-
-		if (!ir3_instr_check_mark(instr))
-			instr->depth = DEPTH_UNUSED;
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		remove_unused_by_block(block);
 	}
 
 	/* cleanup unused inputs: */
-	for (i = 0; i < block->ninputs; i++) {
-		struct ir3_instruction *in = block->inputs[i];
+	for (i = 0; i < ir->ninputs; i++) {
+		struct ir3_instruction *in = ir->inputs[i];
 		if (in && (in->depth == DEPTH_UNUSED))
-			block->inputs[i] = NULL;
+			ir->inputs[i] = NULL;
 	}
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_dump.c b/src/gallium/drivers/freedreno/ir3/ir3_dump.c
deleted file mode 100644
index 1614d63..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_dump.c
+++ /dev/null
@@ -1,456 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-
-#include "ir3.h"
-
-#define PTRID(x) ((unsigned long)(x))
-
-struct ir3_dump_ctx {
-	FILE *f;
-	bool verbose;
-};
-
-static void dump_instr_name(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	/* for debugging: */
-	if (ctx->verbose) {
-#ifdef DEBUG
-		fprintf(ctx->f, "%04u:", instr->serialno);
-#endif
-		fprintf(ctx->f, "%03u: ", instr->depth);
-	}
-
-	if (instr->flags & IR3_INSTR_SY)
-		fprintf(ctx->f, "(sy)");
-	if (instr->flags & IR3_INSTR_SS)
-		fprintf(ctx->f, "(ss)");
-
-	if (is_meta(instr)) {
-		switch(instr->opc) {
-		case OPC_META_PHI:
-			fprintf(ctx->f, "&#934;");
-			break;
-		default:
-			/* shouldn't hit here.. just for debugging: */
-			switch (instr->opc) {
-			case OPC_META_INPUT:  fprintf(ctx->f, "_meta:in");   break;
-			case OPC_META_OUTPUT: fprintf(ctx->f, "_meta:out");  break;
-			case OPC_META_FO:     fprintf(ctx->f, "_meta:fo");   break;
-			case OPC_META_FI:     fprintf(ctx->f, "_meta:fi");   break;
-			case OPC_META_FLOW:   fprintf(ctx->f, "_meta:flow"); break;
-
-			default: fprintf(ctx->f, "_meta:%d", instr->opc); break;
-			}
-			break;
-		}
-	} else if (instr->category == 1) {
-		static const char *type[] = {
-				[TYPE_F16] = "f16",
-				[TYPE_F32] = "f32",
-				[TYPE_U16] = "u16",
-				[TYPE_U32] = "u32",
-				[TYPE_S16] = "s16",
-				[TYPE_S32] = "s32",
-				[TYPE_U8]  = "u8",
-				[TYPE_S8]  = "s8",
-		};
-		if (instr->cat1.src_type == instr->cat1.dst_type)
-			fprintf(ctx->f, "mov");
-		else
-			fprintf(ctx->f, "cov");
-		fprintf(ctx->f, ".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
-	} else {
-		fprintf(ctx->f, "%s", ir3_instr_name(instr));
-		if (instr->flags & IR3_INSTR_3D)
-			fprintf(ctx->f, ".3d");
-		if (instr->flags & IR3_INSTR_A)
-			fprintf(ctx->f, ".a");
-		if (instr->flags & IR3_INSTR_O)
-			fprintf(ctx->f, ".o");
-		if (instr->flags & IR3_INSTR_P)
-			fprintf(ctx->f, ".p");
-		if (instr->flags & IR3_INSTR_S)
-			fprintf(ctx->f, ".s");
-		if (instr->flags & IR3_INSTR_S2EN)
-			fprintf(ctx->f, ".s2en");
-	}
-}
-
-static void dump_reg_name(struct ir3_dump_ctx *ctx,
-		struct ir3_register *reg, bool followssa)
-{
-	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
-			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
-		fprintf(ctx->f, "(absneg)");
-	else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
-		fprintf(ctx->f, "(neg)");
-	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
-		fprintf(ctx->f, "(abs)");
-
-	if (reg->flags & IR3_REG_IMMED) {
-		fprintf(ctx->f, "imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
-	} else if (reg->flags & IR3_REG_SSA) {
-		if (ctx->verbose) {
-			fprintf(ctx->f, "_");
-			if (followssa) {
-				fprintf(ctx->f, "[");
-				dump_instr_name(ctx, reg->instr);
-				fprintf(ctx->f, "]");
-			}
-		}
-	} else if (reg->flags & IR3_REG_RELATIV) {
-		if (reg->flags & IR3_REG_HALF)
-			fprintf(ctx->f, "h");
-		if (reg->flags & IR3_REG_CONST)
-			fprintf(ctx->f, "c<a0.x + %u>", reg->num);
-		else
-			fprintf(ctx->f, "\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
-	} else {
-		if (reg->flags & IR3_REG_HALF)
-			fprintf(ctx->f, "h");
-		if (reg->flags & IR3_REG_CONST)
-			fprintf(ctx->f, "c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
-		else
-			fprintf(ctx->f, "\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
-	}
-}
-
-static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr);
-static void ir3_block_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_block *block, const char *name);
-
-static void dump_instr(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	/* if we've already visited this instruction, bail now: */
-	if (ir3_instr_check_mark(instr))
-		return;
-
-	/* some meta-instructions need to be handled specially: */
-	if (is_meta(instr)) {
-		if ((instr->opc == OPC_META_FO) ||
-				(instr->opc == OPC_META_FI)) {
-			struct ir3_instruction *src;
-			foreach_ssa_src(src, instr)
-				dump_instr(ctx, src);
-		} else if (instr->opc == OPC_META_FLOW) {
-			struct ir3_register *reg = instr->regs[1];
-			ir3_block_dump(ctx, instr->flow.if_block, "if");
-			if (instr->flow.else_block)
-				ir3_block_dump(ctx, instr->flow.else_block, "else");
-			if (reg->flags & IR3_REG_SSA)
-				dump_instr(ctx, reg->instr);
-		} else if (instr->opc == OPC_META_PHI) {
-			/* treat like a normal instruction: */
-			ir3_instr_dump(ctx, instr);
-		}
-	} else {
-		ir3_instr_dump(ctx, instr);
-	}
-}
-
-/* arrarraggh!  if link is to something outside of the current block, we
- * need to defer emitting the link until the end of the block, since the
- * edge triggers pre-creation of the node it links to inside the cluster,
- * even though it is meant to be outside..
- */
-static struct {
-	char buf[40960];
-	unsigned n;
-} edge_buf;
-
-/* helper to print or defer: */
-static void printdef(struct ir3_dump_ctx *ctx,
-		bool defer, const char *fmt, ...)
-{
-	va_list ap;
-	va_start(ap, fmt);
-	if (defer) {
-		unsigned n = edge_buf.n;
-		n += vsnprintf(&edge_buf.buf[n], sizeof(edge_buf.buf) - n,
-				fmt, ap);
-		edge_buf.n = n;
-	} else {
-		vfprintf(ctx->f, fmt, ap);
-	}
-	va_end(ap);
-}
-
-static void dump_link2(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr, const char *target, bool defer)
-{
-	/* some meta-instructions need to be handled specially: */
-	if (is_meta(instr)) {
-		if (instr->opc == OPC_META_INPUT) {
-			printdef(ctx, defer, "input%lx:<in%u>:w -> %s",
-					PTRID(instr->inout.block),
-					instr->regs[0]->num, target);
-		} else if (instr->opc == OPC_META_FO) {
-			struct ir3_register *reg = instr->regs[1];
-			dump_link2(ctx, reg->instr, target, defer);
-			printdef(ctx, defer, "[label=\".%c\"]",
-					"xyzw"[instr->fo.off & 0x3]);
-		} else if (instr->opc == OPC_META_FI) {
-			struct ir3_instruction *src;
-
-			foreach_ssa_src_n(src, i, instr) {
-				dump_link2(ctx, src, target, defer);
-				printdef(ctx, defer, "[label=\".%c\"]",
-						"xyzw"[i & 0x3]);
-			}
-		} else if (instr->opc == OPC_META_OUTPUT) {
-			printdef(ctx, defer, "output%lx:<out%u>:w -> %s",
-					PTRID(instr->inout.block),
-					instr->regs[0]->num, target);
-		} else if (instr->opc == OPC_META_PHI) {
-			/* treat like a normal instruction: */
-			printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
-		}
-	} else {
-		printdef(ctx, defer, "instr%lx:<dst0> -> %s", PTRID(instr), target);
-	}
-}
-
-static void dump_link(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr,
-		struct ir3_block *block, const char *target)
-{
-	bool defer = instr->block != block;
-	dump_link2(ctx, instr, target, defer);
-	printdef(ctx, defer, "\n");
-}
-
-static struct ir3_register *follow_flow(struct ir3_register *reg)
-{
-	if (reg->flags & IR3_REG_SSA) {
-		struct ir3_instruction *instr = reg->instr;
-		/* go with the flow.. */
-		if (is_meta(instr) && (instr->opc == OPC_META_FLOW))
-			return instr->regs[1];
-	}
-	return reg;
-}
-
-static void ir3_instr_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_register *src;
-
-	fprintf(ctx->f, "instr%lx [shape=record,style=filled,fillcolor=lightgrey,label=\"{",
-			PTRID(instr));
-	dump_instr_name(ctx, instr);
-
-	/* destination register: */
-	fprintf(ctx->f, "|<dst0>");
-
-	/* source register(s): */
-	foreach_src_n(src, i, instr) {
-		struct ir3_register *reg = follow_flow(src);
-
-		fprintf(ctx->f, "|");
-
-		if (reg->flags & IR3_REG_SSA)
-			fprintf(ctx->f, "<src%u> ", i);
-
-		dump_reg_name(ctx, reg, true);
-	}
-
-	fprintf(ctx->f, "}\"];\n");
-
-	/* and recursively dump dependent instructions: */
-	foreach_src_n(src, i, instr) {
-		struct ir3_register *reg = follow_flow(src);
-		char target[32];  /* link target */
-
-		if (!(reg->flags & IR3_REG_SSA))
-			continue;
-
-		snprintf(target, sizeof(target), "instr%lx:<src%u>",
-				PTRID(instr), i);
-
-		dump_instr(ctx, reg->instr);
-		dump_link(ctx, reg->instr, instr->block, target);
-	}
-}
-
-static void ir3_block_dump(struct ir3_dump_ctx *ctx,
-		struct ir3_block *block, const char *name)
-{
-	unsigned i, n;
-
-	n = edge_buf.n;
-
-	fprintf(ctx->f, "subgraph cluster%lx {\n", PTRID(block));
-	fprintf(ctx->f, "label=\"%s\";\n", name);
-
-	/* draw inputs: */
-	fprintf(ctx->f, "input%lx [shape=record,label=\"inputs", PTRID(block));
-	for (i = 0; i < block->ninputs; i++)
-		if (block->inputs[i])
-			fprintf(ctx->f, "|<in%u> i%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
-	fprintf(ctx->f, "\"];\n");
-
-	/* draw instruction graph: */
-	for (i = 0; i < block->noutputs; i++)
-		if (block->outputs[i])
-			dump_instr(ctx, block->outputs[i]);
-
-	/* draw outputs: */
-	fprintf(ctx->f, "output%lx [shape=record,label=\"outputs", PTRID(block));
-	for (i = 0; i < block->noutputs; i++)
-		fprintf(ctx->f, "|<out%u> o%u.%c", i, (i >> 2), "xyzw"[i & 0x3]);
-	fprintf(ctx->f, "\"];\n");
-
-	/* and links to outputs: */
-	for (i = 0; i < block->noutputs; i++) {
-		char target[32];  /* link target */
-
-		/* NOTE: there could be outputs that are never assigned,
-		 * so skip them
-		 */
-		if (!block->outputs[i])
-			continue;
-
-		snprintf(target, sizeof(target), "output%lx:<out%u>:e",
-				PTRID(block), i);
-
-		dump_link(ctx, block->outputs[i], block, target);
-	}
-
-	fprintf(ctx->f, "}\n");
-
-	/* and links to inputs: */
-	if (block->parent) {
-		for (i = 0; i < block->ninputs; i++) {
-			char target[32];  /* link target */
-
-			if (!block->inputs[i])
-				continue;
-
-			dump_instr(ctx, block->inputs[i]);
-
-			snprintf(target, sizeof(target), "input%lx:<in%u>:e",
-					PTRID(block), i);
-
-			dump_link(ctx, block->inputs[i], block, target);
-		}
-	}
-
-	/* dump deferred edges: */
-	if (edge_buf.n > n) {
-		fprintf(ctx->f, "%*s", edge_buf.n - n, &edge_buf.buf[n]);
-		edge_buf.n = n;
-	}
-}
-
-void ir3_dump(struct ir3 *shader, const char *name,
-		struct ir3_block *block /* XXX maybe 'block' ptr should move to ir3? */,
-		FILE *f)
-{
-	struct ir3_dump_ctx ctx = {
-			.f = f,
-	};
-	ir3_clear_mark(shader);
-	fprintf(ctx.f, "digraph G {\n");
-	fprintf(ctx.f, "rankdir=RL;\n");
-	fprintf(ctx.f, "nodesep=0.25;\n");
-	fprintf(ctx.f, "ranksep=1.5;\n");
-	ir3_block_dump(&ctx, block, name);
-	fprintf(ctx.f, "}\n");
-}
-
-/*
- * For Debugging:
- */
-
-void
-ir3_dump_instr_single(struct ir3_instruction *instr)
-{
-	struct ir3_dump_ctx ctx = {
-			.f = stdout,
-			.verbose = true,
-	};
-	unsigned i;
-
-	dump_instr_name(&ctx, instr);
-	for (i = 0; i < instr->regs_count; i++) {
-		struct ir3_register *reg = instr->regs[i];
-		printf(i ? ", " : " ");
-		dump_reg_name(&ctx, reg, !!i);
-	}
-
-	if (instr->address) {
-		fprintf(ctx.f, ", address=_");
-		fprintf(ctx.f, "[");
-		dump_instr_name(&ctx, instr->address);
-		fprintf(ctx.f, "]");
-	}
-
-	if (instr->fanin) {
-		fprintf(ctx.f, ", fanin=_");
-		fprintf(ctx.f, "[");
-		dump_instr_name(&ctx, instr->fanin);
-		fprintf(ctx.f, "]");
-	}
-
-	if (is_meta(instr)) {
-		if (instr->opc == OPC_META_FO) {
-			printf(", off=%d", instr->fo.off);
-		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
-			printf(", aid=%d", instr->fi.aid);
-		}
-	}
-
-	printf("\n");
-}
-
-void
-ir3_dump_instr_list(struct ir3_instruction *instr)
-{
-	struct ir3_block *block = instr->block;
-	unsigned n = 0;
-
-	while (instr) {
-		ir3_dump_instr_single(instr);
-		if (!is_meta(instr))
-			n++;
-		instr = instr->next;
-	}
-	printf("%u instructions\n", n);
-
-	for (n = 0; n < block->noutputs; n++) {
-		if (!block->outputs[n])
-			continue;
-		printf("out%d: ", n);
-		ir3_dump_instr_single(block->outputs[n]);
-	}
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c b/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
deleted file mode 100644
index 419cd9d..0000000
--- a/src/gallium/drivers/freedreno/ir3/ir3_flatten.c
+++ /dev/null
@@ -1,152 +0,0 @@
-/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
-
-/*
- * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- * Authors:
- *    Rob Clark <robclark@freedesktop.org>
- */
-
-#include <stdarg.h>
-
-#include "ir3.h"
-
-/*
- * Flatten: flatten out legs of if/else, etc
- *
- * TODO probably should use some heuristic to decide to not flatten
- * if one side of the other is too large / deeply nested / whatever?
- */
-
-struct ir3_flatten_ctx {
-	struct ir3_block *block;
-	unsigned cnt;
-};
-
-static struct ir3_register *unwrap(struct ir3_register *reg)
-{
-
-	if (reg->flags & IR3_REG_SSA) {
-		struct ir3_instruction *instr = reg->instr;
-		if (is_meta(instr)) {
-			switch (instr->opc) {
-			case OPC_META_OUTPUT:
-			case OPC_META_FLOW:
-				if (instr->regs_count > 1)
-					return instr->regs[1];
-				return NULL;
-			default:
-				break;
-			}
-		}
-	}
-	return reg;
-}
-
-static void ir3_instr_flatten(struct ir3_flatten_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-
-	/* if we've already visited this instruction, bail now: */
-	if (ir3_instr_check_mark(instr))
-		return;
-
-	instr->block = ctx->block;
-
-	/* TODO: maybe some threshold to decide whether to
-	 * flatten or not??
-	 */
-	if (is_meta(instr)) {
-		if (instr->opc == OPC_META_PHI) {
-			struct ir3_register *cond, *t, *f;
-
-			cond = unwrap(instr->regs[1]);
-			t    = unwrap(instr->regs[2]);  /* true val */
-			f    = unwrap(instr->regs[3]);  /* false val */
-
-			/* must have cond, but t or f may be null if only written
-			 * one one side of the if/else (in which case we can just
-			 * convert the PHI to a simple move).
-			 */
-			assert(cond);
-			assert(t || f);
-
-			if (t && f) {
-				/* convert the PHI instruction to sel.{b16,b32} */
-				instr->category = 3;
-
-				/* instruction type based on dst size: */
-				if (instr->regs[0]->flags & IR3_REG_HALF)
-					instr->opc = OPC_SEL_B16;
-				else
-					instr->opc = OPC_SEL_B32;
-
-				instr->regs[1] = t;
-				instr->regs[2] = cond;
-				instr->regs[3] = f;
-			} else {
-				/* convert to simple mov: */
-				instr->category = 1;
-				instr->cat1.dst_type = TYPE_F32;
-				instr->cat1.src_type = TYPE_F32;
-				instr->regs_count = 2;
-				instr->regs[1] = t ? t : f;
-			}
-
-			ctx->cnt++;
-		} else if ((instr->opc == OPC_META_INPUT) &&
-				(instr->regs_count == 2)) {
-			type_t ftype;
-
-			if (instr->regs[0]->flags & IR3_REG_HALF)
-				ftype = TYPE_F16;
-			else
-				ftype = TYPE_F32;
-
-			/* convert meta:input to mov: */
-			instr->category = 1;
-			instr->cat1.src_type = ftype;
-			instr->cat1.dst_type = ftype;
-		}
-	}
-
-	/* recursively visit children: */
-	foreach_ssa_src(src, instr)
-		ir3_instr_flatten(ctx, src);
-}
-
-/* return >= 0 is # of phi's flattened, < 0 is error */
-int ir3_block_flatten(struct ir3_block *block)
-{
-	struct ir3_flatten_ctx ctx = {
-			.block = block,
-	};
-	unsigned i;
-
-	ir3_clear_mark(block->shader);
-	for(i = 0; i < block->noutputs; i++)
-		if (block->outputs[i])
-			ir3_instr_flatten(&ctx, block->outputs[i]);
-
-	return ctx.cnt;
-}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_group.c b/src/gallium/drivers/freedreno/ir3/ir3_group.c
index 782f6e8..70d9b08 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_group.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_group.c
@@ -34,35 +34,6 @@
  * Find/group instruction neighbors:
  */
 
-/* stop condition for iteration: */
-static bool check_stop(struct ir3_instruction *instr)
-{
-	if (ir3_instr_check_mark(instr))
-		return true;
-
-	/* stay within the block.. don't try to operate across
-	 * basic block boundaries or we'll have problems when
-	 * dealing with multiple basic blocks:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_INPUT))
-		return true;
-
-	return false;
-}
-
-static struct ir3_instruction * create_mov(struct ir3_instruction *instr)
-{
-	struct ir3_instruction *mov;
-
-	mov = ir3_instr_create(instr->block, 1, 0);
-	mov->cat1.src_type = TYPE_F32;
-	mov->cat1.dst_type = TYPE_F32;
-	ir3_reg_create(mov, 0, 0);    /* dst */
-	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = instr;
-
-	return mov;
-}
-
 /* bleh.. we need to do the same group_n() thing for both inputs/outputs
  * (where we have a simple instr[] array), and fanin nodes (where we have
  * an extra indirection via reg->instr).
@@ -78,7 +49,8 @@ static struct ir3_instruction *arr_get(void *arr, int idx)
 }
 static void arr_insert_mov_out(void *arr, int idx, struct ir3_instruction *instr)
 {
-	((struct ir3_instruction **)arr)[idx] = create_mov(instr);
+	((struct ir3_instruction **)arr)[idx] =
+			ir3_MOV(instr->block, instr, TYPE_F32);
 }
 static void arr_insert_mov_in(void *arr, int idx, struct ir3_instruction *instr)
 {
@@ -111,14 +83,17 @@ static struct ir3_instruction *instr_get(void *arr, int idx)
 {
 	return ssa(((struct ir3_instruction *)arr)->regs[idx+1]);
 }
-static void instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
+static void
+instr_insert_mov(void *arr, int idx, struct ir3_instruction *instr)
 {
-	((struct ir3_instruction *)arr)->regs[idx+1]->instr = create_mov(instr);
+	((struct ir3_instruction *)arr)->regs[idx+1]->instr =
+			ir3_MOV(instr->block, instr, TYPE_F32);
 }
 static struct group_ops instr_ops = { instr_get, instr_insert_mov };
 
 
-static void group_n(struct group_ops *ops, void *arr, unsigned n)
+static void
+group_n(struct group_ops *ops, void *arr, unsigned n)
 {
 	unsigned i, j;
 
@@ -141,6 +116,10 @@ restart:
 			conflict = conflicts(instr->cp.left, left) ||
 				conflicts(instr->cp.right, right);
 
+			/* RA can't yet deal very well w/ group'd phi's: */
+			if (is_meta(instr) && (instr->opc == OPC_META_PHI))
+				conflict = true;
+
 			/* we also can't have an instr twice in the group: */
 			for (j = i + 1; (j < n) && !conflict; j++)
 				if (ops->get(arr, j) == instr)
@@ -181,11 +160,12 @@ restart:
 	}
 }
 
-static void instr_find_neighbors(struct ir3_instruction *instr)
+static void
+instr_find_neighbors(struct ir3_instruction *instr)
 {
 	struct ir3_instruction *src;
 
-	if (check_stop(instr))
+	if (ir3_instr_check_mark(instr))
 		return;
 
 	if (is_meta(instr) && (instr->opc == OPC_META_FI))
@@ -200,7 +180,8 @@ static void instr_find_neighbors(struct ir3_instruction *instr)
  * we need to insert dummy/padding instruction for grouping, and
  * then take it back out again before anyone notices.
  */
-static void pad_and_group_input(struct ir3_instruction **input, unsigned n)
+static void
+pad_and_group_input(struct ir3_instruction **input, unsigned n)
 {
 	int i, mask = 0;
 	struct ir3_block *block = NULL;
@@ -210,8 +191,8 @@ static void pad_and_group_input(struct ir3_instruction **input, unsigned n)
 		if (instr) {
 			block = instr->block;
 		} else if (block) {
-			instr = ir3_instr_create(block, 0, OPC_NOP);
-			ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dst */
+			instr = ir3_NOP(block);
+			ir3_reg_create(instr, 0, IR3_REG_SSA);    /* dummy dst */
 			input[i] = instr;
 			mask |= (1 << i);
 		}
@@ -225,42 +206,41 @@ static void pad_and_group_input(struct ir3_instruction **input, unsigned n)
 	}
 }
 
-static void block_find_neighbors(struct ir3_block *block)
+static void
+find_neighbors(struct ir3 *ir)
 {
 	unsigned i;
 
-	for (i = 0; i < block->noutputs; i++) {
-		if (block->outputs[i]) {
-			struct ir3_instruction *instr = block->outputs[i];
-			instr_find_neighbors(instr);
-		}
-	}
-
 	/* shader inputs/outputs themselves must be contiguous as well:
+	 *
+	 * NOTE: group inputs first, since we only insert mov's
+	 * *before* the conflicted instr (and that would go badly
+	 * for inputs).  By doing inputs first, we should never
+	 * have a conflict on inputs.. pushing any conflict to
+	 * resolve to the outputs, for stuff like:
+	 *
+	 *     MOV OUT[n], IN[m].wzyx
+	 *
+	 * NOTE: we assume here inputs/outputs are grouped in vec4.
+	 * This logic won't quite cut it if we don't align smaller
+	 * on vec4 boundaries
 	 */
-	if (!block->parent) {
-		/* NOTE: group inputs first, since we only insert mov's
-		 * *before* the conflicted instr (and that would go badly
-		 * for inputs).  By doing inputs first, we should never
-		 * have a conflict on inputs.. pushing any conflict to
-		 * resolve to the outputs, for stuff like:
-		 *
-		 *     MOV OUT[n], IN[m].wzyx
-		 *
-		 * NOTE: we assume here inputs/outputs are grouped in vec4.
-		 * This logic won't quite cut it if we don't align smaller
-		 * on vec4 boundaries
-		 */
-		for (i = 0; i < block->ninputs; i += 4)
-			pad_and_group_input(&block->inputs[i], 4);
-		for (i = 0; i < block->noutputs; i += 4)
-			group_n(&arr_ops_out, &block->outputs[i], 4);
-
+	for (i = 0; i < ir->ninputs; i += 4)
+		pad_and_group_input(&ir->inputs[i], 4);
+	for (i = 0; i < ir->noutputs; i += 4)
+		group_n(&arr_ops_out, &ir->outputs[i], 4);
+
+	for (i = 0; i < ir->noutputs; i++) {
+		if (ir->outputs[i]) {
+			struct ir3_instruction *instr = ir->outputs[i];
+			instr_find_neighbors(instr);
+		}
 	}
 }
 
-void ir3_block_group(struct ir3_block *block)
+void
+ir3_group(struct ir3 *ir)
 {
-	ir3_clear_mark(block->shader);
-	block_find_neighbors(block);
+	ir3_clear_mark(ir);
+	find_neighbors(ir);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
index 2455f7e..f4a4223 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_legalize.c
@@ -26,7 +26,6 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
-#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 
 #include "freedreno_util.h"
@@ -43,20 +42,31 @@
  */
 
 struct ir3_legalize_ctx {
-	struct ir3_block *block;
 	bool has_samp;
 	int max_bary;
 };
 
-static void legalize(struct ir3_legalize_ctx *ctx)
+/* We want to evaluate each block from the position of any other
+ * predecessor block, in order that the flags set are the union
+ * of all possible program paths.  For stopping condition, we
+ * want to stop when the pair of <pred-block, current-block> has
+ * been visited already.
+ *
+ * XXX is that completely true?  We could have different needs_xyz
+ * flags set depending on path leading to pred-block.. we could
+ * do *most* of this based on chasing src instructions ptrs (and
+ * following all phi srcs).. except the write-after-read hazzard.
+ *
+ * For now we just set ss/sy flag on first instruction on block,
+ * and handle everything within the block as before.
+ */
+
+static void
+legalize_block(struct ir3_legalize_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_block *block = ctx->block;
-	struct ir3_instruction *n;
-	struct ir3 *shader = block->shader;
-	struct ir3_instruction *end =
-			ir3_instr_create(block, 0, OPC_END);
 	struct ir3_instruction *last_input = NULL;
 	struct ir3_instruction *last_rel = NULL;
+	struct list_head instr_list;
 	regmask_t needs_ss_war;       /* write after read */
 	regmask_t needs_ss;
 	regmask_t needs_sy;
@@ -65,9 +75,13 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 	regmask_init(&needs_ss);
 	regmask_init(&needs_sy);
 
-	shader->instrs_count = 0;
+	/* remove all the instructions from the list, we'll be adding
+	 * them back in as we go
+	 */
+	list_replace(&block->instr_list, &instr_list);
+	list_inithead(&block->instr_list);
 
-	for (n = block->head; n; n = n->next) {
+	list_for_each_entry_safe (struct ir3_instruction, n, &instr_list, node) {
 		struct ir3_register *reg;
 		unsigned i;
 
@@ -134,18 +148,18 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 		 */
 		if ((n->flags & IR3_INSTR_SS) && (n->category >= 5)) {
 			struct ir3_instruction *nop;
-			nop = ir3_instr_create(block, 0, OPC_NOP);
+			nop = ir3_NOP(block);
 			nop->flags |= IR3_INSTR_SS;
 			n->flags &= ~IR3_INSTR_SS;
 		}
 
 		/* need to be able to set (ss) on first instruction: */
-		if ((shader->instrs_count == 0) && (n->category >= 5))
-			ir3_instr_create(block, 0, OPC_NOP);
+		if (list_empty(&block->instr_list) && (n->category >= 5))
+			ir3_NOP(block);
 
-		if (is_nop(n) && shader->instrs_count) {
-			struct ir3_instruction *last =
-					shader->instrs[shader->instrs_count-1];
+		if (is_nop(n) && !list_empty(&block->instr_list)) {
+			struct ir3_instruction *last = list_last_entry(&block->instr_list,
+					struct ir3_instruction, node);
 			if (is_nop(last) && (last->repeat < 5)) {
 				last->repeat++;
 				last->flags |= n->flags;
@@ -153,7 +167,7 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 			}
 		}
 
-		shader->instrs[shader->instrs_count++] = n;
+		list_addtail(&n->node, &block->instr_list);
 
 		if (is_sfu(n))
 			regmask_set(&needs_ss, n->regs[0]);
@@ -192,35 +206,20 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 		 * the (ei) flag:
 		 */
 		if (is_mem(last_input) && (last_input->opc == OPC_LDLV)) {
-			int i, cnt;
-
-			/* note that ir3_instr_create() inserts into
-			 * shader->instrs[] and increments the count..
-			 * so we need to bump up the cnt initially (to
-			 * avoid it clobbering the last real instr) and
-			 * restore it after.
-			 */
-			cnt = ++shader->instrs_count;
+			struct ir3_instruction *baryf;
 
-			/* inserting instructions would be a bit nicer if list.. */
-			for (i = cnt - 2; i >= 0; i--) {
-				if (shader->instrs[i] == last_input) {
+			/* (ss)bary.f (ei)r63.x, 0, r0.x */
+			baryf = ir3_instr_create(block, 2, OPC_BARY_F);
+			baryf->flags |= IR3_INSTR_SS;
+			ir3_reg_create(baryf, regid(63, 0), 0);
+			ir3_reg_create(baryf, 0, IR3_REG_IMMED)->iim_val = 0;
+			ir3_reg_create(baryf, regid(0, 0), 0);
 
-					/* (ss)bary.f (ei)r63.x, 0, r0.x */
-					last_input = ir3_instr_create(block, 2, OPC_BARY_F);
-					last_input->flags |= IR3_INSTR_SS;
-					ir3_reg_create(last_input, regid(63, 0), 0);
-					ir3_reg_create(last_input, 0, IR3_REG_IMMED)->iim_val = 0;
-					ir3_reg_create(last_input, regid(0, 0), 0);
+			/* insert the dummy bary.f after last_input: */
+			list_delinit(&baryf->node);
+			list_add(&baryf->node, &last_input->node);
 
-					shader->instrs[i + 1] = last_input;
-
-					break;
-				}
-				shader->instrs[i + 1] = shader->instrs[i];
-			}
-
-			shader->instrs_count = cnt;
+			last_input = baryf;
 		}
 		last_input->regs[0]->flags |= IR3_REG_EI;
 	}
@@ -228,21 +227,177 @@ static void legalize(struct ir3_legalize_ctx *ctx)
 	if (last_rel)
 		last_rel->flags |= IR3_INSTR_UL;
 
-	shader->instrs[shader->instrs_count++] = end;
+	list_first_entry(&block->instr_list, struct ir3_instruction, node)
+		->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+}
+
+/* NOTE: branch instructions are always the last instruction(s)
+ * in the block.  We take advantage of this as we resolve the
+ * branches, since "if (foo) break;" constructs turn into
+ * something like:
+ *
+ *   block3 {
+ *   	...
+ *   	0029:021: mov.s32s32 r62.x, r1.y
+ *   	0082:022: br !p0.x, target=block5
+ *   	0083:023: br p0.x, target=block4
+ *   	// succs: if _[0029:021: mov.s32s32] block4; else block5;
+ *   }
+ *   block4 {
+ *   	0084:024: jump, target=block6
+ *   	// succs: block6;
+ *   }
+ *   block5 {
+ *   	0085:025: jump, target=block7
+ *   	// succs: block7;
+ *   }
+ *
+ * ie. only instruction in block4/block5 is a jump, so when
+ * resolving branches we can easily detect this by checking
+ * that the first instruction in the target block is itself
+ * a jump, and setup the br directly to the jump's target
+ * (and strip back out the now unreached jump)
+ *
+ * TODO sometimes we end up with things like:
+ *
+ *    br !p0.x, #2
+ *    br p0.x, #12
+ *    add.u r0.y, r0.y, 1
+ *
+ * If we swapped the order of the branches, we could drop one.
+ */
+static struct ir3_block *
+resolve_dest_block(struct ir3_block *block)
+{
+	/* special case for last block: */
+	if (!block->successors[0])
+		return block;
+
+	/* NOTE that we may or may not have inserted the jump
+	 * in the target block yet, so conditions to resolve
+	 * the dest to the dest block's successor are:
+	 *
+	 *   (1) successor[1] == NULL &&
+	 *   (2) (block-is-empty || only-instr-is-jump)
+	 */
+	if (block->successors[1] == NULL) {
+		if (list_empty(&block->instr_list)) {
+			return block->successors[0];
+		} else if (list_length(&block->instr_list) == 1) {
+			struct ir3_instruction *instr = list_first_entry(
+					&block->instr_list, struct ir3_instruction, node);
+			if (is_flow(instr) && (instr->opc == OPC_JUMP))
+				return block->successors[0];
+		}
+	}
+	return block;
+}
+
+static bool
+resolve_jump(struct ir3_instruction *instr)
+{
+	struct ir3_block *tblock =
+		resolve_dest_block(instr->cat0.target);
+	struct ir3_instruction *target;
+
+	if (tblock != instr->cat0.target) {
+		list_delinit(&instr->cat0.target->node);
+		instr->cat0.target = tblock;
+		return true;
+	}
+
+	target = list_first_entry(&tblock->instr_list,
+				struct ir3_instruction, node);
+
+	if ((!target) || (target->ip == (instr->ip + 1))) {
+		list_delinit(&instr->node);
+		return true;
+	} else {
+		instr->cat0.immed =
+			(int)target->ip - (int)instr->ip;
+	}
+	return false;
+}
+
+/* resolve jumps, removing jumps/branches to immediately following
+ * instruction which we end up with from earlier stages.  Since
+ * removing an instruction can invalidate earlier instruction's
+ * branch offsets, we need to do this iteratively until no more
+ * branches are removed.
+ */
+static bool
+resolve_jumps(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node)
+			if (is_flow(instr) && instr->cat0.target)
+				if (resolve_jump(instr))
+					return true;
+
+	return false;
+}
 
-	shader->instrs[0]->flags |= IR3_INSTR_SS | IR3_INSTR_SY;
+/* we want to mark points where divergent flow control re-converges
+ * with (jp) flags.  For now, since we don't do any optimization for
+ * things that start out as a 'do {} while()', re-convergence points
+ * will always be a branch or jump target.  Note that this is overly
+ * conservative, since unconditional jump targets are not convergence
+ * points, we are just assuming that the other path to reach the jump
+ * target was divergent.  If we were clever enough to optimize the
+ * jump at end of a loop back to a conditional branch into a single
+ * conditional branch, ie. like:
+ *
+ *    add.f r1.w, r0.x, (neg)(r)c2.x   <= loop start
+ *    mul.f r1.z, r1.z, r0.x
+ *    mul.f r1.y, r1.y, r0.x
+ *    mul.f r0.z, r1.x, r0.x
+ *    mul.f r0.w, r0.y, r0.x
+ *    cmps.f.ge r0.x, (r)c2.y, (r)r1.w
+ *    add.s r0.x, (r)r0.x, (r)-1
+ *    sel.f32 r0.x, (r)c3.y, (r)r0.x, c3.x
+ *    cmps.f.eq p0.x, r0.x, c3.y
+ *    mov.f32f32 r0.x, r1.w
+ *    mov.f32f32 r0.y, r0.w
+ *    mov.f32f32 r1.x, r0.z
+ *    (rpt2)nop
+ *    br !p0.x, #-13
+ *    (jp)mul.f r0.x, c263.y, r1.y
+ *
+ * Then we'd have to be more clever, as the convergence point is no
+ * longer a branch or jump target.
+ */
+static void
+mark_convergence_points(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+			if (is_flow(instr) && instr->cat0.target) {
+				struct ir3_instruction *target =
+					list_first_entry(&instr->cat0.target->instr_list,
+							struct ir3_instruction, node);
+				target->flags |= IR3_INSTR_JP;
+			}
+		}
+	}
 }
 
-void ir3_block_legalize(struct ir3_block *block,
-		bool *has_samp, int *max_bary)
+void
+ir3_legalize(struct ir3 *ir, bool *has_samp, int *max_bary)
 {
 	struct ir3_legalize_ctx ctx = {
-			.block = block,
 			.max_bary = -1,
 	};
 
-	legalize(&ctx);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		legalize_block(&ctx, block);
+	}
 
 	*has_samp = ctx.has_samp;
 	*max_bary = ctx.max_bary;
+
+	do {
+		ir3_count_instructions(ir);
+	} while(resolve_jumps(ir));
+
+	mark_convergence_points(ir);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
index ae36019..dc9e462 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir_lower_if_else.c
@@ -74,14 +74,13 @@ valid_dest(nir_block *block, nir_dest *dest)
 	 * (so this is run iteratively in a loop).  Therefore if
 	 * we get this far, it should not have any if_uses:
 	 */
-	assert(dest->ssa.if_uses->entries == 0);
+	assert(list_empty(&dest->ssa.if_uses));
 
 	/* The only uses of this definition must be phi's in the
 	 * successor or in the current block
 	 */
-	struct set_entry *entry;
-	set_foreach(dest->ssa.uses, entry) {
-		const nir_instr *dest_instr = entry->key;
+	nir_foreach_use(&dest->ssa, use) {
+		nir_instr *dest_instr = use->parent_instr;
 		if (dest_instr->block == block)
 			continue;
 		if ((dest_instr->type == nir_instr_type_phi) &&
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_print.c b/src/gallium/drivers/freedreno/ir3/ir3_print.c
new file mode 100644
index 0000000..f377982
--- /dev/null
+++ b/src/gallium/drivers/freedreno/ir3/ir3_print.c
@@ -0,0 +1,237 @@
+/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
+
+/*
+ * Copyright (C) 2014 Rob Clark <robclark@freedesktop.org>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ * Authors:
+ *    Rob Clark <robclark@freedesktop.org>
+ */
+
+#include <stdarg.h>
+#include <stdio.h>
+
+#include "ir3.h"
+
+#define PTRID(x) ((unsigned long)(x))
+
+static void print_instr_name(struct ir3_instruction *instr)
+{
+#ifdef DEBUG
+	printf("%04u:", instr->serialno);
+#endif
+	printf("%03u: ", instr->depth);
+
+	if (instr->flags & IR3_INSTR_SY)
+		printf("(sy)");
+	if (instr->flags & IR3_INSTR_SS)
+		printf("(ss)");
+
+	if (is_meta(instr)) {
+		switch(instr->opc) {
+		case OPC_META_PHI:
+			printf("&#934;");
+			break;
+		default:
+			/* shouldn't hit here.. just for debugging: */
+			switch (instr->opc) {
+			case OPC_META_INPUT:  printf("_meta:in");   break;
+			case OPC_META_FO:     printf("_meta:fo");   break;
+			case OPC_META_FI:     printf("_meta:fi");   break;
+
+			default: printf("_meta:%d", instr->opc); break;
+			}
+			break;
+		}
+	} else if (instr->category == 1) {
+		static const char *type[] = {
+				[TYPE_F16] = "f16",
+				[TYPE_F32] = "f32",
+				[TYPE_U16] = "u16",
+				[TYPE_U32] = "u32",
+				[TYPE_S16] = "s16",
+				[TYPE_S32] = "s32",
+				[TYPE_U8]  = "u8",
+				[TYPE_S8]  = "s8",
+		};
+		if (instr->cat1.src_type == instr->cat1.dst_type)
+			printf("mov");
+		else
+			printf("cov");
+		printf(".%s%s", type[instr->cat1.src_type], type[instr->cat1.dst_type]);
+	} else {
+		printf("%s", ir3_instr_name(instr));
+		if (instr->flags & IR3_INSTR_3D)
+			printf(".3d");
+		if (instr->flags & IR3_INSTR_A)
+			printf(".a");
+		if (instr->flags & IR3_INSTR_O)
+			printf(".o");
+		if (instr->flags & IR3_INSTR_P)
+			printf(".p");
+		if (instr->flags & IR3_INSTR_S)
+			printf(".s");
+		if (instr->flags & IR3_INSTR_S2EN)
+			printf(".s2en");
+	}
+}
+
+static void print_reg_name(struct ir3_register *reg, bool followssa)
+{
+	if ((reg->flags & (IR3_REG_FABS | IR3_REG_SABS)) &&
+			(reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT)))
+		printf("(absneg)");
+	else if (reg->flags & (IR3_REG_FNEG | IR3_REG_SNEG | IR3_REG_BNOT))
+		printf("(neg)");
+	else if (reg->flags & (IR3_REG_FABS | IR3_REG_SABS))
+		printf("(abs)");
+
+	if (reg->flags & IR3_REG_IMMED) {
+		printf("imm[%f,%d,0x%x]", reg->fim_val, reg->iim_val, reg->iim_val);
+	} else if (reg->flags & IR3_REG_SSA) {
+		printf("_");
+		if (followssa) {
+			printf("[");
+			print_instr_name(reg->instr);
+			printf("]");
+		}
+	} else if (reg->flags & IR3_REG_RELATIV) {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c<a0.x + %u>", reg->num);
+		else
+			printf("\x1b[0;31mr<a0.x + %u>\x1b[0m (%u)", reg->num, reg->size);
+	} else {
+		if (reg->flags & IR3_REG_HALF)
+			printf("h");
+		if (reg->flags & IR3_REG_CONST)
+			printf("c%u.%c", reg_num(reg), "xyzw"[reg_comp(reg)]);
+		else
+			printf("\x1b[0;31mr%u.%c\x1b[0m", reg_num(reg), "xyzw"[reg_comp(reg)]);
+	}
+}
+
+static void
+tab(int lvl)
+{
+	for (int i = 0; i < lvl; i++)
+		printf("\t");
+}
+
+static uint32_t
+block_id(struct ir3_block *block)
+{
+#ifdef DEBUG
+	return block->serialno;
+#else
+	return (uint32_t)(uint64_t)block;
+#endif
+}
+
+static void
+print_instr(struct ir3_instruction *instr, int lvl)
+{
+	unsigned i;
+
+	tab(lvl);
+
+	print_instr_name(instr);
+	for (i = 0; i < instr->regs_count; i++) {
+		struct ir3_register *reg = instr->regs[i];
+		printf(i ? ", " : " ");
+		print_reg_name(reg, !!i);
+	}
+
+	if (instr->address) {
+		printf(", address=_");
+		printf("[");
+		print_instr_name(instr->address);
+		printf("]");
+	}
+
+	if (instr->fanin) {
+		printf(", fanin=_");
+		printf("[");
+		print_instr_name(instr->fanin);
+		printf("]");
+	}
+
+	if (is_meta(instr)) {
+		if (instr->opc == OPC_META_FO) {
+			printf(", off=%d", instr->fo.off);
+		} else if ((instr->opc == OPC_META_FI) && instr->fi.aid) {
+			printf(", aid=%d", instr->fi.aid);
+		}
+	}
+
+	if (is_flow(instr) && instr->cat0.target) {
+		/* the predicate register src is implied: */
+		if (instr->opc == OPC_BR) {
+			printf(" %sp0.x", instr->cat0.inv ? "!" : "");
+		}
+		printf(", target=block%u", block_id(instr->cat0.target));
+	}
+
+	printf("\n");
+}
+
+void ir3_print_instr(struct ir3_instruction *instr)
+{
+	print_instr(instr, 0);
+}
+
+static void
+print_block(struct ir3_block *block, int lvl)
+{
+	tab(lvl); printf("block%u {\n", block_id(block));
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		print_instr(instr, lvl+1);
+	}
+	if (block->successors[1]) {
+		/* leading into if/else: */
+		tab(lvl+1);
+		printf("/* succs: if _[");
+		print_instr_name(block->condition);
+		printf("] block%u; else block%u; */\n",
+				block_id(block->successors[0]),
+				block_id(block->successors[1]));
+	} else if (block->successors[0]) {
+		tab(lvl+1);
+		printf("/* succs: block%u; */\n",
+				block_id(block->successors[0]));
+	}
+	tab(lvl); printf("}\n");
+}
+
+void
+ir3_print(struct ir3 *ir)
+{
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node)
+		print_block(block, 0);
+
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		if (!ir->outputs[i])
+			continue;
+		printf("out%d: ", i);
+		print_instr(ir->outputs[i], 0);
+	}
+}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_ra.c b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
index a4235a7..e5aba85 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_ra.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_ra.c
@@ -26,284 +26,702 @@
  *    Rob Clark <robclark@freedesktop.org>
  */
 
-#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/register_allocate.h"
+#include "util/ralloc.h"
+#include "util/bitset.h"
 
 #include "ir3.h"
+#include "ir3_compiler.h"
 
 /*
  * Register Assignment:
  *
- * NOTE: currently only works on a single basic block.. need to think
- * about how multiple basic blocks are going to get scheduled.  But
- * I think I want to re-arrange how blocks work, ie. get rid of the
- * block nesting thing..
+ * Uses the register_allocate util, which implements graph coloring
+ * algo with interference classes.  To handle the cases where we need
+ * consecutive registers (for example, texture sample instructions),
+ * we model these as larger (double/quad/etc) registers which conflict
+ * with the corresponding registers in other classes.
  *
- * NOTE: we could do register coalescing (eliminate moves) as part of
- * the RA step.. OTOH I think we need to do scheduling before register
- * assignment.  And if we remove a mov that effects scheduling (unless
- * we leave a placeholder nop, which seems lame), so I'm not really
- * sure how practical this is to do both in a single stage.  But OTOH
- * I'm not really sure a sane way for the CP stage to realize when it
- * cannot remove a mov due to multi-register constraints..
+ * Additionally we create additional classes for half-regs, which
+ * do not conflict with the full-reg classes.  We do need at least
+ * sizes 1-4 (to deal w/ texture sample instructions output to half-
+ * reg).  At the moment we don't create the higher order half-reg
+ * classes as half-reg frequently does not have enough precision
+ * for texture coords at higher resolutions.
  *
- * NOTE: http://scopesconf.org/scopes-01/paper/session1_2.ps.gz has
- * some ideas to handle array allocation with a more conventional
- * graph coloring algorithm for register assignment, which might be
- * a good alternative to the current algo.  However afaict it cannot
- * handle overlapping arrays, which is a scenario that we have to
- * deal with
+ * There are some additional cases that we need to handle specially,
+ * as the graph coloring algo doesn't understand "partial writes".
+ * For example, a sequence like:
+ *
+ *   add r0.z, ...
+ *   sam (f32)(xy)r0.x, ...
+ *   ...
+ *   sam (f32)(xyzw)r0.w, r0.x, ...  ; 3d texture, so r0.xyz are coord
+ *
+ * In this scenario, we treat r0.xyz as class size 3, which is written
+ * (from a use/def perspective) at the 'add' instruction and ignore the
+ * subsequent partial writes to r0.xy.  So the 'add r0.z, ...' is the
+ * defining instruction, as it is the first to partially write r0.xyz.
+ *
+ * Note i965 has a similar scenario, which they solve with a virtual
+ * LOAD_PAYLOAD instruction which gets turned into multiple MOV's after
+ * register assignment.  But for us that is horrible from a scheduling
+ * standpoint.  Instead what we do is use idea of 'definer' instruction.
+ * Ie. the first instruction (lowest ip) to write to the array is the
+ * one we consider from use/def perspective when building interference
+ * graph.  (Other instructions which write other array elements just
+ * define the variable some more.)
+ */
+
+static const unsigned class_sizes[] = {
+	1, 2, 3, 4,
+	4 + 4, /* txd + 1d/2d */
+	4 + 6, /* txd + 3d */
+	/* temporary: until we can assign arrays, create classes so we
+	 * can round up array to fit.  NOTE with tgsi arrays should
+	 * really all be multiples of four:
+	 */
+	4 * 4,
+	4 * 8,
+	4 * 16,
+	4 * 32,
+
+};
+#define class_count ARRAY_SIZE(class_sizes)
+
+static const unsigned half_class_sizes[] = {
+	1, 2, 3, 4,
+};
+#define half_class_count  ARRAY_SIZE(half_class_sizes)
+#define total_class_count (class_count + half_class_count)
+
+/* Below a0.x are normal regs.  RA doesn't need to assign a0.x/p0.x. */
+#define NUM_REGS             (4 * (REG_A0 - 1))
+/* Number of virtual regs in a given class: */
+#define CLASS_REGS(i)        (NUM_REGS - (class_sizes[i] - 1))
+#define HALF_CLASS_REGS(i)   (NUM_REGS - (half_class_sizes[i] - 1))
+
+/* register-set, created one time, used for all shaders: */
+struct ir3_ra_reg_set {
+	struct ra_regs *regs;
+	unsigned int classes[class_count];
+	unsigned int half_classes[half_class_count];
+	/* maps flat virtual register space to base gpr: */
+	uint16_t *ra_reg_to_gpr;
+	/* maps cls,gpr to flat virtual register space: */
+	uint16_t **gpr_to_ra_reg;
+};
+
+/* One-time setup of RA register-set, which describes all the possible
+ * "virtual" registers and their interferences.  Ie. double register
+ * occupies (and conflicts with) two single registers, and so forth.
+ * Since registers do not need to be aligned to their class size, they
+ * can conflict with other registers in the same class too.  Ie:
+ *
+ *    Single (base) |  Double
+ *    --------------+---------------
+ *       R0         |  D0
+ *       R1         |  D0 D1
+ *       R2         |     D1 D2
+ *       R3         |        D2
+ *           .. and so on..
+ *
+ * (NOTE the disassembler uses notation like r0.x/y/z/w but those are
+ * really just four scalar registers.  Don't let that confuse you.)
  */
+struct ir3_ra_reg_set *
+ir3_ra_alloc_reg_set(void *memctx)
+{
+	struct ir3_ra_reg_set *set = rzalloc(memctx, struct ir3_ra_reg_set);
+	unsigned ra_reg_count, reg, first_half_reg;
+	unsigned int **q_values;
+
+	/* calculate # of regs across all classes: */
+	ra_reg_count = 0;
+	for (unsigned i = 0; i < class_count; i++)
+		ra_reg_count += CLASS_REGS(i);
+	for (unsigned i = 0; i < half_class_count; i++)
+		ra_reg_count += HALF_CLASS_REGS(i);
+
+	/* allocate and populate q_values: */
+	q_values = ralloc_array(set, unsigned *, total_class_count);
+	for (unsigned i = 0; i < class_count; i++) {
+		q_values[i] = rzalloc_array(q_values, unsigned, total_class_count);
+
+		/* From register_allocate.c:
+		 *
+		 * q(B,C) (indexed by C, B is this register class) in
+		 * Runeson/Nyström paper.  This is "how many registers of B could
+		 * the worst choice register from C conflict with".
+		 *
+		 * If we just let the register allocation algorithm compute these
+		 * values, is extremely expensive.  However, since all of our
+		 * registers are laid out, we can very easily compute them
+		 * ourselves.  View the register from C as fixed starting at GRF n
+		 * somewhere in the middle, and the register from B as sliding back
+		 * and forth.  Then the first register to conflict from B is the
+		 * one starting at n - class_size[B] + 1 and the last register to
+		 * conflict will start at n + class_size[B] - 1.  Therefore, the
+		 * number of conflicts from B is class_size[B] + class_size[C] - 1.
+		 *
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 * B | | | | | |n| --> | | | | | | |
+		 *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+		 *             +-+-+-+-+-+
+		 * C           |n| | | | |
+		 *             +-+-+-+-+-+
+		 *
+		 * (Idea copied from brw_fs_reg_allocate.cpp)
+		 */
+		for (unsigned j = 0; j < class_count; j++)
+			q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
+	}
+
+	for (unsigned i = class_count; i < total_class_count; i++) {
+		q_values[i] = ralloc_array(q_values, unsigned, total_class_count);
+
+		/* see comment above: */
+		for (unsigned j = class_count; j < total_class_count; j++) {
+			q_values[i][j] = half_class_sizes[i - class_count] +
+					half_class_sizes[j - class_count] - 1;
+		}
+	}
 
+	/* allocate the reg-set.. */
+	set->regs = ra_alloc_reg_set(set, ra_reg_count);
+	set->ra_reg_to_gpr = ralloc_array(set, uint16_t, ra_reg_count);
+	set->gpr_to_ra_reg = ralloc_array(set, uint16_t *, total_class_count);
+
+	/* .. and classes */
+	reg = 0;
+	for (unsigned i = 0; i < class_count; i++) {
+		set->classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[i] = ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+		for (unsigned j = 0; j < CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[i][j] = reg;
+
+			for (unsigned br = j; br < j + class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br, reg);
+
+			reg++;
+		}
+	}
+
+	first_half_reg = reg;
+
+	for (unsigned i = 0; i < half_class_count; i++) {
+		set->half_classes[i] = ra_alloc_reg_class(set->regs);
+
+		set->gpr_to_ra_reg[class_count + i] =
+				ralloc_array(set, uint16_t, CLASS_REGS(i));
+
+		for (unsigned j = 0; j < HALF_CLASS_REGS(i); j++) {
+			ra_class_add_reg(set->regs, set->half_classes[i], reg);
+
+			set->ra_reg_to_gpr[reg] = j;
+			set->gpr_to_ra_reg[class_count + i][j] = reg;
+
+			for (unsigned br = j; br < j + half_class_sizes[i]; br++)
+				ra_add_transitive_reg_conflict(set->regs, br + first_half_reg, reg);
+
+			reg++;
+		}
+	}
+
+	ra_set_finalize(set->regs, q_values);
+
+	ralloc_free(q_values);
+
+	return set;
+}
+
+/* register-assign context, per-shader */
 struct ir3_ra_ctx {
-	struct ir3_block *block;
+	struct ir3 *ir;
 	enum shader_t type;
-	bool frag_coord;
 	bool frag_face;
-	int cnt;
-	bool error;
-	struct {
-		unsigned base;
-		unsigned size;
-	} arrays[MAX_ARRAYS];
+
+	struct ir3_ra_reg_set *set;
+	struct ra_graph *g;
+	unsigned alloc_count;
+	unsigned class_alloc_count[total_class_count];
+	unsigned class_base[total_class_count];
+	unsigned instr_cnt;
+	unsigned *def, *use;     /* def/use table */
 };
 
-#ifdef DEBUG
-#  include "freedreno_util.h"
-#  define ra_debug (fd_mesa_debug & FD_DBG_OPTMSGS)
-#else
-#  define ra_debug 0
-#endif
-
-#define ra_dump_list(msg, n) do { \
-		if (ra_debug) { \
-			debug_printf("-- " msg); \
-			ir3_dump_instr_list(n); \
-		} \
-	} while (0)
-
-#define ra_dump_instr(msg, n) do { \
-		if (ra_debug) { \
-			debug_printf(">> " msg); \
-			ir3_dump_instr_single(n); \
-		} \
-	} while (0)
-
-#define ra_assert(ctx, x) do { \
-		debug_assert(x); \
-		if (!(x)) { \
-			debug_printf("RA: failed assert: %s\n", #x); \
-			(ctx)->error = true; \
-		}; \
-	} while (0)
-
-
-/* sorta ugly way to retrofit half-precision support.. rather than
- * passing extra param around, just OR in a high bit.  All the low
- * value arithmetic (ie. +/- offset within a contiguous vec4, etc)
- * will continue to work as long as you don't underflow (and that
- * would go badly anyways).
- */
-#define REG_HALF  0x8000
+/* additional block-data (per-block) */
+struct ir3_ra_block_data {
+	BITSET_WORD *def;        /* variables defined before used in block */
+	BITSET_WORD *use;        /* variables used before defined in block */
+	BITSET_WORD *livein;     /* which defs reach entry point of block */
+	BITSET_WORD *liveout;    /* which defs reach exit point of block */
+};
+
+static bool
+is_half(struct ir3_instruction *instr)
+{
+	return !!(instr->regs[0]->flags & IR3_REG_HALF);
+}
 
-#define REG(n, wm, f) (struct ir3_register){ \
-		.flags  = (f), \
-		.num    = (n), \
-		.wrmask = TGSI_WRITEMASK_ ## wm, \
+static int
+size_to_class(unsigned sz, bool half)
+{
+	if (half) {
+		for (unsigned i = 0; i < half_class_count; i++)
+			if (half_class_sizes[i] >= sz)
+				return i + class_count;
+	} else {
+		for (unsigned i = 0; i < class_count; i++)
+			if (class_sizes[i] >= sz)
+				return i;
 	}
+	debug_assert(0);
+	return -1;
+}
 
-/* check that the register exists, is a GPR and is not special (a0/p0) */
-static struct ir3_register * reg_check(struct ir3_instruction *instr, unsigned n)
+static bool
+is_temp(struct ir3_register *reg)
 {
-	if ((n < instr->regs_count) && reg_gpr(instr->regs[n]) &&
-			!(instr->regs[n]->flags & IR3_REG_SSA))
-		return instr->regs[n];
-	return NULL;
+	if (reg->flags & (IR3_REG_CONST | IR3_REG_IMMED))
+		return false;
+	if (reg->flags & IR3_REG_RELATIV) // TODO
+		return false;
+	if ((reg->num == regid(REG_A0, 0)) ||
+			(reg->num == regid(REG_P0, 0)))
+		return false;
+	return true;
 }
 
-/* figure out if an unassigned src register points back to the instr we
- * are assigning:
- */
-static bool instr_used_by(struct ir3_instruction *instr,
-		struct ir3_register *src)
+static bool
+writes_gpr(struct ir3_instruction *instr)
 {
-	struct ir3_instruction *src_instr = ssa(src);
-	unsigned i;
-	if (instr == src_instr)
-		return true;
-	if (src_instr && is_meta(src_instr))
-		for (i = 1; i < src_instr->regs_count; i++)
-			if (instr_used_by(instr, src_instr->regs[i]))
-				return true;
-
-	return false;
+	if (is_store(instr))
+		return false;
+	/* is dest a normal temp register: */
+	return is_temp(instr->regs[0]);
 }
 
-static bool instr_is_output(struct ir3_instruction *instr)
+static struct ir3_instruction *
+get_definer(struct ir3_instruction *instr, int *sz, int *off)
 {
-	struct ir3_block *block = instr->block;
-	unsigned i;
+	struct ir3_instruction *d = NULL;
+	if (is_meta(instr) && (instr->opc == OPC_META_FI)) {
+		/* What about the case where collect is subset of array, we
+		 * need to find the distance between where actual array starts
+		 * and fanin..  that probably doesn't happen currently.
+		 */
+		struct ir3_register *src;
 
-	for (i = 0; i < block->noutputs; i++)
-		if (instr == block->outputs[i])
-			return true;
+		/* note: don't use foreach_ssa_src as this gets called once
+		 * while assigning regs (which clears SSA flag)
+		 */
+		foreach_src(src, instr) {
+			if (!src->instr)
+				continue;
+			if ((!d) || (src->instr->ip < d->ip))
+				d = src->instr;
+		}
 
-	return false;
-}
+		*sz = instr->regs_count - 1;
+		*off = 0;
 
-static void mark_sources(struct ir3_instruction *instr,
-		struct ir3_instruction *n, regmask_t *liveregs, regmask_t *written)
-{
-	unsigned i;
+	} else if (instr->cp.right || instr->cp.left) {
+		/* covers also the meta:fo case, which ends up w/ single
+		 * scalar instructions for each component:
+		 */
+		struct ir3_instruction *f = ir3_neighbor_first(instr);
+
+		/* by definition, the entire sequence forms one linked list
+		 * of single scalar register nodes (even if some of them may
+		 * be fanouts from a texture sample (for example) instr.  We
+		 * just need to walk the list finding the first element of
+		 * the group defined (lowest ip)
+		 */
+		int cnt = 0;
+
+		d = f;
+		while (f) {
+			if (f->ip < d->ip)
+				d = f;
+			if (f == instr)
+				*off = cnt;
+			f = f->cp.right;
+			cnt++;
+		}
+
+		*sz = cnt;
+
+	} else {
+		/* second case is looking directly at the instruction which
+		 * produces multiple values (eg, texture sample), rather
+		 * than the fanout nodes that point back to that instruction.
+		 * This isn't quite right, because it may be part of a larger
+		 * group, such as:
+		 *
+		 *     sam (f32)(xyzw)r0.x, ...
+		 *     add r1.x, ...
+		 *     add r1.y, ...
+		 *     sam (f32)(xyzw)r2.x, r0.w  <-- (r0.w, r1.x, r1.y)
+		 *
+		 * need to come up with a better way to handle that case.
+		 */
+		if (instr->address) {
+			*sz = instr->regs[0]->size;
+		} else {
+			*sz = util_last_bit(instr->regs[0]->wrmask);
+		}
+		*off = 0;
+		d = instr;
+	}
+
+	if (d->regs[0]->flags & IR3_REG_PHI_SRC) {
+		struct ir3_instruction *phi = d->regs[0]->instr;
+		struct ir3_instruction *dd;
+		int dsz, doff;
+
+		dd = get_definer(phi, &dsz, &doff);
+
+		*sz = MAX2(*sz, dsz);
+		*off = doff;
+
+		if (dd->ip < d->ip) {
+			d = dd;
+		}
+	}
 
-	for (i = 1; i < n->regs_count; i++) {
-		struct ir3_register *r = reg_check(n, i);
-		if (r)
-			regmask_set_if_not(liveregs, r, written);
+	if (is_meta(d) && (d->opc == OPC_META_PHI)) {
+		/* we have already inserted parallel-copies into
+		 * the phi, so we don't need to chase definers
+		 */
+		struct ir3_register *src;
 
-		/* if any src points back to the instruction(s) in
-		 * the block of neighbors that we are assigning then
-		 * mark any written (clobbered) registers as live:
+		/* note: don't use foreach_ssa_src as this gets called once
+		 * while assigning regs (which clears SSA flag)
 		 */
-		if (instr_used_by(instr, n->regs[i]))
-			regmask_or(liveregs, liveregs, written);
+		foreach_src(src, d) {
+			if (!src->instr)
+				continue;
+			if (src->instr->ip < d->ip)
+				d = src->instr;
+		}
 	}
 
+	if (is_meta(d) && (d->opc == OPC_META_FO)) {
+		struct ir3_instruction *dd;
+		int dsz, doff;
+
+		dd = get_definer(d->regs[1]->instr, &dsz, &doff);
+
+		/* by definition, should come before: */
+		debug_assert(dd->ip < d->ip);
+
+		*sz = MAX2(*sz, dsz);
+
+		/* Fanout's are grouped, so *off should already valid */
+
+		d = dd;
+	}
+
+	return d;
 }
 
-/* live means read before written */
-static void compute_liveregs(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, regmask_t *liveregs)
+/* give each instruction a name (and ip), and count up the # of names
+ * of each class
+ */
+static void
+ra_block_name_instructions(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_block *block = instr->block;
-	struct ir3_instruction *n;
-	regmask_t written;
-	unsigned i;
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_instruction *defn;
+		int cls, sz, off;
 
-	regmask_init(&written);
+		ctx->instr_cnt++;
 
-	for (n = instr->next; n; n = n->next) {
-		struct ir3_register *r;
-
-		if (is_meta(n))
+		if (instr->regs_count == 0)
 			continue;
 
-		/* check first src's read: */
-		mark_sources(instr, n, liveregs, &written);
+		if (!writes_gpr(instr))
+			continue;
 
-		/* for instructions that write to an array, we need to
-		 * capture the dependency on the array elements:
-		 */
-		if (n->fanin)
-			mark_sources(instr, n->fanin, liveregs, &written);
+		defn = get_definer(instr, &sz, &off);
 
-		/* meta-instructions don't actually get scheduled,
-		 * so don't let it's write confuse us.. what we
-		 * really care about is when the src to the meta
-		 * instr was written:
-		 */
-		if (is_meta(n))
+		if (defn != instr)
 			continue;
 
-		/* then dst written (if assigned already): */
-		r = reg_check(n, 0);
-		if (r) {
-			/* if an instruction *is* an output, then it is live */
-			if (!instr_is_output(n))
-				regmask_set(&written, r);
+		/* arrays which don't fit in one of the pre-defined class
+		 * sizes are pre-colored:
+		 *
+		 * TODO but we still need to allocate names for them, don't we??
+		 */
+		cls = size_to_class(sz, is_half(defn));
+		if (cls >= 0) {
+			instr->name = ctx->class_alloc_count[cls]++;
+			ctx->alloc_count++;
 		}
-
 	}
+}
 
-	/* be sure to account for output registers too: */
-	for (i = 0; i < block->noutputs; i++) {
-		struct ir3_register *r;
-		if (!block->outputs[i])
-			continue;
-		r = reg_check(block->outputs[i], 0);
-		if (r)
-			regmask_set_if_not(liveregs, r, &written);
+static void
+ra_init(struct ir3_ra_ctx *ctx)
+{
+	ir3_clear_mark(ctx->ir);
+	ir3_count_instructions(ctx->ir);
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_name_instructions(ctx, block);
 	}
 
-	/* if instruction is output, we need a reg that isn't written
-	 * before the end.. equiv to the instr_used_by() check above
-	 * in the loop body
-	 * TODO maybe should follow fanin/fanout?
+	/* figure out the base register name for each class.  The
+	 * actual ra name is class_base[cls] + instr->name;
 	 */
-	if (instr_is_output(instr))
-		regmask_or(liveregs, liveregs, &written);
+	ctx->class_base[0] = 0;
+	for (unsigned i = 1; i < total_class_count; i++) {
+		ctx->class_base[i] = ctx->class_base[i-1] +
+				ctx->class_alloc_count[i-1];
+	}
+
+	ctx->g = ra_alloc_interference_graph(ctx->set->regs, ctx->alloc_count);
+	ctx->def = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+	ctx->use = rzalloc_array(ctx->g, unsigned, ctx->alloc_count);
+}
+
+static unsigned
+ra_name(struct ir3_ra_ctx *ctx, int cls, struct ir3_instruction *defn)
+{
+	unsigned name;
+	debug_assert(cls >= 0);
+	name = ctx->class_base[cls] + defn->name;
+	debug_assert(name < ctx->alloc_count);
+	return name;
 }
 
-static int find_available(regmask_t *liveregs, int size, bool half)
+static void
+ra_destroy(struct ir3_ra_ctx *ctx)
 {
-	unsigned i;
-	unsigned f = half ? IR3_REG_HALF : 0;
-	for (i = 0; i < MAX_REG - size; i++) {
-		if (!regmask_get(liveregs, &REG(i, X, f))) {
-			unsigned start = i++;
-			for (; (i < MAX_REG) && ((i - start) < size); i++)
-				if (regmask_get(liveregs, &REG(i, X, f)))
-					break;
-			if ((i - start) >= size)
-				return start;
+	ralloc_free(ctx->g);
+}
+
+static void
+ra_block_compute_live_ranges(struct ir3_ra_ctx *ctx, struct ir3_block *block)
+{
+	struct ir3_ra_block_data *bd;
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+
+	bd = rzalloc(ctx->g, struct ir3_ra_block_data);
+
+	bd->def     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->use     = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->livein  = rzalloc_array(bd, BITSET_WORD, bitset_words);
+	bd->liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+	block->bd = bd;
+
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_instruction *src;
+
+		if (instr->regs_count == 0)
+			continue;
+
+		/* There are a couple special cases to deal with here:
+		 *
+		 * fanout: used to split values from a higher class to a lower
+		 *     class, for example split the results of a texture fetch
+		 *     into individual scalar values;  We skip over these from
+		 *     a 'def' perspective, and for a 'use' we walk the chain
+		 *     up to the defining instruction.
+		 *
+		 * fanin: used to collect values from lower class and assemble
+		 *     them together into a higher class, for example arguments
+		 *     to texture sample instructions;  We consider these to be
+		 *     defined at the earliest fanin source.
+		 *
+		 * phi: used to merge values from different flow control paths
+		 *     to the same reg.  Consider defined at earliest phi src,
+		 *     and update all the other phi src's (which may come later
+		 *     in the program) as users to extend the var's live range.
+		 *
+		 * Most of this, other than phi, is completely handled in the
+		 * get_definer() helper.
+		 *
+		 * In either case, we trace the instruction back to the original
+		 * definer and consider that as the def/use ip.
+		 */
+
+		if (writes_gpr(instr)) {
+			struct ir3_instruction *defn;
+			int cls, sz, off;
+
+			defn = get_definer(instr, &sz, &off);
+			if (defn == instr) {
+				/* arrays which don't fit in one of the pre-defined class
+				 * sizes are pre-colored:
+				 */
+				cls = size_to_class(sz, is_half(defn));
+				if (cls >= 0) {
+					unsigned name = ra_name(ctx, cls, defn);
+
+					ctx->def[name] = defn->ip;
+					ctx->use[name] = defn->ip;
+
+					/* since we are in SSA at this point: */
+					debug_assert(!BITSET_TEST(bd->use, name));
+
+					BITSET_SET(bd->def, name);
+
+					if (is_half(defn)) {
+						ra_set_node_class(ctx->g, name,
+								ctx->set->half_classes[cls - class_count]);
+					} else {
+						ra_set_node_class(ctx->g, name,
+								ctx->set->classes[cls]);
+					}
+
+					/* extend the live range for phi srcs, which may come
+					 * from the bottom of the loop
+					 */
+					if (defn->regs[0]->flags & IR3_REG_PHI_SRC) {
+						struct ir3_instruction *phi = defn->regs[0]->instr;
+						foreach_ssa_src(src, phi) {
+							/* if src is after phi, then we need to extend
+							 * the liverange to the end of src's block:
+							 */
+							if (src->ip > phi->ip) {
+								struct ir3_instruction *last =
+									list_last_entry(&src->block->instr_list,
+										struct ir3_instruction, node);
+								ctx->use[name] = MAX2(ctx->use[name], last->ip);
+							}
+						}
+					}
+				}
+			}
+		}
+
+		foreach_ssa_src(src, instr) {
+			if (writes_gpr(src)) {
+				struct ir3_instruction *srcdefn;
+				int cls, sz, off;
+
+				srcdefn = get_definer(src, &sz, &off);
+				cls = size_to_class(sz, is_half(srcdefn));
+				if (cls >= 0) {
+					unsigned name = ra_name(ctx, cls, srcdefn);
+					ctx->use[name] = MAX2(ctx->use[name], instr->ip);
+					if (!BITSET_TEST(bd->def, name))
+						BITSET_SET(bd->use, name);
+				}
+			}
 		}
 	}
-	assert(0);
-	return -1;
 }
 
-static int alloc_block(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, int size)
+static bool
+ra_compute_livein_liveout(struct ir3_ra_ctx *ctx)
 {
-	struct ir3_register *dst = instr->regs[0];
-	struct ir3_instruction *n;
-	regmask_t liveregs;
-	unsigned name;
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	bool progress = false;
 
-	/* should only ever be called w/ head of neighbor list: */
-	debug_assert(!instr->cp.left);
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->bd;
 
-	regmask_init(&liveregs);
+		/* update livein: */
+		for (unsigned i = 0; i < bitset_words; i++) {
+			BITSET_WORD new_livein =
+				(bd->use[i] | (bd->liveout[i] & ~bd->def[i]));
 
-	for (n = instr; n; n = n->cp.right)
-		compute_liveregs(ctx, n, &liveregs);
+			if (new_livein & ~bd->livein[i]) {
+				bd->livein[i] |= new_livein;
+				progress = true;
+			}
+		}
 
-	/* because we do assignment on fanout nodes for wrmask!=0x1, we
-	 * need to handle this special case, where the fanout nodes all
-	 * appear after one or more of the consumers of the src node:
-	 *
-	 *   0098:009: sam _, r2.x
-	 *   0028:010: mul.f r3.z, r4.x, c13.x
-	 *   ; we start assigning here for '0098:009: sam'.. but
-	 *   ; would miss the usage at '0028:010: mul.f'
-	 *   0101:009: _meta:fo _, _[0098:009: sam], off=2
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_FO))
-		compute_liveregs(ctx, instr->regs[1]->instr, &liveregs);
+		/* update liveout: */
+		for (unsigned j = 0; j < ARRAY_SIZE(block->successors); j++) {
+			struct ir3_block *succ = block->successors[j];
+			struct ir3_ra_block_data *succ_bd;
+
+			if (!succ)
+				continue;
 
-	name = find_available(&liveregs, size,
-			!!(dst->flags & IR3_REG_HALF));
+			succ_bd = succ->bd;
 
-	if (dst->flags & IR3_REG_HALF)
-		name |= REG_HALF;
+			for (unsigned i = 0; i < bitset_words; i++) {
+				BITSET_WORD new_liveout =
+					(succ_bd->livein[i] & ~bd->liveout[i]);
 
-	return name;
+				if (new_liveout) {
+					bd->liveout[i] |= new_liveout;
+					progress = true;
+				}
+			}
+		}
+	}
+
+	return progress;
 }
 
-static type_t half_type(type_t type)
+static void
+ra_add_interference(struct ir3_ra_ctx *ctx)
 {
-	switch (type) {
-	case TYPE_F32: return TYPE_F16;
-	case TYPE_U32: return TYPE_U16;
-	case TYPE_S32: return TYPE_S16;
-	/* instructions may already be fixed up: */
-	case TYPE_F16:
-	case TYPE_U16:
-	case TYPE_S16:
-		return type;
-	default:
-		assert(0);
-		return ~0;
+	struct ir3 *ir = ctx->ir;
+
+	/* compute live ranges (use/def) on a block level, also updating
+	 * block's def/use bitmasks (used below to calculate per-block
+	 * livein/liveout):
+	 */
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		ra_block_compute_live_ranges(ctx, block);
+	}
+
+	/* update per-block livein/liveout: */
+	while (ra_compute_livein_liveout(ctx)) {}
+
+	/* extend start/end ranges based on livein/liveout info from cfg: */
+	unsigned bitset_words = BITSET_WORDS(ctx->alloc_count);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		struct ir3_ra_block_data *bd = block->bd;
+
+		for (unsigned i = 0; i < bitset_words; i++) {
+			if (BITSET_TEST(bd->livein, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->start_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->start_ip);
+			}
+
+			if (BITSET_TEST(bd->liveout, i)) {
+				ctx->def[i] = MIN2(ctx->def[i], block->end_ip);
+				ctx->use[i] = MAX2(ctx->use[i], block->end_ip);
+			}
+		}
+	}
+
+	/* need to fix things up to keep outputs live: */
+	for (unsigned i = 0; i < ir->noutputs; i++) {
+		struct ir3_instruction *instr = ir->outputs[i];
+		struct ir3_instruction *defn;
+		int cls, sz, off;
+
+		defn = get_definer(instr, &sz, &off);
+		cls = size_to_class(sz, is_half(defn));
+		if (cls >= 0) {
+			unsigned name = ra_name(ctx, cls, defn);
+			ctx->use[name] = ctx->instr_cnt;
+		}
+	}
+
+	for (unsigned i = 0; i < ctx->alloc_count; i++) {
+		for (unsigned j = 0; j < ctx->alloc_count; j++) {
+			if (!((ctx->def[i] >= ctx->use[j]) ||
+					(ctx->def[j] >= ctx->use[i]))) {
+				ra_add_node_interference(ctx->g, i, j);
+			}
+		}
 	}
 }
 
@@ -358,302 +776,124 @@ static void fixup_half_instr_src(struct ir3_instruction *instr)
 	}
 }
 
-static void reg_assign(struct ir3_instruction *instr,
-		unsigned r, unsigned name)
+static void
+reg_assign(struct ir3_ra_ctx *ctx, struct ir3_register *reg,
+		struct ir3_instruction *instr)
 {
-	struct ir3_register *reg = instr->regs[r];
-
-	reg->flags &= ~IR3_REG_SSA;
-	reg->num = name & ~REG_HALF;
-
-	if (name & REG_HALF) {
-		reg->flags |= IR3_REG_HALF;
-		/* if dst reg being assigned, patch up the instr: */
-		if (reg == instr->regs[0])
-			fixup_half_instr_dst(instr);
-		else
-			fixup_half_instr_src(instr);
-	}
-}
-
-static void instr_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned name);
+	struct ir3_instruction *defn;
+	int cls, sz, off;
 
-static void instr_assign_src(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned r, unsigned name)
-{
-	struct ir3_register *reg = instr->regs[r];
+	defn = get_definer(instr, &sz, &off);
+	cls = size_to_class(sz, is_half(defn));
+	if (cls >= 0) {
+		unsigned name = ra_name(ctx, cls, defn);
+		unsigned r = ra_get_node_reg(ctx->g, name);
+		unsigned num = ctx->set->ra_reg_to_gpr[r] + off;
 
-	if (reg->flags & IR3_REG_RELATIV)
-		name += reg->offset;
+		if (reg->flags & IR3_REG_RELATIV)
+			num += reg->offset;
 
-	reg_assign(instr, r, name);
+		reg->num = num;
+		reg->flags &= ~(IR3_REG_SSA | IR3_REG_PHI_SRC);
 
-	if (is_meta(instr)) {
-		switch (instr->opc) {
-		case OPC_META_INPUT:
-			/* shader-input does not have a src, only block input: */
-			debug_assert(instr->regs_count == 2);
-			instr_assign(ctx, instr, name);
-			return;
-		case OPC_META_FO:
-			instr_assign(ctx, instr, name + instr->fo.off);
-			return;
-		case OPC_META_FI:
-			instr_assign(ctx, instr, name - (r - 1));
-			return;
-		default:
-			break;
-		}
+		if (is_half(defn))
+			reg->flags |= IR3_REG_HALF;
 	}
 }
 
-static void instr_assign_srcs(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned name)
+static void
+ra_block_alloc(struct ir3_ra_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_instruction *n, *src;
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		struct ir3_register *reg;
 
-	for (n = instr->next; n && !ctx->error; n = n->next) {
-		foreach_ssa_src_n(src, i, n) {
-			unsigned r = i + 1;
-
-			/* skip address / etc (non real sources): */
-			if (r >= n->regs_count)
-				continue;
+		if (instr->regs_count == 0)
+			continue;
 
-			if (src == instr)
-				instr_assign_src(ctx, n, r, name);
+		if (writes_gpr(instr)) {
+			reg_assign(ctx, instr->regs[0], instr);
+			if (instr->regs[0]->flags & IR3_REG_HALF)
+				fixup_half_instr_dst(instr);
 		}
-	}
-}
-
-static void instr_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr, unsigned name)
-{
-	struct ir3_register *reg = instr->regs[0];
-
-	if (reg->flags & IR3_REG_RELATIV)
-		return;
-
-	/* check if already assigned: */
-	if (!(reg->flags & IR3_REG_SSA)) {
-		/* ... and if so, sanity check: */
-		ra_assert(ctx, reg->num == (name & ~REG_HALF));
-		return;
-	}
-
-	/* rename this instructions dst register: */
-	reg_assign(instr, 0, name);
-
-	/* and rename any subsequent use of result of this instr: */
-	instr_assign_srcs(ctx, instr, name);
-
-	/* To simplify the neighbor logic, and to "avoid" dealing with
-	 * instructions which write more than one output, we actually
-	 * do register assignment for instructions that produce multiple
-	 * outputs on the fanout nodes and propagate up the assignment
-	 * to the actual instruction:
-	 */
-	if (is_meta(instr) && (instr->opc == OPC_META_FO)) {
-		struct ir3_instruction *src;
 
-		debug_assert(name >= instr->fo.off);
-
-		foreach_ssa_src(src, instr)
-			instr_assign(ctx, src, name - instr->fo.off);
-	}
-}
+		foreach_src_n(reg, n, instr) {
+			struct ir3_instruction *src = reg->instr;
+			if (!src)
+				continue;
 
-/* check neighbor list to see if it is already partially (or completely)
- * assigned, in which case register block is already allocated and we
- * just need to complete the assignment:
- */
-static int check_partial_assignment(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *n;
-	int off = 0;
-
-	debug_assert(!instr->cp.left);
-
-	for (n = instr; n; n = n->cp.right) {
-		struct ir3_register *dst = n->regs[0];
-		if ((n->depth != DEPTH_UNUSED) &&
-				!(dst->flags & IR3_REG_SSA)) {
-			int name = dst->num - off;
-			debug_assert(name >= 0);
-			return name;
+			reg_assign(ctx, instr->regs[n+1], src);
+			if (instr->regs[n+1]->flags & IR3_REG_HALF)
+				fixup_half_instr_src(instr);
 		}
-		off++;
 	}
-
-	return -1;
 }
 
-/* allocate register name(s) for a list of neighboring instructions;
- * instr should point to leftmost neighbor (head of list)
- */
-static void instr_alloc_and_assign(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr)
+static int
+ra_alloc(struct ir3_ra_ctx *ctx)
 {
-	struct ir3_instruction *n;
-	struct ir3_register *dst;
-	int name;
-
-	debug_assert(!instr->cp.left);
-
-	if (instr->regs_count == 0)
-		return;
-
-	dst = instr->regs[0];
-
-	/* For indirect dst, take the register assignment from the
-	 * fanin and propagate it forward.
-	 */
-	if (dst->flags & IR3_REG_RELATIV) {
-		/* NOTE can be grouped, if for example outputs:
-		 * for now disable cp if indirect writes
-		 */
-		instr_alloc_and_assign(ctx, instr->fanin);
-
-		dst->num += instr->fanin->regs[0]->num;
-		dst->flags &= ~IR3_REG_SSA;
-
-		instr_assign_srcs(ctx, instr, instr->fanin->regs[0]->num);
-
-		return;
-	}
-
-	/* for instructions w/ fanouts, do the actual register assignment
-	 * on the group of fanout neighbor nodes and propagate the reg
-	 * name back up to the texture instruction.
-	 */
-	if (dst->wrmask != 0x1)
-		return;
-
-	name = check_partial_assignment(ctx, instr);
-
-	/* allocate register(s): */
-	if (name >= 0) {
-		/* already partially assigned, just finish the job */
-	} else if (reg_gpr(dst)) {
-		int size;
-		/* number of consecutive registers to assign: */
-		size = ir3_neighbor_count(instr);
-		if (dst->wrmask != 0x1)
-			size = MAX2(size, ffs(~dst->wrmask) - 1);
-		name = alloc_block(ctx, instr, size);
-	} else if (dst->flags & IR3_REG_ADDR) {
-		debug_assert(!instr->cp.right);
-		dst->flags &= ~IR3_REG_ADDR;
-		name = regid(REG_A0, 0) | REG_HALF;
-	} else {
-		debug_assert(!instr->cp.right);
-		/* predicate register (p0).. etc */
-		name = regid(REG_P0, 0);
-		debug_assert(dst->num == name);
-	}
-
-	ra_assert(ctx, name >= 0);
-
-	for (n = instr; n && !ctx->error; n = n->cp.right) {
-		instr_assign(ctx, n, name);
-		name++;
-	}
-}
-
-static void instr_assign_array(struct ir3_ra_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	struct ir3_instruction *src;
-	int name, aid = instr->fi.aid;
-
-	if (ctx->arrays[aid].base == ~0) {
-		int size = instr->regs_count - 1;
-		ctx->arrays[aid].base = alloc_block(ctx, instr, size);
-		ctx->arrays[aid].size = size;
-	}
-
-	name = ctx->arrays[aid].base;
-
-	foreach_ssa_src_n(src, i, instr) {
-		unsigned r = i + 1;
-
-		/* skip address / etc (non real sources): */
-		if (r >= instr->regs_count)
-			break;
-
-		instr_assign(ctx, src, name);
-		name++;
-	}
-
-}
-
-static int block_ra(struct ir3_ra_ctx *ctx, struct ir3_block *block)
-{
-	struct ir3_instruction *n;
-
 	/* frag shader inputs get pre-assigned, since we have some
 	 * constraints/unknowns about setup for some of these regs:
 	 */
-	if ((ctx->type == SHADER_FRAGMENT) && !block->parent) {
+	if (ctx->type == SHADER_FRAGMENT) {
+		struct ir3 *ir = ctx->ir;
 		unsigned i = 0, j;
-		if (ctx->frag_face && (i < block->ninputs) && block->inputs[i]) {
+		if (ctx->frag_face && (i < ir->ninputs) && ir->inputs[i]) {
+			struct ir3_instruction *instr = ir->inputs[i];
+			int cls = size_to_class(1, true);
+			unsigned name = ra_name(ctx, cls, instr);
+			unsigned reg = ctx->set->gpr_to_ra_reg[cls][0];
+
 			/* if we have frag_face, it gets hr0.x */
-			instr_assign(ctx, block->inputs[i], REG_HALF | 0);
+			ra_set_node_reg(ctx->g, name, reg);
 			i += 4;
 		}
-		for (j = 0; i < block->ninputs; i++, j++)
-			if (block->inputs[i])
-				instr_assign(ctx, block->inputs[i], j);
-	}
 
-	ra_dump_list("-------\n", block->head);
+		for (j = 0; i < ir->ninputs; i++) {
+			struct ir3_instruction *instr = ir->inputs[i];
+			if (instr) {
+				struct ir3_instruction *defn;
+				int cls, sz, off;
 
-	/* first pass, assign arrays: */
-	for (n = block->head; n && !ctx->error; n = n->next) {
-		if (is_meta(n) && (n->opc == OPC_META_FI) && n->fi.aid) {
-			debug_assert(!n->cp.left);  /* don't think this should happen */
-			ra_dump_instr("ASSIGN ARRAY: ", n);
-			instr_assign_array(ctx, n);
-			ra_dump_list("-------\n", block->head);
+				defn = get_definer(instr, &sz, &off);
+				if (defn == instr) {
+					unsigned name, reg;
+
+					cls = size_to_class(sz, is_half(defn));
+					name = ra_name(ctx, cls, defn);
+					reg = ctx->set->gpr_to_ra_reg[cls][j];
+
+					ra_set_node_reg(ctx->g, name, reg);
+					j += sz;
+				}
+			}
 		}
 	}
 
-	for (n = block->head; n && !ctx->error; n = n->next) {
-		ra_dump_instr("ASSIGN: ", n);
-		instr_alloc_and_assign(ctx, ir3_neighbor_first(n));
-		ra_dump_list("-------\n", block->head);
+	if (!ra_allocate(ctx->g))
+		return -1;
+
+	list_for_each_entry (struct ir3_block, block, &ctx->ir->block_list, node) {
+		ra_block_alloc(ctx, block);
 	}
 
-	return ctx->error ? -1 : 0;
+	return 0;
 }
 
-int ir3_block_ra(struct ir3_block *block, enum shader_t type,
+int ir3_ra(struct ir3 *ir, enum shader_t type,
 		bool frag_coord, bool frag_face)
 {
-	struct ir3_instruction *n;
 	struct ir3_ra_ctx ctx = {
-			.block = block,
+			.ir = ir,
 			.type = type,
-			.frag_coord = frag_coord,
 			.frag_face = frag_face,
+			.set = ir->compiler->set,
 	};
 	int ret;
 
-	memset(&ctx.arrays, ~0, sizeof(ctx.arrays));
-
-	/* mark dst registers w/ SSA flag so we can see which
-	 * have been assigned so far:
-	 * NOTE: we really should set SSA flag consistently on
-	 * every dst register in the frontend.
-	 */
-	for (n = block->head; n; n = n->next)
-		if (n->regs_count > 0)
-			n->regs[0]->flags |= IR3_REG_SSA;
-
-	ir3_clear_mark(block->shader);
-	ret = block_ra(&ctx, block);
+	ra_init(&ctx);
+	ra_add_interference(&ctx);
+	ret = ra_alloc(&ctx);
+	ra_destroy(&ctx);
 
 	return ret;
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_sched.c b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
index a790cba..49a4426 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_sched.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_sched.c
@@ -31,23 +31,14 @@
 
 #include "ir3.h"
 
-enum {
-	SCHEDULED = -1,
-	DELAYED = -2,
-};
-
 /*
  * Instruction Scheduling:
  *
- * Using the depth sorted list from depth pass, attempt to recursively
- * schedule deepest unscheduled path.  The first instruction that cannot
- * be scheduled, returns the required delay slots it needs, at which
- * point we return back up to the top and attempt to schedule by next
- * highest depth.  After a sufficient number of instructions have been
- * scheduled, return back to beginning of list and start again.  If you
- * reach the end of depth sorted list without being able to insert any
- * instruction, insert nop's.  Repeat until no more unscheduled
- * instructions.
+ * A priority-queue based scheduling algo.  Add eligible instructions,
+ * ie. ones with all their dependencies scheduled, to the priority
+ * (depth) sorted queue (list).  Pop highest priority instruction off
+ * the queue and schedule it, add newly eligible instructions to the
+ * priority queue, rinse, repeat.
  *
  * There are a few special cases that need to be handled, since sched
  * is currently independent of register allocation.  Usages of address
@@ -60,90 +51,33 @@ enum {
  */
 
 struct ir3_sched_ctx {
-	struct ir3_instruction *scheduled; /* last scheduled instr */
+	struct ir3_block *block;           /* the current block */
+	struct ir3_instruction *scheduled; /* last scheduled instr XXX remove*/
 	struct ir3_instruction *addr;      /* current a0.x user, if any */
 	struct ir3_instruction *pred;      /* current p0.x user, if any */
-	unsigned cnt;
 	bool error;
 };
 
-static struct ir3_instruction *
-deepest(struct ir3_instruction **srcs, unsigned nsrcs)
-{
-	struct ir3_instruction *d = NULL;
-	unsigned i = 0, id = 0;
-
-	while ((i < nsrcs) && !(d = srcs[id = i]))
-		i++;
-
-	if (!d)
-		return NULL;
-
-	for (; i < nsrcs; i++)
-		if (srcs[i] && (srcs[i]->depth > d->depth))
-			d = srcs[id = i];
-
-	srcs[id] = NULL;
-
-	return d;
-}
-
-static unsigned distance(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr, unsigned maxd)
-{
-	struct ir3_instruction *n = ctx->scheduled;
-	unsigned d = 0;
-	while (n && (n != instr) && (d < maxd)) {
-		if (is_alu(n) || is_flow(n))
-			d++;
-		n = n->next;
-	}
-	return d;
-}
-
-/* TODO maybe we want double linked list? */
-static struct ir3_instruction * prev(struct ir3_instruction *instr)
-{
-	struct ir3_instruction *p = instr->block->head;
-	while (p && (p->next != instr))
-		p = p->next;
-	return p;
-}
-
 static bool is_sfu_or_mem(struct ir3_instruction *instr)
 {
 	return is_sfu(instr) || is_mem(instr);
 }
 
-static void schedule(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr, bool remove)
+static void
+schedule(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
-	struct ir3_block *block = instr->block;
+	debug_assert(ctx->block == instr->block);
 
 	/* maybe there is a better way to handle this than just stuffing
 	 * a nop.. ideally we'd know about this constraint in the
 	 * scheduling and depth calculation..
 	 */
 	if (ctx->scheduled && is_sfu_or_mem(ctx->scheduled) && is_sfu_or_mem(instr))
-		schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+		ir3_NOP(ctx->block);
 
 	/* remove from depth list:
 	 */
-	if (remove) {
-		struct ir3_instruction *p = prev(instr);
-
-		/* NOTE: this can happen for inputs which are not
-		 * read.. in that case there is no need to schedule
-		 * the input, so just bail:
-		 */
-		if (instr != (p ? p->next : block->head))
-			return;
-
-		if (p)
-			p->next = instr->next;
-		else
-			block->head = instr->next;
-	}
+	list_delinit(&instr->node);
 
 	if (writes_addr(instr)) {
 		assert(ctx->addr == NULL);
@@ -157,18 +91,30 @@ static void schedule(struct ir3_sched_ctx *ctx,
 
 	instr->flags |= IR3_INSTR_MARK;
 
-	instr->next = ctx->scheduled;
+	list_addtail(&instr->node, &instr->block->instr_list);
 	ctx->scheduled = instr;
-
-	ctx->cnt++;
 }
 
-/*
- * Delay-slot calculation.  Follows fanin/fanout.
- */
+static unsigned
+distance(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr,
+		unsigned maxd)
+{
+	struct list_head *instr_list = &ctx->block->instr_list;
+	unsigned d = 0;
+
+	list_for_each_entry_rev (struct ir3_instruction, n, instr_list, node) {
+		if ((n == instr) || (d >= maxd))
+			break;
+		if (is_alu(n) || is_flow(n))
+			d++;
+	}
+
+	return d;
+}
 
 /* calculate delay for specified src: */
-static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
+static unsigned
+delay_calc_srcn(struct ir3_sched_ctx *ctx,
 		struct ir3_instruction *assigner,
 		struct ir3_instruction *consumer, unsigned srcn)
 {
@@ -177,7 +123,10 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 	if (is_meta(assigner)) {
 		struct ir3_instruction *src;
 		foreach_ssa_src(src, assigner) {
-			unsigned d = delay_calc_srcn(ctx, src, consumer, srcn);
+			unsigned d;
+			if (src->block != assigner->block)
+				break;
+			d = delay_calc_srcn(ctx, src, consumer, srcn);
 			delay = MAX2(delay, d);
 		}
 	} else {
@@ -189,48 +138,87 @@ static unsigned delay_calc_srcn(struct ir3_sched_ctx *ctx,
 }
 
 /* calculate delay for instruction (maximum of delay for all srcs): */
-static unsigned delay_calc(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
+static unsigned
+delay_calc(struct ir3_sched_ctx *ctx, struct ir3_instruction *instr)
 {
 	unsigned delay = 0;
 	struct ir3_instruction *src;
 
 	foreach_ssa_src_n(src, i, instr) {
-		unsigned d = delay_calc_srcn(ctx, src, instr, i);
+		unsigned d;
+		if (src->block != instr->block)
+			continue;
+		d = delay_calc_srcn(ctx, src, instr, i);
 		delay = MAX2(delay, d);
 	}
 
 	return delay;
 }
 
-/* A negative return value signals that an instruction has been newly
- * SCHEDULED (or DELAYED due to address or predicate register already
- * in use), return back up to the top of the stack (to block_sched())
+struct ir3_sched_notes {
+	/* there is at least one kill which could be scheduled, except
+	 * for unscheduled bary.f's:
+	 */
+	bool blocked_kill;
+	/* there is at least one instruction that could be scheduled,
+	 * except for conflicting address/predicate register usage:
+	 */
+	bool addr_conflict, pred_conflict;
+};
+
+static bool is_scheduled(struct ir3_instruction *instr)
+{
+	return !!(instr->flags & IR3_INSTR_MARK);
+}
+
+static bool
+check_conflict(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct ir3_instruction *instr)
+{
+	/* if this is a write to address/predicate register, and that
+	 * register is currently in use, we need to defer until it is
+	 * free:
+	 */
+	if (writes_addr(instr) && ctx->addr) {
+		assert(ctx->addr != instr);
+		notes->addr_conflict = true;
+		return true;
+	}
+
+	if (writes_pred(instr) && ctx->pred) {
+		assert(ctx->pred != instr);
+		notes->pred_conflict = true;
+		return true;
+	}
+
+	return false;
+}
+
+/* is this instruction ready to be scheduled?  Return negative for not
+ * ready (updating notes if needed), or >= 0 to indicate number of
+ * delay slots needed.
  */
-static int trysched(struct ir3_sched_ctx *ctx,
+static int
+instr_eligibility(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
 		struct ir3_instruction *instr)
 {
-	struct ir3_instruction *srcs[64];
 	struct ir3_instruction *src;
-	unsigned delay, nsrcs = 0;
+	unsigned delay = 0;
 
-	/* if already scheduled: */
-	if (instr->flags & IR3_INSTR_MARK)
+	/* Phi instructions can have a dependency on something not
+	 * scheduled yet (for ex, loops).  But OTOH we don't really
+	 * care.  By definition phi's should appear at the top of
+	 * the block, and it's sources should be values from the
+	 * previously executing block, so they are always ready to
+	 * be scheduled:
+	 */
+	if (is_meta(instr) && (instr->opc == OPC_META_PHI))
 		return 0;
 
-	/* figure out our src's, copy 'em out into an array for sorting: */
 	foreach_ssa_src(src, instr) {
-		debug_assert(nsrcs < ARRAY_SIZE(srcs));
-		srcs[nsrcs++] = src;
-	}
-
-	/* for each src register in sorted order:
-	 */
-	delay = 0;
-	while ((src = deepest(srcs, nsrcs))) {
-		delay = trysched(ctx, src);
-		if (delay)
-			return delay;
+		/* if dependency not scheduled, we aren't ready yet: */
+		if (!is_scheduled(src))
+			return -1;
 	}
 
 	/* all our dependents are scheduled, figure out if
@@ -255,216 +243,276 @@ static int trysched(struct ir3_sched_ctx *ctx,
 	 */
 	if (is_kill(instr)) {
 		struct ir3 *ir = instr->block->shader;
-		unsigned i;
 
-		for (i = 0; i < ir->baryfs_count; i++) {
+		for (unsigned i = 0; i < ir->baryfs_count; i++) {
 			struct ir3_instruction *baryf = ir->baryfs[i];
 			if (baryf->depth == DEPTH_UNUSED)
 				continue;
-			delay = trysched(ctx, baryf);
-			if (delay)
-				return delay;
+			if (!is_scheduled(baryf)) {
+				notes->blocked_kill = true;
+				return -1;
+			}
 		}
 	}
 
-	/* if this is a write to address/predicate register, and that
-	 * register is currently in use, we need to defer until it is
-	 * free:
-	 */
-	if (writes_addr(instr) && ctx->addr) {
-		assert(ctx->addr != instr);
-		return DELAYED;
-	}
-	if (writes_pred(instr) && ctx->pred) {
-		assert(ctx->pred != instr);
-		return DELAYED;
-	}
+	if (check_conflict(ctx, notes, instr))
+		return -1;
 
-	schedule(ctx, instr, true);
-	return SCHEDULED;
+	return 0;
 }
 
-static struct ir3_instruction * reverse(struct ir3_instruction *instr)
+/* move eligible instructions to the priority list: */
+static unsigned
+add_eligible_instrs(struct ir3_sched_ctx *ctx, struct ir3_sched_notes *notes,
+		struct list_head *prio_queue, struct list_head *unscheduled_list)
 {
-	struct ir3_instruction *reversed = NULL;
-	while (instr) {
-		struct ir3_instruction *next = instr->next;
-		instr->next = reversed;
-		reversed = instr;
-		instr = next;
+	unsigned min_delay = ~0;
+
+	list_for_each_entry_safe (struct ir3_instruction, instr, unscheduled_list, node) {
+		int e = instr_eligibility(ctx, notes, instr);
+		if (e < 0)
+			continue;
+		min_delay = MIN2(min_delay, e);
+		if (e == 0) {
+			/* remove from unscheduled list and into priority queue: */
+			list_delinit(&instr->node);
+			ir3_insert_by_depth(instr, prio_queue);
+		}
 	}
-	return reversed;
-}
 
-static bool uses_current_addr(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
-{
-	return instr->address && (ctx->addr == instr->address);
+	return min_delay;
 }
 
-static bool uses_current_pred(struct ir3_sched_ctx *ctx,
-		struct ir3_instruction *instr)
+/* "spill" the address register by remapping any unscheduled
+ * instructions which depend on the current address register
+ * to a clone of the instruction which wrote the address reg.
+ */
+static void
+split_addr(struct ir3_sched_ctx *ctx)
 {
-	struct ir3_instruction *src;
-	foreach_ssa_src(src, instr)
-		if (ctx->pred == src)
-			return true;
-	return false;
+	struct ir3 *ir = ctx->addr->block->shader;
+	struct ir3_instruction *new_addr = NULL;
+	unsigned i;
+
+	debug_assert(ctx->addr);
+
+	for (i = 0; i < ir->indirects_count; i++) {
+		struct ir3_instruction *indirect = ir->indirects[i];
+
+		/* skip instructions already scheduled: */
+		if (indirect->flags & IR3_INSTR_MARK)
+			continue;
+
+		/* remap remaining instructions using current addr
+		 * to new addr:
+		 */
+		if (indirect->address == ctx->addr) {
+			if (!new_addr) {
+				new_addr = ir3_instr_clone(ctx->addr);
+				/* original addr is scheduled, but new one isn't: */
+				new_addr->flags &= ~IR3_INSTR_MARK;
+			}
+			indirect->address = new_addr;
+		}
+	}
+
+	/* all remaining indirects remapped to new addr: */
+	ctx->addr = NULL;
 }
 
-/* when we encounter an instruction that writes to the address register
- * when it is in use, we delay that instruction and try to schedule all
- * other instructions using the current address register:
+/* "spill" the predicate register by remapping any unscheduled
+ * instructions which depend on the current predicate register
+ * to a clone of the instruction which wrote the address reg.
  */
-static int block_sched_undelayed(struct ir3_sched_ctx *ctx,
-		struct ir3_block *block)
+static void
+split_pred(struct ir3_sched_ctx *ctx)
 {
-	struct ir3_instruction *instr = block->head;
-	bool addr_in_use = false;
-	bool pred_in_use = false;
-	bool all_delayed = true;
-	unsigned cnt = ~0, attempted = 0;
-
-	while (instr) {
-		struct ir3_instruction *next = instr->next;
-		bool addr = uses_current_addr(ctx, instr);
-		bool pred = uses_current_pred(ctx, instr);
-
-		if (addr || pred) {
-			int ret = trysched(ctx, instr);
-
-			if (ret != DELAYED)
-				all_delayed = false;
-
-			if (ret == SCHEDULED)
-				cnt = 0;
-			else if (ret > 0)
-				cnt = MIN2(cnt, ret);
-			if (addr)
-				addr_in_use = true;
-			if (pred)
-				pred_in_use = true;
-
-			attempted++;
-		}
+	struct ir3 *ir = ctx->pred->block->shader;
+	struct ir3_instruction *new_pred = NULL;
+	unsigned i;
 
-		instr = next;
-	}
+	debug_assert(ctx->pred);
 
-	if (!addr_in_use)
-		ctx->addr = NULL;
+	for (i = 0; i < ir->predicates_count; i++) {
+		struct ir3_instruction *predicated = ir->predicates[i];
 
-	if (!pred_in_use)
-		ctx->pred = NULL;
+		/* skip instructions already scheduled: */
+		if (predicated->flags & IR3_INSTR_MARK)
+			continue;
 
-	/* detect if we've gotten ourselves into an impossible situation
-	 * and bail if needed
-	 */
-	if (all_delayed && (attempted > 0)) {
-		if (pred_in_use) {
-			/* TODO we probably need to keep a list of instructions
-			 * that reference predicate, similar to indirects
-			 */
-			ctx->error = true;
-			return DELAYED;
-		}
-		if (addr_in_use) {
-			struct ir3 *ir = ctx->addr->block->shader;
-			struct ir3_instruction *new_addr =
-					ir3_instr_clone(ctx->addr);
-			unsigned i;
-
-			/* original addr is scheduled, but new one isn't: */
-			new_addr->flags &= ~IR3_INSTR_MARK;
-
-			for (i = 0; i < ir->indirects_count; i++) {
-				struct ir3_instruction *indirect = ir->indirects[i];
-
-				/* skip instructions already scheduled: */
-				if (indirect->flags & IR3_INSTR_MARK)
-					continue;
-
-				/* remap remaining instructions using current addr
-				 * to new addr:
-				 */
-				if (indirect->address == ctx->addr)
-					indirect->address = new_addr;
+		/* remap remaining instructions using current pred
+		 * to new pred:
+		 *
+		 * TODO is there ever a case when pred isn't first
+		 * (and only) src?
+		 */
+		if (ssa(predicated->regs[1]) == ctx->pred) {
+			if (!new_pred) {
+				new_pred = ir3_instr_clone(ctx->pred);
+				/* original pred is scheduled, but new one isn't: */
+				new_pred->flags &= ~IR3_INSTR_MARK;
 			}
-
-			/* all remaining indirects remapped to new addr: */
-			ctx->addr = NULL;
-
-			/* not really, but this will trigger us to go back to
-			 * main trysched() loop now that we've resolved the
-			 * conflict by duplicating the instr that writes to
-			 * the address register.
-			 */
-			return SCHEDULED;
+			predicated->regs[1]->instr = new_pred;
 		}
 	}
 
-	return cnt;
+	/* all remaining predicated remapped to new pred: */
+	ctx->pred = NULL;
 }
 
-static void block_sched(struct ir3_sched_ctx *ctx, struct ir3_block *block)
+static void
+sched_block(struct ir3_sched_ctx *ctx, struct ir3_block *block)
 {
-	struct ir3_instruction *instr;
+	struct list_head unscheduled_list, prio_queue;
 
-	/* schedule all the shader input's (meta-instr) first so that
-	 * the RA step sees that the input registers contain a value
-	 * from the start of the shader:
+	ctx->block = block;
+
+	/* move all instructions to the unscheduled list, and
+	 * empty the block's instruction list (to which we will
+	 * be inserting.
 	 */
-	if (!block->parent) {
-		unsigned i;
-		for (i = 0; i < block->ninputs; i++) {
-			struct ir3_instruction *in = block->inputs[i];
-			if (in)
-				schedule(ctx, in, true);
+	list_replace(&block->instr_list, &unscheduled_list);
+	list_inithead(&block->instr_list);
+	list_inithead(&prio_queue);
+
+	/* first a pre-pass to schedule all meta:input/phi instructions
+	 * (which need to appear first so that RA knows the register is
+	 * occupied:
+	 */
+	list_for_each_entry_safe (struct ir3_instruction, instr, &unscheduled_list, node) {
+		if (is_meta(instr) && ((instr->opc == OPC_META_INPUT) ||
+				(instr->opc == OPC_META_PHI)))
+			schedule(ctx, instr);
+	}
+
+	while (!(list_empty(&unscheduled_list) &&
+			list_empty(&prio_queue))) {
+		struct ir3_sched_notes notes = {0};
+		unsigned delay;
+
+		delay = add_eligible_instrs(ctx, &notes, &prio_queue, &unscheduled_list);
+
+		if (!list_empty(&prio_queue)) {
+			struct ir3_instruction *instr = list_last_entry(&prio_queue,
+					struct ir3_instruction, node);
+			/* ugg, this is a bit ugly, but between the time when
+			 * the instruction became eligible and now, a new
+			 * conflict may have arose..
+			 */
+			if (check_conflict(ctx, &notes, instr)) {
+				list_del(&instr->node);
+				list_addtail(&instr->node, &unscheduled_list);
+				continue;
+			}
+
+			schedule(ctx, instr);
+		} else if (delay == ~0) {
+			/* nothing available to schedule.. if we are blocked on
+			 * address/predicate register conflict, then break the
+			 * deadlock by cloning the instruction that wrote that
+			 * reg:
+			 */
+			if (notes.addr_conflict) {
+				split_addr(ctx);
+			} else if (notes.pred_conflict) {
+				split_pred(ctx);
+			} else {
+				debug_assert(0);
+				ctx->error = true;
+				return;
+			}
+		} else {
+			/* and if we run out of instructions that can be scheduled,
+			 * then it is time for nop's:
+			 */
+			debug_assert(delay <= 6);
+			while (delay > 0) {
+				ir3_NOP(block);
+				delay--;
+			}
 		}
 	}
 
-	while ((instr = block->head) && !ctx->error) {
-		/* NOTE: always grab next *before* trysched(), in case the
-		 * instruction is actually scheduled (and therefore moved
-		 * from depth list into scheduled list)
-		 */
-		struct ir3_instruction *next = instr->next;
-		int cnt = trysched(ctx, instr);
+	/* And lastly, insert branch/jump instructions to take us to
+	 * the next block.  Later we'll strip back out the branches
+	 * that simply jump to next instruction.
+	 */
+	if (block->successors[1]) {
+		/* if/else, conditional branches to "then" or "else": */
+		struct ir3_instruction *br;
+		unsigned delay = 6;
 
-		if (cnt == DELAYED)
-			cnt = block_sched_undelayed(ctx, block);
+		debug_assert(ctx->pred);
+		debug_assert(block->condition);
 
-		/* -1 is signal to return up stack, but to us means same as 0: */
-		cnt = MAX2(0, cnt);
-		cnt += ctx->cnt;
-		instr = next;
+		delay -= distance(ctx, ctx->pred, delay);
 
-		/* if deepest remaining instruction cannot be scheduled, try
-		 * the increasingly more shallow instructions until needed
-		 * number of delay slots is filled:
-		 */
-		while (instr && (cnt > ctx->cnt)) {
-			next = instr->next;
-			trysched(ctx, instr);
-			instr = next;
+		while (delay > 0) {
+			ir3_NOP(block);
+			delay--;
 		}
 
-		/* and if we run out of instructions that can be scheduled,
-		 * then it is time for nop's:
+		/* create "else" branch first (since "then" block should
+		 * frequently/always end up being a fall-thru):
+		 */
+		br = ir3_BR(block);
+		br->cat0.inv = true;
+		br->cat0.target = block->successors[1];
+
+		/* NOTE: we have to hard code delay of 6 above, since
+		 * we want to insert the nop's before constructing the
+		 * branch.  Throw in an assert so we notice if this
+		 * ever breaks on future generation:
 		 */
-		while (cnt > ctx->cnt)
-			schedule(ctx, ir3_instr_create(block, 0, OPC_NOP), false);
+		debug_assert(ir3_delayslots(ctx->pred, br, 0) == 6);
+
+		br = ir3_BR(block);
+		br->cat0.target = block->successors[0];
+
+	} else if (block->successors[0]) {
+		/* otherwise unconditional jump to next block: */
+		struct ir3_instruction *jmp;
+
+		jmp = ir3_JUMP(block);
+		jmp->cat0.target = block->successors[0];
 	}
 
-	/* at this point, scheduled list is in reverse order, so fix that: */
-	block->head = reverse(ctx->scheduled);
+	/* NOTE: if we kept track of the predecessors, we could do a better
+	 * job w/ (jp) flags.. every node w/ > predecessor is a join point.
+	 * Note that as we eliminate blocks which contain only an unconditional
+	 * jump we probably need to propagate (jp) flag..
+	 */
 }
 
-int ir3_block_sched(struct ir3_block *block)
+/* this is needed to ensure later RA stage succeeds: */
+static void
+sched_insert_parallel_copies(struct ir3_block *block)
+{
+	list_for_each_entry (struct ir3_instruction, instr, &block->instr_list, node) {
+		if (is_meta(instr) && (instr->opc == OPC_META_PHI)) {
+			struct ir3_register *reg;
+			foreach_src(reg, instr) {
+				struct ir3_instruction *src = reg->instr;
+				struct ir3_instruction *mov =
+					ir3_MOV(src->block, src, TYPE_U32);
+				mov->regs[0]->flags |= IR3_REG_PHI_SRC;
+				mov->regs[0]->instr = instr;
+				reg->instr = mov;
+			}
+		}
+	}
+}
+
+int ir3_sched(struct ir3 *ir)
 {
 	struct ir3_sched_ctx ctx = {0};
-	ir3_clear_mark(block->shader);
-	block_sched(&ctx, block);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_insert_parallel_copies(block);
+	}
+	ir3_clear_mark(ir);
+	list_for_each_entry (struct ir3_block, block, &ir->block_list, node) {
+		sched_block(&ctx, block);
+	}
 	if (ctx.error)
 		return -1;
 	return 0;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 9bf4e64..b5b0381 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -127,7 +127,7 @@ static void
 assemble_variant(struct ir3_shader_variant *v)
 {
 	struct fd_context *ctx = fd_context(v->shader->pctx);
-	uint32_t gpu_id = ir3_shader_gpuid(v->shader);
+	uint32_t gpu_id = v->shader->compiler->gpu_id;
 	uint32_t sz, *bin;
 
 	bin = ir3_shader_assemble(v, gpu_id);
@@ -146,17 +146,6 @@ assemble_variant(struct ir3_shader_variant *v)
 	v->ir = NULL;
 }
 
-/* reset before attempting to compile again.. */
-static void reset_variant(struct ir3_shader_variant *v, const char *msg)
-{
-	debug_error(msg);
-	v->inputs_count = 0;
-	v->outputs_count = 0;
-	v->total_in = 0;
-	v->has_samp = false;
-	v->immediates_count = 0;
-}
-
 static struct ir3_shader_variant *
 create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
@@ -177,22 +166,7 @@ create_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 		tgsi_dump(tokens, 0);
 	}
 
-	if (fd_mesa_debug & FD_DBG_NIR) {
-		ret = ir3_compile_shader_nir(v, tokens, key);
-		if (ret)
-			reset_variant(v, "NIR compiler failed, fallback to TGSI!");
-	} else {
-		ret = -1;
-	}
-
-	if (ret) {
-		ret = ir3_compile_shader(v, tokens, key, true);
-		if (ret) {
-			reset_variant(v, "new compiler failed, trying without copy propagation!");
-			ret = ir3_compile_shader(v, tokens, key, false);
-		}
-	}
-
+	ret = ir3_compile_shader_nir(shader->compiler, v, tokens, key);
 	if (ret) {
 		debug_error("compile failed!");
 		goto fail;
@@ -217,13 +191,6 @@ fail:
 	return NULL;
 }
 
-uint32_t
-ir3_shader_gpuid(struct ir3_shader *shader)
-{
-	struct fd_context *ctx = fd_context(shader->pctx);
-	return ctx->screen->gpu_id;
-}
-
 struct ir3_shader_variant *
 ir3_shader_variant(struct ir3_shader *shader, struct ir3_shader_key key)
 {
@@ -286,6 +253,7 @@ ir3_shader_create(struct pipe_context *pctx, const struct tgsi_token *tokens,
 		enum shader_t type)
 {
 	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
+	shader->compiler = fd_context(pctx)->screen->compiler;
 	shader->pctx = pctx;
 	shader->type = type;
 	shader->tokens = tgsi_dup_tokens(tokens);
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index e5410bf..9f1b076 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -86,10 +86,6 @@ struct ir3_shader_key {
 	 * shader:
 	 */
 	uint16_t fsaturate_s, fsaturate_t, fsaturate_r;
-
-	/* bitmask of sampler which produces integer outputs:
-	 */
-	uint16_t vinteger_s, finteger_s;
 };
 
 static inline bool
@@ -196,6 +192,8 @@ struct ir3_shader_variant {
 struct ir3_shader {
 	enum shader_t type;
 
+	struct ir3_compiler *compiler;
+
 	struct pipe_context *pctx;
 	const struct tgsi_token *tokens;
 
@@ -212,7 +210,6 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
 struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
 		const struct tgsi_token *tokens, enum shader_t type);
 void ir3_shader_destroy(struct ir3_shader *shader);
-uint32_t ir3_shader_gpuid(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key);
 
@@ -220,6 +217,8 @@ struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
  * Helper/util:
  */
 
+#include "pipe/p_shader_tokens.h"
+
 static inline int
 ir3_find_output(const struct ir3_shader_variant *so, ir3_semantic semantic)
 {
diff --git a/src/gallium/drivers/i915/i915_fpc_optimize.c b/src/gallium/drivers/i915/i915_fpc_optimize.c
index e0134a7..83bb649 100644
--- a/src/gallium/drivers/i915/i915_fpc_optimize.c
+++ b/src/gallium/drivers/i915/i915_fpc_optimize.c
@@ -552,7 +552,7 @@ static boolean i915_fpc_useless_mov(union tgsi_full_token *tgsi_current)
    if ( current.Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION  &&
         current.FullInstruction.Instruction.Opcode == TGSI_OPCODE_MOV &&
         op_has_dst(current.FullInstruction.Instruction.Opcode) &&
-        current.FullInstruction.Instruction.Saturate == TGSI_SAT_NONE &&
+        !current.FullInstruction.Instruction.Saturate &&
         current.FullInstruction.Src[0].Register.Absolute == 0 &&
         current.FullInstruction.Src[0].Register.Negate == 0 &&
         is_unswizzled(&current.FullInstruction.Src[0], current.FullInstruction.Dst[0].Register.WriteMask) &&
@@ -582,7 +582,7 @@ static void i915_fpc_optimize_useless_mov_after_inst(struct i915_optimize_contex
         next->Token.Type == TGSI_TOKEN_TYPE_INSTRUCTION  &&
         next->FullInstruction.Instruction.Opcode == TGSI_OPCODE_MOV &&
         op_has_dst(current->FullInstruction.Instruction.Opcode) &&
-        next->FullInstruction.Instruction.Saturate == TGSI_SAT_NONE &&
+        !next->FullInstruction.Instruction.Saturate &&
         next->FullInstruction.Src[0].Register.Absolute == 0 &&
         next->FullInstruction.Src[0].Register.Negate == 0 &&
         unused_from(ctx, &current->FullInstruction.Dst[0], index) &&
diff --git a/src/gallium/drivers/i915/i915_fpc_translate.c b/src/gallium/drivers/i915/i915_fpc_translate.c
index b74f823..38a3388 100644
--- a/src/gallium/drivers/i915/i915_fpc_translate.c
+++ b/src/gallium/drivers/i915/i915_fpc_translate.c
@@ -329,7 +329,7 @@ get_result_flags(const struct i915_full_instruction *inst)
       = inst->Dst[0].Register.WriteMask;
    uint flags = 0x0;
 
-   if (inst->Instruction.Saturate == TGSI_SAT_ZERO_ONE)
+   if (inst->Instruction.Saturate)
       flags |= A0_DEST_SATURATE;
 
    if (writeMask & TGSI_WRITEMASK_X)
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 7216160..0590da0 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -165,6 +165,7 @@ i915_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe_sha
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       default:
          debug_printf("%s: Unknown cap %u.\n", __FUNCTION__, cap);
@@ -241,6 +242,7 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/Makefile.sources b/src/gallium/drivers/ilo/Makefile.sources
index 91a6f65..e1bbb9a 100644
--- a/src/gallium/drivers/ilo/Makefile.sources
+++ b/src/gallium/drivers/ilo/Makefile.sources
@@ -15,14 +15,34 @@ C_SOURCES := \
 	core/ilo_debug.h \
 	core/ilo_dev.c \
 	core/ilo_dev.h \
-	core/ilo_format.c \
-	core/ilo_format.h \
-	core/ilo_fence.h \
 	core/ilo_image.c \
 	core/ilo_image.h \
-	core/ilo_state_3d.h \
-	core/ilo_state_3d_bottom.c \
-	core/ilo_state_3d_top.c \
+	core/ilo_state_cc.c \
+	core/ilo_state_cc.h \
+	core/ilo_state_compute.c \
+	core/ilo_state_compute.h \
+	core/ilo_state_raster.c \
+	core/ilo_state_raster.h \
+	core/ilo_state_sampler.c \
+	core/ilo_state_sampler.h \
+	core/ilo_state_sbe.c \
+	core/ilo_state_sbe.h \
+	core/ilo_state_shader.c \
+	core/ilo_state_shader_ps.c \
+	core/ilo_state_shader.h \
+	core/ilo_state_sol.c \
+	core/ilo_state_sol.h \
+	core/ilo_state_surface.c \
+	core/ilo_state_surface_format.c \
+	core/ilo_state_surface.h \
+	core/ilo_state_urb.c \
+	core/ilo_state_urb.h \
+	core/ilo_state_vf.c \
+	core/ilo_state_vf.h \
+	core/ilo_state_viewport.c \
+	core/ilo_state_viewport.h \
+	core/ilo_state_zs.c \
+	core/ilo_state_zs.h \
 	core/intel_winsys.h \
 	ilo_blit.c \
 	ilo_blit.h \
@@ -38,6 +58,8 @@ C_SOURCES := \
 	ilo_cp.h \
 	ilo_draw.c \
 	ilo_draw.h \
+	ilo_format.c \
+	ilo_format.h \
 	ilo_gpgpu.c \
 	ilo_gpgpu.h \
 	ilo_public.h \
diff --git a/src/gallium/drivers/ilo/core/ilo_buffer.h b/src/gallium/drivers/ilo/core/ilo_buffer.h
index 50f97d1..ca3c61f 100644
--- a/src/gallium/drivers/ilo/core/ilo_buffer.h
+++ b/src/gallium/drivers/ilo/core/ilo_buffer.h
@@ -31,11 +31,13 @@
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
+#include "ilo_debug.h"
 #include "ilo_dev.h"
 
 struct ilo_buffer {
    unsigned bo_size;
 
+   /* managed by users */
    struct intel_bo *bo;
 };
 
@@ -43,6 +45,8 @@ static inline void
 ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev,
                 unsigned size, uint32_t bind, uint32_t flags)
 {
+   assert(ilo_is_zeroed(buf, sizeof(*buf)));
+
    buf->bo_size = size;
 
    /*
@@ -55,36 +59,6 @@ ilo_buffer_init(struct ilo_buffer *buf, const struct ilo_dev *dev,
     */
    if (bind & PIPE_BIND_SAMPLER_VIEW)
       buf->bo_size = align(buf->bo_size, 256) + 16;
-
-   if ((bind & PIPE_BIND_VERTEX_BUFFER) && ilo_dev_gen(dev) < ILO_GEN(7.5)) {
-      /*
-       * As noted in ilo_format_translate(), we treat some 3-component formats
-       * as 4-component formats to work around hardware limitations.  Imagine
-       * the case where the vertex buffer holds a single
-       * PIPE_FORMAT_R16G16B16_FLOAT vertex, and buf->bo_size is 6.  The
-       * hardware would fail to fetch it at boundary check because the vertex
-       * buffer is expected to hold a PIPE_FORMAT_R16G16B16A16_FLOAT vertex
-       * and that takes at least 8 bytes.
-       *
-       * For the workaround to work, we should add 2 to the bo size.  But that
-       * would waste a page when the bo size is already page aligned.  Let's
-       * round it to page size for now and revisit this when needed.
-       */
-      buf->bo_size = align(buf->bo_size, 4096);
-   }
-}
-
-static inline void
-ilo_buffer_cleanup(struct ilo_buffer *buf)
-{
-   intel_bo_unref(buf->bo);
-}
-
-static inline void
-ilo_buffer_set_bo(struct ilo_buffer *buf, struct intel_bo *bo)
-{
-   intel_bo_unref(buf->bo);
-   buf->bo = intel_bo_ref(bo);
 }
 
 #endif /* ILO_BUFFER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_builder.c b/src/gallium/drivers/ilo/core/ilo_builder.c
index 3c5eef9..4e05a3a 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder.c
@@ -333,7 +333,7 @@ ilo_builder_init(struct ilo_builder *builder,
 {
    int i;
 
-   memset(builder, 0, sizeof(*builder));
+   assert(ilo_is_zeroed(builder, sizeof(*builder)));
 
    builder->dev = dev;
    builder->winsys = winsys;
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d.h b/src/gallium/drivers/ilo/core/ilo_builder_3d.h
index 6cf1732..fb8b53c 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d.h
@@ -35,45 +35,45 @@
 #include "ilo_builder_3d_top.h"
 #include "ilo_builder_3d_bottom.h"
 
+struct gen6_3dprimitive_info {
+   enum gen_3dprim_type topology;
+   bool indexed;
+
+   uint32_t vertex_count;
+   uint32_t vertex_start;
+   uint32_t instance_count;
+   uint32_t instance_start;
+   int32_t vertex_base;
+};
+
 static inline void
 gen6_3DPRIMITIVE(struct ilo_builder *builder,
-                 const struct pipe_draw_info *info,
-                 const struct ilo_ib_state *ib)
+                 const struct gen6_3dprimitive_info *info)
 {
    const uint8_t cmd_len = 6;
-   const int prim = gen6_3d_translate_pipe_prim(info->mode);
-   const int vb_access = (info->indexed) ?
-      GEN6_3DPRIM_DW0_ACCESS_RANDOM : GEN6_3DPRIM_DW0_ACCESS_SEQUENTIAL;
-   const uint32_t vb_start = info->start +
-      ((info->indexed) ? ib->draw_start_offset : 0);
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) |
-           vb_access |
-           prim << GEN6_3DPRIM_DW0_TYPE__SHIFT |
-           (cmd_len - 2);
-   dw[1] = info->count;
-   dw[2] = vb_start;
+   dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | (cmd_len - 2) |
+           info->topology << GEN6_3DPRIM_DW0_TYPE__SHIFT;
+   if (info->indexed)
+      dw[0] |= GEN6_3DPRIM_DW0_ACCESS_RANDOM;
+
+   dw[1] = info->vertex_count;
+   dw[2] = info->vertex_start;
    dw[3] = info->instance_count;
-   dw[4] = info->start_instance;
-   dw[5] = info->index_bias;
+   dw[4] = info->instance_start;
+   dw[5] = info->vertex_base;
 }
 
 static inline void
 gen7_3DPRIMITIVE(struct ilo_builder *builder,
-                 const struct pipe_draw_info *info,
-                 const struct ilo_ib_state *ib)
+                 const struct gen6_3dprimitive_info *info)
 {
    const uint8_t cmd_len = 7;
-   const int prim = gen6_3d_translate_pipe_prim(info->mode);
-   const int vb_access = (info->indexed) ?
-      GEN7_3DPRIM_DW1_ACCESS_RANDOM : GEN7_3DPRIM_DW1_ACCESS_SEQUENTIAL;
-   const uint32_t vb_start = info->start +
-      ((info->indexed) ? ib->draw_start_offset : 0);
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
@@ -81,12 +81,16 @@ gen7_3DPRIMITIVE(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DPRIMITIVE) | (cmd_len - 2);
-   dw[1] = vb_access | prim;
-   dw[2] = info->count;
-   dw[3] = vb_start;
+
+   dw[1] = info->topology << GEN7_3DPRIM_DW1_TYPE__SHIFT;
+   if (info->indexed)
+      dw[1] |= GEN7_3DPRIM_DW1_ACCESS_RANDOM;
+
+   dw[2] = info->vertex_count;
+   dw[3] = info->vertex_start;
    dw[4] = info->instance_count;
-   dw[5] = info->start_instance;
-   dw[6] = info->index_bias;
+   dw[5] = info->instance_start;
+   dw[6] = info->vertex_base;
 }
 
 #endif /* ILO_BUILDER_3D_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
index 16ec4af..6d9e369 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_bottom.h
@@ -29,335 +29,121 @@
 #define ILO_BUILDER_3D_BOTTOM_H
 
 #include "genhw/genhw.h"
-#include "../ilo_shader.h"
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
-#include "ilo_format.h"
+#include "ilo_state_cc.h"
+#include "ilo_state_raster.h"
+#include "ilo_state_sbe.h"
+#include "ilo_state_shader.h"
+#include "ilo_state_viewport.h"
+#include "ilo_state_zs.h"
 #include "ilo_builder.h"
 #include "ilo_builder_3d_top.h"
 
 static inline void
 gen6_3DSTATE_CLIP(struct ilo_builder *builder,
-                  const struct ilo_rasterizer_state *rasterizer,
-                  const struct ilo_shader_state *fs,
-                  bool enable_guardband,
-                  int num_viewports)
-{
-   const uint8_t cmd_len = 4;
-   uint32_t dw1, dw2, dw3, *dw;
-   int interps;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 8);
-
-   dw1 = rasterizer->clip.payload[0];
-   dw2 = rasterizer->clip.payload[1];
-   dw3 = rasterizer->clip.payload[2];
-
-   if (enable_guardband && rasterizer->clip.can_enable_guardband)
-      dw2 |= GEN6_CLIP_DW2_GB_TEST_ENABLE;
-
-   interps = (fs) ?  ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) : 0;
-
-   if (interps & (GEN6_INTERP_NONPERSPECTIVE_PIXEL |
-                  GEN6_INTERP_NONPERSPECTIVE_CENTROID |
-                  GEN6_INTERP_NONPERSPECTIVE_SAMPLE))
-      dw2 |= GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE;
-
-   dw3 |= GEN6_CLIP_DW3_RTAINDEX_FORCED_ZERO |
-          (num_viewports - 1);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
-   dw[3] = dw3;
-}
-
-static inline void
-gen6_disable_3DSTATE_CLIP(struct ilo_builder *builder)
+                  const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 4;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
+   ILO_DEV_ASSERT(builder->dev, 6, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_CLIP) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-}
-
-static inline void
-gen7_internal_3dstate_sf(struct ilo_builder *builder,
-                         uint8_t cmd_len, uint32_t *dw,
-                         const struct ilo_rasterizer_sf *sf,
-                         int num_samples)
-{
-   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
-
-   assert(cmd_len == 7);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2);
-
-   if (!sf) {
-      dw[1] = 0;
-      dw[2] = (num_samples > 1) ? GEN7_SF_DW2_MSRASTMODE_ON_PATTERN : 0;
-      dw[3] = 0;
-      dw[4] = 0;
-      dw[5] = 0;
-      dw[6] = 0;
-
-      return;
-   }
-
-   /* see rasterizer_init_sf_gen6() */
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   dw[1] = sf->payload[0];
-   dw[2] = sf->payload[1];
-   dw[3] = sf->payload[2];
-
-   if (num_samples > 1)
-      dw[2] |= sf->dw_msaa;
-
-   dw[4] = sf->dw_depth_offset_const;
-   dw[5] = sf->dw_depth_offset_scale;
-   dw[6] = sf->dw_depth_offset_clamp;
-}
-
-static inline void
-gen8_internal_3dstate_sbe(struct ilo_builder *builder,
-                          uint8_t cmd_len, uint32_t *dw,
-                          const struct ilo_shader_state *fs,
-                          int sprite_coord_mode)
-{
-   const struct ilo_kernel_routing *routing;
-   int vue_offset, vue_len, out_count;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 8);
-
-   assert(cmd_len == 4);
-
-   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2);
-
-   if (!fs) {
-      dw[1] = 1 << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
-      dw[2] = 0;
-      dw[3] = 0;
-      return;
-   }
-
-   routing = ilo_shader_get_kernel_routing(fs);
-
-   vue_offset = routing->source_skip;
-   assert(vue_offset % 2 == 0);
-   vue_offset /= 2;
-
-   vue_len = (routing->source_len + 1) / 2;
-   if (!vue_len)
-      vue_len = 1;
-
-   out_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT);
-   assert(out_count <= 32);
-
-   dw[1] = out_count << GEN7_SBE_DW1_ATTR_COUNT__SHIFT |
-           vue_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= GEN8_SBE_DW1_USE_URB_READ_LEN |
-               GEN8_SBE_DW1_USE_URB_READ_OFFSET |
-               vue_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT;
-   } else {
-      dw[1] |= vue_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT;
-   }
-
-   if (routing->swizzle_enable)
-      dw[1] |= GEN7_SBE_DW1_ATTR_SWIZZLE_ENABLE;
-
-   switch (sprite_coord_mode) {
-   case PIPE_SPRITE_COORD_UPPER_LEFT:
-      dw[1] |= GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT;
-      break;
-   case PIPE_SPRITE_COORD_LOWER_LEFT:
-      dw[1] |= GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT;
-      break;
-   }
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 268:
-    *
-    *     "This field (Point Sprite Texture Coordinate Enable) must be
-    *      programmed to 0 when non-point primitives are rendered."
-    *
-    * TODO We do not check that yet.
-    */
-   dw[2] = routing->point_sprite_enable;
-
-   dw[3] = routing->const_interp_enable;
-}
-
-static inline void
-gen8_internal_3dstate_sbe_swiz(struct ilo_builder *builder,
-                               uint8_t cmd_len, uint32_t *dw,
-                               const struct ilo_shader_state *fs)
-{
-   const struct ilo_kernel_routing *routing;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 8);
-
-   assert(cmd_len == 11);
-
-   dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SBE_SWIZ) | (cmd_len - 2);
-
-   if (!fs) {
-      memset(&dw[1], 0, sizeof(*dw) * (cmd_len - 1));
-      return;
-   }
-
-   routing = ilo_shader_get_kernel_routing(fs);
-
-   STATIC_ASSERT(sizeof(routing->swizzles) >= sizeof(*dw) * 8);
-   memcpy(&dw[1], routing->swizzles, sizeof(*dw) * 8);
-
-   /* WrapShortest enables */
-   dw[9] = 0;
-   dw[10] = 0;
+   /* see raster_set_gen6_3DSTATE_CLIP() */
+   dw[1] = rs->clip[0];
+   dw[2] = rs->clip[1];
+   dw[3] = rs->clip[2];
 }
 
 static inline void
 gen6_3DSTATE_SF(struct ilo_builder *builder,
-                const struct ilo_rasterizer_state *rasterizer,
-                const struct ilo_shader_state *fs,
-                int sample_count)
+                const struct ilo_state_raster *rs,
+                const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 20;
-   uint32_t gen8_3dstate_sbe[4], gen8_3dstate_sbe_swiz[11];
-   uint32_t gen7_3dstate_sf[7];
-   const struct ilo_rasterizer_sf *sf;
-   int sprite_coord_mode;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   sf = (rasterizer) ? &rasterizer->sf : NULL;
-   sprite_coord_mode = (rasterizer) ? rasterizer->state.sprite_coord_mode : 0;
-
-   gen8_internal_3dstate_sbe(builder, Elements(gen8_3dstate_sbe),
-         gen8_3dstate_sbe, fs, sprite_coord_mode);
-   gen8_internal_3dstate_sbe_swiz(builder, Elements(gen8_3dstate_sbe_swiz),
-         gen8_3dstate_sbe_swiz, fs);
-   gen7_internal_3dstate_sf(builder, Elements(gen7_3dstate_sf),
-         gen7_3dstate_sf, sf, sample_count);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2);
-   dw[1] = gen8_3dstate_sbe[1];
-   memcpy(&dw[2], &gen7_3dstate_sf[1], sizeof(*dw) * 6);
-   memcpy(&dw[8], &gen8_3dstate_sbe_swiz[1], sizeof(*dw) * 8);
-   dw[16] = gen8_3dstate_sbe[2];
-   dw[17] = gen8_3dstate_sbe[3];
-   dw[18] = gen8_3dstate_sbe_swiz[9];
-   dw[19] = gen8_3dstate_sbe_swiz[10];
+   /* see sbe_set_gen8_3DSTATE_SBE() */
+   dw[1] = sbe->sbe[0];
+
+   /* see raster_set_gen7_3DSTATE_SF() */
+   dw[2] = rs->sf[0];
+   dw[3] = rs->sf[1];
+   dw[4] = rs->sf[2];
+   dw[5] = rs->raster[1];
+   dw[6] = rs->raster[2];
+   dw[7] = rs->raster[3];
+
+   /* see sbe_set_gen8_3DSTATE_SBE_SWIZ() */
+   memcpy(&dw[8], sbe->swiz, sizeof(*dw) * 8);
+
+   dw[16] = sbe->sbe[1];
+   dw[17] = sbe->sbe[2];
+   /* WrapShortest enables */
+   dw[18] = 0;
+   dw[19] = 0;
 }
 
 static inline void
 gen7_3DSTATE_SF(struct ilo_builder *builder,
-                const struct ilo_rasterizer_sf *sf,
-                enum pipe_format zs_format,
-                int sample_count)
+                const struct ilo_state_raster *rs)
 {
-   const uint8_t cmd_len = 7;
+   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 4 : 7;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   gen7_internal_3dstate_sf(builder, cmd_len, dw, sf, sample_count);
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-      int hw_format;
-
-      /* separate stencil */
-      switch (zs_format) {
-      case PIPE_FORMAT_Z16_UNORM:
-         hw_format = GEN6_ZFORMAT_D16_UNORM;
-         break;
-      case PIPE_FORMAT_Z32_FLOAT:
-      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-         hw_format = GEN6_ZFORMAT_D32_FLOAT;
-         break;
-      case PIPE_FORMAT_Z24X8_UNORM:
-      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-         hw_format = GEN6_ZFORMAT_D24_UNORM_X8_UINT;
-         break;
-      default:
-         /* FLOAT surface is assumed when there is no depth buffer */
-         hw_format = GEN6_ZFORMAT_D32_FLOAT;
-         break;
-      }
-
-      dw[1] |= hw_format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT;
-   }
-}
-
-static inline void
-gen8_3DSTATE_SF(struct ilo_builder *builder,
-                const struct ilo_rasterizer_sf *sf)
-{
-   const uint8_t cmd_len = 4;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 8, 8);
+   ILO_DEV_ASSERT(builder->dev, 7, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SF) | (cmd_len - 2);
 
-   /* see rasterizer_init_sf_gen8() */
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   dw[1] = sf->payload[0];
-   dw[2] = sf->payload[1];
-   dw[3] = sf->payload[2];
+   /* see raster_set_gen7_3DSTATE_SF() or raster_set_gen8_3DSTATE_SF() */
+   dw[1] = rs->sf[0];
+   dw[2] = rs->sf[1];
+   dw[3] = rs->sf[2];
+   if (ilo_dev_gen(builder->dev) < ILO_GEN(8)) {
+      dw[4] = rs->raster[1];
+      dw[5] = rs->raster[2];
+      dw[6] = rs->raster[3];
+   }
 }
 
 static inline void
 gen7_3DSTATE_SBE(struct ilo_builder *builder,
-                 const struct ilo_shader_state *fs,
-                 int sprite_coord_mode)
+                 const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 14;
-   uint32_t gen8_3dstate_sbe[4], gen8_3dstate_sbe_swiz[11];
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   gen8_internal_3dstate_sbe(builder, Elements(gen8_3dstate_sbe),
-         gen8_3dstate_sbe, fs, sprite_coord_mode);
-   gen8_internal_3dstate_sbe_swiz(builder, Elements(gen8_3dstate_sbe_swiz),
-         gen8_3dstate_sbe_swiz, fs);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2);
-   dw[1] = gen8_3dstate_sbe[1];
-   memcpy(&dw[2], &gen8_3dstate_sbe_swiz[1], sizeof(*dw) * 8);
-   dw[10] = gen8_3dstate_sbe[2];
-   dw[11] = gen8_3dstate_sbe[3];
-   dw[12] = gen8_3dstate_sbe_swiz[9];
-   dw[13] = gen8_3dstate_sbe_swiz[10];
+   /* see sbe_set_gen8_3DSTATE_SBE() and sbe_set_gen8_3DSTATE_SBE_SWIZ() */
+   dw[1] = sbe->sbe[0];
+   memcpy(&dw[2], sbe->swiz, sizeof(*dw) * 8);
+   dw[10] = sbe->sbe[1];
+   dw[11] = sbe->sbe[2];
+
+   /* WrapShortest enables */
+   dw[12] = 0;
+   dw[13] = 0;
 }
 
 static inline void
 gen8_3DSTATE_SBE(struct ilo_builder *builder,
-                 const struct ilo_shader_state *fs,
-                 int sprite_coord_mode)
+                 const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 4;
    uint32_t *dw;
@@ -366,12 +152,16 @@ gen8_3DSTATE_SBE(struct ilo_builder *builder,
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   gen8_internal_3dstate_sbe(builder, cmd_len, dw, fs, sprite_coord_mode);
+   /* see sbe_set_gen8_3DSTATE_SBE() */
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SBE) | (cmd_len - 2);
+   dw[1] = sbe->sbe[0];
+   dw[2] = sbe->sbe[1];
+   dw[3] = sbe->sbe[2];
 }
 
 static inline void
 gen8_3DSTATE_SBE_SWIZ(struct ilo_builder *builder,
-                      const struct ilo_shader_state *fs)
+                      const struct ilo_state_sbe *sbe)
 {
    const uint8_t cmd_len = 11;
    uint32_t *dw;
@@ -380,12 +170,17 @@ gen8_3DSTATE_SBE_SWIZ(struct ilo_builder *builder,
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   gen8_internal_3dstate_sbe_swiz(builder, cmd_len, dw, fs);
+   dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SBE_SWIZ) | (cmd_len - 2);
+   /* see sbe_set_gen8_3DSTATE_SBE_SWIZ() */
+   memcpy(&dw[1], sbe->swiz, sizeof(*dw) * 8);
+   /* WrapShortest enables */
+   dw[9] = 0;
+   dw[10] = 0;
 }
 
 static inline void
 gen8_3DSTATE_RASTER(struct ilo_builder *builder,
-                    const struct ilo_rasterizer_sf *sf)
+                    const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 5;
    uint32_t *dw;
@@ -395,232 +190,108 @@ gen8_3DSTATE_RASTER(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_RASTER) | (cmd_len - 2);
-   dw[1] = sf->dw_raster;
-   dw[2] = sf->dw_depth_offset_const;
-   dw[3] = sf->dw_depth_offset_scale;
-   dw[4] = sf->dw_depth_offset_clamp;
+   /* see raster_set_gen8_3DSTATE_RASTER() */
+   dw[1] = rs->raster[0];
+   dw[2] = rs->raster[1];
+   dw[3] = rs->raster[2];
+   dw[4] = rs->raster[3];
 }
 
 static inline void
 gen6_3DSTATE_WM(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                const struct ilo_rasterizer_state *rasterizer,
-                bool dual_blend, bool cc_may_kill)
+                const struct ilo_state_raster *rs,
+                const struct ilo_state_ps *ps,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 9;
-   const int num_samples = 1;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, dw6, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-   dw6 = cso->payload[3];
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "This bit (Statistics Enable) must be disabled if either of these
-    *      bits is set: Depth Buffer Clear , Hierarchical Depth Buffer Resolve
-    *      Enable or Depth Buffer Resolve Enable."
-    */
-   dw4 |= GEN6_WM_DW4_STATISTICS;
-
-   if (cc_may_kill)
-      dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL | GEN6_WM_DW5_PS_DISPATCH_ENABLE;
-
-   if (dual_blend)
-      dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
-
-   dw5 |= rasterizer->wm.payload[0];
-
-   dw6 |= rasterizer->wm.payload[1];
-
-   if (num_samples > 1) {
-      dw6 |= rasterizer->wm.dw_msaa_rast |
-             rasterizer->wm.dw_msaa_disp;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(fs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
-   dw[6] = dw6;
+   dw[1] = kernel_offset;
+   /* see raster_set_gen6_3dstate_wm() and ps_set_gen6_3dstate_wm() */
+   dw[2] = ps->ps[0];
+   dw[3] = ps->ps[1];
+   dw[4] = rs->wm[0] | ps->ps[2];
+   dw[5] = rs->wm[1] | ps->ps[3];
+   dw[6] = rs->wm[2] | ps->ps[4];
    dw[7] = 0; /* kernel 1 */
    dw[8] = 0; /* kernel 2 */
 }
 
 static inline void
-gen6_hiz_3DSTATE_WM(struct ilo_builder *builder, uint32_t hiz_op)
-{
-   const uint8_t cmd_len = 9;
-   const int max_threads = (builder->dev->gt == 2) ? 80 : 40;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = hiz_op;
-   /* honor the valid range even if dispatching is disabled */
-   dw[5] = (max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT;
-   dw[6] = 0;
-   dw[7] = 0;
-   dw[8] = 0;
-}
-
-static inline void
 gen7_3DSTATE_WM(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                const struct ilo_rasterizer_state *rasterizer,
-                bool cc_may_kill)
+                const struct ilo_state_raster *rs,
+                const struct ilo_state_ps *ps)
 {
    const uint8_t cmd_len = 3;
-   const int num_samples = 1;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw1, dw2, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   /* see rasterizer_init_wm_gen7() */
-   dw1 = rasterizer->wm.payload[0];
-   dw2 = rasterizer->wm.payload[1];
-
-   /* see fs_init_cso_gen7() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw1 |= cso->payload[3];
-
-   dw1 |= GEN7_WM_DW1_STATISTICS;
-
-   if (cc_may_kill)
-      dw1 |= GEN7_WM_DW1_PS_DISPATCH_ENABLE | GEN7_WM_DW1_PS_KILL_PIXEL;
-
-   if (num_samples > 1) {
-      dw1 |= rasterizer->wm.dw_msaa_rast;
-      dw2 |= rasterizer->wm.dw_msaa_disp;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
+   /* see raster_set_gen8_3DSTATE_WM() and ps_set_gen7_3dstate_wm() */
+   dw[1] = rs->wm[0] | ps->ps[0];
+   dw[2] = ps->ps[1];
 }
 
 static inline void
 gen8_3DSTATE_WM(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                const struct ilo_rasterizer_state *rasterizer)
+                const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 2;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw1, interps, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   /* see rasterizer_get_wm_gen8() */
-   dw1 = rasterizer->wm.payload[0];
-   dw1 |= GEN7_WM_DW1_STATISTICS;
-
-   /* see fs_init_cso_gen8() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   interps = cso->payload[4];
-
-   assert(!(dw1 & interps));
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = dw1 | interps;
-}
-
-static inline void
-gen7_hiz_3DSTATE_WM(struct ilo_builder *builder, uint32_t hiz_op)
-{
-   const uint8_t cmd_len = 3;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_WM) | (cmd_len - 2);
-   dw[1] = hiz_op;
-   dw[2] = 0;
+   /* see raster_set_gen8_3DSTATE_WM() */
+   dw[1] = rs->wm[0];
 }
 
 static inline void
 gen8_3DSTATE_WM_DEPTH_STENCIL(struct ilo_builder *builder,
-                              const struct ilo_dsa_state *dsa)
+                              const struct ilo_state_cc *cc)
 {
    const uint8_t cmd_len = 3;
-   uint32_t dw1, dw2, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = dsa->payload[0];
-   dw2 = dsa->payload[1];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_WM_DEPTH_STENCIL) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
+   /* see cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL() */
+   dw[1] = cc->ds[0];
+   dw[2] = cc->ds[1];
 }
 
 static inline void
-gen8_3DSTATE_WM_HZ_OP(struct ilo_builder *builder, uint32_t op,
-                      uint16_t width, uint16_t height, int sample_count)
+gen8_3DSTATE_WM_HZ_OP(struct ilo_builder *builder,
+                      const struct ilo_state_raster *rs,
+                      uint16_t width, uint16_t height)
 {
    const uint8_t cmd_len = 5;
-   const uint32_t sample_mask = ((1 << sample_count) - 1) | 0x1;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = op;
-
-   switch (sample_count) {
-   case 0:
-   case 1:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_1;
-      break;
-   case 2:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_2;
-      break;
-   case 4:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_4;
-      break;
-   case 8:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_8;
-      break;
-   case 16:
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_16;
-      break;
-   default:
-      assert(!"unsupported sample count");
-      dw1 |= GEN8_WM_HZ_DW1_NUMSAMPLES_1;
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_WM_HZ_OP) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see raster_set_gen8_3dstate_wm_hz_op() */
+   dw[1] = rs->wm[1];
    dw[2] = 0;
-   /* exclusive? */
+   /* exclusive */
    dw[3] = height << 16 | width;
-   dw[4] = sample_mask;
+   dw[4] = rs->wm[2];
 }
 
 static inline void
@@ -656,100 +327,48 @@ gen8_3DSTATE_WM_CHROMAKEY(struct ilo_builder *builder)
 
 static inline void
 gen7_3DSTATE_PS(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs,
-                bool dual_blend)
+                const struct ilo_state_ps *ps,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 8;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   /* see fs_init_cso_gen7() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-
-   if (dual_blend)
-      dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(fs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
+   dw[1] = kernel_offset;
+   /* see ps_set_gen7_3DSTATE_PS() */
+   dw[2] = ps->ps[2];
+   dw[3] = ps->ps[3];
+   dw[4] = ps->ps[4];
+   dw[5] = ps->ps[5];
    dw[6] = 0; /* kernel 1 */
    dw[7] = 0; /* kernel 2 */
 }
 
 static inline void
-gen7_disable_3DSTATE_PS(struct ilo_builder *builder)
-{
-   const uint8_t cmd_len = 8;
-   int max_threads;
-   uint32_t dw4, *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   /* GPU hangs if none of the dispatch enable bits is set */
-   dw4 = GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
-
-   /* see brwCreateContext() */
-   switch (ilo_dev_gen(builder->dev)) {
-   case ILO_GEN(7.5):
-      max_threads = (builder->dev->gt == 3) ? 408 :
-                    (builder->dev->gt == 2) ? 204 : 102;
-      dw4 |= (max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT;
-      break;
-   case ILO_GEN(7):
-   default:
-      max_threads = (builder->dev->gt == 2) ? 172 : 48;
-      dw4 |= (max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT;
-      break;
-   }
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = dw4;
-   dw[5] = 0;
-   dw[6] = 0;
-   dw[7] = 0;
-}
-
-static inline void
 gen8_3DSTATE_PS(struct ilo_builder *builder,
-                const struct ilo_shader_state *fs)
+                const struct ilo_state_ps *ps,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 12;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw3, dw6, dw7, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   /* see fs_init_cso_gen8() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw3 = cso->payload[0];
-   dw6 = cso->payload[1];
-   dw7 = cso->payload[2];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(fs);
+   dw[1] = kernel_offset;
    dw[2] = 0;
-   dw[3] = dw3;
-   dw[4] = 0; /* scratch */
+   /* see ps_set_gen8_3DSTATE_PS() */
+   dw[3] = ps->ps[0];
+   dw[4] = ps->ps[1];
    dw[5] = 0;
-   dw[6] = dw6;
-   dw[7] = dw7;
+   dw[6] = ps->ps[2];
+   dw[7] = ps->ps[3];
    dw[8] = 0; /* kernel 1 */
    dw[9] = 0;
    dw[10] = 0; /* kernel 2 */
@@ -758,66 +377,34 @@ gen8_3DSTATE_PS(struct ilo_builder *builder,
 
 static inline void
 gen8_3DSTATE_PS_EXTRA(struct ilo_builder *builder,
-                      const struct ilo_shader_state *fs,
-                      bool cc_may_kill, bool per_sample)
+                      const struct ilo_state_ps *ps)
 {
    const uint8_t cmd_len = 2;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   /* see fs_init_cso_gen8() */
-   cso = ilo_shader_get_kernel_cso(fs);
-   dw1 = cso->payload[3];
-
-   if (cc_may_kill)
-      dw1 |= GEN8_PSX_DW1_DISPATCH_ENABLE | GEN8_PSX_DW1_KILL_PIXEL;
-   if (per_sample)
-      dw1 |= GEN8_PSX_DW1_PER_SAMPLE;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_PS_EXTRA) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see ps_set_gen8_3DSTATE_PS_EXTRA() */
+   dw[1] = ps->ps[4];
 }
 
 static inline void
 gen8_3DSTATE_PS_BLEND(struct ilo_builder *builder,
-                      const struct ilo_blend_state *blend,
-                      const struct ilo_fb_state *fb,
-                      const struct ilo_dsa_state *dsa)
+                      const struct ilo_state_cc *cc)
 {
    const uint8_t cmd_len = 2;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = 0;
-   if (blend->alpha_to_coverage && fb->num_samples > 1)
-      dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE;
-
-   if (fb->state.nr_cbufs && fb->state.cbufs[0]) {
-      const struct ilo_fb_blend_caps *caps = &fb->blend_caps[0];
-
-      dw1 |= GEN8_PS_BLEND_DW1_WRITABLE_RT;
-      if (caps->can_blend) {
-         if (caps->dst_alpha_forced_one)
-            dw1 |= blend->dw_ps_blend_dst_alpha_forced_one;
-         else
-            dw1 |= blend->dw_ps_blend;
-      }
-
-      if (caps->can_alpha_test)
-         dw1 |= dsa->dw_ps_blend_alpha;
-   } else {
-      dw1 |= dsa->dw_ps_blend_alpha;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_PS_BLEND) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see cc_set_gen8_3DSTATE_PS_BLEND() */
+   dw[1] = cc->blend[0];
 }
 
 static inline void
@@ -862,101 +449,49 @@ gen7_3DSTATE_SAMPLER_STATE_POINTERS_PS(struct ilo_builder *builder,
 
 static inline void
 gen6_3DSTATE_MULTISAMPLE(struct ilo_builder *builder,
-                         int num_samples, const uint32_t *pattern,
-                         bool pixel_location_center)
+                         const struct ilo_state_raster *rs,
+                         const struct ilo_state_sample_pattern *pattern,
+                         uint8_t sample_count)
 {
    const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ? 4 : 3;
-   uint32_t dw1, dw2, dw3, *dw;
+   const uint32_t *packed = (const uint32_t *)
+      ilo_state_sample_pattern_get_packed_offsets(pattern,
+            builder->dev, sample_count);
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   dw1 = (pixel_location_center) ? GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER :
-      GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER;
-
-   switch (num_samples) {
-   case 0:
-   case 1:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      dw2 = 0;
-      dw3 = 0;
-      break;
-   case 4:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4;
-      dw2 = pattern[0];
-      dw3 = 0;
-      break;
-   case 8:
-      assert(ilo_dev_gen(builder->dev) >= ILO_GEN(7));
-      dw1 |= GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8;
-      dw2 = pattern[0];
-      dw3 = pattern[1];
-      break;
-   default:
-      assert(!"unsupported sample count");
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      dw2 = 0;
-      dw3 = 0;
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (cmd_len - 2);
-   dw[1] = dw1;
-   dw[2] = dw2;
+   /* see raster_set_gen8_3DSTATE_MULTISAMPLE() */
+   dw[1] = rs->sample[0];
+
+   /* see sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN() */
+   dw[2] = (sample_count >= 4) ? packed[0] : 0;
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
-      dw[3] = dw3;
+      dw[3] = (sample_count >= 8) ? packed[1] : 0;
 }
 
 static inline void
 gen8_3DSTATE_MULTISAMPLE(struct ilo_builder *builder,
-                         int num_samples,
-                         bool pixel_location_center)
+                         const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 2;
-   uint32_t dw1, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   dw1 = (pixel_location_center) ? GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER :
-      GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER;
-
-   switch (num_samples) {
-   case 0:
-   case 1:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      break;
-   case 2:
-      dw1 |= GEN8_MULTISAMPLE_DW1_NUMSAMPLES_2;
-      break;
-   case 4:
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4;
-      break;
-   case 8:
-      dw1 |= GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8;
-      break;
-   case 16:
-      dw1 |= GEN8_MULTISAMPLE_DW1_NUMSAMPLES_16;
-      break;
-   default:
-      assert(!"unsupported sample count");
-      dw1 |= GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1;
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_MULTISAMPLE) | (cmd_len - 2);
-   dw[1] = dw1;
+   /* see raster_set_gen8_3DSTATE_MULTISAMPLE() */
+   dw[1] = rs->sample[0];
 }
 
 static inline void
 gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_builder *builder,
-                            const uint32_t *pattern_1x,
-                            const uint32_t *pattern_2x,
-                            const uint32_t *pattern_4x,
-                            const uint32_t *pattern_8x,
-                            const uint32_t *pattern_16x)
+                            const struct ilo_state_sample_pattern *pattern)
 {
    const uint8_t cmd_len = 9;
    uint32_t *dw;
@@ -966,61 +501,32 @@ gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_SAMPLE_PATTERN) | (cmd_len - 2);
-   dw[1] = pattern_16x[3];
-   dw[2] = pattern_16x[2];
-   dw[3] = pattern_16x[1];
-   dw[4] = pattern_16x[0];
-   dw[5] = pattern_8x[1];
-   dw[6] = pattern_8x[0];
-   dw[7] = pattern_4x[0];
-   dw[8] = pattern_1x[0] << 16 |
-           pattern_2x[0];
+   dw[1] = 0;
+   dw[2] = 0;
+   dw[3] = 0;
+   dw[4] = 0;
+   /* see sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN() */
+   dw[5] = ((const uint32_t *) pattern->pattern_8x)[1];
+   dw[6] = ((const uint32_t *) pattern->pattern_8x)[0];
+   dw[7] = ((const uint32_t *) pattern->pattern_4x)[0];
+   dw[8] = pattern->pattern_1x[0] << 16 |
+           ((const uint16_t *) pattern->pattern_2x)[0];
 }
 
 static inline void
 gen6_3DSTATE_SAMPLE_MASK(struct ilo_builder *builder,
-                         unsigned sample_mask)
+                         const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 2;
-   const unsigned valid_mask = 0xf;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   sample_mask &= valid_mask;
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLE_MASK) | (cmd_len - 2);
-   dw[1] = sample_mask;
-}
-
-static inline void
-gen7_3DSTATE_SAMPLE_MASK(struct ilo_builder *builder,
-                         unsigned sample_mask,
-                         int num_samples)
-{
-   const uint8_t cmd_len = 2;
-   const unsigned valid_mask = ((1 << num_samples) - 1) | 0x1;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 294:
-    *
-    *     "If Number of Multisamples is NUMSAMPLES_1, bits 7:1 of this field
-    *      (Sample Mask) must be zero.
-    *
-    *      If Number of Multisamples is NUMSAMPLES_4, bits 7:4 of this field
-    *      must be zero."
-    */
-   sample_mask &= valid_mask;
+   ILO_DEV_ASSERT(builder->dev, 6, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_SAMPLE_MASK) | (cmd_len - 2);
-   dw[1] = sample_mask;
+   /* see raster_set_gen6_3DSTATE_SAMPLE_MASK() */
+   dw[1] = rs->sample[1];
 }
 
 static inline void
@@ -1070,95 +576,75 @@ gen6_3DSTATE_DRAWING_RECTANGLE(struct ilo_builder *builder,
 
 static inline void
 gen6_3DSTATE_POLY_STIPPLE_OFFSET(struct ilo_builder *builder,
-                                 int x_offset, int y_offset)
+                                 const struct ilo_state_poly_stipple *stipple)
 {
    const uint8_t cmd_len = 2;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   assert(x_offset >= 0 && x_offset <= 31);
-   assert(y_offset >= 0 && y_offset <= 31);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_POLY_STIPPLE_OFFSET) | (cmd_len - 2);
-   dw[1] = x_offset << 8 | y_offset;
+   /* constant */
+   dw[1] = 0;
 }
 
 static inline void
 gen6_3DSTATE_POLY_STIPPLE_PATTERN(struct ilo_builder *builder,
-                                  const struct pipe_poly_stipple *pattern)
+                                  const struct ilo_state_poly_stipple *stipple)
 {
    const uint8_t cmd_len = 33;
    uint32_t *dw;
-   int i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_POLY_STIPPLE_PATTERN) | (cmd_len - 2);
-   dw++;
-
-   STATIC_ASSERT(Elements(pattern->stipple) == 32);
-   for (i = 0; i < 32; i++)
-      dw[i] = pattern->stipple[i];
+   /* see poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN() */
+   memcpy(&dw[1], stipple->stipple, sizeof(stipple->stipple));
 }
 
 static inline void
 gen6_3DSTATE_LINE_STIPPLE(struct ilo_builder *builder,
-                          unsigned pattern, unsigned factor)
+                          const struct ilo_state_line_stipple *stipple)
 {
    const uint8_t cmd_len = 3;
-   unsigned inverse;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   assert((pattern & 0xffff) == pattern);
-   assert(factor >= 1 && factor <= 256);
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_LINE_STIPPLE) | (cmd_len - 2);
-   dw[1] = pattern;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-      /* in U1.16 */
-      inverse = 65536 / factor;
-
-      dw[2] = inverse << GEN7_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
-              factor;
-   }
-   else {
-      /* in U1.13 */
-      inverse = 8192 / factor;
-
-      dw[2] = inverse << GEN6_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
-              factor;
-   }
+   /* see line_stipple_set_gen6_3DSTATE_LINE_STIPPLE() */
+   dw[1] = stipple->stipple[0];
+   dw[2] = stipple->stipple[1];
 }
 
 static inline void
-gen6_3DSTATE_AA_LINE_PARAMETERS(struct ilo_builder *builder)
+gen6_3DSTATE_AA_LINE_PARAMETERS(struct ilo_builder *builder,
+                                const struct ilo_state_raster *rs)
 {
    const uint8_t cmd_len = 3;
-   const uint32_t dw[3] = {
-      GEN6_RENDER_CMD(3D, 3DSTATE_AA_LINE_PARAMETERS) | (cmd_len - 2),
-      0 << GEN6_AA_LINE_DW1_BIAS__SHIFT | 0,
-      0 << GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT | 0,
-   };
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   ilo_builder_batch_write(builder, cmd_len, dw);
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_AA_LINE_PARAMETERS) | (cmd_len - 2);
+   /* constant */
+   dw[1] = 0 << GEN6_AA_LINE_DW1_BIAS__SHIFT |
+           0 << GEN6_AA_LINE_DW1_SLOPE__SHIFT;
+   dw[2] = 0 << GEN6_AA_LINE_DW2_CAP_BIAS__SHIFT |
+           0 << GEN6_AA_LINE_DW2_CAP_SLOPE__SHIFT;
 }
 
 static inline void
 gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
-                          const struct ilo_zs_surface *zs,
-                          bool aligned_8x4)
+                          const struct ilo_state_zs *zs)
 {
    const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ?
       GEN7_RENDER_CMD(3D, 3DSTATE_DEPTH_BUFFER) :
@@ -1172,44 +658,49 @@ gen6_3DSTATE_DEPTH_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = cmd | (cmd_len - 2);
-   dw[1] = zs->payload[0];
-   dw[2] = 0;
 
-   /* see ilo_gpe_init_zs_surface() */
+   /*
+    * see zs_set_gen6_3DSTATE_DEPTH_BUFFER() and
+    * zs_set_gen7_3DSTATE_DEPTH_BUFFER()
+    */
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
+      dw[1] = zs->depth[0];
+      dw[2] = 0;
       dw[3] = 0;
-      dw[4] = (aligned_8x4) ? zs->dw_aligned_8x4 : zs->payload[2];
-      dw[5] = zs->payload[3];
-      dw[6] = zs->payload[4];
-      dw[7] = zs->payload[5];
+      dw[4] = zs->depth[2];
+      dw[5] = zs->depth[3];
+      dw[6] = 0;
+      dw[7] = zs->depth[4];
 
       dw[5] |= builder->mocs << GEN8_DEPTH_DW5_MOCS__SHIFT;
 
-      if (zs->bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2, zs->bo,
-               zs->payload[1], INTEL_RELOC_WRITE);
+      if (zs->depth_bo) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->depth_bo,
+               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
-      dw[3] = (aligned_8x4) ? zs->dw_aligned_8x4 : zs->payload[2];
-      dw[4] = zs->payload[3];
-      dw[5] = zs->payload[4];
-      dw[6] = zs->payload[5];
+      dw[1] = zs->depth[0];
+      dw[2] = 0;
+      dw[3] = zs->depth[2];
+      dw[4] = zs->depth[3];
+      dw[5] = 0;
+      dw[6] = zs->depth[4];
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
          dw[4] |= builder->mocs << GEN7_DEPTH_DW4_MOCS__SHIFT;
       else
          dw[6] |= builder->mocs << GEN6_DEPTH_DW6_MOCS__SHIFT;
 
-      if (zs->bo) {
-         ilo_builder_batch_reloc(builder, pos + 2, zs->bo,
-               zs->payload[1], INTEL_RELOC_WRITE);
+      if (zs->depth_bo) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->depth_bo,
+               zs->depth[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
 
 static inline void
 gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
-                            const struct ilo_zs_surface *zs)
+                            const struct ilo_state_zs *zs)
 {
    const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ?
       GEN7_RENDER_CMD(3D, 3DSTATE_STENCIL_BUFFER) :
@@ -1223,33 +714,36 @@ gen6_3DSTATE_STENCIL_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = cmd | (cmd_len - 2);
-   /* see ilo_gpe_init_zs_surface() */
-   dw[1] = zs->payload[6];
-   dw[2] = 0;
 
+   /* see zs_set_gen6_3DSTATE_STENCIL_BUFFER() */
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT;
-
+      dw[1] = zs->stencil[0];
+      dw[2] = 0;
       dw[3] = 0;
-      dw[4] = zs->payload[8];
+      dw[4] = zs->stencil[2];
 
-      if (zs->separate_s8_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2,
-               zs->separate_s8_bo, zs->payload[7], INTEL_RELOC_WRITE);
+      dw[1] |= builder->mocs << GEN8_STENCIL_DW1_MOCS__SHIFT;
+
+      if (zs->stencil_bo) {
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->stencil_bo,
+               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
+      dw[1] = zs->stencil[0];
+      dw[2] = 0;
+
       dw[1] |= builder->mocs << GEN6_STENCIL_DW1_MOCS__SHIFT;
 
-      if (zs->separate_s8_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2,
-               zs->separate_s8_bo, zs->payload[7], INTEL_RELOC_WRITE);
+      if (zs->stencil_bo) {
+         ilo_builder_batch_reloc(builder, pos + 2, zs->stencil_bo,
+               zs->stencil[1], (zs->s_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
 
 static inline void
 gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
-                               const struct ilo_zs_surface *zs)
+                               const struct ilo_state_zs *zs)
 {
    const uint32_t cmd = (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) ?
       GEN7_RENDER_CMD(3D, 3DSTATE_HIER_DEPTH_BUFFER) :
@@ -1263,26 +757,29 @@ gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_builder *builder,
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = cmd | (cmd_len - 2);
-   /* see ilo_gpe_init_zs_surface() */
-   dw[1] = zs->payload[9];
-   dw[2] = 0;
 
+   /* see zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER() */
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT;
-
+      dw[1] = zs->hiz[0];
+      dw[2] = 0;
       dw[3] = 0;
-      dw[4] = zs->payload[11];
+      dw[4] = zs->hiz[2];
+
+      dw[1] |= builder->mocs << GEN8_HIZ_DW1_MOCS__SHIFT;
 
       if (zs->hiz_bo) {
-         ilo_builder_batch_reloc64(builder, pos + 2,
-               zs->hiz_bo, zs->payload[10], INTEL_RELOC_WRITE);
+         ilo_builder_batch_reloc64(builder, pos + 2, zs->hiz_bo,
+               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
+      dw[1] = zs->hiz[0];
+      dw[2] = 0;
+
       dw[1] |= builder->mocs << GEN6_HIZ_DW1_MOCS__SHIFT;
 
       if (zs->hiz_bo) {
-         ilo_builder_batch_reloc(builder, pos + 2,
-               zs->hiz_bo, zs->payload[10], INTEL_RELOC_WRITE);
+         ilo_builder_batch_reloc(builder, pos + 2, zs->hiz_bo,
+               zs->hiz[1], (zs->z_readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 }
@@ -1440,34 +937,24 @@ gen7_3DSTATE_BLEND_STATE_POINTERS(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_CLIP_VIEWPORT(struct ilo_builder *builder,
-                   const struct ilo_viewport_cso *viewports,
-                   unsigned num_viewports)
+                   const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 4 * num_viewports;
+   const int state_len = 4 * vp->count;
    uint32_t state_offset, *dw;
-   unsigned i;
+   int i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 193:
-    *
-    *     "The viewport-related state is stored as an array of up to 16
-    *      elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_CLIP_VIEWPORT, state_align, state_len, &dw);
 
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->min_gbx);
-      dw[1] = fui(vp->max_gbx);
-      dw[2] = fui(vp->min_gby);
-      dw[3] = fui(vp->max_gby);
+   for (i = 0; i < vp->count; i++) {
+      /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */
+      dw[0] = vp->sf_clip[i][8];
+      dw[1] = vp->sf_clip[i][9];
+      dw[2] = vp->sf_clip[i][10];
+      dw[3] = vp->sf_clip[i][11];
 
       dw += 4;
    }
@@ -1477,38 +964,21 @@ gen6_CLIP_VIEWPORT(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_SF_VIEWPORT(struct ilo_builder *builder,
-                 const struct ilo_viewport_cso *viewports,
-                 unsigned num_viewports)
+                 const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 8 * num_viewports;
+   const int state_len = 8 * vp->count;
    uint32_t state_offset, *dw;
-   unsigned i;
+   int i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 6);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 262:
-    *
-    *     "The viewport-specific state used by the SF unit (SF_VIEWPORT) is
-    *      stored as an array of up to 16 elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_SF_VIEWPORT, state_align, state_len, &dw);
 
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->m00);
-      dw[1] = fui(vp->m11);
-      dw[2] = fui(vp->m22);
-      dw[3] = fui(vp->m30);
-      dw[4] = fui(vp->m31);
-      dw[5] = fui(vp->m32);
-      dw[6] = 0;
-      dw[7] = 0;
+   for (i = 0; i < vp->count; i++) {
+      /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */
+      memcpy(dw, vp->sf_clip[i], sizeof(*dw) * 8);
 
       dw += 8;
    }
@@ -1518,298 +988,103 @@ gen6_SF_VIEWPORT(struct ilo_builder *builder,
 
 static inline uint32_t
 gen7_SF_CLIP_VIEWPORT(struct ilo_builder *builder,
-                      const struct ilo_viewport_cso *viewports,
-                      unsigned num_viewports)
+                      const struct ilo_state_viewport *vp)
 {
    const int state_align = 64;
-   const int state_len = 16 * num_viewports;
-   uint32_t state_offset, *dw;
-   unsigned i;
+   const int state_len = 16 * vp->count;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
 
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 270:
-    *
-    *     "The viewport-specific state used by both the SF and CL units
-    *      (SF_CLIP_VIEWPORT) is stored as an array of up to 16 elements, each
-    *      of which contains the DWords described below. The start of each
-    *      element is spaced 16 DWords apart. The location of first element of
-    *      the array, as specified by both Pointer to SF_VIEWPORT and Pointer
-    *      to CLIP_VIEWPORT, is aligned to a 64-byte boundary."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_SF_VIEWPORT, state_align, state_len, &dw);
-
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->m00);
-      dw[1] = fui(vp->m11);
-      dw[2] = fui(vp->m22);
-      dw[3] = fui(vp->m30);
-      dw[4] = fui(vp->m31);
-      dw[5] = fui(vp->m32);
-      dw[6] = 0;
-      dw[7] = 0;
-
-      dw[8] = fui(vp->min_gbx);
-      dw[9] = fui(vp->max_gbx);
-      dw[10] = fui(vp->min_gby);
-      dw[11] = fui(vp->max_gby);
-
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         dw[12] = fui(vp->min_x);
-         dw[13] = fui(vp->max_x - 1.0f);
-         dw[14] = fui(vp->min_y);
-         dw[15] = fui(vp->max_y - 1.0f);
-      } else {
-         dw[12] = 0;
-         dw[13] = 0;
-         dw[14] = 0;
-         dw[15] = 0;
-      }
-
-      dw += 16;
-   }
-
-   return state_offset;
+   /* see viewport_matrix_set_gen7_SF_CLIP_VIEWPORT() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_SF_VIEWPORT,
+         state_align, state_len, (const uint32_t *) vp->sf_clip);
 }
 
 static inline uint32_t
 gen6_CC_VIEWPORT(struct ilo_builder *builder,
-                 const struct ilo_viewport_cso *viewports,
-                 unsigned num_viewports)
+                 const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 2 * num_viewports;
-   uint32_t state_offset, *dw;
-   unsigned i;
+   const int state_len = 2 * vp->count;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 385:
-    *
-    *     "The viewport state is stored as an array of up to 16 elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_CC_VIEWPORT, state_align, state_len, &dw);
-
-   for (i = 0; i < num_viewports; i++) {
-      const struct ilo_viewport_cso *vp = &viewports[i];
-
-      dw[0] = fui(vp->min_z);
-      dw[1] = fui(vp->max_z);
-
-      dw += 2;
-   }
-
-   return state_offset;
+   /* see viewport_matrix_set_gen6_CC_VIEWPORT() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_CC_VIEWPORT,
+         state_align, state_len, (const uint32_t *) vp->cc);
 }
 
 static inline uint32_t
 gen6_SCISSOR_RECT(struct ilo_builder *builder,
-                  const struct ilo_scissor_state *scissor,
-                  unsigned num_viewports)
+                  const struct ilo_state_viewport *vp)
 {
    const int state_align = 32;
-   const int state_len = 2 * num_viewports;
+   const int state_len = 2 * vp->count;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 263:
-    *
-    *     "The viewport-specific state used by the SF unit (SCISSOR_RECT) is
-    *      stored as an array of up to 16 elements..."
-    */
-   assert(num_viewports && num_viewports <= 16);
-   assert(Elements(scissor->payload) >= state_len);
-
+   /* see viewport_scissor_set_gen6_SCISSOR_RECT() */
    return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_SCISSOR_RECT,
-         state_align, state_len, scissor->payload);
+         state_align, state_len, (const uint32_t *) vp->scissor);
 }
 
 static inline uint32_t
 gen6_COLOR_CALC_STATE(struct ilo_builder *builder,
-                      const struct pipe_stencil_ref *stencil_ref,
-                      ubyte alpha_ref,
-                      const struct pipe_blend_color *blend_color)
+                      const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
    const int state_len = 6;
-   uint32_t state_offset, *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_COLOR_CALC, state_align, state_len, &dw);
-
-   dw[0] = stencil_ref->ref_value[0] << 24 |
-           stencil_ref->ref_value[1] << 16 |
-           GEN6_CC_DW0_ALPHATEST_UNORM8;
-   dw[1] = alpha_ref;
-   dw[2] = fui(blend_color->color[0]);
-   dw[3] = fui(blend_color->color[1]);
-   dw[4] = fui(blend_color->color[2]);
-   dw[5] = fui(blend_color->color[3]);
-
-   return state_offset;
+   /* see cc_params_set_gen6_COLOR_CALC_STATE() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_COLOR_CALC,
+         state_align, state_len, cc->cc);
 }
 
 static inline uint32_t
 gen6_DEPTH_STENCIL_STATE(struct ilo_builder *builder,
-                         const struct ilo_dsa_state *dsa)
+                         const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
    const int state_len = 3;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   STATIC_ASSERT(Elements(dsa->payload) >= state_len);
-
+   /* see cc_set_gen6_DEPTH_STENCIL_STATE() */
    return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_DEPTH_STENCIL,
-         state_align, state_len, dsa->payload);
+         state_align, state_len, cc->ds);
 }
 
 static inline uint32_t
 gen6_BLEND_STATE(struct ilo_builder *builder,
-                 const struct ilo_blend_state *blend,
-                 const struct ilo_fb_state *fb,
-                 const struct ilo_dsa_state *dsa)
+                 const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
-   int state_len;
-   uint32_t state_offset, *dw;
-   unsigned num_targets, i;
+   const int state_len = 2 * cc->blend_state_count;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 376:
-    *
-    *     "The blend state is stored as an array of up to 8 elements..."
-    */
-   num_targets = fb->state.nr_cbufs;
-   assert(num_targets <= 8);
-
-   if (!num_targets) {
-      if (!dsa->dw_blend_alpha)
-         return 0;
-      /* to be able to reference alpha func */
-      num_targets = 1;
-   }
-
-   state_len = 2 * num_targets;
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_BLEND, state_align, state_len, &dw);
-
-   for (i = 0; i < num_targets; i++) {
-      const struct ilo_blend_cso *cso = &blend->cso[i];
-
-      dw[0] = cso->payload[0];
-      dw[1] = cso->payload[1] | blend->dw_shared;
-
-      if (i < fb->state.nr_cbufs && fb->state.cbufs[i]) {
-         const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i];
-
-         if (caps->can_blend) {
-            if (caps->dst_alpha_forced_one)
-               dw[0] |= cso->dw_blend_dst_alpha_forced_one;
-            else
-               dw[0] |= cso->dw_blend;
-         }
-
-         if (caps->can_logicop)
-            dw[1] |= blend->dw_logicop;
-
-         if (caps->can_alpha_test)
-            dw[1] |= dsa->dw_blend_alpha;
-      } else {
-         dw[1] |= GEN6_RT_DW1_WRITE_DISABLE_A |
-                  GEN6_RT_DW1_WRITE_DISABLE_R |
-                  GEN6_RT_DW1_WRITE_DISABLE_G |
-                  GEN6_RT_DW1_WRITE_DISABLE_B |
-                  dsa->dw_blend_alpha;
-      }
+   if (!state_len)
+      return 0;
 
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 356:
-       *
-       *     "When NumSamples = 1, AlphaToCoverage and AlphaToCoverage
-       *      Dither both must be disabled."
-       *
-       * There is no such limitation on GEN7, or for AlphaToOne.  But GL
-       * requires that anyway.
-       */
-      if (fb->num_samples > 1)
-         dw[1] |= blend->dw_alpha_mod;
-
-      dw += 2;
-   }
-
-   return state_offset;
+   /* see cc_set_gen6_BLEND_STATE() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLEND,
+         state_align, state_len, cc->blend);
 }
 
 static inline uint32_t
 gen8_BLEND_STATE(struct ilo_builder *builder,
-                 const struct ilo_blend_state *blend,
-                 const struct ilo_fb_state *fb,
-                 const struct ilo_dsa_state *dsa)
+                 const struct ilo_state_cc *cc)
 {
    const int state_align = 64;
-   const int state_len = 1 + 2 * fb->state.nr_cbufs;
-   uint32_t state_offset, *dw;
-   unsigned i;
+   const int state_len = 1 + 2 * cc->blend_state_count;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   assert(fb->state.nr_cbufs <= 8);
-
-   state_offset = ilo_builder_dynamic_pointer(builder,
-         ILO_BUILDER_ITEM_BLEND, state_align, state_len, &dw);
-
-   dw[0] = blend->dw_shared;
-   if (fb->num_samples > 1)
-      dw[0] |= blend->dw_alpha_mod;
-   if (!fb->state.nr_cbufs || fb->blend_caps[0].can_alpha_test)
-      dw[0] |= dsa->dw_blend_alpha;
-   dw++;
-
-   for (i = 0; i < fb->state.nr_cbufs; i++) {
-      const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i];
-      const struct ilo_blend_cso *cso = &blend->cso[i];
-
-      dw[0] = cso->payload[0];
-      dw[1] = cso->payload[1];
-
-      if (fb->state.cbufs[i]) {
-         if (caps->can_blend) {
-            if (caps->dst_alpha_forced_one)
-               dw[0] |= cso->dw_blend_dst_alpha_forced_one;
-            else
-               dw[0] |= cso->dw_blend;
-         }
-
-         if (caps->can_logicop)
-            dw[1] |= blend->dw_logicop;
-      } else {
-         dw[0] |= GEN8_RT_DW0_WRITE_DISABLE_A |
-                  GEN8_RT_DW0_WRITE_DISABLE_R |
-                  GEN8_RT_DW0_WRITE_DISABLE_G |
-                  GEN8_RT_DW0_WRITE_DISABLE_B;
-      }
-
-      dw += 2;
-   }
-
-   return state_offset;
+   /* see cc_set_gen8_BLEND_STATE() */
+   return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLEND,
+         state_align, state_len, &cc->blend[1]);
 }
 
 #endif /* ILO_BUILDER_3D_BOTTOM_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
index 05dbce7..8d30095 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_3d_top.h
@@ -29,303 +29,167 @@
 #define ILO_BUILDER_3D_TOP_H
 
 #include "genhw/genhw.h"
-#include "../ilo_resource.h"
-#include "../ilo_shader.h"
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
-#include "ilo_state_3d.h"
+#include "ilo_state_sampler.h"
+#include "ilo_state_shader.h"
+#include "ilo_state_sol.h"
+#include "ilo_state_surface.h"
+#include "ilo_state_urb.h"
+#include "ilo_state_vf.h"
 #include "ilo_builder.h"
 
 static inline void
 gen6_3DSTATE_URB(struct ilo_builder *builder,
-                 int vs_total_size, int gs_total_size,
-                 int vs_entry_size, int gs_entry_size)
+                 const struct ilo_state_urb *urb)
 {
    const uint8_t cmd_len = 3;
-   const int row_size = 128; /* 1024 bits */
-   int vs_alloc_size, gs_alloc_size;
-   int vs_num_entries, gs_num_entries;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   /* in 1024-bit URB rows */
-   vs_alloc_size = (vs_entry_size + row_size - 1) / row_size;
-   gs_alloc_size = (gs_entry_size + row_size - 1) / row_size;
-
-   /* the valid range is [1, 5] */
-   if (!vs_alloc_size)
-      vs_alloc_size = 1;
-   if (!gs_alloc_size)
-      gs_alloc_size = 1;
-   assert(vs_alloc_size <= 5 && gs_alloc_size <= 5);
-
-   /* the valid range is [24, 256] in multiples of 4 */
-   vs_num_entries = (vs_total_size / row_size / vs_alloc_size) & ~3;
-   if (vs_num_entries > 256)
-      vs_num_entries = 256;
-   assert(vs_num_entries >= 24);
-
-   /* the valid range is [0, 256] in multiples of 4 */
-   gs_num_entries = (gs_total_size / row_size / gs_alloc_size) & ~3;
-   if (gs_num_entries > 256)
-      gs_num_entries = 256;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_URB) | (cmd_len - 2);
-   dw[1] = (vs_alloc_size - 1) << GEN6_URB_DW1_VS_ENTRY_SIZE__SHIFT |
-           vs_num_entries << GEN6_URB_DW1_VS_ENTRY_COUNT__SHIFT;
-   dw[2] = gs_num_entries << GEN6_URB_DW2_GS_ENTRY_COUNT__SHIFT |
-           (gs_alloc_size - 1) << GEN6_URB_DW2_GS_ENTRY_SIZE__SHIFT;
+   /* see urb_set_gen6_3DSTATE_URB() */
+   dw[1] = urb->urb[0];
+   dw[2] = urb->urb[1];
 }
 
 static inline void
-gen7_3dstate_push_constant_alloc(struct ilo_builder *builder,
-                                 int subop, int offset, int size)
+gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(struct ilo_builder *builder,
+                                    const struct ilo_state_urb *urb)
 {
-   const uint32_t cmd = GEN6_RENDER_TYPE_RENDER |
-                        GEN6_RENDER_SUBTYPE_3D |
-                        subop;
    const uint8_t cmd_len = 2;
-   const int slice_count = ((ilo_dev_gen(builder->dev) == ILO_GEN(7.5) &&
-                             builder->dev->gt == 3) ||
-                            ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 2 : 1;
    uint32_t *dw;
-   int end;
-
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   /* VS, HS, DS, GS, and PS variants */
-   assert(subop >= GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_VS &&
-          subop <= GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS);
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 68:
-    *
-    *     "(A table that says the maximum size of each constant buffer is
-    *      16KB")
-    *
-    * From the Ivy Bridge PRM, volume 2 part 1, page 115:
-    *
-    *     "The sum of the Constant Buffer Offset and the Constant Buffer Size
-    *      may not exceed the maximum value of the Constant Buffer Size."
-    *
-    * Thus, the valid range of buffer end is [0KB, 16KB].
-    */
-   end = (offset + size) / 1024;
-   if (end > 16 * slice_count) {
-      assert(!"invalid constant buffer end");
-      end = 16 * slice_count;
-   }
-
-   /* the valid range of buffer offset is [0KB, 15KB] */
-   offset = (offset + 1023) / 1024;
-   if (offset > 15 * slice_count) {
-      assert(!"invalid constant buffer offset");
-      offset = 15 * slice_count;
-   }
-
-   if (offset > end) {
-      assert(!size);
-      offset = end;
-   }
-
-   /* the valid range of buffer size is [0KB, 15KB] */
-   size = end - offset;
-   if (size > 15 * slice_count) {
-      assert(!"invalid constant buffer size");
-      size = 15 * slice_count;
-   }
-
-   assert(offset % slice_count == 0 && size % slice_count == 0);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = cmd | (cmd_len - 2);
-   dw[1] = offset << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
-           size;
-}
-
-static inline void
-gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(struct ilo_builder *builder,
-                                    int offset, int size)
-{
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_VS, offset, size);
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_VS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[0];
 }
 
 static inline void
 gen7_3DSTATE_PUSH_CONSTANT_ALLOC_HS(struct ilo_builder *builder,
-                                    int offset, int size)
+                                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_HS, offset, size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_HS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[1];
 }
 
 static inline void
 gen7_3DSTATE_PUSH_CONSTANT_ALLOC_DS(struct ilo_builder *builder,
-                                    int offset, int size)
+                                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_DS, offset, size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_DS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[2];
 }
 
 static inline void
 gen7_3DSTATE_PUSH_CONSTANT_ALLOC_GS(struct ilo_builder *builder,
-                                    int offset, int size)
+                                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_GS, offset, size);
-}
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
 
-static inline void
-gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(struct ilo_builder *builder,
-                                    int offset, int size)
-{
-   gen7_3dstate_push_constant_alloc(builder,
-         GEN7_RENDER_OPCODE_3DSTATE_PUSH_CONSTANT_ALLOC_PS, offset, size);
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_GS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[3];
 }
 
 static inline void
-gen7_3dstate_urb(struct ilo_builder *builder,
-                 int subop, int offset, int size,
-                 int entry_size)
+gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(struct ilo_builder *builder,
+                                    const struct ilo_state_urb *urb)
 {
-   const uint32_t cmd = GEN6_RENDER_TYPE_RENDER |
-                        GEN6_RENDER_SUBTYPE_3D |
-                        subop;
    const uint8_t cmd_len = 2;
-   const int row_size = 64; /* 512 bits */
-   int alloc_size, num_entries, min_entries, max_entries;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   /* VS, HS, DS, and GS variants */
-   assert(subop >= GEN7_RENDER_OPCODE_3DSTATE_URB_VS &&
-          subop <= GEN7_RENDER_OPCODE_3DSTATE_URB_GS);
-
-   /* in multiples of 8KB */
-   assert(offset % 8192 == 0);
-   offset /= 8192;
-
-   /* in multiple of 512-bit rows */
-   alloc_size = (entry_size + row_size - 1) / row_size;
-   if (!alloc_size)
-      alloc_size = 1;
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 34:
-    *
-    *     "VS URB Entry Allocation Size equal to 4(5 512-bit URB rows) may
-    *      cause performance to decrease due to banking in the URB. Element
-    *      sizes of 16 to 20 should be programmed with six 512-bit URB rows."
-    */
-   if (subop == GEN7_RENDER_OPCODE_3DSTATE_URB_VS && alloc_size == 5)
-      alloc_size = 6;
-
-   /* in multiples of 8 */
-   num_entries = (size / row_size / alloc_size) & ~7;
-
-   switch (subop) {
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_VS:
-      switch (ilo_dev_gen(builder->dev)) {
-      case ILO_GEN(8):
-         max_entries = 2560;
-         min_entries = 64;
-         break;
-      case ILO_GEN(7.5):
-         max_entries = (builder->dev->gt >= 2) ? 1664 : 640;
-         min_entries = (builder->dev->gt >= 2) ? 64 : 32;
-         break;
-      case ILO_GEN(7):
-      default:
-         max_entries = (builder->dev->gt == 2) ? 704 : 512;
-         min_entries = 32;
-         break;
-      }
-
-      assert(num_entries >= min_entries);
-      if (num_entries > max_entries)
-         num_entries = max_entries;
-      break;
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_HS:
-      max_entries = (builder->dev->gt == 2) ? 64 : 32;
-      if (num_entries > max_entries)
-         num_entries = max_entries;
-      break;
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_DS:
-      if (num_entries)
-         assert(num_entries >= 138);
-      break;
-   case GEN7_RENDER_OPCODE_3DSTATE_URB_GS:
-      switch (ilo_dev_gen(builder->dev)) {
-      case ILO_GEN(8):
-         max_entries = 960;
-         break;
-      case ILO_GEN(7.5):
-         max_entries = (builder->dev->gt >= 2) ? 640 : 256;
-         break;
-      case ILO_GEN(7):
-      default:
-         max_entries = (builder->dev->gt == 2) ? 320 : 192;
-         break;
-      }
-
-      if (num_entries > max_entries)
-         num_entries = max_entries;
-      break;
-   default:
-      break;
-   }
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = cmd | (cmd_len - 2);
-   dw[1] = offset << GEN7_URB_DW1_OFFSET__SHIFT |
-           (alloc_size - 1) << GEN7_URB_DW1_ENTRY_SIZE__SHIFT |
-           num_entries;
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_PUSH_CONSTANT_ALLOC_PS) |
+           (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->pcb[4];
 }
 
 static inline void
 gen7_3DSTATE_URB_VS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_VS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_VS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[0];
 }
 
 static inline void
 gen7_3DSTATE_URB_HS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_HS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_HS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[1];
 }
 
 static inline void
 gen7_3DSTATE_URB_DS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_DS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_DS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[2];
 }
 
 static inline void
 gen7_3DSTATE_URB_GS(struct ilo_builder *builder,
-                    int offset, int size, int entry_size)
+                    const struct ilo_state_urb *urb)
 {
-   gen7_3dstate_urb(builder, GEN7_RENDER_OPCODE_3DSTATE_URB_GS,
-         offset, size, entry_size);
+   const uint8_t cmd_len = 2;
+   uint32_t *dw;
+
+   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_URB_GS) | (cmd_len - 2);
+   /* see urb_set_gen7_3dstate_push_constant_alloc() */
+   dw[1] = urb->urb[3];
 }
 
 static inline void
 gen75_3DSTATE_VF(struct ilo_builder *builder,
-                 bool enable_cut_index,
-                 uint32_t cut_index)
+                 const struct ilo_state_vf *vf)
 {
    const uint8_t cmd_len = 2;
    uint32_t *dw;
@@ -334,11 +198,10 @@ gen75_3DSTATE_VF(struct ilo_builder *builder,
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN75_RENDER_CMD(3D, 3DSTATE_VF) | (cmd_len - 2);
-   if (enable_cut_index)
-      dw[0] |= GEN75_VF_DW0_CUT_INDEX_ENABLE;
-
-   dw[1] = cut_index;
+   /* see vf_params_set_gen75_3DSTATE_VF() */
+   dw[0] = GEN75_RENDER_CMD(3D, 3DSTATE_VF) | (cmd_len - 2) |
+           vf->cut[0];
+   dw[1] = vf->cut[1];
 }
 
 static inline void
@@ -354,40 +217,11 @@ gen6_3DSTATE_VF_STATISTICS(struct ilo_builder *builder,
    ilo_builder_batch_write(builder, cmd_len, &dw0);
 }
 
-/**
- * Translate a pipe primitive type to the matching hardware primitive type.
- */
-static inline int
-gen6_3d_translate_pipe_prim(unsigned prim)
-{
-   static const int prim_mapping[ILO_PRIM_MAX] = {
-      [PIPE_PRIM_POINTS]                     = GEN6_3DPRIM_POINTLIST,
-      [PIPE_PRIM_LINES]                      = GEN6_3DPRIM_LINELIST,
-      [PIPE_PRIM_LINE_LOOP]                  = GEN6_3DPRIM_LINELOOP,
-      [PIPE_PRIM_LINE_STRIP]                 = GEN6_3DPRIM_LINESTRIP,
-      [PIPE_PRIM_TRIANGLES]                  = GEN6_3DPRIM_TRILIST,
-      [PIPE_PRIM_TRIANGLE_STRIP]             = GEN6_3DPRIM_TRISTRIP,
-      [PIPE_PRIM_TRIANGLE_FAN]               = GEN6_3DPRIM_TRIFAN,
-      [PIPE_PRIM_QUADS]                      = GEN6_3DPRIM_QUADLIST,
-      [PIPE_PRIM_QUAD_STRIP]                 = GEN6_3DPRIM_QUADSTRIP,
-      [PIPE_PRIM_POLYGON]                    = GEN6_3DPRIM_POLYGON,
-      [PIPE_PRIM_LINES_ADJACENCY]            = GEN6_3DPRIM_LINELIST_ADJ,
-      [PIPE_PRIM_LINE_STRIP_ADJACENCY]       = GEN6_3DPRIM_LINESTRIP_ADJ,
-      [PIPE_PRIM_TRIANGLES_ADJACENCY]        = GEN6_3DPRIM_TRILIST_ADJ,
-      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]   = GEN6_3DPRIM_TRISTRIP_ADJ,
-      [ILO_PRIM_RECTANGLES]                  = GEN6_3DPRIM_RECTLIST,
-   };
-
-   assert(prim_mapping[prim]);
-
-   return prim_mapping[prim];
-}
-
 static inline void
-gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder, unsigned pipe_prim)
+gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder,
+                         enum gen_3dprim_type topology)
 {
    const uint8_t cmd_len = 2;
-   const int prim = gen6_3d_translate_pipe_prim(pipe_prim);
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
@@ -395,12 +229,13 @@ gen8_3DSTATE_VF_TOPOLOGY(struct ilo_builder *builder, unsigned pipe_prim)
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_TOPOLOGY) | (cmd_len - 2);
-   dw[1] = prim;
+   dw[1] = topology << GEN8_TOPOLOGY_DW1_TYPE__SHIFT;
 }
 
 static inline void
 gen8_3DSTATE_VF_INSTANCING(struct ilo_builder *builder,
-                           int vb_index, uint32_t step_rate)
+                           const struct ilo_state_vf *vf,
+                           uint32_t attr)
 {
    const uint8_t cmd_len = 3;
    uint32_t *dw;
@@ -410,16 +245,20 @@ gen8_3DSTATE_VF_INSTANCING(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_INSTANCING) | (cmd_len - 2);
-   dw[1] = vb_index;
-   if (step_rate)
-      dw[1] |= GEN8_INSTANCING_DW1_ENABLE;
-   dw[2] = step_rate;
+   dw[1] = attr << GEN8_INSTANCING_DW1_VE_INDEX__SHIFT;
+   dw[2] = 0;
+   /* see vf_set_gen8_3DSTATE_VF_INSTANCING() */
+   if (attr >= vf->internal_ve_count) {
+      attr -= vf->internal_ve_count;
+
+      dw[1] |= vf->user_instancing[attr][0];
+      dw[2] |= vf->user_instancing[attr][1];
+   }
 }
 
 static inline void
 gen8_3DSTATE_VF_SGVS(struct ilo_builder *builder,
-                     bool vid_enable, int vid_ve, int vid_comp,
-                     bool iid_enable, int iid_ve, int iid_comp)
+                     const struct ilo_state_vf *vf)
 {
    const uint8_t cmd_len = 2;
    uint32_t *dw;
@@ -429,29 +268,19 @@ gen8_3DSTATE_VF_SGVS(struct ilo_builder *builder,
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN8_RENDER_CMD(3D, 3DSTATE_VF_SGVS) | (cmd_len - 2);
-   dw[1] = 0;
-
-   if (iid_enable) {
-      dw[1] |= GEN8_SGVS_DW1_IID_ENABLE |
-               vid_comp << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT |
-               vid_ve << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT;
-   }
-
-   if (vid_enable) {
-      dw[1] |= GEN8_SGVS_DW1_VID_ENABLE |
-               vid_comp << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT |
-               vid_ve << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT;
-   }
+   /* see vf_params_set_gen8_3DSTATE_VF_SGVS() */
+   dw[1] = vf->sgvs[0];
 }
 
 static inline void
 gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
-                            const struct ilo_ve_state *ve,
-                            const struct ilo_vb_state *vb)
+                            const struct ilo_state_vf *vf,
+                            const struct ilo_state_vertex_buffer *vb,
+                            unsigned vb_count)
 {
    uint8_t cmd_len;
    uint32_t *dw;
-   unsigned pos, hw_idx;
+   unsigned pos, i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
@@ -460,67 +289,52 @@ gen6_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
     *
     *     "From 1 to 33 VBs can be specified..."
     */
-   assert(ve->vb_count <= 33);
+   assert(vb_count <= 33);
 
-   if (!ve->vb_count)
+   if (!vb_count)
       return;
 
-   cmd_len = 1 + 4 * ve->vb_count;
+   cmd_len = 1 + 4 * vb_count;
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_BUFFERS) | (cmd_len - 2);
    dw++;
    pos++;
 
-   for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) {
-      const unsigned instance_divisor = ve->instance_divisors[hw_idx];
-      const unsigned pipe_idx = ve->vb_mapping[hw_idx];
-      const struct pipe_vertex_buffer *cso = &vb->states[pipe_idx];
+   for (i = 0; i < vb_count; i++) {
+      const struct ilo_state_vertex_buffer *b = &vb[i];
 
-      dw[0] = hw_idx << GEN6_VB_DW0_INDEX__SHIFT;
+      /* see vertex_buffer_set_gen8_vertex_buffer_state() */
+      dw[0] = b->vb[0] |
+              i << GEN6_VB_DW0_INDEX__SHIFT;
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(8))
          dw[0] |= builder->mocs << GEN8_VB_DW0_MOCS__SHIFT;
       else
          dw[0] |= builder->mocs << GEN6_VB_DW0_MOCS__SHIFT;
 
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
-         dw[0] |= GEN7_VB_DW0_ADDR_MODIFIED;
-
-      if (instance_divisor)
-         dw[0] |= GEN6_VB_DW0_ACCESS_INSTANCEDATA;
-      else
-         dw[0] |= GEN6_VB_DW0_ACCESS_VERTEXDATA;
-
-      /* use null vb if there is no buffer or the stride is out of range */
-      if (!cso->buffer || cso->stride > 2048) {
-         dw[0] |= GEN6_VB_DW0_IS_NULL;
-         dw[1] = 0;
-         dw[2] = 0;
-         dw[3] = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ?
-            0 : instance_divisor;
-
-         continue;
-      }
-
-      dw[0] |= cso->stride << GEN6_VB_DW0_PITCH__SHIFT;
+      dw[1] = 0;
+      dw[2] = 0;
+      dw[3] = 0;
 
       if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         const struct ilo_buffer *buf = ilo_buffer(cso->buffer);
-         const uint32_t start_offset = cso->buffer_offset;
+         if (b->need_bo)
+            ilo_builder_batch_reloc64(builder, pos + 1, b->bo, b->vb[1], 0);
 
-         ilo_builder_batch_reloc64(builder, pos + 1,
-               buf->bo, start_offset, 0);
-         dw[3] = buf->bo_size;
+         dw[3] |= b->vb[2];
       } else {
-         const struct ilo_buffer *buf = ilo_buffer(cso->buffer);
-         const uint32_t start_offset = cso->buffer_offset;
-         const uint32_t end_offset = buf->bo_size - 1;
+         const int8_t elem = vf->vb_to_first_elem[i];
 
-         dw[3] = instance_divisor;
+         /* see vf_set_gen6_vertex_buffer_state() */
+         if (elem >= 0) {
+            dw[0] |= vf->user_instancing[elem][0];
+            dw[3] |= vf->user_instancing[elem][1];
+         }
 
-         ilo_builder_batch_reloc(builder, pos + 1, buf->bo, start_offset, 0);
-         ilo_builder_batch_reloc(builder, pos + 2, buf->bo, end_offset, 0);
+         if (b->need_bo) {
+            ilo_builder_batch_reloc(builder, pos + 1, b->bo, b->vb[1], 0);
+            ilo_builder_batch_reloc(builder, pos + 2, b->bo, b->vb[2], 0);
+         }
       }
 
       dw += 4;
@@ -563,248 +377,189 @@ gen6_user_3DSTATE_VERTEX_BUFFERS(struct ilo_builder *builder,
 
 static inline void
 gen6_3DSTATE_VERTEX_ELEMENTS(struct ilo_builder *builder,
-                             const struct ilo_ve_state *ve)
+                             const struct ilo_state_vf *vf)
 {
    uint8_t cmd_len;
    uint32_t *dw;
-   unsigned i;
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 92:
-    *
-    *    "At least one VERTEX_ELEMENT_STATE structure must be included."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 93:
-    *
-    *     "Up to 34 (DevSNB+) vertex elements are supported."
-    */
-   assert(ve->count + ve->prepend_nosrc_cso >= 1);
-   assert(ve->count + ve->prepend_nosrc_cso <= 34);
-
-   STATIC_ASSERT(Elements(ve->cso[0].payload) == 2);
+   cmd_len = 1 + 2 * (vf->internal_ve_count + vf->user_ve_count);
 
-   cmd_len = 1 + 2 * (ve->count + ve->prepend_nosrc_cso);
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VERTEX_ELEMENTS) | (cmd_len - 2);
    dw++;
 
-   if (ve->prepend_nosrc_cso) {
-      memcpy(dw, ve->nosrc_cso.payload, sizeof(ve->nosrc_cso.payload));
-      dw += 2;
-   }
-
-   for (i = 0; i < ve->count - ve->last_cso_edgeflag; i++) {
-      memcpy(dw, ve->cso[i].payload, sizeof(ve->cso[i].payload));
-      dw += 2;
+   /*
+    * see vf_params_set_gen6_internal_ve() and
+    * vf_set_gen6_3DSTATE_VERTEX_ELEMENTS()
+    */
+   if (vf->internal_ve_count) {
+      memcpy(dw, vf->internal_ve,
+            sizeof(vf->internal_ve[0]) * vf->internal_ve_count);
+      dw += 2 * vf->internal_ve_count;
    }
 
-   if (ve->last_cso_edgeflag)
-      memcpy(dw, ve->edgeflag_cso.payload, sizeof(ve->edgeflag_cso.payload));
+   memcpy(dw, vf->user_ve, sizeof(vf->user_ve[0]) * vf->user_ve_count);
 }
 
 static inline void
 gen6_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
-                          const struct ilo_ib_state *ib,
-                          bool enable_cut_index)
+                          const struct ilo_state_vf *vf,
+                          const struct ilo_state_index_buffer *ib)
 {
    const uint8_t cmd_len = 3;
-   struct ilo_buffer *buf = ilo_buffer(ib->hw_resource);
-   uint32_t start_offset, end_offset;
-   int format;
-   uint32_t *dw;
+   uint32_t dw0, *dw;
    unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   if (!buf)
-      return;
-
-   /* this is moved to the new 3DSTATE_VF */
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7.5))
-      assert(!enable_cut_index);
-
-   switch (ib->hw_index_size) {
-   case 4:
-      format = GEN6_IB_DW0_FORMAT_DWORD;
-      break;
-   case 2:
-      format = GEN6_IB_DW0_FORMAT_WORD;
-      break;
-   case 1:
-      format = GEN6_IB_DW0_FORMAT_BYTE;
-      break;
-   default:
-      assert(!"unknown index size");
-      format = GEN6_IB_DW0_FORMAT_BYTE;
-      break;
-   }
+   dw0 = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2) |
+         builder->mocs << GEN6_IB_DW0_MOCS__SHIFT;
 
    /*
-    * set start_offset to 0 here and adjust pipe_draw_info::start with
-    * ib->draw_start_offset in 3DPRIMITIVE
+    * see index_buffer_set_gen8_3DSTATE_INDEX_BUFFER() and
+    * vf_params_set_gen6_3dstate_index_buffer()
     */
-   start_offset = 0;
-   end_offset = buf->bo_size;
-
-   /* end_offset must also be aligned and is inclusive */
-   end_offset -= (end_offset % ib->hw_index_size);
-   end_offset--;
+   dw0 |= ib->ib[0];
+   if (ilo_dev_gen(builder->dev) <= ILO_GEN(7))
+      dw0 |= vf->cut[0];
 
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2) |
-           builder->mocs << GEN6_IB_DW0_MOCS__SHIFT |
-           format;
-   if (enable_cut_index)
-      dw[0] |= GEN6_IB_DW0_CUT_INDEX_ENABLE;
-
-   ilo_builder_batch_reloc(builder, pos + 1, buf->bo, start_offset, 0);
-   ilo_builder_batch_reloc(builder, pos + 2, buf->bo, end_offset, 0);
+   dw[0] = dw0;
+   if (ib->need_bo) {
+      ilo_builder_batch_reloc(builder, pos + 1, ib->bo, ib->ib[1], 0);
+      ilo_builder_batch_reloc(builder, pos + 2, ib->bo, ib->ib[2], 0);
+   } else {
+      dw[1] = 0;
+      dw[2] = 0;
+   }
 }
 
 static inline void
 gen8_3DSTATE_INDEX_BUFFER(struct ilo_builder *builder,
-                          const struct ilo_ib_state *ib)
+                          const struct ilo_state_vf *vf,
+                          const struct ilo_state_index_buffer *ib)
 {
    const uint8_t cmd_len = 5;
-   struct ilo_buffer *buf = ilo_buffer(ib->hw_resource);
-   int format;
    uint32_t *dw;
    unsigned pos;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   if (!buf)
-      return;
-
-   switch (ib->hw_index_size) {
-   case 4:
-      format = GEN8_IB_DW1_FORMAT_DWORD;
-      break;
-   case 2:
-      format = GEN8_IB_DW1_FORMAT_WORD;
-      break;
-   case 1:
-      format = GEN8_IB_DW1_FORMAT_BYTE;
-      break;
-   default:
-      assert(!"unknown index size");
-      format = GEN8_IB_DW1_FORMAT_BYTE;
-      break;
-   }
-
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_INDEX_BUFFER) | (cmd_len - 2);
-   dw[1] = format |
+   /* see index_buffer_set_gen8_3DSTATE_INDEX_BUFFER() */
+   dw[1] = ib->ib[0] |
            builder->mocs << GEN8_IB_DW1_MOCS__SHIFT;
-   dw[4] = buf->bo_size;
 
-   /* ignore ib->offset here in favor of adjusting 3DPRIMITIVE */
-   ilo_builder_batch_reloc64(builder, pos + 2, buf->bo, 0, 0);
+   if (ib->need_bo) {
+      ilo_builder_batch_reloc64(builder, pos + 2, ib->bo, ib->ib[1], 0);
+   } else {
+      dw[2] = 0;
+      dw[3] = 0;
+   }
+
+   dw[4] = ib->ib[2];
 }
 
 static inline void
 gen6_3DSTATE_VS(struct ilo_builder *builder,
-                const struct ilo_shader_state *vs)
+                const struct ilo_state_vs *vs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 6;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
-   cso = ilo_shader_get_kernel_cso(vs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(vs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
+   dw[1] = kernel_offset;
+   /* see vs_set_gen6_3DSTATE_VS() */
+   dw[2] = vs->vs[0];
+   dw[3] = vs->vs[1];
+   dw[4] = vs->vs[2];
+   dw[5] = vs->vs[3];
 }
 
 static inline void
 gen8_3DSTATE_VS(struct ilo_builder *builder,
-                const struct ilo_shader_state *vs,
-                uint32_t clip_plane_enable)
+                const struct ilo_state_vs *vs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 9;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw3, dw6, dw7, dw8, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   cso = ilo_shader_get_kernel_cso(vs);
-   dw3 = cso->payload[0];
-   dw6 = cso->payload[1];
-   dw7 = cso->payload[2];
-   dw8 = clip_plane_enable << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT;
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(vs);
+   dw[1] = kernel_offset;
    dw[2] = 0;
-   dw[3] = dw3;
-   dw[4] = 0; /* scratch */
+   /* see vs_set_gen6_3DSTATE_VS() */
+   dw[3] = vs->vs[0];
+   dw[4] = vs->vs[1];
    dw[5] = 0;
-   dw[6] = dw6;
-   dw[7] = dw7;
-   dw[8] = dw8;
+   dw[6] = vs->vs[2];
+   dw[7] = vs->vs[3];
+   dw[8] = vs->vs[4];
 }
 
 static inline void
-gen6_disable_3DSTATE_VS(struct ilo_builder *builder)
+gen7_3DSTATE_HS(struct ilo_builder *builder,
+                const struct ilo_state_hs *hs,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = 6;
+   const uint8_t cmd_len = 7;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
+   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_VS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = 0;
-   dw[5] = 0;
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
+   /* see hs_set_gen7_3DSTATE_HS() */
+   dw[1] = hs->hs[0];
+   dw[2] = hs->hs[1];
+   dw[3] = kernel_offset;
+   dw[4] = hs->hs[2];
+   dw[5] = hs->hs[3];
+   dw[6] = 0;
 }
 
 static inline void
-gen7_disable_3DSTATE_HS(struct ilo_builder *builder)
+gen8_3DSTATE_HS(struct ilo_builder *builder,
+                const struct ilo_state_hs *hs,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 9 : 7;
+   const uint8_t cmd_len = 9;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_HS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
+   /* see hs_set_gen7_3DSTATE_HS() */
+   dw[1] = hs->hs[0];
+   dw[2] = hs->hs[1];
+   dw[3] = kernel_offset;
    dw[4] = 0;
-   dw[5] = 0;
+   dw[5] = hs->hs[2];
    dw[6] = 0;
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[7] = 0;
-      dw[8] = 0;
-   }
+   dw[7] = hs->hs[3];
+   dw[8] = 0;
 }
 
 static inline void
-gen7_3DSTATE_TE(struct ilo_builder *builder)
+gen7_3DSTATE_TE(struct ilo_builder *builder,
+                const struct ilo_state_ds *ds)
 {
    const uint8_t cmd_len = 4;
    uint32_t *dw;
@@ -814,108 +569,61 @@ gen7_3DSTATE_TE(struct ilo_builder *builder)
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_TE) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
+   /* see ds_set_gen7_3DSTATE_TE() */
+   dw[1] = ds->te[0];
+   dw[2] = ds->te[1];
+   dw[3] = ds->te[2];
 }
 
 static inline void
-gen7_disable_3DSTATE_DS(struct ilo_builder *builder)
+gen7_3DSTATE_DS(struct ilo_builder *builder,
+                const struct ilo_state_ds *ds,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 9 : 6;
+   const uint8_t cmd_len = 6;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = 0;
-   dw[5] = 0;
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[6] = 0;
-      dw[7] = 0;
-      dw[8] = 0;
-   }
-}
-
-static inline void
-gen6_3DSTATE_GS(struct ilo_builder *builder,
-                const struct ilo_shader_state *gs)
-{
-   const uint8_t cmd_len = 7;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, dw6, *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   cso = ilo_shader_get_kernel_cso(gs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-   dw6 = cso->payload[3];
-
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
-
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(gs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
-   dw[6] = dw6;
+   /* see ds_set_gen7_3DSTATE_DS() */
+   dw[1] = kernel_offset;
+   dw[2] = ds->ds[0];
+   dw[3] = ds->ds[1];
+   dw[4] = ds->ds[2];
+   dw[5] = ds->ds[3];
 }
 
 static inline void
-gen6_so_3DSTATE_GS(struct ilo_builder *builder,
-                   const struct ilo_shader_state *vs,
-                   int verts_per_prim)
+gen8_3DSTATE_DS(struct ilo_builder *builder,
+                const struct ilo_state_ds *ds,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = 7;
-   struct ilo_shader_cso cso;
-   enum ilo_kernel_param param;
-   uint32_t dw2, dw4, dw5, dw6, *dw;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   assert(ilo_shader_get_kernel_param(vs, ILO_KERNEL_VS_GEN6_SO));
-
-   switch (verts_per_prim) {
-   case 1:
-      param = ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET;
-      break;
-   case 2:
-      param = ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET;
-      break;
-   default:
-      param = ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET;
-      break;
-   }
+   const uint8_t cmd_len = 9;
+   uint32_t *dw;
 
-   /* cannot use VS's CSO */
-   ilo_gpe_init_gs_cso(builder->dev, vs, &cso);
-   dw2 = cso.payload[0];
-   dw4 = cso.payload[1];
-   dw5 = cso.payload[2];
-   dw6 = cso.payload[3];
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
-   dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(vs) +
-           ilo_shader_get_kernel_param(vs, param);
-   dw[2] = dw2;
-   dw[3] = 0;
-   dw[4] = dw4;
-   dw[5] = dw5;
-   dw[6] = dw6;
+   dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_DS) | (cmd_len - 2);
+   /* see ds_set_gen7_3DSTATE_DS() */
+   dw[1] = kernel_offset;
+   dw[2] = 0;
+   dw[3] = ds->ds[0];
+   dw[4] = ds->ds[1];
+   dw[5] = 0;
+   dw[6] = ds->ds[2];
+   dw[7] = ds->ds[3];
+   dw[8] = ds->ds[4];
 }
 
 static inline void
-gen6_disable_3DSTATE_GS(struct ilo_builder *builder)
+gen6_3DSTATE_GS(struct ilo_builder *builder,
+                const struct ilo_state_gs *gs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 7;
    uint32_t *dw;
@@ -925,13 +633,13 @@ gen6_disable_3DSTATE_GS(struct ilo_builder *builder)
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = 0;
-   dw[2] = 0;
-   dw[3] = 0;
-   /* honor the valid range of URB read length */
-   dw[4] = 1 << GEN6_GS_DW4_URB_READ_LEN__SHIFT;
-   dw[5] = GEN6_GS_DW5_STATISTICS;
-   dw[6] = 0;
+   dw[1] = kernel_offset;
+   /* see gs_set_gen6_3DSTATE_GS() */
+   dw[2] = gs->gs[0];
+   dw[3] = gs->gs[1];
+   dw[4] = gs->gs[2];
+   dw[5] = gs->gs[3];
+   dw[6] = gs->gs[4];
 }
 
 static inline void
@@ -960,183 +668,90 @@ gen6_3DSTATE_GS_SVB_INDEX(struct ilo_builder *builder,
 
 static inline void
 gen7_3DSTATE_GS(struct ilo_builder *builder,
-                const struct ilo_shader_state *gs)
+                const struct ilo_state_gs *gs,
+                uint32_t kernel_offset)
 {
    const uint8_t cmd_len = 7;
-   const struct ilo_shader_cso *cso;
-   uint32_t dw2, dw4, dw5, *dw;
+   uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   cso = ilo_shader_get_kernel_cso(gs);
-   dw2 = cso->payload[0];
-   dw4 = cso->payload[1];
-   dw5 = cso->payload[2];
-
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = ilo_shader_get_kernel_offset(gs);
-   dw[2] = dw2;
-   dw[3] = 0; /* scratch */
-   dw[4] = dw4;
-   dw[5] = dw5;
+   dw[1] = kernel_offset;
+   /* see gs_set_gen7_3DSTATE_GS() */
+   dw[2] = gs->gs[0];
+   dw[3] = gs->gs[1];
+   dw[4] = gs->gs[2];
+   dw[5] = gs->gs[3];
    dw[6] = 0;
 }
 
 static inline void
-gen7_disable_3DSTATE_GS(struct ilo_builder *builder)
+gen8_3DSTATE_GS(struct ilo_builder *builder,
+                const struct ilo_state_gs *gs,
+                uint32_t kernel_offset)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 10 : 7;
+   const uint8_t cmd_len = 10;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(3D, 3DSTATE_GS) | (cmd_len - 2);
-   dw[1] = 0;
+   dw[1] = kernel_offset;
    dw[2] = 0;
-   dw[3] = 0;
-   dw[4] = 0;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[7] = GEN8_GS_DW7_STATISTICS;
-      dw[8] = 0;
-      dw[9] = 0;
-   } else {
-      dw[5] = GEN7_GS_DW5_STATISTICS;
-      dw[6] = 0;
-   }
+   /* see gs_set_gen7_3DSTATE_GS() */
+   dw[3] = gs->gs[0];
+   dw[4] = gs->gs[1];
+   dw[5] = 0;
+   dw[6] = gs->gs[2];
+   dw[7] = gs->gs[3];
+   dw[8] = 0;
+   dw[9] = gs->gs[4];
 }
 
 static inline void
 gen7_3DSTATE_STREAMOUT(struct ilo_builder *builder,
-                       int render_stream,
-                       bool render_disable,
-                       int vertex_attrib_count,
-                       const int *buf_strides)
+                       const struct ilo_state_sol *sol)
 {
    const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 5 : 3;
    uint32_t *dw;
-   int buf_mask;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_STREAMOUT) | (cmd_len - 2);
-
-   dw[1] = render_stream << GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT;
-   if (render_disable)
-      dw[1] |= GEN7_SO_DW1_RENDER_DISABLE;
-
-   if (buf_strides) {
-      buf_mask = ((bool) buf_strides[3]) << 3 |
-                 ((bool) buf_strides[2]) << 2 |
-                 ((bool) buf_strides[1]) << 1 |
-                 ((bool) buf_strides[0]);
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-         dw[3] = buf_strides[1] << 16 | buf_strides[0];
-         dw[4] = buf_strides[3] << 16 | buf_strides[1];
-      }
-   } else {
-      buf_mask = 0;
-   }
-
-   if (buf_mask) {
-      int read_len;
-
-      dw[1] |= GEN7_SO_DW1_SO_ENABLE |
-               GEN7_SO_DW1_STATISTICS;
-      /* API_OPENGL */
-      if (true)
-         dw[1] |= GEN7_SO_DW1_REORDER_TRAILING;
-      if (ilo_dev_gen(builder->dev) < ILO_GEN(8))
-         dw[1] |= buf_mask << GEN7_SO_DW1_BUFFER_ENABLES__SHIFT;
-
-      read_len = (vertex_attrib_count + 1) / 2;
-      if (!read_len)
-         read_len = 1;
-
-      dw[2] = 0 << GEN7_SO_DW2_STREAM3_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM3_READ_LEN__SHIFT |
-              0 << GEN7_SO_DW2_STREAM2_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM2_READ_LEN__SHIFT |
-              0 << GEN7_SO_DW2_STREAM1_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM1_READ_LEN__SHIFT |
-              0 << GEN7_SO_DW2_STREAM0_READ_OFFSET__SHIFT |
-              (read_len - 1) << GEN7_SO_DW2_STREAM0_READ_LEN__SHIFT;
-   } else {
-      dw[2] = 0;
+   /* see sol_set_gen7_3DSTATE_STREAMOUT() */
+   dw[1] = sol->streamout[0];
+   dw[2] = sol->streamout[1];
+   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
+      dw[3] = sol->strides[1] << GEN8_SO_DW3_BUFFER1_PITCH__SHIFT |
+              sol->strides[0] << GEN8_SO_DW3_BUFFER0_PITCH__SHIFT;
+      dw[4] = sol->strides[3] << GEN8_SO_DW4_BUFFER3_PITCH__SHIFT |
+              sol->strides[2] << GEN8_SO_DW4_BUFFER2_PITCH__SHIFT;
    }
 }
 
 static inline void
 gen7_3DSTATE_SO_DECL_LIST(struct ilo_builder *builder,
-                          const struct pipe_stream_output_info *so_info)
+                          const struct ilo_state_sol *sol)
 {
    /*
     * Note that "DWord Length" has 9 bits for this command and the type of
     * cmd_len cannot be uint8_t.
     */
    uint16_t cmd_len;
-   struct {
-      int buf_selects;
-      int decl_count;
-      uint16_t decls[128];
-   } streams[4];
-   unsigned buf_offsets[PIPE_MAX_SO_BUFFERS];
-   int hw_decl_count, i;
+   int cmd_decl_count;
    uint32_t *dw;
 
    ILO_DEV_ASSERT(builder->dev, 7, 8);
 
-   memset(streams, 0, sizeof(streams));
-   memset(buf_offsets, 0, sizeof(buf_offsets));
-
-   for (i = 0; i < so_info->num_outputs; i++) {
-      unsigned decl, st, buf, reg, mask;
-
-      st = so_info->output[i].stream;
-      buf = so_info->output[i].output_buffer;
-
-      /* pad with holes */
-      while (buf_offsets[buf] < so_info->output[i].dst_offset) {
-         int num_dwords;
-
-         num_dwords = so_info->output[i].dst_offset - buf_offsets[buf];
-         if (num_dwords > 4)
-            num_dwords = 4;
-
-         decl = buf << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT |
-                GEN7_SO_DECL_HOLE_FLAG |
-                ((1 << num_dwords) - 1) << GEN7_SO_DECL_COMPONENT_MASK__SHIFT;
-
-         assert(streams[st].decl_count < Elements(streams[st].decls));
-         streams[st].decls[streams[st].decl_count++] = decl;
-         buf_offsets[buf] += num_dwords;
-      }
-      assert(buf_offsets[buf] == so_info->output[i].dst_offset);
-
-      reg = so_info->output[i].register_index;
-      mask = ((1 << so_info->output[i].num_components) - 1) <<
-         so_info->output[i].start_component;
-
-      decl = buf << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT |
-             reg << GEN7_SO_DECL_REG_INDEX__SHIFT |
-             mask << GEN7_SO_DECL_COMPONENT_MASK__SHIFT;
-
-      assert(streams[st].decl_count < Elements(streams[st].decls));
-
-      streams[st].buf_selects |= 1 << buf;
-      streams[st].decls[streams[st].decl_count++] = decl;
-      buf_offsets[buf] += so_info->output[i].num_components;
-   }
-
    if (ilo_dev_gen(builder->dev) >= ILO_GEN(7.5)) {
-      hw_decl_count = MAX4(streams[0].decl_count, streams[1].decl_count,
-                           streams[2].decl_count, streams[3].decl_count);
+      cmd_decl_count = sol->decl_count;
    } else {
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 201:
@@ -1145,100 +760,97 @@ gen7_3DSTATE_SO_DECL_LIST(struct ilo_builder *builder,
        *      whenever this command is issued. The "Num Entries [n]" fields
        *      still contain the actual numbers of valid decls."
        */
-      hw_decl_count = 128;
+      cmd_decl_count = 128;
    }
 
-   cmd_len = 3 + 2 * hw_decl_count;
+   cmd_len = 3 + 2 * cmd_decl_count;
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_DECL_LIST) | (cmd_len - 2);
-   dw[1] = streams[3].buf_selects << GEN7_SO_DECL_DW1_STREAM3_BUFFER_SELECTS__SHIFT |
-           streams[2].buf_selects << GEN7_SO_DECL_DW1_STREAM2_BUFFER_SELECTS__SHIFT |
-           streams[1].buf_selects << GEN7_SO_DECL_DW1_STREAM1_BUFFER_SELECTS__SHIFT |
-           streams[0].buf_selects << GEN7_SO_DECL_DW1_STREAM0_BUFFER_SELECTS__SHIFT;
-   dw[2] = streams[3].decl_count << GEN7_SO_DECL_DW2_STREAM3_ENTRY_COUNT__SHIFT |
-           streams[2].decl_count << GEN7_SO_DECL_DW2_STREAM2_ENTRY_COUNT__SHIFT |
-           streams[1].decl_count << GEN7_SO_DECL_DW2_STREAM1_ENTRY_COUNT__SHIFT |
-           streams[0].decl_count << GEN7_SO_DECL_DW2_STREAM0_ENTRY_COUNT__SHIFT;
-   dw += 3;
-
-   for (i = 0; i < hw_decl_count; i++) {
-      dw[0] = streams[1].decls[i] << 16 | streams[0].decls[i];
-      dw[1] = streams[3].decls[i] << 16 | streams[2].decls[i];
-      dw += 2;
+   /* see sol_set_gen7_3DSTATE_SO_DECL_LIST() */
+   dw[1] = sol->so_decl[0];
+   dw[2] = sol->so_decl[1];
+   memcpy(&dw[3], sol->decl, sizeof(sol->decl[0]) * sol->decl_count);
+
+   if (sol->decl_count < cmd_decl_count) {
+      memset(&dw[3 + 2 * sol->decl_count], 0, sizeof(sol->decl[0]) *
+            cmd_decl_count - sol->decl_count);
    }
 }
 
 static inline void
-gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder, int index, int stride,
-                       const struct pipe_stream_output_target *so_target)
+gen7_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
+                       const struct ilo_state_sol *sol,
+                       const struct ilo_state_sol_buffer *sb,
+                       uint8_t buffer)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 8 : 4;
-   struct ilo_buffer *buf;
-   int start, end;
+   const uint8_t cmd_len = 4;
    uint32_t *dw;
    unsigned pos;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
-
-   buf = ilo_buffer(so_target->buffer);
-
-   /* DWord-aligned */
-   assert(stride % 4 == 0);
-   assert(so_target->buffer_offset % 4 == 0);
+   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
 
-   stride &= ~3;
-   start = so_target->buffer_offset & ~3;
-   end = (start + so_target->buffer_size) & ~3;
+   assert(buffer < ILO_STATE_SOL_MAX_BUFFER_COUNT);
 
    pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_BUFFER) | (cmd_len - 2);
-   dw[1] = index << GEN7_SO_BUF_DW1_INDEX__SHIFT |
-           stride;
-
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[1] |= builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT;
-
-      dw[4] = end - start;
-      dw[5] = 0;
-      dw[6] = 0;
-      dw[7] = 0;
-
-      ilo_builder_batch_reloc64(builder, pos + 2,
-            buf->bo, start, INTEL_RELOC_WRITE);
+   /* see sol_buffer_set_gen7_3dstate_so_buffer() */
+   dw[1] = buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT |
+           builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT |
+           sol->strides[buffer] << GEN7_SO_BUF_DW1_PITCH__SHIFT;
+
+   if (sb->need_bo) {
+      ilo_builder_batch_reloc(builder, pos + 2, sb->bo,
+            sb->so_buf[0], INTEL_RELOC_WRITE);
+      ilo_builder_batch_reloc(builder, pos + 3, sb->bo,
+            sb->so_buf[1], INTEL_RELOC_WRITE);
    } else {
-      dw[1] |= builder->mocs << GEN7_SO_BUF_DW1_MOCS__SHIFT;
-
-      ilo_builder_batch_reloc(builder, pos + 2,
-            buf->bo, start, INTEL_RELOC_WRITE);
-      ilo_builder_batch_reloc(builder, pos + 3,
-            buf->bo, end, INTEL_RELOC_WRITE);
+      dw[2] = 0;
+      dw[3] = 0;
    }
 }
 
 static inline void
-gen7_disable_3DSTATE_SO_BUFFER(struct ilo_builder *builder, int index)
+gen8_3DSTATE_SO_BUFFER(struct ilo_builder *builder,
+                       const struct ilo_state_sol *sol,
+                       const struct ilo_state_sol_buffer *sb,
+                       uint8_t buffer)
 {
-   const uint8_t cmd_len = (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 8 : 4;
+   const uint8_t cmd_len = 8;
    uint32_t *dw;
+   unsigned pos;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 8);
+   ILO_DEV_ASSERT(builder->dev, 8, 8);
 
-   ilo_builder_batch_pointer(builder, cmd_len, &dw);
+   pos = ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN7_RENDER_CMD(3D, 3DSTATE_SO_BUFFER) | (cmd_len - 2);
-   dw[1] = index << GEN7_SO_BUF_DW1_INDEX__SHIFT;
-   dw[2] = 0;
-   dw[3] = 0;
+   /* see sol_buffer_set_gen8_3dstate_so_buffer() */
+   dw[1] = sb->so_buf[0] |
+           buffer << GEN7_SO_BUF_DW1_INDEX__SHIFT |
+           builder->mocs << GEN8_SO_BUF_DW1_MOCS__SHIFT;
+
+   if (sb->need_bo) {
+      ilo_builder_batch_reloc64(builder, pos + 2, sb->bo,
+            sb->so_buf[1], INTEL_RELOC_WRITE);
+   } else {
+      dw[2] = 0;
+      dw[3] = 0;
+   }
 
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) {
-      dw[4] = 0;
+   dw[4] = sb->so_buf[2];
+
+   if (sb->need_write_offset_bo) {
+      ilo_builder_batch_reloc64(builder, pos + 5, sb->write_offset_bo,
+            sizeof(uint32_t) * buffer, INTEL_RELOC_WRITE);
+   } else {
       dw[5] = 0;
       dw[6] = 0;
-      dw[7] = 0;
    }
+
+   dw[7] = sb->so_buf[3];
 }
 
 static inline void
@@ -1627,8 +1239,7 @@ gen6_BINDING_TABLE_STATE(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_SURFACE_STATE(struct ilo_builder *builder,
-                   const struct ilo_view_surface *surf,
-                   bool for_render)
+                   const struct ilo_state_surface *surf)
 {
    int state_align, state_len;
    uint32_t state_offset, *dw;
@@ -1641,7 +1252,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
 
       state_offset = ilo_builder_surface_pointer(builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
-      memcpy(dw, surf->payload, state_len << 2);
+      memcpy(dw, surf->surface, state_len << 2);
 
       if (surf->bo) {
          const uint32_t mocs = (surf->scanout) ?
@@ -1650,7 +1261,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
          dw[1] |= mocs << GEN8_SURFACE_DW1_MOCS__SHIFT;
 
          ilo_builder_surface_reloc64(builder, state_offset, 8, surf->bo,
-               surf->payload[8], (for_render) ? INTEL_RELOC_WRITE : 0);
+               surf->surface[8], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    } else {
       state_align = 32;
@@ -1658,7 +1269,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
 
       state_offset = ilo_builder_surface_pointer(builder,
             ILO_BUILDER_ITEM_SURFACE, state_align, state_len, &dw);
-      memcpy(dw, surf->payload, state_len << 2);
+      memcpy(dw, surf->surface, state_len << 2);
 
       if (surf->bo) {
          /*
@@ -1668,7 +1279,7 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
          dw[5] |= builder->mocs << GEN6_SURFACE_DW5_MOCS__SHIFT;
 
          ilo_builder_surface_reloc(builder, state_offset, 1, surf->bo,
-               surf->payload[1], (for_render) ? INTEL_RELOC_WRITE : 0);
+               surf->surface[1], (surf->readonly) ? 0 : INTEL_RELOC_WRITE);
       }
    }
 
@@ -1676,55 +1287,13 @@ gen6_SURFACE_STATE(struct ilo_builder *builder,
 }
 
 static inline uint32_t
-gen6_so_SURFACE_STATE(struct ilo_builder *builder,
-                      const struct pipe_stream_output_target *so,
-                      const struct pipe_stream_output_info *so_info,
-                      int so_index)
-{
-   struct ilo_buffer *buf = ilo_buffer(so->buffer);
-   unsigned bo_offset, struct_size;
-   enum pipe_format elem_format;
-   struct ilo_view_surface surf;
-
-   ILO_DEV_ASSERT(builder->dev, 6, 6);
-
-   bo_offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
-   struct_size = so_info->stride[so_info->output[so_index].output_buffer] * 4;
-
-   switch (so_info->output[so_index].num_components) {
-   case 1:
-      elem_format = PIPE_FORMAT_R32_FLOAT;
-      break;
-   case 2:
-      elem_format = PIPE_FORMAT_R32G32_FLOAT;
-      break;
-   case 3:
-      elem_format = PIPE_FORMAT_R32G32B32_FLOAT;
-      break;
-   case 4:
-      elem_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
-      break;
-   default:
-      assert(!"unexpected SO components length");
-      elem_format = PIPE_FORMAT_R32_FLOAT;
-      break;
-   }
-
-   ilo_gpe_init_view_surface_for_buffer(builder->dev, buf, bo_offset,
-         so->buffer_size, struct_size, elem_format, false, true, &surf);
-
-   return gen6_SURFACE_STATE(builder, &surf, false);
-}
-
-static inline uint32_t
 gen6_SAMPLER_STATE(struct ilo_builder *builder,
-                   const struct ilo_sampler_cso * const *samplers,
-                   const struct pipe_sampler_view * const *views,
+                   const struct ilo_state_sampler *samplers,
                    const uint32_t *sampler_border_colors,
-                   int num_samplers)
+                   int sampler_count)
 {
    const int state_align = 32;
-   const int state_len = 4 * num_samplers;
+   const int state_len = 4 * sampler_count;
    uint32_t state_offset, *dw;
    int i;
 
@@ -1735,9 +1304,9 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder,
     *
     *     "The sampler state is stored as an array of up to 16 elements..."
     */
-   assert(num_samplers <= 16);
+   assert(sampler_count <= 16);
 
-   if (!num_samplers)
+   if (!sampler_count)
       return 0;
 
    /*
@@ -1749,86 +1318,19 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder,
     *
     * It also applies to other shader stages.
     */
-   ilo_builder_dynamic_pad_top(builder, 4 * (4 - (num_samplers % 4)));
+   ilo_builder_dynamic_pad_top(builder, 4 * (4 - (sampler_count % 4)));
 
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_SAMPLER, state_align, state_len, &dw);
 
-   for (i = 0; i < num_samplers; i++) {
-      const struct ilo_sampler_cso *sampler = samplers[i];
-      const struct pipe_sampler_view *view = views[i];
-      const uint32_t border_color = sampler_border_colors[i];
-      uint32_t dw_filter, dw_wrap;
-
-      /* there may be holes */
-      if (!sampler || !view) {
-         /* disabled sampler */
-         dw[0] = 1 << 31;
-         dw[1] = 0;
-         dw[2] = 0;
-         dw[3] = 0;
-         dw += 4;
-
-         continue;
-      }
-
-      /* determine filter and wrap modes */
-      switch (view->texture->target) {
-      case PIPE_TEXTURE_1D:
-         dw_filter = (sampler->anisotropic) ?
-            sampler->dw_filter_aniso : sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap_1d;
-         break;
-      case PIPE_TEXTURE_3D:
-         /*
-          * From the Sandy Bridge PRM, volume 4 part 1, page 103:
-          *
-          *     "Only MAPFILTER_NEAREST and MAPFILTER_LINEAR are supported for
-          *      surfaces of type SURFTYPE_3D."
-          */
-         dw_filter = sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap;
-         break;
-      case PIPE_TEXTURE_CUBE:
-         dw_filter = (sampler->anisotropic) ?
-            sampler->dw_filter_aniso : sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap_cube;
-         break;
-      default:
-         dw_filter = (sampler->anisotropic) ?
-            sampler->dw_filter_aniso : sampler->dw_filter;
-         dw_wrap = sampler->dw_wrap;
-         break;
-      }
+   for (i = 0; i < sampler_count; i++) {
+      /* see sampler_set_gen6_SAMPLER_STATE() */
+      dw[0] = samplers[i].sampler[0];
+      dw[1] = samplers[i].sampler[1];
+      dw[3] = samplers[i].sampler[2];
 
-      dw[0] = sampler->payload[0];
-      dw[1] = sampler->payload[1];
-      assert(!(border_color & 0x1f));
-      dw[2] = border_color;
-      dw[3] = sampler->payload[2];
-
-      dw[0] |= dw_filter;
-
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-         dw[3] |= dw_wrap;
-      }
-      else {
-         /*
-          * From the Sandy Bridge PRM, volume 4 part 1, page 21:
-          *
-          *     "[DevSNB] Errata: Incorrect behavior is observed in cases
-          *      where the min and mag mode filters are different and
-          *      SurfMinLOD is nonzero. The determination of MagMode uses the
-          *      following equation instead of the one in the above
-          *      pseudocode: MagMode = (LOD + SurfMinLOD - Base <= 0)"
-          *
-          * As a way to work around that, we set Base to
-          * view->u.tex.first_level.
-          */
-         dw[0] |= view->u.tex.first_level << 22;
-
-         dw[1] |= dw_wrap;
-      }
+      assert(!(sampler_border_colors[i] & 0x1f));
+      dw[2] = sampler_border_colors[i];
 
       dw += 4;
    }
@@ -1838,7 +1340,7 @@ gen6_SAMPLER_STATE(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_builder *builder,
-                                const struct ilo_sampler_cso *sampler)
+                                const struct ilo_state_sampler_border *border)
 {
    const int state_align =
       (ilo_dev_gen(builder->dev) >= ILO_GEN(8)) ? 64 : 32;
@@ -1846,11 +1348,12 @@ gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_builder *builder,
 
    ILO_DEV_ASSERT(builder->dev, 6, 8);
 
-   assert(Elements(sampler->payload) >= 3 + state_len);
-
-   /* see ilo_gpe_init_sampler_cso() */
+   /*
+    * see border_set_gen6_SAMPLER_BORDER_COLOR_STATE() and
+    * border_set_gen7_SAMPLER_BORDER_COLOR_STATE()
+    */
    return ilo_builder_dynamic_write(builder, ILO_BUILDER_ITEM_BLOB,
-         state_align, state_len, &sampler->payload[3]);
+         state_align, state_len, border->color);
 }
 
 static inline uint32_t
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_decode.c b/src/gallium/drivers/ilo/core/ilo_builder_decode.c
index cedaab1..c5a98c9 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_decode.c
+++ b/src/gallium/drivers/ilo/core/ilo_builder_decode.c
@@ -319,7 +319,7 @@ writer_decode_color_calc(const struct ilo_builder *builder,
               "stencil ref %d, bf stencil ref %d\n",
 	      GEN_EXTRACT(dw, GEN6_CC_DW0_ALPHATEST) ? "FLOAT32" : "UNORM8",
 	      (bool) (dw & GEN6_CC_DW0_ROUND_DISABLE_DISABLE),
-	      GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL0_REF),
+	      GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL_REF),
 	      GEN_EXTRACT(dw, GEN6_CC_DW0_STENCIL1_REF));
 
    writer_dw(builder, which, item->offset, 1, "CC\n");
@@ -347,13 +347,13 @@ writer_decode_depth_stencil(const struct ilo_builder *builder,
    dw = writer_dw(builder, which, item->offset, 0, "D_S");
    ilo_printf("stencil %sable, func %d, write %sable\n",
          (dw & GEN6_ZS_DW0_STENCIL_TEST_ENABLE) ? "en" : "dis",
-         GEN_EXTRACT(dw, GEN6_ZS_DW0_STENCIL0_FUNC),
+         GEN_EXTRACT(dw, GEN6_ZS_DW0_STENCIL_FUNC),
          (dw & GEN6_ZS_DW0_STENCIL_WRITE_ENABLE) ? "en" : "dis");
 
    dw = writer_dw(builder, which, item->offset, 1, "D_S");
    ilo_printf("stencil test mask 0x%x, write mask 0x%x\n",
-         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL0_VALUEMASK),
-         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL0_WRITEMASK));
+         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL_TEST_MASK),
+         GEN_EXTRACT(dw, GEN6_ZS_DW1_STENCIL_WRITE_MASK));
 
    dw = writer_dw(builder, which, item->offset, 2, "D_S");
    ilo_printf("depth test %sable, func %d, write %sable\n",
diff --git a/src/gallium/drivers/ilo/core/ilo_builder_media.h b/src/gallium/drivers/ilo/core/ilo_builder_media.h
index 7fbe6d4..7197104 100644
--- a/src/gallium/drivers/ilo/core/ilo_builder_media.h
+++ b/src/gallium/drivers/ilo/core/ilo_builder_media.h
@@ -29,57 +29,30 @@
 #define ILO_BUILDER_MEDIA_H
 
 #include "genhw/genhw.h"
-#include "../ilo_shader.h"
 #include "intel_winsys.h"
 
 #include "ilo_core.h"
 #include "ilo_dev.h"
+#include "ilo_state_compute.h"
 #include "ilo_builder.h"
 
-struct gen6_idrt_data {
-   const struct ilo_shader_state *cs;
-
-   uint32_t sampler_offset;
-   uint32_t binding_table_offset;
-
-   unsigned curbe_size;
-   unsigned thread_group_size;
-};
-
 static inline void
 gen6_MEDIA_VFE_STATE(struct ilo_builder *builder,
-                     unsigned curbe_alloc, bool use_slm)
+                     const struct ilo_state_compute *compute)
 {
    const uint8_t cmd_len = 8;
-   const unsigned idrt_alloc =
-      ((ilo_dev_gen(builder->dev) >= ILO_GEN(7.5)) ? 64 : 32) * 32;
-   int max_threads;
    uint32_t *dw;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
-
-   max_threads = builder->dev->thread_count;
-
-   curbe_alloc = align(curbe_alloc, 32);
-   assert(idrt_alloc + curbe_alloc <= builder->dev->urb_size / (use_slm + 1));
+   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
    ilo_builder_batch_pointer(builder, cmd_len, &dw);
 
    dw[0] = GEN6_RENDER_CMD(MEDIA, MEDIA_VFE_STATE) | (cmd_len - 2);
-   dw[1] = 0; /* scratch */
-
-   dw[2] = (max_threads - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
-           0 << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
-           GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
-           GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
-   if (ilo_dev_gen(builder->dev) >= ILO_GEN(7))
-      dw[2] |= GEN7_VFE_DW2_GPGPU_MODE;
-
+   /* see compute_set_gen6_MEDIA_VFE_STATE() */
+   dw[1] = compute->vfe[0];
+   dw[2] = compute->vfe[1];
    dw[3] = 0;
-
-   dw[4] = 0 << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
-           (curbe_alloc / 32);
-
+   dw[4] = compute->vfe[2];
    dw[5] = 0;
    dw[6] = 0;
    dw[7] = 0;
@@ -194,8 +167,10 @@ gen7_GPGPU_WALKER(struct ilo_builder *builder,
 
 static inline uint32_t
 gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_builder *builder,
-                               const struct gen6_idrt_data *data,
-                               int idrt_count)
+                               const struct ilo_state_compute *compute,
+                               const uint32_t *kernel_offsets,
+                               const uint32_t *sampler_offsets,
+                               const uint32_t *binding_table_offsets)
 {
    /*
     * From the Sandy Bridge PRM, volume 2 part 2, page 34:
@@ -211,61 +186,26 @@ gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_builder *builder,
     *      aligned address of the Interface Descriptor data."
     */
    const int state_align = 32;
-   const int state_len = (32 / 4) * idrt_count;
+   const int state_len = (32 / 4) * compute->idrt_count;
    uint32_t state_offset, *dw;
    int i;
 
-   ILO_DEV_ASSERT(builder->dev, 7, 7.5);
+   ILO_DEV_ASSERT(builder->dev, 6, 7.5);
 
    state_offset = ilo_builder_dynamic_pointer(builder,
          ILO_BUILDER_ITEM_INTERFACE_DESCRIPTOR, state_align, state_len, &dw);
 
-   for (i = 0; i < idrt_count; i++) {
-      const struct gen6_idrt_data *idrt = &data[i];
-      const struct ilo_shader_state *cs = idrt->cs;
-      unsigned sampler_count, bt_size, slm_size;
-
-      sampler_count =
-         ilo_shader_get_kernel_param(cs, ILO_KERNEL_SAMPLER_COUNT);
-      assert(sampler_count <= 16);
-      sampler_count = (sampler_count + 3) / 4;
-
-      bt_size =
-         ilo_shader_get_kernel_param(cs, ILO_KERNEL_SURFACE_TOTAL_COUNT);
-      if (bt_size > 31)
-         bt_size = 31;
-
-      slm_size = ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_LOCAL_SIZE);
-
-      assert(idrt->curbe_size / 32 <= 63);
-
-      dw[0] = ilo_shader_get_kernel_offset(idrt->cs);
+   for (i = 0; i < compute->idrt_count; i++) {
+      /* see compute_set_gen6_INTERFACE_DESCRIPTOR_DATA() */
+      dw[0] = compute->idrt[i][0] + kernel_offsets[i];
       dw[1] = 0;
-      dw[2] = idrt->sampler_offset |
-              sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
-      dw[3] = idrt->binding_table_offset |
-              bt_size << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
-
-      dw[4] = (idrt->curbe_size / 32) << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
-              0 << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
-
-      if (ilo_dev_gen(builder->dev) >= ILO_GEN(7)) {
-         dw[5] = GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
-
-         if (slm_size) {
-            assert(slm_size <= 64 * 1024);
-            slm_size = util_next_power_of_two((slm_size + 4095) / 4096);
-
-            dw[5] |= GEN7_IDRT_DW5_BARRIER_ENABLE |
-                     slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT |
-                     idrt->thread_group_size <<
-                        GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
-         }
-      } else {
-         dw[5] = 0;
-      }
-
-      dw[6] = 0;
+      dw[2] = compute->idrt[i][1] |
+              sampler_offsets[i];
+      dw[3] = compute->idrt[i][2] |
+              binding_table_offsets[i];
+      dw[4] = compute->idrt[i][3];
+      dw[5] = compute->idrt[i][4];
+      dw[6] = compute->idrt[i][5];
       dw[7] = 0;
 
       dw += 8;
diff --git a/src/gallium/drivers/ilo/core/ilo_core.h b/src/gallium/drivers/ilo/core/ilo_core.h
index 3587d39..0a7f7d9 100644
--- a/src/gallium/drivers/ilo/core/ilo_core.h
+++ b/src/gallium/drivers/ilo/core/ilo_core.h
@@ -40,7 +40,4 @@
 #include "util/u_memory.h"
 #include "util/u_pointer.h"
 
-#define ILO_PRIM_RECTANGLES PIPE_PRIM_MAX
-#define ILO_PRIM_MAX (PIPE_PRIM_MAX + 1)
-
 #endif /* ILO_CORE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_debug.h b/src/gallium/drivers/ilo/core/ilo_debug.h
index d9c4604..9833233 100644
--- a/src/gallium/drivers/ilo/core/ilo_debug.h
+++ b/src/gallium/drivers/ilo/core/ilo_debug.h
@@ -100,4 +100,21 @@ ilo_warn(const char *format, ...)
 #endif
 }
 
+static inline bool
+ilo_is_zeroed(const void *ptr, size_t size)
+{
+#ifdef DEBUG
+   size_t i;
+
+   for (i = 0; i < size; i++) {
+      if (*((const char *) ptr) != 0)
+         return false;
+   }
+
+   return true;
+#else
+   return true;
+#endif
+}
+
 #endif /* ILO_DEBUG_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_dev.c b/src/gallium/drivers/ilo/core/ilo_dev.c
index 7a774fa..925322a 100644
--- a/src/gallium/drivers/ilo/core/ilo_dev.c
+++ b/src/gallium/drivers/ilo/core/ilo_dev.c
@@ -32,14 +32,15 @@
 #include "ilo_dev.h"
 
 /**
- * Initialize the \p dev from \p winsys.  \p winsys is considered owned by \p
- * dev and will be destroyed in \p ilo_dev_cleanup().
+ * Initialize the \p dev from \p winsys.
  */
 bool
 ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys)
 {
    const struct intel_winsys_info *info;
 
+   assert(ilo_is_zeroed(dev, sizeof(*dev)));
+
    info = intel_winsys_get_info(winsys);
 
    dev->winsys = winsys;
@@ -178,9 +179,3 @@ ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys)
 
    return true;
 }
-
-void
-ilo_dev_cleanup(struct ilo_dev *dev)
-{
-   intel_winsys_destroy(dev->winsys);
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_dev.h b/src/gallium/drivers/ilo/core/ilo_dev.h
index 4eb5d59..a9f9b17 100644
--- a/src/gallium/drivers/ilo/core/ilo_dev.h
+++ b/src/gallium/drivers/ilo/core/ilo_dev.h
@@ -63,9 +63,6 @@ struct ilo_dev {
 bool
 ilo_dev_init(struct ilo_dev *dev, struct intel_winsys *winsys);
 
-void
-ilo_dev_cleanup(struct ilo_dev *dev);
-
 static inline int
 ilo_dev_gen(const struct ilo_dev *dev)
 {
diff --git a/src/gallium/drivers/ilo/core/ilo_fence.h b/src/gallium/drivers/ilo/core/ilo_fence.h
deleted file mode 100644
index 00d555a..0000000
--- a/src/gallium/drivers/ilo/core/ilo_fence.h
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2013 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#ifndef ILO_FENCE_H
-#define ILO_FENCE_H
-
-#include "intel_winsys.h"
-
-#include "ilo_core.h"
-#include "ilo_dev.h"
-
-struct ilo_fence {
-   struct intel_bo *seq_bo;
-};
-
-static inline void
-ilo_fence_init(struct ilo_fence *fence, const struct ilo_dev *dev)
-{
-   /* no-op */
-}
-
-static inline void
-ilo_fence_cleanup(struct ilo_fence *fence)
-{
-   intel_bo_unref(fence->seq_bo);
-}
-
-/**
- * Set the sequence bo for waiting.  The fence is considered signaled when
- * there is no sequence bo.
- */
-static inline void
-ilo_fence_set_seq_bo(struct ilo_fence *fence, struct intel_bo *seq_bo)
-{
-   intel_bo_unref(fence->seq_bo);
-   fence->seq_bo = intel_bo_ref(seq_bo);
-}
-
-/**
- * Wait for the fence to be signaled or until \p timeout nanoseconds has
- * passed.  It will wait indefinitely when \p timeout is negative.
- */
-static inline bool
-ilo_fence_wait(struct ilo_fence *fence, int64_t timeout)
-{
-   return (!fence->seq_bo || intel_bo_wait(fence->seq_bo, timeout) == 0);
-}
-
-#endif /* ILO_FENCE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_format.c b/src/gallium/drivers/ilo/core/ilo_format.c
deleted file mode 100644
index 280e499..0000000
--- a/src/gallium/drivers/ilo/core/ilo_format.c
+++ /dev/null
@@ -1,755 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2013 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#include "genhw/genhw.h"
-#include "ilo_format.h"
-
-struct ilo_vf_cap {
-   int vertex_element;
-};
-
-struct ilo_sol_cap {
-   int buffer;
-};
-
-struct ilo_sampler_cap {
-   int sampling;
-   int filtering;
-   int shadow_map;
-   int chroma_key;
-};
-
-struct ilo_dp_cap {
-   int rt_write;
-   int rt_write_blending;
-   int typed_write;
-   int media_color_processing;
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 2 part 1, page 97-99
- *  - the Haswell PRM, volume 7, page 467-470
- */
-static const struct ilo_vf_cap ilo_vf_caps[] = {
-#define CAP(vertex_element) { ILO_GEN(vertex_element) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_UNORM]       = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SNORM]       = CAP(  1),
-   [GEN6_FORMAT_R64G64_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SSCALED]     = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_USCALED]     = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SFIXED]      = CAP(7.5),
-   [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_UNORM]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SNORM]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SSCALED]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_USCALED]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SFIXED]         = CAP(7.5),
-   [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1),
-   [GEN6_FORMAT_R32G32_UNORM]             = CAP(  1),
-   [GEN6_FORMAT_R32G32_SNORM]             = CAP(  1),
-   [GEN6_FORMAT_R64_FLOAT]                = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_SSCALED]     = CAP(  1),
-   [GEN6_FORMAT_R16G16B16A16_USCALED]     = CAP(  1),
-   [GEN6_FORMAT_R32G32_SSCALED]           = CAP(  1),
-   [GEN6_FORMAT_R32G32_USCALED]           = CAP(  1),
-   [GEN6_FORMAT_R32G32_SFIXED]            = CAP(7.5),
-   [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1),
-   [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1),
-   [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1),
-   [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1),
-   [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1),
-   [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1),
-   [GEN6_FORMAT_R16G16_SINT]              = CAP(  1),
-   [GEN6_FORMAT_R16G16_UINT]              = CAP(  1),
-   [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(7.5),
-   [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1),
-   [GEN6_FORMAT_R32_UNORM]                = CAP(  1),
-   [GEN6_FORMAT_R32_SNORM]                = CAP(  1),
-   [GEN6_FORMAT_R10G10B10X2_USCALED]      = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_SSCALED]         = CAP(  1),
-   [GEN6_FORMAT_R8G8B8A8_USCALED]         = CAP(  1),
-   [GEN6_FORMAT_R16G16_SSCALED]           = CAP(  1),
-   [GEN6_FORMAT_R16G16_USCALED]           = CAP(  1),
-   [GEN6_FORMAT_R32_SSCALED]              = CAP(  1),
-   [GEN6_FORMAT_R32_USCALED]              = CAP(  1),
-   [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1),
-   [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1),
-   [GEN6_FORMAT_R8G8_SINT]                = CAP(  1),
-   [GEN6_FORMAT_R8G8_UINT]                = CAP(  1),
-   [GEN6_FORMAT_R16_UNORM]                = CAP(  1),
-   [GEN6_FORMAT_R16_SNORM]                = CAP(  1),
-   [GEN6_FORMAT_R16_SINT]                 = CAP(  1),
-   [GEN6_FORMAT_R16_UINT]                 = CAP(  1),
-   [GEN6_FORMAT_R16_FLOAT]                = CAP(  1),
-   [GEN6_FORMAT_R8G8_SSCALED]             = CAP(  1),
-   [GEN6_FORMAT_R8G8_USCALED]             = CAP(  1),
-   [GEN6_FORMAT_R16_SSCALED]              = CAP(  1),
-   [GEN6_FORMAT_R16_USCALED]              = CAP(  1),
-   [GEN6_FORMAT_R8_UNORM]                 = CAP(  1),
-   [GEN6_FORMAT_R8_SNORM]                 = CAP(  1),
-   [GEN6_FORMAT_R8_SINT]                  = CAP(  1),
-   [GEN6_FORMAT_R8_UINT]                  = CAP(  1),
-   [GEN6_FORMAT_R8_SSCALED]               = CAP(  1),
-   [GEN6_FORMAT_R8_USCALED]               = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_UNORM]             = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_SNORM]             = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_SSCALED]           = CAP(  1),
-   [GEN6_FORMAT_R8G8B8_USCALED]           = CAP(  1),
-   [GEN6_FORMAT_R64G64B64A64_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R64G64B64_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_FLOAT]          = CAP(  6),
-   [GEN6_FORMAT_R16G16B16_UNORM]          = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_SNORM]          = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_SSCALED]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_USCALED]        = CAP(  1),
-   [GEN6_FORMAT_R16G16B16_UINT]           = CAP(7.5),
-   [GEN6_FORMAT_R16G16B16_SINT]           = CAP(7.5),
-   [GEN6_FORMAT_R32_SFIXED]               = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_SNORM]        = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_USCALED]      = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_SSCALED]      = CAP(7.5),
-   [GEN6_FORMAT_R10G10B10A2_SINT]         = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_SNORM]        = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_USCALED]      = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_SSCALED]      = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_UINT]         = CAP(7.5),
-   [GEN6_FORMAT_B10G10R10A2_SINT]         = CAP(7.5),
-   [GEN6_FORMAT_R8G8B8_UINT]              = CAP(7.5),
-   [GEN6_FORMAT_R8G8B8_SINT]              = CAP(7.5),
-#undef CAP
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 2 part 1, page 195
- *  - the Haswell PRM, volume 7, page 535
- */
-static const struct ilo_sol_cap ilo_sol_caps[] = {
-#define CAP(buffer) { ILO_GEN(buffer) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1),
-#undef CAP
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 4 part 1, page 84-87
- */
-static const struct ilo_sampler_cap ilo_sampler_caps[] = {
-#define CAP(sampling, filtering, shadow_map, chroma_key) \
-   { ILO_GEN(sampling), ILO_GEN(filtering), ILO_GEN(shadow_map), ILO_GEN(chroma_key) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32B32X32_FLOAT]       = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS] = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_X32_TYPELESS_G8X24_UINT]  = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_L32A32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_R16G16B16X16_UNORM]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16X16_FLOAT]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_A32X32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_L32X32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_I32X32_FLOAT]             = CAP(  1,   5,   0,   0),
-   [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_R24_UNORM_X8_TYPELESS]    = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_X24_TYPELESS_G8_UINT]     = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_L16A16_UNORM]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_I24X8_UNORM]              = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_L24X8_UNORM]              = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_A24X8_UNORM]              = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_I32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_L32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_A32_FLOAT]                = CAP(  1,   5,   1,   0),
-   [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B8G8R8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8X8_UNORM]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8B8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R9G9B9E5_SHAREDEXP]       = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B10G10R10X2_UNORM]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_L16A16_FLOAT]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_A8P8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_A8P8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_I16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_A16_UNORM]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L8A8_UNORM]               = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_I16_FLOAT]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L16_FLOAT]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_A16_FLOAT]                = CAP(  1,   1,   1,   0),
-   [GEN6_FORMAT_L8A8_UNORM_SRGB]          = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_R5G5_SNORM_B6_UNORM]      = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_P8A8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_P8A8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   0, 4.5),
-   [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_I8_UNORM]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_L8_UNORM]                 = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_P4A4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_A4P4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_P8_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_L8_UNORM_SRGB]            = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_P8_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_P4A4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_A4P4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_DXT1_RGB_SRGB]            = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_R1_UNORM]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_P2_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_P2_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
-   [GEN6_FORMAT_BC1_UNORM]                = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_BC2_UNORM]                = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_BC3_UNORM]                = CAP(  1,   1,   0,   1),
-   [GEN6_FORMAT_BC4_UNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC5_UNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC1_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC2_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC3_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_MONO8]                    = CAP(  1,   0,   0,   0),
-   [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_DXT1_RGB]                 = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_FXT1]                     = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC4_SNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_BC5_SNORM]                = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R16G16B16_FLOAT]          = CAP(  5,   5,   0,   0),
-   [GEN6_FORMAT_BC6H_SF16]                = CAP(  7,   7,   0,   0),
-   [GEN6_FORMAT_BC7_UNORM]                = CAP(  7,   7,   0,   0),
-   [GEN6_FORMAT_BC7_UNORM_SRGB]           = CAP(  7,   7,   0,   0),
-   [GEN6_FORMAT_BC6H_UF16]                = CAP(  7,   7,   0,   0),
-#undef CAP
-};
-
-/*
- * This table is based on:
- *
- *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
- *  - the Ivy Bridge PRM, volume 4 part 1, page 172, 252-253, and 277-278
- *  - the Haswell PRM, volume 7, page 262-264
- */
-static const struct ilo_dp_cap ilo_dp_caps[] = {
-#define CAP(rt_write, rt_write_blending, typed_write, media_color_processing) \
-   { ILO_GEN(rt_write), ILO_GEN(rt_write_blending), ILO_GEN(typed_write), ILO_GEN(media_color_processing) }
-   [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1, 4.5,   7,   6),
-   [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  0,   0,   0,   6),
-   [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   6),
-   [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1, 4.5,   7,   0),
-   [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   7,   6),
-   [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   6),
-   [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  0,   0,   0,   6),
-   [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16_UNORM]                = CAP(  1, 4.5,   7,   7),
-   [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G5R5X1_UNORM]           = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_B5G5R5X1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
-   [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   6,   7,   0),
-   [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   7,   0),
-   [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   7,   0),
-   [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   0,   0,   6),
-   [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   0,   0,   6),
-   [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   0,   0,   6),
-   [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   0,   0,   6),
-#undef CAP
-};
-
-bool
-ilo_format_support_vb(const struct ilo_dev *dev,
-                      enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER);
-   const struct ilo_vf_cap *cap = (idx >= 0 && idx < Elements(ilo_vf_caps)) ?
-      &ilo_vf_caps[idx] : NULL;
-
-   return (cap && cap->vertex_element &&
-         ilo_dev_gen(dev) >= cap->vertex_element);
-}
-
-bool
-ilo_format_support_sol(const struct ilo_dev *dev,
-                       enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_STREAM_OUTPUT);
-   const struct ilo_sol_cap *cap = (idx >= 0 && idx < Elements(ilo_sol_caps)) ?
-      &ilo_sol_caps[idx] : NULL;
-
-   return (cap && cap->buffer && ilo_dev_gen(dev) >= cap->buffer);
-}
-
-bool
-ilo_format_support_sampler(const struct ilo_dev *dev,
-                           enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_SAMPLER_VIEW);
-   const struct ilo_sampler_cap *cap = (idx >= 0 &&
-         idx < Elements(ilo_sampler_caps)) ? &ilo_sampler_caps[idx] : NULL;
-
-   if (!cap || !cap->sampling)
-      return false;
-
-   assert(!cap->filtering || cap->filtering >= cap->sampling);
-
-   if (util_format_is_pure_integer(format))
-      return (ilo_dev_gen(dev) >= cap->sampling);
-   else if (cap->filtering)
-      return (ilo_dev_gen(dev) >= cap->filtering);
-   else
-      return false;
-}
-
-bool
-ilo_format_support_rt(const struct ilo_dev *dev,
-                      enum pipe_format format)
-{
-   const int idx = ilo_format_translate(dev, format, PIPE_BIND_RENDER_TARGET);
-   const struct ilo_dp_cap *cap = (idx >= 0 && idx < Elements(ilo_dp_caps)) ?
-      &ilo_dp_caps[idx] : NULL;
-
-   if (!cap || !cap->rt_write)
-      return false;
-
-   assert(!cap->rt_write_blending || cap->rt_write_blending >= cap->rt_write);
-
-   if (util_format_is_pure_integer(format))
-      return (ilo_dev_gen(dev) >= cap->rt_write);
-   else if (cap->rt_write_blending)
-      return (ilo_dev_gen(dev) >= cap->rt_write_blending);
-   else
-      return false;
-}
-
-bool
-ilo_format_support_zs(const struct ilo_dev *dev,
-                      enum pipe_format format)
-{
-   switch (format) {
-   case PIPE_FORMAT_Z16_UNORM:
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_Z32_FLOAT:
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      return true;
-   case PIPE_FORMAT_S8_UINT:
-      /* TODO separate stencil */
-   default:
-      return false;
-   }
-}
-
-/**
- * Translate a color (non-depth/stencil) pipe format to the matching hardware
- * format.  Return -1 on errors.
- */
-int
-ilo_format_translate_color(const struct ilo_dev *dev,
-                           enum pipe_format format)
-{
-   static const int format_mapping[PIPE_FORMAT_COUNT] = {
-      [PIPE_FORMAT_NONE]                  = 0,
-      [PIPE_FORMAT_B8G8R8A8_UNORM]        = GEN6_FORMAT_B8G8R8A8_UNORM,
-      [PIPE_FORMAT_B8G8R8X8_UNORM]        = GEN6_FORMAT_B8G8R8X8_UNORM,
-      [PIPE_FORMAT_A8R8G8B8_UNORM]        = 0,
-      [PIPE_FORMAT_X8R8G8B8_UNORM]        = 0,
-      [PIPE_FORMAT_B5G5R5A1_UNORM]        = GEN6_FORMAT_B5G5R5A1_UNORM,
-      [PIPE_FORMAT_B4G4R4A4_UNORM]        = GEN6_FORMAT_B4G4R4A4_UNORM,
-      [PIPE_FORMAT_B5G6R5_UNORM]          = GEN6_FORMAT_B5G6R5_UNORM,
-      [PIPE_FORMAT_R10G10B10A2_UNORM]     = GEN6_FORMAT_R10G10B10A2_UNORM,
-      [PIPE_FORMAT_L8_UNORM]              = GEN6_FORMAT_L8_UNORM,
-      [PIPE_FORMAT_A8_UNORM]              = GEN6_FORMAT_A8_UNORM,
-      [PIPE_FORMAT_I8_UNORM]              = GEN6_FORMAT_I8_UNORM,
-      [PIPE_FORMAT_L8A8_UNORM]            = GEN6_FORMAT_L8A8_UNORM,
-      [PIPE_FORMAT_L16_UNORM]             = GEN6_FORMAT_L16_UNORM,
-      [PIPE_FORMAT_UYVY]                  = GEN6_FORMAT_YCRCB_SWAPUVY,
-      [PIPE_FORMAT_YUYV]                  = GEN6_FORMAT_YCRCB_NORMAL,
-      [PIPE_FORMAT_Z16_UNORM]             = 0,
-      [PIPE_FORMAT_Z32_UNORM]             = 0,
-      [PIPE_FORMAT_Z32_FLOAT]             = 0,
-      [PIPE_FORMAT_Z24_UNORM_S8_UINT]     = 0,
-      [PIPE_FORMAT_S8_UINT_Z24_UNORM]     = 0,
-      [PIPE_FORMAT_Z24X8_UNORM]           = 0,
-      [PIPE_FORMAT_X8Z24_UNORM]           = 0,
-      [PIPE_FORMAT_S8_UINT]               = 0,
-      [PIPE_FORMAT_R64_FLOAT]             = GEN6_FORMAT_R64_FLOAT,
-      [PIPE_FORMAT_R64G64_FLOAT]          = GEN6_FORMAT_R64G64_FLOAT,
-      [PIPE_FORMAT_R64G64B64_FLOAT]       = GEN6_FORMAT_R64G64B64_FLOAT,
-      [PIPE_FORMAT_R64G64B64A64_FLOAT]    = GEN6_FORMAT_R64G64B64A64_FLOAT,
-      [PIPE_FORMAT_R32_FLOAT]             = GEN6_FORMAT_R32_FLOAT,
-      [PIPE_FORMAT_R32G32_FLOAT]          = GEN6_FORMAT_R32G32_FLOAT,
-      [PIPE_FORMAT_R32G32B32_FLOAT]       = GEN6_FORMAT_R32G32B32_FLOAT,
-      [PIPE_FORMAT_R32G32B32A32_FLOAT]    = GEN6_FORMAT_R32G32B32A32_FLOAT,
-      [PIPE_FORMAT_R32_UNORM]             = GEN6_FORMAT_R32_UNORM,
-      [PIPE_FORMAT_R32G32_UNORM]          = GEN6_FORMAT_R32G32_UNORM,
-      [PIPE_FORMAT_R32G32B32_UNORM]       = GEN6_FORMAT_R32G32B32_UNORM,
-      [PIPE_FORMAT_R32G32B32A32_UNORM]    = GEN6_FORMAT_R32G32B32A32_UNORM,
-      [PIPE_FORMAT_R32_USCALED]           = GEN6_FORMAT_R32_USCALED,
-      [PIPE_FORMAT_R32G32_USCALED]        = GEN6_FORMAT_R32G32_USCALED,
-      [PIPE_FORMAT_R32G32B32_USCALED]     = GEN6_FORMAT_R32G32B32_USCALED,
-      [PIPE_FORMAT_R32G32B32A32_USCALED]  = GEN6_FORMAT_R32G32B32A32_USCALED,
-      [PIPE_FORMAT_R32_SNORM]             = GEN6_FORMAT_R32_SNORM,
-      [PIPE_FORMAT_R32G32_SNORM]          = GEN6_FORMAT_R32G32_SNORM,
-      [PIPE_FORMAT_R32G32B32_SNORM]       = GEN6_FORMAT_R32G32B32_SNORM,
-      [PIPE_FORMAT_R32G32B32A32_SNORM]    = GEN6_FORMAT_R32G32B32A32_SNORM,
-      [PIPE_FORMAT_R32_SSCALED]           = GEN6_FORMAT_R32_SSCALED,
-      [PIPE_FORMAT_R32G32_SSCALED]        = GEN6_FORMAT_R32G32_SSCALED,
-      [PIPE_FORMAT_R32G32B32_SSCALED]     = GEN6_FORMAT_R32G32B32_SSCALED,
-      [PIPE_FORMAT_R32G32B32A32_SSCALED]  = GEN6_FORMAT_R32G32B32A32_SSCALED,
-      [PIPE_FORMAT_R16_UNORM]             = GEN6_FORMAT_R16_UNORM,
-      [PIPE_FORMAT_R16G16_UNORM]          = GEN6_FORMAT_R16G16_UNORM,
-      [PIPE_FORMAT_R16G16B16_UNORM]       = GEN6_FORMAT_R16G16B16_UNORM,
-      [PIPE_FORMAT_R16G16B16A16_UNORM]    = GEN6_FORMAT_R16G16B16A16_UNORM,
-      [PIPE_FORMAT_R16_USCALED]           = GEN6_FORMAT_R16_USCALED,
-      [PIPE_FORMAT_R16G16_USCALED]        = GEN6_FORMAT_R16G16_USCALED,
-      [PIPE_FORMAT_R16G16B16_USCALED]     = GEN6_FORMAT_R16G16B16_USCALED,
-      [PIPE_FORMAT_R16G16B16A16_USCALED]  = GEN6_FORMAT_R16G16B16A16_USCALED,
-      [PIPE_FORMAT_R16_SNORM]             = GEN6_FORMAT_R16_SNORM,
-      [PIPE_FORMAT_R16G16_SNORM]          = GEN6_FORMAT_R16G16_SNORM,
-      [PIPE_FORMAT_R16G16B16_SNORM]       = GEN6_FORMAT_R16G16B16_SNORM,
-      [PIPE_FORMAT_R16G16B16A16_SNORM]    = GEN6_FORMAT_R16G16B16A16_SNORM,
-      [PIPE_FORMAT_R16_SSCALED]           = GEN6_FORMAT_R16_SSCALED,
-      [PIPE_FORMAT_R16G16_SSCALED]        = GEN6_FORMAT_R16G16_SSCALED,
-      [PIPE_FORMAT_R16G16B16_SSCALED]     = GEN6_FORMAT_R16G16B16_SSCALED,
-      [PIPE_FORMAT_R16G16B16A16_SSCALED]  = GEN6_FORMAT_R16G16B16A16_SSCALED,
-      [PIPE_FORMAT_R8_UNORM]              = GEN6_FORMAT_R8_UNORM,
-      [PIPE_FORMAT_R8G8_UNORM]            = GEN6_FORMAT_R8G8_UNORM,
-      [PIPE_FORMAT_R8G8B8_UNORM]          = GEN6_FORMAT_R8G8B8_UNORM,
-      [PIPE_FORMAT_R8G8B8A8_UNORM]        = GEN6_FORMAT_R8G8B8A8_UNORM,
-      [PIPE_FORMAT_X8B8G8R8_UNORM]        = 0,
-      [PIPE_FORMAT_R8_USCALED]            = GEN6_FORMAT_R8_USCALED,
-      [PIPE_FORMAT_R8G8_USCALED]          = GEN6_FORMAT_R8G8_USCALED,
-      [PIPE_FORMAT_R8G8B8_USCALED]        = GEN6_FORMAT_R8G8B8_USCALED,
-      [PIPE_FORMAT_R8G8B8A8_USCALED]      = GEN6_FORMAT_R8G8B8A8_USCALED,
-      [PIPE_FORMAT_R8_SNORM]              = GEN6_FORMAT_R8_SNORM,
-      [PIPE_FORMAT_R8G8_SNORM]            = GEN6_FORMAT_R8G8_SNORM,
-      [PIPE_FORMAT_R8G8B8_SNORM]          = GEN6_FORMAT_R8G8B8_SNORM,
-      [PIPE_FORMAT_R8G8B8A8_SNORM]        = GEN6_FORMAT_R8G8B8A8_SNORM,
-      [PIPE_FORMAT_R8_SSCALED]            = GEN6_FORMAT_R8_SSCALED,
-      [PIPE_FORMAT_R8G8_SSCALED]          = GEN6_FORMAT_R8G8_SSCALED,
-      [PIPE_FORMAT_R8G8B8_SSCALED]        = GEN6_FORMAT_R8G8B8_SSCALED,
-      [PIPE_FORMAT_R8G8B8A8_SSCALED]      = GEN6_FORMAT_R8G8B8A8_SSCALED,
-      [PIPE_FORMAT_R32_FIXED]             = GEN6_FORMAT_R32_SFIXED,
-      [PIPE_FORMAT_R32G32_FIXED]          = GEN6_FORMAT_R32G32_SFIXED,
-      [PIPE_FORMAT_R32G32B32_FIXED]       = GEN6_FORMAT_R32G32B32_SFIXED,
-      [PIPE_FORMAT_R32G32B32A32_FIXED]    = GEN6_FORMAT_R32G32B32A32_SFIXED,
-      [PIPE_FORMAT_R16_FLOAT]             = GEN6_FORMAT_R16_FLOAT,
-      [PIPE_FORMAT_R16G16_FLOAT]          = GEN6_FORMAT_R16G16_FLOAT,
-      [PIPE_FORMAT_R16G16B16_FLOAT]       = GEN6_FORMAT_R16G16B16_FLOAT,
-      [PIPE_FORMAT_R16G16B16A16_FLOAT]    = GEN6_FORMAT_R16G16B16A16_FLOAT,
-      [PIPE_FORMAT_L8_SRGB]               = GEN6_FORMAT_L8_UNORM_SRGB,
-      [PIPE_FORMAT_L8A8_SRGB]             = GEN6_FORMAT_L8A8_UNORM_SRGB,
-      [PIPE_FORMAT_R8G8B8_SRGB]           = GEN6_FORMAT_R8G8B8_UNORM_SRGB,
-      [PIPE_FORMAT_A8B8G8R8_SRGB]         = 0,
-      [PIPE_FORMAT_X8B8G8R8_SRGB]         = 0,
-      [PIPE_FORMAT_B8G8R8A8_SRGB]         = GEN6_FORMAT_B8G8R8A8_UNORM_SRGB,
-      [PIPE_FORMAT_B8G8R8X8_SRGB]         = GEN6_FORMAT_B8G8R8X8_UNORM_SRGB,
-      [PIPE_FORMAT_A8R8G8B8_SRGB]         = 0,
-      [PIPE_FORMAT_X8R8G8B8_SRGB]         = 0,
-      [PIPE_FORMAT_R8G8B8A8_SRGB]         = GEN6_FORMAT_R8G8B8A8_UNORM_SRGB,
-      [PIPE_FORMAT_DXT1_RGB]              = GEN6_FORMAT_DXT1_RGB,
-      [PIPE_FORMAT_DXT1_RGBA]             = GEN6_FORMAT_BC1_UNORM,
-      [PIPE_FORMAT_DXT3_RGBA]             = GEN6_FORMAT_BC2_UNORM,
-      [PIPE_FORMAT_DXT5_RGBA]             = GEN6_FORMAT_BC3_UNORM,
-      [PIPE_FORMAT_DXT1_SRGB]             = GEN6_FORMAT_DXT1_RGB_SRGB,
-      [PIPE_FORMAT_DXT1_SRGBA]            = GEN6_FORMAT_BC1_UNORM_SRGB,
-      [PIPE_FORMAT_DXT3_SRGBA]            = GEN6_FORMAT_BC2_UNORM_SRGB,
-      [PIPE_FORMAT_DXT5_SRGBA]            = GEN6_FORMAT_BC3_UNORM_SRGB,
-      [PIPE_FORMAT_RGTC1_UNORM]           = GEN6_FORMAT_BC4_UNORM,
-      [PIPE_FORMAT_RGTC1_SNORM]           = GEN6_FORMAT_BC4_SNORM,
-      [PIPE_FORMAT_RGTC2_UNORM]           = GEN6_FORMAT_BC5_UNORM,
-      [PIPE_FORMAT_RGTC2_SNORM]           = GEN6_FORMAT_BC5_SNORM,
-      [PIPE_FORMAT_R8G8_B8G8_UNORM]       = 0,
-      [PIPE_FORMAT_G8R8_G8B8_UNORM]       = 0,
-      [PIPE_FORMAT_R8SG8SB8UX8U_NORM]     = 0,
-      [PIPE_FORMAT_R5SG5SB6U_NORM]        = 0,
-      [PIPE_FORMAT_A8B8G8R8_UNORM]        = 0,
-      [PIPE_FORMAT_B5G5R5X1_UNORM]        = GEN6_FORMAT_B5G5R5X1_UNORM,
-      [PIPE_FORMAT_R10G10B10A2_USCALED]   = GEN6_FORMAT_R10G10B10A2_USCALED,
-      [PIPE_FORMAT_R11G11B10_FLOAT]       = GEN6_FORMAT_R11G11B10_FLOAT,
-      [PIPE_FORMAT_R9G9B9E5_FLOAT]        = GEN6_FORMAT_R9G9B9E5_SHAREDEXP,
-      [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT]  = 0,
-      [PIPE_FORMAT_R1_UNORM]              = GEN6_FORMAT_R1_UNORM,
-      [PIPE_FORMAT_R10G10B10X2_USCALED]   = GEN6_FORMAT_R10G10B10X2_USCALED,
-      [PIPE_FORMAT_R10G10B10X2_SNORM]     = 0,
-      [PIPE_FORMAT_L4A4_UNORM]            = 0,
-      [PIPE_FORMAT_B10G10R10A2_UNORM]     = GEN6_FORMAT_B10G10R10A2_UNORM,
-      [PIPE_FORMAT_R10SG10SB10SA2U_NORM]  = 0,
-      [PIPE_FORMAT_R8G8Bx_SNORM]          = 0,
-      [PIPE_FORMAT_R8G8B8X8_UNORM]        = GEN6_FORMAT_R8G8B8X8_UNORM,
-      [PIPE_FORMAT_B4G4R4X4_UNORM]        = 0,
-      [PIPE_FORMAT_X24S8_UINT]            = 0,
-      [PIPE_FORMAT_S8X24_UINT]            = 0,
-      [PIPE_FORMAT_X32_S8X24_UINT]        = 0,
-      [PIPE_FORMAT_B2G3R3_UNORM]          = 0,
-      [PIPE_FORMAT_L16A16_UNORM]          = GEN6_FORMAT_L16A16_UNORM,
-      [PIPE_FORMAT_A16_UNORM]             = GEN6_FORMAT_A16_UNORM,
-      [PIPE_FORMAT_I16_UNORM]             = GEN6_FORMAT_I16_UNORM,
-      [PIPE_FORMAT_LATC1_UNORM]           = 0,
-      [PIPE_FORMAT_LATC1_SNORM]           = 0,
-      [PIPE_FORMAT_LATC2_UNORM]           = 0,
-      [PIPE_FORMAT_LATC2_SNORM]           = 0,
-      [PIPE_FORMAT_A8_SNORM]              = 0,
-      [PIPE_FORMAT_L8_SNORM]              = 0,
-      [PIPE_FORMAT_L8A8_SNORM]            = 0,
-      [PIPE_FORMAT_I8_SNORM]              = 0,
-      [PIPE_FORMAT_A16_SNORM]             = 0,
-      [PIPE_FORMAT_L16_SNORM]             = 0,
-      [PIPE_FORMAT_L16A16_SNORM]          = 0,
-      [PIPE_FORMAT_I16_SNORM]             = 0,
-      [PIPE_FORMAT_A16_FLOAT]             = GEN6_FORMAT_A16_FLOAT,
-      [PIPE_FORMAT_L16_FLOAT]             = GEN6_FORMAT_L16_FLOAT,
-      [PIPE_FORMAT_L16A16_FLOAT]          = GEN6_FORMAT_L16A16_FLOAT,
-      [PIPE_FORMAT_I16_FLOAT]             = GEN6_FORMAT_I16_FLOAT,
-      [PIPE_FORMAT_A32_FLOAT]             = GEN6_FORMAT_A32_FLOAT,
-      [PIPE_FORMAT_L32_FLOAT]             = GEN6_FORMAT_L32_FLOAT,
-      [PIPE_FORMAT_L32A32_FLOAT]          = GEN6_FORMAT_L32A32_FLOAT,
-      [PIPE_FORMAT_I32_FLOAT]             = GEN6_FORMAT_I32_FLOAT,
-      [PIPE_FORMAT_YV12]                  = 0,
-      [PIPE_FORMAT_YV16]                  = 0,
-      [PIPE_FORMAT_IYUV]                  = 0,
-      [PIPE_FORMAT_NV12]                  = 0,
-      [PIPE_FORMAT_NV21]                  = 0,
-      [PIPE_FORMAT_A4R4_UNORM]            = 0,
-      [PIPE_FORMAT_R4A4_UNORM]            = 0,
-      [PIPE_FORMAT_R8A8_UNORM]            = 0,
-      [PIPE_FORMAT_A8R8_UNORM]            = 0,
-      [PIPE_FORMAT_R10G10B10A2_SSCALED]   = GEN6_FORMAT_R10G10B10A2_SSCALED,
-      [PIPE_FORMAT_R10G10B10A2_SNORM]     = GEN6_FORMAT_R10G10B10A2_SNORM,
-      [PIPE_FORMAT_B10G10R10A2_USCALED]   = GEN6_FORMAT_B10G10R10A2_USCALED,
-      [PIPE_FORMAT_B10G10R10A2_SSCALED]   = GEN6_FORMAT_B10G10R10A2_SSCALED,
-      [PIPE_FORMAT_B10G10R10A2_SNORM]     = GEN6_FORMAT_B10G10R10A2_SNORM,
-      [PIPE_FORMAT_R8_UINT]               = GEN6_FORMAT_R8_UINT,
-      [PIPE_FORMAT_R8G8_UINT]             = GEN6_FORMAT_R8G8_UINT,
-      [PIPE_FORMAT_R8G8B8_UINT]           = GEN6_FORMAT_R8G8B8_UINT,
-      [PIPE_FORMAT_R8G8B8A8_UINT]         = GEN6_FORMAT_R8G8B8A8_UINT,
-      [PIPE_FORMAT_R8_SINT]               = GEN6_FORMAT_R8_SINT,
-      [PIPE_FORMAT_R8G8_SINT]             = GEN6_FORMAT_R8G8_SINT,
-      [PIPE_FORMAT_R8G8B8_SINT]           = GEN6_FORMAT_R8G8B8_SINT,
-      [PIPE_FORMAT_R8G8B8A8_SINT]         = GEN6_FORMAT_R8G8B8A8_SINT,
-      [PIPE_FORMAT_R16_UINT]              = GEN6_FORMAT_R16_UINT,
-      [PIPE_FORMAT_R16G16_UINT]           = GEN6_FORMAT_R16G16_UINT,
-      [PIPE_FORMAT_R16G16B16_UINT]        = GEN6_FORMAT_R16G16B16_UINT,
-      [PIPE_FORMAT_R16G16B16A16_UINT]     = GEN6_FORMAT_R16G16B16A16_UINT,
-      [PIPE_FORMAT_R16_SINT]              = GEN6_FORMAT_R16_SINT,
-      [PIPE_FORMAT_R16G16_SINT]           = GEN6_FORMAT_R16G16_SINT,
-      [PIPE_FORMAT_R16G16B16_SINT]        = GEN6_FORMAT_R16G16B16_SINT,
-      [PIPE_FORMAT_R16G16B16A16_SINT]     = GEN6_FORMAT_R16G16B16A16_SINT,
-      [PIPE_FORMAT_R32_UINT]              = GEN6_FORMAT_R32_UINT,
-      [PIPE_FORMAT_R32G32_UINT]           = GEN6_FORMAT_R32G32_UINT,
-      [PIPE_FORMAT_R32G32B32_UINT]        = GEN6_FORMAT_R32G32B32_UINT,
-      [PIPE_FORMAT_R32G32B32A32_UINT]     = GEN6_FORMAT_R32G32B32A32_UINT,
-      [PIPE_FORMAT_R32_SINT]              = GEN6_FORMAT_R32_SINT,
-      [PIPE_FORMAT_R32G32_SINT]           = GEN6_FORMAT_R32G32_SINT,
-      [PIPE_FORMAT_R32G32B32_SINT]        = GEN6_FORMAT_R32G32B32_SINT,
-      [PIPE_FORMAT_R32G32B32A32_SINT]     = GEN6_FORMAT_R32G32B32A32_SINT,
-      [PIPE_FORMAT_A8_UINT]               = 0,
-      [PIPE_FORMAT_I8_UINT]               = GEN6_FORMAT_I8_UINT,
-      [PIPE_FORMAT_L8_UINT]               = GEN6_FORMAT_L8_UINT,
-      [PIPE_FORMAT_L8A8_UINT]             = GEN6_FORMAT_L8A8_UINT,
-      [PIPE_FORMAT_A8_SINT]               = 0,
-      [PIPE_FORMAT_I8_SINT]               = GEN6_FORMAT_I8_SINT,
-      [PIPE_FORMAT_L8_SINT]               = GEN6_FORMAT_L8_SINT,
-      [PIPE_FORMAT_L8A8_SINT]             = GEN6_FORMAT_L8A8_SINT,
-      [PIPE_FORMAT_A16_UINT]              = 0,
-      [PIPE_FORMAT_I16_UINT]              = 0,
-      [PIPE_FORMAT_L16_UINT]              = 0,
-      [PIPE_FORMAT_L16A16_UINT]           = 0,
-      [PIPE_FORMAT_A16_SINT]              = 0,
-      [PIPE_FORMAT_I16_SINT]              = 0,
-      [PIPE_FORMAT_L16_SINT]              = 0,
-      [PIPE_FORMAT_L16A16_SINT]           = 0,
-      [PIPE_FORMAT_A32_UINT]              = 0,
-      [PIPE_FORMAT_I32_UINT]              = 0,
-      [PIPE_FORMAT_L32_UINT]              = 0,
-      [PIPE_FORMAT_L32A32_UINT]           = 0,
-      [PIPE_FORMAT_A32_SINT]              = 0,
-      [PIPE_FORMAT_I32_SINT]              = 0,
-      [PIPE_FORMAT_L32_SINT]              = 0,
-      [PIPE_FORMAT_L32A32_SINT]           = 0,
-      [PIPE_FORMAT_B10G10R10A2_UINT]      = GEN6_FORMAT_B10G10R10A2_UINT,
-      [PIPE_FORMAT_ETC1_RGB8]             = GEN6_FORMAT_ETC1_RGB8,
-      [PIPE_FORMAT_R8G8_R8B8_UNORM]       = 0,
-      [PIPE_FORMAT_G8R8_B8R8_UNORM]       = 0,
-      [PIPE_FORMAT_R8G8B8X8_SNORM]        = 0,
-      [PIPE_FORMAT_R8G8B8X8_SRGB]         = 0,
-      [PIPE_FORMAT_R8G8B8X8_UINT]         = 0,
-      [PIPE_FORMAT_R8G8B8X8_SINT]         = 0,
-      [PIPE_FORMAT_B10G10R10X2_UNORM]     = GEN6_FORMAT_B10G10R10X2_UNORM,
-      [PIPE_FORMAT_R16G16B16X16_UNORM]    = GEN6_FORMAT_R16G16B16X16_UNORM,
-      [PIPE_FORMAT_R16G16B16X16_SNORM]    = 0,
-      [PIPE_FORMAT_R16G16B16X16_FLOAT]    = GEN6_FORMAT_R16G16B16X16_FLOAT,
-      [PIPE_FORMAT_R16G16B16X16_UINT]     = 0,
-      [PIPE_FORMAT_R16G16B16X16_SINT]     = 0,
-      [PIPE_FORMAT_R32G32B32X32_FLOAT]    = GEN6_FORMAT_R32G32B32X32_FLOAT,
-      [PIPE_FORMAT_R32G32B32X32_UINT]     = 0,
-      [PIPE_FORMAT_R32G32B32X32_SINT]     = 0,
-      [PIPE_FORMAT_R8A8_SNORM]            = 0,
-      [PIPE_FORMAT_R16A16_UNORM]          = 0,
-      [PIPE_FORMAT_R16A16_SNORM]          = 0,
-      [PIPE_FORMAT_R16A16_FLOAT]          = 0,
-      [PIPE_FORMAT_R32A32_FLOAT]          = 0,
-      [PIPE_FORMAT_R8A8_UINT]             = 0,
-      [PIPE_FORMAT_R8A8_SINT]             = 0,
-      [PIPE_FORMAT_R16A16_UINT]           = 0,
-      [PIPE_FORMAT_R16A16_SINT]           = 0,
-      [PIPE_FORMAT_R32A32_UINT]           = 0,
-      [PIPE_FORMAT_R32A32_SINT]           = 0,
-      [PIPE_FORMAT_R10G10B10A2_UINT]      = GEN6_FORMAT_R10G10B10A2_UINT,
-      [PIPE_FORMAT_B5G6R5_SRGB]           = GEN6_FORMAT_B5G6R5_UNORM_SRGB,
-   };
-   int sfmt = format_mapping[format];
-
-   /* GEN6_FORMAT_R32G32B32A32_FLOAT happens to be 0 */
-   if (!sfmt && format != PIPE_FORMAT_R32G32B32A32_FLOAT)
-      sfmt = -1;
-
-   return sfmt;
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_image.c b/src/gallium/drivers/ilo/core/ilo_image.c
index 22c8ef2..0d837d8 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.c
+++ b/src/gallium/drivers/ilo/core/ilo_image.c
@@ -675,9 +675,12 @@ img_init_size_and_format(struct ilo_image *img,
    enum pipe_format format = templ->format;
    bool require_separate_stencil = false;
 
+   img->target = templ->target;
    img->width0 = templ->width0;
    img->height0 = templ->height0;
    img->depth0 = templ->depth0;
+   img->array_size = templ->array_size;
+   img->level_count = templ->last_level + 1;
    img->sample_count = (templ->nr_samples) ? templ->nr_samples : 1;
 
    /*
@@ -794,6 +797,10 @@ img_want_hiz(const struct ilo_image *img,
    if (ilo_debug & ILO_DEBUG_NOHIZ)
       return false;
 
+   /* we want 8x4 aligned levels */
+   if (templ->target == PIPE_TEXTURE_1D)
+      return false;
+
    if (!(templ->bind & PIPE_BIND_DEPTH_STENCIL))
       return false;
 
@@ -1343,9 +1350,12 @@ img_init_for_transfer(struct ilo_image *img,
 
    img->aux.type = ILO_IMAGE_AUX_NONE;
 
+   img->target = templ->target;
    img->width0 = templ->width0;
    img->height0 = templ->height0;
    img->depth0 = templ->depth0;
+   img->array_size = templ->array_size;
+   img->level_count = 1;
    img->sample_count = 1;
 
    img->format = templ->format;
@@ -1386,6 +1396,8 @@ void ilo_image_init(struct ilo_image *img,
    struct ilo_image_params params;
    bool transfer_only;
 
+   assert(ilo_is_zeroed(img, sizeof(*img)));
+
    /* use transfer layout when the texture is never bound to GPU */
    transfer_only = !(templ->bind & ~(PIPE_BIND_TRANSFER_WRITE |
                                      PIPE_BIND_TRANSFER_READ));
@@ -1411,6 +1423,8 @@ ilo_image_init_for_imported(struct ilo_image *img,
 {
    struct ilo_image_params params;
 
+   assert(ilo_is_zeroed(img, sizeof(*img)));
+
    if ((tiling == GEN6_TILING_X && bo_stride % 512) ||
        (tiling == GEN6_TILING_Y && bo_stride % 128) ||
        (tiling == GEN8_TILING_W && bo_stride % 64))
@@ -1435,3 +1449,22 @@ ilo_image_init_for_imported(struct ilo_image *img,
 
    return true;
 }
+
+bool
+ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev)
+{
+   /* HiZ is required for separate stencil on Gen6 */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) &&
+       img->aux.type == ILO_IMAGE_AUX_HIZ &&
+       img->separate_stencil)
+      return false;
+
+   /* MCS is required for multisample images */
+   if (img->aux.type == ILO_IMAGE_AUX_MCS &&
+       img->sample_count > 1)
+      return false;
+
+   img->aux.enables = 0x0;
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_image.h b/src/gallium/drivers/ilo/core/ilo_image.h
index 4956bda..af15e85 100644
--- a/src/gallium/drivers/ilo/core/ilo_image.h
+++ b/src/gallium/drivers/ilo/core/ilo_image.h
@@ -88,10 +88,14 @@ struct ilo_image_lod {
  * Texture layout.
  */
 struct ilo_image {
+   enum pipe_texture_target target;
+
    /* size, format, etc for programming hardware states */
    unsigned width0;
    unsigned height0;
    unsigned depth0;
+   unsigned array_size;
+   unsigned level_count;
    unsigned sample_count;
    enum pipe_format format;
    bool separate_stencil;
@@ -125,8 +129,6 @@ struct ilo_image {
 
    bool scanout;
 
-   struct intel_bo *bo;
-
    struct {
       enum ilo_image_aux_type type;
 
@@ -140,8 +142,12 @@ struct ilo_image {
       unsigned bo_stride;
       unsigned bo_height;
 
+      /* managed by users */
       struct intel_bo *bo;
    } aux;
+
+   /* managed by users */
+   struct intel_bo *bo;
 };
 
 struct pipe_resource;
@@ -158,31 +164,13 @@ ilo_image_init_for_imported(struct ilo_image *img,
                             enum gen_surface_tiling tiling,
                             unsigned bo_stride);
 
-static inline void
-ilo_image_cleanup(struct ilo_image *img)
-{
-   intel_bo_unref(img->bo);
-   intel_bo_unref(img->aux.bo);
-}
-
-static inline void
-ilo_image_set_bo(struct ilo_image *img, struct intel_bo *bo)
-{
-   intel_bo_unref(img->bo);
-   img->bo = intel_bo_ref(bo);
-}
-
-static inline void
-ilo_image_set_aux_bo(struct ilo_image *img, struct intel_bo *bo)
-{
-   intel_bo_unref(img->aux.bo);
-   img->aux.bo = intel_bo_ref(bo);
-}
+bool
+ilo_image_disable_aux(struct ilo_image *img, const struct ilo_dev *dev);
 
 static inline bool
 ilo_image_can_enable_aux(const struct ilo_image *img, unsigned level)
 {
-   return (img->aux.bo && (img->aux.enables & (1 << level)));
+   return (img->aux.enables & (1 << level));
 }
 
 /**
diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d.h b/src/gallium/drivers/ilo/core/ilo_state_3d.h
deleted file mode 100644
index fdce445..0000000
--- a/src/gallium/drivers/ilo/core/ilo_state_3d.h
+++ /dev/null
@@ -1,427 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2014 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#ifndef ILO_STATE_3D_H
-#define ILO_STATE_3D_H
-
-#include "genhw/genhw.h"
-#include "pipe/p_state.h"
-
-#include "ilo_core.h"
-#include "ilo_dev.h"
-
-/**
- * \see brw_context.h
- */
-#define ILO_MAX_DRAW_BUFFERS    8
-#define ILO_MAX_CONST_BUFFERS   (1 + 12)
-#define ILO_MAX_SAMPLER_VIEWS   16
-#define ILO_MAX_SAMPLERS        16
-#define ILO_MAX_SO_BINDINGS     64
-#define ILO_MAX_SO_BUFFERS      4
-#define ILO_MAX_VIEWPORTS       1
-
-#define ILO_MAX_SURFACES        256
-
-struct intel_bo;
-struct ilo_buffer;
-struct ilo_image;
-struct ilo_shader_state;
-
-struct ilo_vb_state {
-   struct pipe_vertex_buffer states[PIPE_MAX_ATTRIBS];
-   uint32_t enabled_mask;
-};
-
-struct ilo_ib_state {
-   struct pipe_resource *buffer;
-   const void *user_buffer;
-   unsigned offset;
-   unsigned index_size;
-
-   /* these are not valid until the state is finalized */
-   struct pipe_resource *hw_resource;
-   unsigned hw_index_size;
-   /* an offset to be added to pipe_draw_info::start */
-   int64_t draw_start_offset;
-};
-
-struct ilo_ve_cso {
-   /* VERTEX_ELEMENT_STATE */
-   uint32_t payload[2];
-};
-
-struct ilo_ve_state {
-   struct ilo_ve_cso cso[PIPE_MAX_ATTRIBS];
-   unsigned count;
-
-   unsigned instance_divisors[PIPE_MAX_ATTRIBS];
-   unsigned vb_mapping[PIPE_MAX_ATTRIBS];
-   unsigned vb_count;
-
-   /* these are not valid until the state is finalized */
-   struct ilo_ve_cso edgeflag_cso;
-   bool last_cso_edgeflag;
-
-   struct ilo_ve_cso nosrc_cso;
-   bool prepend_nosrc_cso;
-};
-
-struct ilo_so_state {
-   struct pipe_stream_output_target *states[ILO_MAX_SO_BUFFERS];
-   unsigned count;
-   unsigned append_bitmask;
-
-   bool enabled;
-};
-
-struct ilo_viewport_cso {
-   /* matrix form */
-   float m00, m11, m22, m30, m31, m32;
-
-   /* guardband in NDC space */
-   float min_gbx, min_gby, max_gbx, max_gby;
-
-   /* viewport in screen space */
-   float min_x, min_y, min_z;
-   float max_x, max_y, max_z;
-};
-
-struct ilo_viewport_state {
-   struct ilo_viewport_cso cso[ILO_MAX_VIEWPORTS];
-   unsigned count;
-
-   struct pipe_viewport_state viewport0;
-};
-
-struct ilo_scissor_state {
-   /* SCISSOR_RECT */
-   uint32_t payload[ILO_MAX_VIEWPORTS * 2];
-
-   struct pipe_scissor_state scissor0;
-};
-
-struct ilo_rasterizer_clip {
-   /* 3DSTATE_CLIP */
-   uint32_t payload[3];
-
-   uint32_t can_enable_guardband;
-};
-
-struct ilo_rasterizer_sf {
-   /* 3DSTATE_SF */
-   uint32_t payload[3];
-   uint32_t dw_msaa;
-
-   /* Global Depth Offset Constant/Scale/Clamp */
-   uint32_t dw_depth_offset_const;
-   uint32_t dw_depth_offset_scale;
-   uint32_t dw_depth_offset_clamp;
-
-   /* Gen8+ 3DSTATE_RASTER */
-   uint32_t dw_raster;
-};
-
-struct ilo_rasterizer_wm {
-   /* 3DSTATE_WM */
-   uint32_t payload[2];
-   uint32_t dw_msaa_rast;
-   uint32_t dw_msaa_disp;
-};
-
-struct ilo_rasterizer_state {
-   struct pipe_rasterizer_state state;
-
-   struct ilo_rasterizer_clip clip;
-   struct ilo_rasterizer_sf sf;
-   struct ilo_rasterizer_wm wm;
-};
-
-struct ilo_dsa_state {
-   /* DEPTH_STENCIL_STATE or Gen8+ 3DSTATE_WM_DEPTH_STENCIL */
-   uint32_t payload[3];
-
-   uint32_t dw_blend_alpha;
-   uint32_t dw_ps_blend_alpha;
-   ubyte alpha_ref;
-};
-
-struct ilo_blend_cso {
-   /* BLEND_STATE */
-   uint32_t payload[2];
-
-   uint32_t dw_blend;
-   uint32_t dw_blend_dst_alpha_forced_one;
-};
-
-struct ilo_blend_state {
-   struct ilo_blend_cso cso[ILO_MAX_DRAW_BUFFERS];
-
-   bool dual_blend;
-   bool alpha_to_coverage;
-
-   uint32_t dw_shared;
-   uint32_t dw_alpha_mod;
-   uint32_t dw_logicop;
-
-   /* a part of 3DSTATE_PS_BLEND */
-   uint32_t dw_ps_blend;
-   uint32_t dw_ps_blend_dst_alpha_forced_one;
-};
-
-struct ilo_sampler_cso {
-   /* SAMPLER_STATE and SAMPLER_BORDER_COLOR_STATE */
-   uint32_t payload[15];
-
-   uint32_t dw_filter;
-   uint32_t dw_filter_aniso;
-   uint32_t dw_wrap;
-   uint32_t dw_wrap_1d;
-   uint32_t dw_wrap_cube;
-
-   bool anisotropic;
-   bool saturate_r;
-   bool saturate_s;
-   bool saturate_t;
-};
-
-struct ilo_sampler_state {
-   const struct ilo_sampler_cso *cso[ILO_MAX_SAMPLERS];
-};
-
-struct ilo_view_surface {
-   /* SURFACE_STATE */
-   uint32_t payload[13];
-   struct intel_bo *bo;
-
-   uint32_t scanout;
-};
-
-struct ilo_view_cso {
-   struct pipe_sampler_view base;
-
-   struct ilo_view_surface surface;
-};
-
-struct ilo_view_state {
-   struct pipe_sampler_view *states[ILO_MAX_SAMPLER_VIEWS];
-   unsigned count;
-};
-
-struct ilo_cbuf_cso {
-   struct pipe_resource *resource;
-   struct ilo_view_surface surface;
-
-   /*
-    * this CSO is not so constant because user buffer needs to be uploaded in
-    * finalize_constant_buffers()
-    */
-   const void *user_buffer;
-   unsigned user_buffer_size;
-};
-
-struct ilo_cbuf_state {
-   struct ilo_cbuf_cso cso[ILO_MAX_CONST_BUFFERS];
-   uint32_t enabled_mask;
-};
-
-struct ilo_resource_state {
-   struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES];
-   unsigned count;
-};
-
-struct ilo_surface_cso {
-   struct pipe_surface base;
-
-   bool is_rt;
-   union {
-      struct ilo_view_surface rt;
-      struct ilo_zs_surface {
-         uint32_t payload[12];
-         uint32_t dw_aligned_8x4;
-
-         struct intel_bo *bo;
-         struct intel_bo *hiz_bo;
-         struct intel_bo *separate_s8_bo;
-      } zs;
-   } u;
-};
-
-struct ilo_fb_state {
-   struct pipe_framebuffer_state state;
-
-   struct ilo_view_surface null_rt;
-   struct ilo_zs_surface null_zs;
-
-   struct ilo_fb_blend_caps {
-      bool can_logicop;
-      bool can_blend;
-      bool can_alpha_test;
-      bool dst_alpha_forced_one;
-   } blend_caps[PIPE_MAX_COLOR_BUFS];
-
-   unsigned num_samples;
-};
-
-struct ilo_shader_cso {
-   uint32_t payload[5];
-};
-
-/**
- * Translate a pipe texture target to the matching hardware surface type.
- */
-static inline int
-ilo_gpe_gen6_translate_texture(enum pipe_texture_target target)
-{
-   switch (target) {
-   case PIPE_BUFFER:
-      return GEN6_SURFTYPE_BUFFER;
-   case PIPE_TEXTURE_1D:
-   case PIPE_TEXTURE_1D_ARRAY:
-      return GEN6_SURFTYPE_1D;
-   case PIPE_TEXTURE_2D:
-   case PIPE_TEXTURE_RECT:
-   case PIPE_TEXTURE_2D_ARRAY:
-      return GEN6_SURFTYPE_2D;
-   case PIPE_TEXTURE_3D:
-      return GEN6_SURFTYPE_3D;
-   case PIPE_TEXTURE_CUBE:
-   case PIPE_TEXTURE_CUBE_ARRAY:
-      return GEN6_SURFTYPE_CUBE;
-   default:
-      assert(!"unknown texture target");
-      return GEN6_SURFTYPE_BUFFER;
-   }
-}
-
-void
-ilo_gpe_init_ve(const struct ilo_dev *dev,
-                unsigned num_states,
-                const struct pipe_vertex_element *states,
-                struct ilo_ve_state *ve);
-
-void
-ilo_gpe_set_ve_edgeflag(const struct ilo_dev *dev,
-                        struct ilo_ve_cso *cso);
-
-void
-ilo_gpe_init_ve_nosrc(const struct ilo_dev *dev,
-                      int comp0, int comp1, int comp2, int comp3,
-                      struct ilo_ve_cso *cso);
-
-void
-ilo_gpe_set_viewport_cso(const struct ilo_dev *dev,
-                         const struct pipe_viewport_state *state,
-                         struct ilo_viewport_cso *vp);
-
-void
-ilo_gpe_set_scissor(const struct ilo_dev *dev,
-                    unsigned start_slot,
-                    unsigned num_states,
-                    const struct pipe_scissor_state *states,
-                    struct ilo_scissor_state *scissor);
-
-void
-ilo_gpe_set_scissor_null(const struct ilo_dev *dev,
-                         struct ilo_scissor_state *scissor);
-
-void
-ilo_gpe_init_rasterizer(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_state *rasterizer);
-void
-ilo_gpe_init_dsa(const struct ilo_dev *dev,
-                 const struct pipe_depth_stencil_alpha_state *state,
-                 struct ilo_dsa_state *dsa);
-
-void
-ilo_gpe_init_blend(const struct ilo_dev *dev,
-                   const struct pipe_blend_state *state,
-                   struct ilo_blend_state *blend);
-
-void
-ilo_gpe_init_sampler_cso(const struct ilo_dev *dev,
-                         const struct pipe_sampler_state *state,
-                         struct ilo_sampler_cso *sampler);
-
-void
-ilo_gpe_init_view_surface_null(const struct ilo_dev *dev,
-                               unsigned width, unsigned height,
-                               unsigned depth, unsigned level,
-                               struct ilo_view_surface *surf);
-
-void
-ilo_gpe_init_view_surface_for_buffer(const struct ilo_dev *dev,
-                                     const struct ilo_buffer *buf,
-                                     unsigned offset, unsigned size,
-                                     unsigned struct_size,
-                                     enum pipe_format elem_format,
-                                     bool is_rt, bool render_cache_rw,
-                                     struct ilo_view_surface *surf);
-
-void
-ilo_gpe_init_view_surface_for_image(const struct ilo_dev *dev,
-                                    const struct ilo_image *img,
-                                    enum pipe_texture_target target,
-                                    enum pipe_format format,
-                                    unsigned first_level,
-                                    unsigned num_levels,
-                                    unsigned first_layer,
-                                    unsigned num_layers,
-                                    bool is_rt,
-                                    struct ilo_view_surface *surf);
-
-void
-ilo_gpe_init_zs_surface(const struct ilo_dev *dev,
-                        const struct ilo_image *img,
-                        const struct ilo_image *s8_img,
-                        enum pipe_texture_target target,
-                        enum pipe_format format, unsigned level,
-                        unsigned first_layer, unsigned num_layers,
-                        struct ilo_zs_surface *zs);
-
-void
-ilo_gpe_init_vs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *vs,
-                    struct ilo_shader_cso *cso);
-
-void
-ilo_gpe_init_gs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *gs,
-                    struct ilo_shader_cso *cso);
-
-void
-ilo_gpe_init_fs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *fs,
-                    struct ilo_shader_cso *cso);
-
-void
-ilo_gpe_set_fb(const struct ilo_dev *dev,
-               const struct pipe_framebuffer_state *state,
-               struct ilo_fb_state *fb);
-
-#endif /* ILO_STATE_3D_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c b/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c
deleted file mode 100644
index 5a4c5dd..0000000
--- a/src/gallium/drivers/ilo/core/ilo_state_3d_bottom.c
+++ /dev/null
@@ -1,2222 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2014 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#include "genhw/genhw.h"
-#include "util/u_dual_blend.h"
-#include "util/u_framebuffer.h"
-#include "util/u_half.h"
-
-#include "ilo_format.h"
-#include "ilo_image.h"
-#include "ilo_state_3d.h"
-#include "../ilo_shader.h"
-
-static void
-rasterizer_init_clip(const struct ilo_dev *dev,
-                     const struct pipe_rasterizer_state *state,
-                     struct ilo_rasterizer_clip *clip)
-{
-   uint32_t dw1, dw2, dw3;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   dw1 = GEN6_CLIP_DW1_STATISTICS;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 219:
-       *
-       *     "Workaround : Due to Hardware issue "EarlyCull" needs to be
-       *      enabled only for the cases where the incoming primitive topology
-       *      into the clipper guaranteed to be Trilist."
-       *
-       * What does this mean?
-       */
-      dw1 |= 0 << 19 |
-             GEN7_CLIP_DW1_EARLY_CULL_ENABLE;
-
-      if (ilo_dev_gen(dev) < ILO_GEN(8)) {
-         if (state->front_ccw)
-            dw1 |= GEN7_CLIP_DW1_FRONTWINDING_CCW;
-
-         switch (state->cull_face) {
-         case PIPE_FACE_NONE:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_NONE;
-            break;
-         case PIPE_FACE_FRONT:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_FRONT;
-            break;
-         case PIPE_FACE_BACK:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_BACK;
-            break;
-         case PIPE_FACE_FRONT_AND_BACK:
-            dw1 |= GEN7_CLIP_DW1_CULLMODE_BOTH;
-            break;
-         }
-      }
-   }
-
-   dw2 = GEN6_CLIP_DW2_CLIP_ENABLE |
-         GEN6_CLIP_DW2_XY_TEST_ENABLE |
-         state->clip_plane_enable << GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT |
-         GEN6_CLIP_DW2_CLIPMODE_NORMAL;
-
-   if (state->clip_halfz)
-      dw2 |= GEN6_CLIP_DW2_APIMODE_D3D;
-   else
-      dw2 |= GEN6_CLIP_DW2_APIMODE_OGL;
-
-   if (ilo_dev_gen(dev) < ILO_GEN(8) && state->depth_clip)
-      dw2 |= GEN6_CLIP_DW2_Z_TEST_ENABLE;
-
-   if (state->flatshade_first) {
-      dw2 |= 0 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
-             0 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
-             1 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
-   }
-   else {
-      dw2 |= 2 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
-             1 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
-             2 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
-   }
-
-   dw3 = 0x1 << GEN6_CLIP_DW3_MIN_POINT_WIDTH__SHIFT |
-         0x7ff << GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT;
-
-   clip->payload[0] = dw1;
-   clip->payload[1] = dw2;
-   clip->payload[2] = dw3;
-
-   clip->can_enable_guardband = true;
-
-   /*
-    * There are several reasons that guard band test should be disabled
-    *
-    *  - GL wide points (to avoid partially visibie object)
-    *  - GL wide or AA lines (to avoid partially visibie object)
-    */
-   if (state->point_size_per_vertex || state->point_size > 1.0f)
-      clip->can_enable_guardband = false;
-   if (state->line_smooth || state->line_width > 1.0f)
-      clip->can_enable_guardband = false;
-}
-
-static void
-rasterizer_init_sf_depth_offset_gen6(const struct ilo_dev *dev,
-                                     const struct pipe_rasterizer_state *state,
-                                     struct ilo_rasterizer_sf *sf)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   /*
-    * Scale the constant term.  The minimum representable value used by the HW
-    * is not large enouch to be the minimum resolvable difference.
-    */
-   sf->dw_depth_offset_const = fui(state->offset_units * 2.0f);
-   sf->dw_depth_offset_scale = fui(state->offset_scale);
-   sf->dw_depth_offset_clamp = fui(state->offset_clamp);
-}
-
-static void
-rasterizer_init_sf_gen6(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_sf *sf)
-{
-   int line_width, point_width;
-   uint32_t dw1, dw2, dw3;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "This bit (Statistics Enable) should be set whenever clipping is
-    *      enabled and the Statistics Enable bit is set in CLIP_STATE. It
-    *      should be cleared if clipping is disabled or Statistics Enable in
-    *      CLIP_STATE is clear."
-    */
-   dw1 = GEN7_SF_DW1_STATISTICS |
-         GEN7_SF_DW1_VIEWPORT_ENABLE;
-
-   /* XXX GEN6 path seems to work fine for GEN7 */
-   if (false && ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 258:
-       *
-       *     "This bit (Legacy Global Depth Bias Enable, Global Depth Offset
-       *      Enable Solid , Global Depth Offset Enable Wireframe, and Global
-       *      Depth Offset Enable Point) should be set whenever non zero depth
-       *      bias (Slope, Bias) values are used. Setting this bit may have
-       *      some degradation of performance for some workloads."
-       */
-      if (state->offset_tri || state->offset_line || state->offset_point) {
-         /* XXX need to scale offset_const according to the depth format */
-         dw1 |= GEN7_SF_DW1_LEGACY_DEPTH_OFFSET;
-
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID |
-                GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME |
-                GEN7_SF_DW1_DEPTH_OFFSET_POINT;
-      }
-   } else {
-      if (state->offset_tri)
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID;
-      if (state->offset_line)
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME;
-      if (state->offset_point)
-         dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_POINT;
-   }
-
-   switch (state->fill_front) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw1 |= GEN7_SF_DW1_FRONTFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw1 |= GEN7_SF_DW1_FRONTFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw1 |= GEN7_SF_DW1_FRONTFACE_POINT;
-      break;
-   }
-
-   switch (state->fill_back) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw1 |= GEN7_SF_DW1_BACKFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw1 |= GEN7_SF_DW1_BACKFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw1 |= GEN7_SF_DW1_BACKFACE_POINT;
-      break;
-   }
-
-   if (state->front_ccw)
-      dw1 |= GEN7_SF_DW1_FRONTWINDING_CCW;
-
-   dw2 = 0;
-
-   if (state->line_smooth) {
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 251:
-       *
-       *     "This field (Anti-aliasing Enable) must be disabled if any of the
-       *      render targets have integer (UINT or SINT) surface format."
-       *
-       * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-       *
-       *     "This field (Hierarchical Depth Buffer Enable) must be disabled
-       *      if Anti-aliasing Enable in 3DSTATE_SF is enabled.
-       *
-       * TODO We do not check those yet.
-       */
-      dw2 |= GEN7_SF_DW2_AA_LINE_ENABLE |
-             GEN7_SF_DW2_AA_LINE_CAP_1_0;
-   }
-
-   switch (state->cull_face) {
-   case PIPE_FACE_NONE:
-      dw2 |= GEN7_SF_DW2_CULLMODE_NONE;
-      break;
-   case PIPE_FACE_FRONT:
-      dw2 |= GEN7_SF_DW2_CULLMODE_FRONT;
-      break;
-   case PIPE_FACE_BACK:
-      dw2 |= GEN7_SF_DW2_CULLMODE_BACK;
-      break;
-   case PIPE_FACE_FRONT_AND_BACK:
-      dw2 |= GEN7_SF_DW2_CULLMODE_BOTH;
-      break;
-   }
-
-   /*
-    * Smooth lines should intersect ceil(line_width) or (ceil(line_width) + 1)
-    * pixels in the minor direction.  We have to make the lines slightly
-    * thicker, 0.5 pixel on both sides, so that they intersect that many
-    * pixels are considered into the lines.
-    *
-    * Line width is in U3.7.
-    */
-   line_width = (int)
-      ((state->line_width + (float) state->line_smooth) * 128.0f + 0.5f);
-   line_width = CLAMP(line_width, 0, 1023);
-
-   /* use GIQ rules */
-   if (line_width == 128 && !state->line_smooth)
-      line_width = 0;
-
-   dw2 |= line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
-
-   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && state->line_stipple_enable)
-      dw2 |= GEN75_SF_DW2_LINE_STIPPLE_ENABLE;
-
-   if (state->scissor)
-      dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE;
-
-   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
-         GEN7_SF_DW3_SUBPIXEL_8BITS;
-
-   if (state->line_last_pixel)
-      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
-
-   if (state->flatshade_first) {
-      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   } else {
-      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   }
-
-   if (!state->point_size_per_vertex)
-      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH;
-
-   /* in U8.3 */
-   point_width = (int) (state->point_size * 8.0f + 0.5f);
-   point_width = CLAMP(point_width, 1, 2047);
-
-   dw3 |= point_width;
-
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   sf->payload[0] = dw1;
-   sf->payload[1] = dw2;
-   sf->payload[2] = dw3;
-
-   if (state->multisample) {
-      sf->dw_msaa = GEN7_SF_DW2_MSRASTMODE_ON_PATTERN;
-
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 251:
-       *
-       *     "Software must not program a value of 0.0 when running in
-       *      MSRASTMODE_ON_xxx modes - zero-width lines are not available
-       *      when multisampling rasterization is enabled."
-       */
-      if (!line_width) {
-         line_width = 128; /* 1.0f */
-
-         sf->dw_msaa |= line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
-      }
-   } else {
-      sf->dw_msaa = 0;
-   }
-
-   rasterizer_init_sf_depth_offset_gen6(dev, state, sf);
-   /* 3DSTATE_RASTER is Gen8+ only */
-   sf->dw_raster = 0;
-}
-
-static uint32_t
-rasterizer_get_sf_raster_gen8(const struct ilo_dev *dev,
-                              const struct pipe_rasterizer_state *state)
-{
-   uint32_t dw = 0;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (state->front_ccw)
-      dw |= GEN8_RASTER_DW1_FRONTWINDING_CCW;
-
-   switch (state->cull_face) {
-   case PIPE_FACE_NONE:
-      dw |= GEN8_RASTER_DW1_CULLMODE_NONE;
-      break;
-   case PIPE_FACE_FRONT:
-      dw |= GEN8_RASTER_DW1_CULLMODE_FRONT;
-      break;
-   case PIPE_FACE_BACK:
-      dw |= GEN8_RASTER_DW1_CULLMODE_BACK;
-      break;
-   case PIPE_FACE_FRONT_AND_BACK:
-      dw |= GEN8_RASTER_DW1_CULLMODE_BOTH;
-      break;
-   }
-
-   if (state->point_smooth)
-      dw |= GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE;
-
-   if (state->multisample)
-      dw |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE;
-
-   if (state->offset_tri)
-      dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID;
-   if (state->offset_line)
-      dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME;
-   if (state->offset_point)
-      dw|= GEN8_RASTER_DW1_DEPTH_OFFSET_POINT;
-
-   switch (state->fill_front) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw |= GEN8_RASTER_DW1_FRONTFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw |= GEN8_RASTER_DW1_FRONTFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw |= GEN8_RASTER_DW1_FRONTFACE_POINT;
-      break;
-   }
-
-   switch (state->fill_back) {
-   case PIPE_POLYGON_MODE_FILL:
-      dw |= GEN8_RASTER_DW1_BACKFACE_SOLID;
-      break;
-   case PIPE_POLYGON_MODE_LINE:
-      dw |= GEN8_RASTER_DW1_BACKFACE_WIREFRAME;
-      break;
-   case PIPE_POLYGON_MODE_POINT:
-      dw |= GEN8_RASTER_DW1_BACKFACE_POINT;
-      break;
-   }
-
-   if (state->line_smooth)
-      dw |= GEN8_RASTER_DW1_AA_LINE_ENABLE;
-
-   if (state->scissor)
-      dw |= GEN8_RASTER_DW1_SCISSOR_ENABLE;
-
-   if (state->depth_clip)
-      dw |= GEN8_RASTER_DW1_Z_TEST_ENABLE;
-
-   return dw;
-}
-
-static void
-rasterizer_init_sf_gen8(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_sf *sf)
-{
-   int line_width, point_width;
-   uint32_t dw1, dw2, dw3;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   /* in U3.7 */
-   line_width = (int)
-      ((state->line_width + (float) state->line_smooth) * 128.0f + 0.5f);
-   line_width = CLAMP(line_width, 0, 1023);
-
-   /* use GIQ rules */
-   if (line_width == 128 && !state->line_smooth)
-      line_width = 0;
-
-   /* in U8.3 */
-   point_width = (int) (state->point_size * 8.0f + 0.5f);
-   point_width = CLAMP(point_width, 1, 2047);
-
-   dw1 = GEN7_SF_DW1_STATISTICS |
-         GEN7_SF_DW1_VIEWPORT_ENABLE;
-
-   dw2 = line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
-   if (state->line_smooth)
-      dw2 |= GEN7_SF_DW2_AA_LINE_CAP_1_0;
-
-   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
-         GEN7_SF_DW3_SUBPIXEL_8BITS |
-         point_width;
-
-   if (state->line_last_pixel)
-      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
-
-   if (state->flatshade_first) {
-      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   } else {
-      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
-             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
-             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
-   }
-
-   if (!state->point_size_per_vertex)
-      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH;
-
-   dw3 |= point_width;
-
-   STATIC_ASSERT(Elements(sf->payload) >= 3);
-   sf->payload[0] = dw1;
-   sf->payload[1] = dw2;
-   sf->payload[2] = dw3;
-
-   rasterizer_init_sf_depth_offset_gen6(dev, state, sf);
-
-   sf->dw_msaa = 0;
-   sf->dw_raster = rasterizer_get_sf_raster_gen8(dev, state);
-}
-
-static void
-rasterizer_init_wm_gen6(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_wm *wm)
-{
-   uint32_t dw5, dw6;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   /* only the FF unit states are set, as in GEN7 */
-
-   dw5 = GEN6_WM_DW5_AA_LINE_WIDTH_2_0;
-
-   /* same value as in 3DSTATE_SF */
-   if (state->line_smooth)
-      dw5 |= GEN6_WM_DW5_AA_LINE_CAP_1_0;
-
-   if (state->poly_stipple_enable)
-      dw5 |= GEN6_WM_DW5_POLY_STIPPLE_ENABLE;
-   if (state->line_stipple_enable)
-      dw5 |= GEN6_WM_DW5_LINE_STIPPLE_ENABLE;
-
-   /*
-    * assertion that makes sure
-    *
-    *   dw6 |= wm->dw_msaa_rast | wm->dw_msaa_disp;
-    *
-    * is valid
-    */
-   STATIC_ASSERT(GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL == 0 &&
-                 GEN6_WM_DW6_MSDISPMODE_PERSAMPLE == 0);
-   dw6 = GEN6_WM_DW6_ZW_INTERP_PIXEL;
-
-   if (state->bottom_edge_rule)
-      dw6 |= GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT;
-
-   wm->dw_msaa_rast =
-      (state->multisample) ? GEN6_WM_DW6_MSRASTMODE_ON_PATTERN : 0;
-   wm->dw_msaa_disp = GEN6_WM_DW6_MSDISPMODE_PERPIXEL;
-
-   STATIC_ASSERT(Elements(wm->payload) >= 2);
-   wm->payload[0] = dw5;
-   wm->payload[1] = dw6;
-}
-
-static void
-rasterizer_init_wm_gen7(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_wm *wm)
-{
-   uint32_t dw1, dw2;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   /*
-    * assertion that makes sure
-    *
-    *   dw1 |= wm->dw_msaa_rast;
-    *   dw2 |= wm->dw_msaa_disp;
-    *
-    * is valid
-    */
-   STATIC_ASSERT(GEN7_WM_DW1_MSRASTMODE_OFF_PIXEL == 0 &&
-                 GEN7_WM_DW2_MSDISPMODE_PERSAMPLE == 0);
-   dw1 = GEN7_WM_DW1_ZW_INTERP_PIXEL |
-         GEN7_WM_DW1_AA_LINE_WIDTH_2_0;
-   dw2 = 0;
-
-   /* same value as in 3DSTATE_SF */
-   if (state->line_smooth)
-      dw1 |= GEN7_WM_DW1_AA_LINE_CAP_1_0;
-
-   if (state->poly_stipple_enable)
-      dw1 |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE;
-   if (state->line_stipple_enable)
-      dw1 |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE;
-
-   if (state->bottom_edge_rule)
-      dw1 |= GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;
-
-   wm->dw_msaa_rast =
-      (state->multisample) ? GEN7_WM_DW1_MSRASTMODE_ON_PATTERN : 0;
-   wm->dw_msaa_disp = GEN7_WM_DW2_MSDISPMODE_PERPIXEL;
-
-   STATIC_ASSERT(Elements(wm->payload) >= 2);
-   wm->payload[0] = dw1;
-   wm->payload[1] = dw2;
-}
-
-static uint32_t
-rasterizer_get_wm_gen8(const struct ilo_dev *dev,
-                       const struct pipe_rasterizer_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   dw = GEN7_WM_DW1_ZW_INTERP_PIXEL |
-        GEN7_WM_DW1_AA_LINE_WIDTH_2_0;
-
-   /* same value as in 3DSTATE_SF */
-   if (state->line_smooth)
-      dw |= GEN7_WM_DW1_AA_LINE_CAP_1_0;
-
-   if (state->poly_stipple_enable)
-      dw |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE;
-   if (state->line_stipple_enable)
-      dw |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE;
-
-   if (state->bottom_edge_rule)
-      dw |= GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;
-
-   return dw;
-}
-
-void
-ilo_gpe_init_rasterizer(const struct ilo_dev *dev,
-                        const struct pipe_rasterizer_state *state,
-                        struct ilo_rasterizer_state *rasterizer)
-{
-   rasterizer_init_clip(dev, state, &rasterizer->clip);
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      memset(&rasterizer->wm, 0, sizeof(rasterizer->wm));
-      rasterizer->wm.payload[0] = rasterizer_get_wm_gen8(dev, state);
-
-      rasterizer_init_sf_gen8(dev, state, &rasterizer->sf);
-   } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      rasterizer_init_wm_gen7(dev, state, &rasterizer->wm);
-      rasterizer_init_sf_gen6(dev, state, &rasterizer->sf);
-   } else {
-      rasterizer_init_wm_gen6(dev, state, &rasterizer->wm);
-      rasterizer_init_sf_gen6(dev, state, &rasterizer->sf);
-   }
-}
-
-static void
-fs_init_cso_gen6(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *fs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, input_count, sampler_count, interps, max_threads;
-   uint32_t dw2, dw4, dw5, dw6;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG);
-   input_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT);
-   sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT);
-   interps = ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS);
-
-   /* see brwCreateContext() */
-   max_threads = (dev->gt == 2) ? 80 : 40;
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = start_grf << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
-         0 << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
-         0 << GEN6_WM_DW4_URB_GRF_START2__SHIFT;
-
-   dw5 = (max_threads - 1) << GEN6_WM_DW5_MAX_THREADS__SHIFT;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
-    *
-    *     "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that the
-    *      PS kernel or color calculator has the ability to kill (discard)
-    *      pixels or samples, other than due to depth or stencil testing.
-    *      This bit is required to be ENABLED in the following situations:
-    *
-    *      The API pixel shader program contains "killpix" or "discard"
-    *      instructions, or other code in the pixel shader kernel that can
-    *      cause the final pixel mask to differ from the pixel mask received
-    *      on dispatch.
-    *
-    *      A sampler with chroma key enabled with kill pixel mode is used by
-    *      the pixel shader.
-    *
-    *      Any render target has Alpha Test Enable or AlphaToCoverage Enable
-    *      enabled.
-    *
-    *      The pixel shader kernel generates and outputs oMask.
-    *
-    *      Note: As ClipDistance clipping is fully supported in hardware and
-    *      therefore not via PS instructions, there should be no need to
-    *      ENABLE this bit due to ClipDistance clipping."
-    */
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL))
-      dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
-    *
-    *     "If a NULL Depth Buffer is selected, the Pixel Shader Computed Depth
-    *      field must be set to disabled."
-    *
-    * TODO This is not checked yet.
-    */
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z))
-      dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z))
-      dw5 |= GEN6_WM_DW5_PS_USE_DEPTH;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W))
-      dw5 |= GEN6_WM_DW5_PS_USE_W;
-
-   /*
-    * TODO set this bit only when
-    *
-    *  a) fs writes colors and color is not masked, or
-    *  b) fs writes depth, or
-    *  c) fs or cc kills
-    */
-   if (true)
-      dw5 |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
-
-   assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET));
-   dw5 |= GEN6_PS_DISPATCH_8 << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;
-
-   dw6 = input_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT |
-         GEN6_WM_DW6_PS_POSOFFSET_NONE |
-         interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 4);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-   cso->payload[3] = dw6;
-}
-
-static uint32_t
-fs_get_wm_gen7(const struct ilo_dev *dev,
-               const struct ilo_shader_state *fs)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   dw = ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) <<
-      GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT;
-
-   /*
-    * TODO set this bit only when
-    *
-    *  a) fs writes colors and color is not masked, or
-    *  b) fs writes depth, or
-    *  c) fs or cc kills
-    */
-   dw |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
-
-   /*
-    * From the Ivy Bridge PRM, volume 2 part 1, page 278:
-    *
-    *     "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that
-    *      the PS kernel or color calculator has the ability to kill
-    *      (discard) pixels or samples, other than due to depth or stencil
-    *      testing. This bit is required to be ENABLED in the following
-    *      situations:
-    *
-    *      - The API pixel shader program contains "killpix" or "discard"
-    *        instructions, or other code in the pixel shader kernel that
-    *        can cause the final pixel mask to differ from the pixel mask
-    *        received on dispatch.
-    *
-    *      - A sampler with chroma key enabled with kill pixel mode is used
-    *        by the pixel shader.
-    *
-    *      - Any render target has Alpha Test Enable or AlphaToCoverage
-    *        Enable enabled.
-    *
-    *      - The pixel shader kernel generates and outputs oMask.
-    *
-    *      Note: As ClipDistance clipping is fully supported in hardware
-    *      and therefore not via PS instructions, there should be no need
-    *      to ENABLE this bit due to ClipDistance clipping."
-    */
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL))
-      dw |= GEN7_WM_DW1_PS_KILL_PIXEL;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z))
-      dw |= GEN7_WM_DW1_PSCDEPTH_ON;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z))
-      dw |= GEN7_WM_DW1_PS_USE_DEPTH;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W))
-      dw |= GEN7_WM_DW1_PS_USE_W;
-
-   return dw;
-}
-
-static void
-fs_init_cso_gen7(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *fs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, sampler_count, max_threads;
-   uint32_t dw2, dw4, dw5;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG);
-   sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT);
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = GEN7_PS_DW4_POSOFFSET_NONE;
-
-   /* see brwCreateContext() */
-   switch (ilo_dev_gen(dev)) {
-   case ILO_GEN(7.5):
-      max_threads = (dev->gt == 3) ? 408 : (dev->gt == 2) ? 204 : 102;
-      dw4 |= (max_threads - 1) << GEN75_PS_DW4_MAX_THREADS__SHIFT;
-      dw4 |= 1 << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
-      break;
-   case ILO_GEN(7):
-   default:
-      max_threads = (dev->gt == 2) ? 172 : 48;
-      dw4 |= (max_threads - 1) << GEN7_PS_DW4_MAX_THREADS__SHIFT;
-      break;
-   }
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_PCB_CBUF0_SIZE))
-      dw4 |= GEN7_PS_DW4_PUSH_CONSTANT_ENABLE;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT))
-      dw4 |= GEN7_PS_DW4_ATTR_ENABLE;
-
-   assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET));
-   dw4 |= GEN6_PS_DISPATCH_8 << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
-
-   dw5 = start_grf << GEN7_PS_DW5_URB_GRF_START0__SHIFT |
-         0 << GEN7_PS_DW5_URB_GRF_START1__SHIFT |
-         0 << GEN7_PS_DW5_URB_GRF_START2__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 4);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-   cso->payload[3] = fs_get_wm_gen7(dev, fs);
-}
-
-static uint32_t
-fs_get_psx_gen8(const struct ilo_dev *dev,
-                const struct ilo_shader_state *fs)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   dw = GEN8_PSX_DW1_DISPATCH_ENABLE;
-
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_USE_KILL))
-      dw |= GEN8_PSX_DW1_KILL_PIXEL;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_OUTPUT_Z))
-      dw |= GEN8_PSX_DW1_PSCDEPTH_ON;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_Z))
-      dw |= GEN8_PSX_DW1_USE_DEPTH;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_INPUT_W))
-      dw |= GEN8_PSX_DW1_USE_W;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_INPUT_COUNT))
-      dw |= GEN8_PSX_DW1_ATTR_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-fs_get_wm_gen8(const struct ilo_dev *dev,
-               const struct ilo_shader_state *fs)
-{
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   return ilo_shader_get_kernel_param(fs,
-         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS) <<
-      GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT;
-}
-
-static void
-fs_init_cso_gen8(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *fs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, sampler_count;
-   uint32_t dw3, dw6, dw7;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   start_grf = ilo_shader_get_kernel_param(fs, ILO_KERNEL_URB_DATA_START_REG);
-   sampler_count = ilo_shader_get_kernel_param(fs, ILO_KERNEL_SAMPLER_COUNT);
-
-   dw3 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw3 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   /* always 64? */
-   dw6 = (64 - 2) << GEN8_PS_DW6_MAX_THREADS__SHIFT |
-         GEN8_PS_DW6_POSOFFSET_NONE;
-   if (ilo_shader_get_kernel_param(fs, ILO_KERNEL_PCB_CBUF0_SIZE))
-      dw6 |= GEN8_PS_DW6_PUSH_CONSTANT_ENABLE;
-
-   assert(!ilo_shader_get_kernel_param(fs, ILO_KERNEL_FS_DISPATCH_16_OFFSET));
-   dw6 |= GEN6_PS_DISPATCH_8 << GEN8_PS_DW6_DISPATCH_MODE__SHIFT;
-
-   dw7 = start_grf << GEN8_PS_DW7_URB_GRF_START0__SHIFT |
-         0 << GEN8_PS_DW7_URB_GRF_START1__SHIFT |
-         0 << GEN8_PS_DW7_URB_GRF_START2__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 5);
-   cso->payload[0] = dw3;
-   cso->payload[1] = dw6;
-   cso->payload[2] = dw7;
-   cso->payload[3] = fs_get_psx_gen8(dev, fs);
-   cso->payload[4] = fs_get_wm_gen8(dev, fs);
-}
-
-void
-ilo_gpe_init_fs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *fs,
-                    struct ilo_shader_cso *cso)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      fs_init_cso_gen8(dev, fs, cso);
-   else if (ilo_dev_gen(dev) >= ILO_GEN(7))
-      fs_init_cso_gen7(dev, fs, cso);
-   else
-      fs_init_cso_gen6(dev, fs, cso);
-}
-
-struct ilo_zs_surface_info {
-   int surface_type;
-   int format;
-
-   struct {
-      struct intel_bo *bo;
-      unsigned stride;
-      unsigned qpitch;
-      enum gen_surface_tiling tiling;
-      uint32_t offset;
-   } zs, stencil, hiz;
-
-   unsigned width, height, depth;
-   unsigned lod, first_layer, num_layers;
-};
-
-static void
-zs_init_info_null(const struct ilo_dev *dev,
-                  struct ilo_zs_surface_info *info)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   memset(info, 0, sizeof(*info));
-
-   info->surface_type = GEN6_SURFTYPE_NULL;
-   info->format = GEN6_ZFORMAT_D32_FLOAT;
-   info->width = 1;
-   info->height = 1;
-   info->depth = 1;
-   info->num_layers = 1;
-}
-
-static void
-zs_init_info(const struct ilo_dev *dev,
-             const struct ilo_image *img,
-             const struct ilo_image *s8_img,
-             enum pipe_texture_target target,
-             enum pipe_format format, unsigned level,
-             unsigned first_layer, unsigned num_layers,
-             struct ilo_zs_surface_info *info)
-{
-   bool separate_stencil;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   memset(info, 0, sizeof(*info));
-
-   info->surface_type = ilo_gpe_gen6_translate_texture(target);
-
-   if (info->surface_type == GEN6_SURFTYPE_CUBE) {
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 325-326:
-       *
-       *     "For Other Surfaces (Cube Surfaces):
-       *      This field (Minimum Array Element) is ignored."
-       *
-       *     "For Other Surfaces (Cube Surfaces):
-       *      This field (Render Target View Extent) is ignored."
-       *
-       * As such, we cannot set first_layer and num_layers on cube surfaces.
-       * To work around that, treat it as a 2D surface.
-       */
-      info->surface_type = GEN6_SURFTYPE_2D;
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      separate_stencil = true;
-   } else {
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-       *
-       *     "This field (Separate Stencil Buffer Enable) must be set to the
-       *      same value (enabled or disabled) as Hierarchical Depth Buffer
-       *      Enable."
-       */
-      separate_stencil = ilo_image_can_enable_aux(img, level);
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 317:
-    *
-    *     "If this field (Hierarchical Depth Buffer Enable) is enabled, the
-    *      Surface Format of the depth buffer cannot be
-    *      D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT. Use of stencil
-    *      requires the separate stencil buffer."
-    *
-    * From the Ironlake PRM, volume 2 part 1, page 330:
-    *
-    *     "If this field (Separate Stencil Buffer Enable) is disabled, the
-    *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
-    *
-    * There is no similar restriction for GEN6.  But when D24_UNORM_X8_UINT
-    * is indeed used, the depth values output by the fragment shaders will
-    * be different when read back.
-    *
-    * As for GEN7+, separate_stencil is always true.
-    */
-   switch (format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      info->format = GEN6_ZFORMAT_D16_UNORM;
-      break;
-   case PIPE_FORMAT_Z32_FLOAT:
-      info->format = GEN6_ZFORMAT_D32_FLOAT;
-      break;
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
-      info->format = (separate_stencil) ?
-         GEN6_ZFORMAT_D24_UNORM_X8_UINT :
-         GEN6_ZFORMAT_D24_UNORM_S8_UINT;
-      break;
-   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
-      info->format = (separate_stencil) ?
-         GEN6_ZFORMAT_D32_FLOAT :
-         GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
-      break;
-   case PIPE_FORMAT_S8_UINT:
-      if (separate_stencil) {
-         info->format = GEN6_ZFORMAT_D32_FLOAT;
-         break;
-      }
-      /* fall through */
-   default:
-      assert(!"unsupported depth/stencil format");
-      zs_init_info_null(dev, info);
-      return;
-      break;
-   }
-
-   if (format != PIPE_FORMAT_S8_UINT) {
-      info->zs.bo = img->bo;
-      info->zs.stride = img->bo_stride;
-
-      assert(img->walk_layer_height % 4 == 0);
-      info->zs.qpitch = img->walk_layer_height / 4;
-
-      info->zs.tiling = img->tiling;
-      info->zs.offset = 0;
-   }
-
-   if (s8_img || format == PIPE_FORMAT_S8_UINT) {
-      info->stencil.bo = s8_img->bo;
-
-      /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 329:
-       *
-       *     "The pitch must be set to 2x the value computed based on width,
-       *       as the stencil buffer is stored with two rows interleaved."
-       *
-       * For GEN7, we still dobule the stride because we did not double the
-       * slice widths when initializing the layout.
-       */
-      info->stencil.stride = s8_img->bo_stride * 2;
-
-      assert(s8_img->walk_layer_height % 4 == 0);
-      info->stencil.qpitch = s8_img->walk_layer_height / 4;
-
-      info->stencil.tiling = s8_img->tiling;
-
-      if (ilo_dev_gen(dev) == ILO_GEN(6)) {
-         unsigned x, y;
-
-         assert(s8_img->walk == ILO_IMAGE_WALK_LOD);
-
-         /* offset to the level */
-         ilo_image_get_slice_pos(s8_img, level, 0, &x, &y);
-         ilo_image_pos_to_mem(s8_img, x, y, &x, &y);
-         info->stencil.offset = ilo_image_mem_to_raw(s8_img, x, y);
-      }
-   }
-
-   if (ilo_image_can_enable_aux(img, level)) {
-      info->hiz.bo = img->aux.bo;
-      info->hiz.stride = img->aux.bo_stride;
-
-      assert(img->aux.walk_layer_height % 4 == 0);
-      info->hiz.qpitch = img->aux.walk_layer_height / 4;
-
-      info->hiz.tiling = GEN6_TILING_Y;
-
-      /* offset to the level */
-      if (ilo_dev_gen(dev) == ILO_GEN(6))
-         info->hiz.offset = img->aux.walk_lod_offsets[level];
-   }
-
-   info->width = img->width0;
-   info->height = img->height0;
-   info->depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers;
-
-   info->lod = level;
-   info->first_layer = first_layer;
-   info->num_layers = num_layers;
-}
-
-void
-ilo_gpe_init_zs_surface(const struct ilo_dev *dev,
-                        const struct ilo_image *img,
-                        const struct ilo_image *s8_img,
-                        enum pipe_texture_target target,
-                        enum pipe_format format, unsigned level,
-                        unsigned first_layer, unsigned num_layers,
-                        struct ilo_zs_surface *zs)
-{
-   const int max_2d_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
-   const int max_array_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512;
-   struct ilo_zs_surface_info info;
-   uint32_t dw1, dw2, dw3, dw4, dw5, dw6;
-   int align_w = 8, align_h = 4;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   if (img) {
-      zs_init_info(dev, img, s8_img, target, format,
-            level, first_layer, num_layers, &info);
-
-      switch (img->sample_count) {
-      case 2:
-         align_w /= 2;
-         break;
-      case 4:
-         align_w /= 2;
-         align_h /= 2;
-         break;
-      case 8:
-         align_w /= 4;
-         align_h /= 2;
-         break;
-      case 16:
-         align_w /= 4;
-         align_h /= 4;
-         break;
-      default:
-         break;
-      }
-   } else {
-      zs_init_info_null(dev, &info);
-   }
-
-   switch (info.surface_type) {
-   case GEN6_SURFTYPE_NULL:
-      break;
-   case GEN6_SURFTYPE_1D:
-      assert(info.width <= max_2d_size && info.height == 1 &&
-             info.depth <= max_array_size);
-      assert(info.first_layer < max_array_size - 1 &&
-             info.num_layers <= max_array_size);
-      break;
-   case GEN6_SURFTYPE_2D:
-      assert(info.width <= max_2d_size && info.height <= max_2d_size &&
-             info.depth <= max_array_size);
-      assert(info.first_layer < max_array_size - 1 &&
-             info.num_layers <= max_array_size);
-      break;
-   case GEN6_SURFTYPE_3D:
-      assert(info.width <= 2048 && info.height <= 2048 && info.depth <= 2048);
-      assert(info.first_layer < 2048 && info.num_layers <= max_array_size);
-      break;
-   case GEN6_SURFTYPE_CUBE:
-      assert(info.width <= max_2d_size && info.height <= max_2d_size &&
-             info.depth == 1);
-      assert(info.first_layer == 0 && info.num_layers == 1);
-      assert(info.width == info.height);
-      break;
-   default:
-      assert(!"unexpected depth surface type");
-      break;
-   }
-
-   dw1 = info.surface_type << GEN6_DEPTH_DW1_TYPE__SHIFT |
-         info.format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
-
-   if (info.zs.bo) {
-      /* required for GEN6+ */
-      assert(info.zs.tiling == GEN6_TILING_Y);
-      assert(info.zs.stride > 0 && info.zs.stride < 128 * 1024 &&
-            info.zs.stride % 128 == 0);
-      assert(info.width <= info.zs.stride);
-
-      dw1 |= (info.zs.stride - 1);
-      dw2 = info.zs.offset;
-   } else {
-      dw2 = 0;
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      if (info.zs.bo)
-         dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE;
-
-      if (info.stencil.bo)
-         dw1 |= GEN7_DEPTH_DW1_STENCIL_WRITE_ENABLE;
-
-      if (info.hiz.bo)
-         dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE;
-
-      dw3 = (info.height - 1) << GEN7_DEPTH_DW3_HEIGHT__SHIFT |
-            (info.width - 1) << GEN7_DEPTH_DW3_WIDTH__SHIFT |
-            info.lod << GEN7_DEPTH_DW3_LOD__SHIFT;
-
-      zs->dw_aligned_8x4 =
-         (align(info.height, align_h) - 1) << GEN7_DEPTH_DW3_HEIGHT__SHIFT |
-         (align(info.width, align_w) - 1) << GEN7_DEPTH_DW3_WIDTH__SHIFT |
-         info.lod << GEN7_DEPTH_DW3_LOD__SHIFT;
-
-      dw4 = (info.depth - 1) << GEN7_DEPTH_DW4_DEPTH__SHIFT |
-            info.first_layer << GEN7_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT;
-
-      dw5 = 0;
-
-      dw6 = (info.num_layers - 1) << GEN7_DEPTH_DW6_RT_VIEW_EXTENT__SHIFT;
-
-      if (ilo_dev_gen(dev) >= ILO_GEN(8))
-         dw6 |= info.zs.qpitch;
-   } else {
-      /* always Y-tiled */
-      dw1 |= GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT;
-
-      if (info.hiz.bo) {
-         dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE |
-                GEN6_DEPTH_DW1_SEPARATE_STENCIL;
-      }
-
-      dw3 = (info.height - 1) << GEN6_DEPTH_DW3_HEIGHT__SHIFT |
-            (info.width - 1) << GEN6_DEPTH_DW3_WIDTH__SHIFT |
-            info.lod << GEN6_DEPTH_DW3_LOD__SHIFT |
-            GEN6_DEPTH_DW3_MIPLAYOUT_BELOW;
-
-      zs->dw_aligned_8x4 =
-         (align(info.height, align_h) - 1) << GEN6_DEPTH_DW3_HEIGHT__SHIFT |
-         (align(info.width, align_w) - 1) << GEN6_DEPTH_DW3_WIDTH__SHIFT |
-         info.lod << GEN6_DEPTH_DW3_LOD__SHIFT |
-         GEN6_DEPTH_DW3_MIPLAYOUT_BELOW;
-
-      dw4 = (info.depth - 1) << GEN6_DEPTH_DW4_DEPTH__SHIFT |
-            info.first_layer << GEN6_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT |
-            (info.num_layers - 1) << GEN6_DEPTH_DW4_RT_VIEW_EXTENT__SHIFT;
-
-      dw5 = 0;
-
-      dw6 = 0;
-   }
-
-   STATIC_ASSERT(Elements(zs->payload) >= 12);
-
-   zs->payload[0] = dw1;
-   zs->payload[1] = dw2;
-   zs->payload[2] = dw3;
-   zs->payload[3] = dw4;
-   zs->payload[4] = dw5;
-   zs->payload[5] = dw6;
-
-   /* do not increment reference count */
-   zs->bo = info.zs.bo;
-
-   /* separate stencil */
-   if (info.stencil.bo) {
-      assert(info.stencil.stride > 0 && info.stencil.stride < 128 * 1024 &&
-             info.stencil.stride % 128 == 0);
-
-      dw1 = (info.stencil.stride - 1) << GEN6_STENCIL_DW1_PITCH__SHIFT;
-      if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
-         dw1 |= GEN75_STENCIL_DW1_STENCIL_BUFFER_ENABLE;
-
-      dw2 = info.stencil.offset;
-      dw4 = info.stencil.qpitch;
-   } else {
-      dw1 = 0;
-      dw2 = 0;
-      dw4 = 0;
-   }
-
-   zs->payload[6] = dw1;
-   zs->payload[7] = dw2;
-   zs->payload[8] = dw4;
-   /* do not increment reference count */
-   zs->separate_s8_bo = info.stencil.bo;
-
-   /* hiz */
-   if (info.hiz.bo) {
-      dw1 = (info.hiz.stride - 1) << GEN6_HIZ_DW1_PITCH__SHIFT;
-      dw2 = info.hiz.offset;
-      dw4 = info.hiz.qpitch;
-   } else {
-      dw1 = 0;
-      dw2 = 0;
-      dw4 = 0;
-   }
-
-   zs->payload[9] = dw1;
-   zs->payload[10] = dw2;
-   zs->payload[11] = dw4;
-   /* do not increment reference count */
-   zs->hiz_bo = info.hiz.bo;
-}
-
-static void
-viewport_get_guardband(const struct ilo_dev *dev,
-                       int center_x, int center_y,
-                       int *min_gbx, int *max_gbx,
-                       int *min_gby, int *max_gby)
-{
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 234:
-    *
-    *     "Per-Device Guardband Extents
-    *
-    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-16K,16K-1]
-    *       - Maximum Post-Clamp Delta (X or Y): 16K"
-    *
-    *     "In addition, in order to be correctly rendered, objects must have a
-    *      screenspace bounding box not exceeding 8K in the X or Y direction.
-    *      This additional restriction must also be comprehended by software,
-    *      i.e., enforced by use of clipping."
-    *
-    * From the Ivy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "Per-Device Guardband Extents
-    *
-    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-32K,32K-1]
-    *       - Maximum Post-Clamp Delta (X or Y): N/A"
-    *
-    *     "In addition, in order to be correctly rendered, objects must have a
-    *      screenspace bounding box not exceeding 8K in the X or Y direction.
-    *      This additional restriction must also be comprehended by software,
-    *      i.e., enforced by use of clipping."
-    *
-    * Combined, the bounding box of any object can not exceed 8K in both
-    * width and height.
-    *
-    * Below we set the guardband as a squre of length 8K, centered at where
-    * the viewport is.  This makes sure all objects passing the GB test are
-    * valid to the renderer, and those failing the XY clipping have a
-    * better chance of passing the GB test.
-    */
-   const int max_extent = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 32768 : 16384;
-   const int half_len = 8192 / 2;
-
-   /* make sure the guardband is within the valid range */
-   if (center_x - half_len < -max_extent)
-      center_x = -max_extent + half_len;
-   else if (center_x + half_len > max_extent - 1)
-      center_x = max_extent - half_len;
-
-   if (center_y - half_len < -max_extent)
-      center_y = -max_extent + half_len;
-   else if (center_y + half_len > max_extent - 1)
-      center_y = max_extent - half_len;
-
-   *min_gbx = (float) (center_x - half_len);
-   *max_gbx = (float) (center_x + half_len);
-   *min_gby = (float) (center_y - half_len);
-   *max_gby = (float) (center_y + half_len);
-}
-
-void
-ilo_gpe_set_viewport_cso(const struct ilo_dev *dev,
-                         const struct pipe_viewport_state *state,
-                         struct ilo_viewport_cso *vp)
-{
-   const float scale_x = fabs(state->scale[0]);
-   const float scale_y = fabs(state->scale[1]);
-   const float scale_z = fabs(state->scale[2]);
-   int min_gbx, max_gbx, min_gby, max_gby;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   viewport_get_guardband(dev,
-         (int) state->translate[0],
-         (int) state->translate[1],
-         &min_gbx, &max_gbx, &min_gby, &max_gby);
-
-   /* matrix form */
-   vp->m00 = state->scale[0];
-   vp->m11 = state->scale[1];
-   vp->m22 = state->scale[2];
-   vp->m30 = state->translate[0];
-   vp->m31 = state->translate[1];
-   vp->m32 = state->translate[2];
-
-   /* guardband in NDC space */
-   vp->min_gbx = ((float) min_gbx - state->translate[0]) / scale_x;
-   vp->max_gbx = ((float) max_gbx - state->translate[0]) / scale_x;
-   vp->min_gby = ((float) min_gby - state->translate[1]) / scale_y;
-   vp->max_gby = ((float) max_gby - state->translate[1]) / scale_y;
-
-   /* viewport in screen space */
-   vp->min_x = scale_x * -1.0f + state->translate[0];
-   vp->max_x = scale_x *  1.0f + state->translate[0];
-   vp->min_y = scale_y * -1.0f + state->translate[1];
-   vp->max_y = scale_y *  1.0f + state->translate[1];
-   vp->min_z = scale_z * -1.0f + state->translate[2];
-   vp->max_z = scale_z *  1.0f + state->translate[2];
-}
-
-/**
- * Translate a pipe logicop to the matching hardware logicop.
- */
-static int
-gen6_translate_pipe_logicop(unsigned logicop)
-{
-   switch (logicop) {
-   case PIPE_LOGICOP_CLEAR:         return GEN6_LOGICOP_CLEAR;
-   case PIPE_LOGICOP_NOR:           return GEN6_LOGICOP_NOR;
-   case PIPE_LOGICOP_AND_INVERTED:  return GEN6_LOGICOP_AND_INVERTED;
-   case PIPE_LOGICOP_COPY_INVERTED: return GEN6_LOGICOP_COPY_INVERTED;
-   case PIPE_LOGICOP_AND_REVERSE:   return GEN6_LOGICOP_AND_REVERSE;
-   case PIPE_LOGICOP_INVERT:        return GEN6_LOGICOP_INVERT;
-   case PIPE_LOGICOP_XOR:           return GEN6_LOGICOP_XOR;
-   case PIPE_LOGICOP_NAND:          return GEN6_LOGICOP_NAND;
-   case PIPE_LOGICOP_AND:           return GEN6_LOGICOP_AND;
-   case PIPE_LOGICOP_EQUIV:         return GEN6_LOGICOP_EQUIV;
-   case PIPE_LOGICOP_NOOP:          return GEN6_LOGICOP_NOOP;
-   case PIPE_LOGICOP_OR_INVERTED:   return GEN6_LOGICOP_OR_INVERTED;
-   case PIPE_LOGICOP_COPY:          return GEN6_LOGICOP_COPY;
-   case PIPE_LOGICOP_OR_REVERSE:    return GEN6_LOGICOP_OR_REVERSE;
-   case PIPE_LOGICOP_OR:            return GEN6_LOGICOP_OR;
-   case PIPE_LOGICOP_SET:           return GEN6_LOGICOP_SET;
-   default:
-      assert(!"unknown logicop function");
-      return GEN6_LOGICOP_CLEAR;
-   }
-}
-
-/**
- * Translate a pipe blend function to the matching hardware blend function.
- */
-static int
-gen6_translate_pipe_blend(unsigned blend)
-{
-   switch (blend) {
-   case PIPE_BLEND_ADD:                return GEN6_BLENDFUNCTION_ADD;
-   case PIPE_BLEND_SUBTRACT:           return GEN6_BLENDFUNCTION_SUBTRACT;
-   case PIPE_BLEND_REVERSE_SUBTRACT:   return GEN6_BLENDFUNCTION_REVERSE_SUBTRACT;
-   case PIPE_BLEND_MIN:                return GEN6_BLENDFUNCTION_MIN;
-   case PIPE_BLEND_MAX:                return GEN6_BLENDFUNCTION_MAX;
-   default:
-      assert(!"unknown blend function");
-      return GEN6_BLENDFUNCTION_ADD;
-   };
-}
-
-/**
- * Translate a pipe blend factor to the matching hardware blend factor.
- */
-static int
-gen6_translate_pipe_blendfactor(unsigned blendfactor)
-{
-   switch (blendfactor) {
-   case PIPE_BLENDFACTOR_ONE:                return GEN6_BLENDFACTOR_ONE;
-   case PIPE_BLENDFACTOR_SRC_COLOR:          return GEN6_BLENDFACTOR_SRC_COLOR;
-   case PIPE_BLENDFACTOR_SRC_ALPHA:          return GEN6_BLENDFACTOR_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_DST_ALPHA:          return GEN6_BLENDFACTOR_DST_ALPHA;
-   case PIPE_BLENDFACTOR_DST_COLOR:          return GEN6_BLENDFACTOR_DST_COLOR;
-   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE;
-   case PIPE_BLENDFACTOR_CONST_COLOR:        return GEN6_BLENDFACTOR_CONST_COLOR;
-   case PIPE_BLENDFACTOR_CONST_ALPHA:        return GEN6_BLENDFACTOR_CONST_ALPHA;
-   case PIPE_BLENDFACTOR_SRC1_COLOR:         return GEN6_BLENDFACTOR_SRC1_COLOR;
-   case PIPE_BLENDFACTOR_SRC1_ALPHA:         return GEN6_BLENDFACTOR_SRC1_ALPHA;
-   case PIPE_BLENDFACTOR_ZERO:               return GEN6_BLENDFACTOR_ZERO;
-   case PIPE_BLENDFACTOR_INV_SRC_COLOR:      return GEN6_BLENDFACTOR_INV_SRC_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:      return GEN6_BLENDFACTOR_INV_SRC_ALPHA;
-   case PIPE_BLENDFACTOR_INV_DST_ALPHA:      return GEN6_BLENDFACTOR_INV_DST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_DST_COLOR:      return GEN6_BLENDFACTOR_INV_DST_COLOR;
-   case PIPE_BLENDFACTOR_INV_CONST_COLOR:    return GEN6_BLENDFACTOR_INV_CONST_COLOR;
-   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:    return GEN6_BLENDFACTOR_INV_CONST_ALPHA;
-   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:     return GEN6_BLENDFACTOR_INV_SRC1_COLOR;
-   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:     return GEN6_BLENDFACTOR_INV_SRC1_ALPHA;
-   default:
-      assert(!"unknown blend factor");
-      return GEN6_BLENDFACTOR_ONE;
-   };
-}
-
-/**
- * Translate a pipe stencil op to the matching hardware stencil op.
- */
-static int
-gen6_translate_pipe_stencil_op(unsigned stencil_op)
-{
-   switch (stencil_op) {
-   case PIPE_STENCIL_OP_KEEP:       return GEN6_STENCILOP_KEEP;
-   case PIPE_STENCIL_OP_ZERO:       return GEN6_STENCILOP_ZERO;
-   case PIPE_STENCIL_OP_REPLACE:    return GEN6_STENCILOP_REPLACE;
-   case PIPE_STENCIL_OP_INCR:       return GEN6_STENCILOP_INCRSAT;
-   case PIPE_STENCIL_OP_DECR:       return GEN6_STENCILOP_DECRSAT;
-   case PIPE_STENCIL_OP_INCR_WRAP:  return GEN6_STENCILOP_INCR;
-   case PIPE_STENCIL_OP_DECR_WRAP:  return GEN6_STENCILOP_DECR;
-   case PIPE_STENCIL_OP_INVERT:     return GEN6_STENCILOP_INVERT;
-   default:
-      assert(!"unknown stencil op");
-      return GEN6_STENCILOP_KEEP;
-   }
-}
-
-static int
-gen6_blend_factor_dst_alpha_forced_one(int factor)
-{
-   switch (factor) {
-   case GEN6_BLENDFACTOR_DST_ALPHA:
-      return GEN6_BLENDFACTOR_ONE;
-   case GEN6_BLENDFACTOR_INV_DST_ALPHA:
-   case GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE:
-      return GEN6_BLENDFACTOR_ZERO;
-   default:
-      return factor;
-   }
-}
-
-static uint32_t
-blend_get_rt_blend_enable_gen6(const struct ilo_dev *dev,
-                               const struct pipe_rt_blend_state *rt,
-                               bool dst_alpha_forced_one)
-{
-   int rgb_src, rgb_dst, a_src, a_dst;
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!rt->blend_enable)
-      return 0;
-
-   rgb_src = gen6_translate_pipe_blendfactor(rt->rgb_src_factor);
-   rgb_dst = gen6_translate_pipe_blendfactor(rt->rgb_dst_factor);
-   a_src = gen6_translate_pipe_blendfactor(rt->alpha_src_factor);
-   a_dst = gen6_translate_pipe_blendfactor(rt->alpha_dst_factor);
-
-   if (dst_alpha_forced_one) {
-      rgb_src = gen6_blend_factor_dst_alpha_forced_one(rgb_src);
-      rgb_dst = gen6_blend_factor_dst_alpha_forced_one(rgb_dst);
-      a_src = gen6_blend_factor_dst_alpha_forced_one(a_src);
-      a_dst = gen6_blend_factor_dst_alpha_forced_one(a_dst);
-   }
-
-   dw = GEN6_RT_DW0_BLEND_ENABLE |
-        gen6_translate_pipe_blend(rt->alpha_func) << 26 |
-        a_src << 20 |
-        a_dst << 15 |
-        gen6_translate_pipe_blend(rt->rgb_func) << 11 |
-        rgb_src << 5 |
-        rgb_dst;
-
-   if (rt->rgb_func != rt->alpha_func ||
-       rgb_src != a_src || rgb_dst != a_dst)
-      dw |= GEN6_RT_DW0_INDEPENDENT_ALPHA_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-blend_get_rt_blend_enable_gen8(const struct ilo_dev *dev,
-                               const struct pipe_rt_blend_state *rt,
-                               bool dst_alpha_forced_one,
-                               bool *independent_alpha)
-{
-   int rgb_src, rgb_dst, a_src, a_dst;
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!rt->blend_enable) {
-      *independent_alpha = false;
-      return 0;
-   }
-
-   rgb_src = gen6_translate_pipe_blendfactor(rt->rgb_src_factor);
-   rgb_dst = gen6_translate_pipe_blendfactor(rt->rgb_dst_factor);
-   a_src = gen6_translate_pipe_blendfactor(rt->alpha_src_factor);
-   a_dst = gen6_translate_pipe_blendfactor(rt->alpha_dst_factor);
-
-   if (dst_alpha_forced_one) {
-      rgb_src = gen6_blend_factor_dst_alpha_forced_one(rgb_src);
-      rgb_dst = gen6_blend_factor_dst_alpha_forced_one(rgb_dst);
-      a_src = gen6_blend_factor_dst_alpha_forced_one(a_src);
-      a_dst = gen6_blend_factor_dst_alpha_forced_one(a_dst);
-   }
-
-   dw = GEN8_RT_DW0_BLEND_ENABLE |
-        rgb_src << 26 |
-        rgb_dst << 21 |
-        gen6_translate_pipe_blend(rt->rgb_func) << 18 |
-        a_src << 13 |
-        a_dst << 8 |
-        gen6_translate_pipe_blend(rt->alpha_func) << 5;
-
-   *independent_alpha = (rt->rgb_func != rt->alpha_func ||
-                         rgb_src != a_src ||
-                         rgb_dst != a_dst);
-
-   return dw;
-}
-
-static void
-blend_init_cso_gen6(const struct ilo_dev *dev,
-                    const struct pipe_blend_state *state,
-                    struct ilo_blend_state *blend,
-                    unsigned index)
-{
-   const struct pipe_rt_blend_state *rt = &state->rt[index];
-   struct ilo_blend_cso *cso = &blend->cso[index];
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   cso->payload[0] = 0;
-   cso->payload[1] = GEN6_RT_DW1_COLORCLAMP_RTFORMAT |
-                     GEN6_RT_DW1_PRE_BLEND_CLAMP |
-                     GEN6_RT_DW1_POST_BLEND_CLAMP;
-
-   if (!(rt->colormask & PIPE_MASK_A))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_A;
-   if (!(rt->colormask & PIPE_MASK_R))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_R;
-   if (!(rt->colormask & PIPE_MASK_G))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_G;
-   if (!(rt->colormask & PIPE_MASK_B))
-      cso->payload[1] |= GEN6_RT_DW1_WRITE_DISABLE_B;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
-    *
-    *     "Color Buffer Blending and Logic Ops must not be enabled
-    *      simultaneously, or behavior is UNDEFINED."
-    *
-    * Since state->logicop_enable takes precedence over rt->blend_enable,
-    * no special care is needed.
-    */
-   if (state->logicop_enable) {
-      cso->dw_blend = 0;
-      cso->dw_blend_dst_alpha_forced_one = 0;
-   } else {
-      cso->dw_blend = blend_get_rt_blend_enable_gen6(dev, rt, false);
-      cso->dw_blend_dst_alpha_forced_one =
-         blend_get_rt_blend_enable_gen6(dev, rt, true);
-   }
-}
-
-static bool
-blend_init_cso_gen8(const struct ilo_dev *dev,
-                    const struct pipe_blend_state *state,
-                    struct ilo_blend_state *blend,
-                    unsigned index)
-{
-   const struct pipe_rt_blend_state *rt = &state->rt[index];
-   struct ilo_blend_cso *cso = &blend->cso[index];
-   bool independent_alpha = false;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   cso->payload[0] = 0;
-   cso->payload[1] = GEN8_RT_DW1_COLORCLAMP_RTFORMAT |
-                     GEN8_RT_DW1_PRE_BLEND_CLAMP |
-                     GEN8_RT_DW1_POST_BLEND_CLAMP;
-
-   if (!(rt->colormask & PIPE_MASK_A))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_A;
-   if (!(rt->colormask & PIPE_MASK_R))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_R;
-   if (!(rt->colormask & PIPE_MASK_G))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_G;
-   if (!(rt->colormask & PIPE_MASK_B))
-      cso->payload[0] |= GEN8_RT_DW0_WRITE_DISABLE_B;
-
-   if (state->logicop_enable) {
-      cso->dw_blend = 0;
-      cso->dw_blend_dst_alpha_forced_one = 0;
-   } else {
-      bool tmp[2];
-
-      cso->dw_blend = blend_get_rt_blend_enable_gen8(dev, rt, false, &tmp[0]);
-      cso->dw_blend_dst_alpha_forced_one =
-         blend_get_rt_blend_enable_gen8(dev, rt, true, &tmp[1]);
-
-      if (tmp[0] || tmp[1])
-         independent_alpha = true;
-   }
-
-   return independent_alpha;
-}
-
-static uint32_t
-blend_get_logicop_enable_gen6(const struct ilo_dev *dev,
-                              const struct pipe_blend_state *state)
-{
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!state->logicop_enable)
-      return 0;
-
-   return GEN6_RT_DW1_LOGICOP_ENABLE |
-          gen6_translate_pipe_logicop(state->logicop_func) << 18;
-}
-
-static uint32_t
-blend_get_logicop_enable_gen8(const struct ilo_dev *dev,
-                              const struct pipe_blend_state *state)
-{
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!state->logicop_enable)
-      return 0;
-
-   return GEN8_RT_DW1_LOGICOP_ENABLE |
-          gen6_translate_pipe_logicop(state->logicop_func) << 27;
-}
-
-static uint32_t
-blend_get_alpha_mod_gen6(const struct ilo_dev *dev,
-                         const struct pipe_blend_state *state,
-                         bool dual_blend)
-{
-   uint32_t dw = 0;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (state->alpha_to_coverage) {
-      dw |= GEN6_RT_DW1_ALPHA_TO_COVERAGE;
-      if (ilo_dev_gen(dev) >= ILO_GEN(7))
-         dw |= GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER;
-   }
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 378:
-    *
-    *     "If Dual Source Blending is enabled, this bit (AlphaToOne Enable)
-    *      must be disabled."
-    */
-   if (state->alpha_to_one && !dual_blend)
-      dw |= GEN6_RT_DW1_ALPHA_TO_ONE;
-
-   return dw;
-}
-
-static uint32_t
-blend_get_alpha_mod_gen8(const struct ilo_dev *dev,
-                         const struct pipe_blend_state *state,
-                         bool dual_blend)
-{
-   uint32_t dw = 0;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (state->alpha_to_coverage) {
-      dw |= GEN8_BLEND_DW0_ALPHA_TO_COVERAGE |
-            GEN8_BLEND_DW0_ALPHA_TO_COVERAGE_DITHER;
-   }
-
-   if (state->alpha_to_one && !dual_blend)
-      dw |= GEN8_BLEND_DW0_ALPHA_TO_ONE;
-
-   return dw;
-}
-
-static uint32_t
-blend_get_ps_blend_gen8(const struct ilo_dev *dev, uint32_t rt_dw0)
-{
-   int rgb_src, rgb_dst, a_src, a_dst;
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!(rt_dw0 & GEN8_RT_DW0_BLEND_ENABLE))
-      return 0;
-
-   a_src = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_SRC_ALPHA_FACTOR);
-   a_dst = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_DST_ALPHA_FACTOR);
-   rgb_src = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_SRC_COLOR_FACTOR);
-   rgb_dst = GEN_EXTRACT(rt_dw0, GEN8_RT_DW0_DST_COLOR_FACTOR);
-
-   dw = GEN8_PS_BLEND_DW1_BLEND_ENABLE;
-   dw |= GEN_SHIFT32(a_src, GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR);
-   dw |= GEN_SHIFT32(a_dst, GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR);
-   dw |= GEN_SHIFT32(rgb_src, GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR);
-   dw |= GEN_SHIFT32(rgb_dst, GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR);
-
-   if (a_src != rgb_src || a_dst != rgb_dst)
-      dw |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE;
-
-   return dw;
-}
-
-void
-ilo_gpe_init_blend(const struct ilo_dev *dev,
-                   const struct pipe_blend_state *state,
-                   struct ilo_blend_state *blend)
-{
-   unsigned i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   blend->dual_blend = (util_blend_state_is_dual(state, 0) &&
-                        state->rt[0].blend_enable &&
-                        !state->logicop_enable);
-   blend->alpha_to_coverage = state->alpha_to_coverage;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      bool independent_alpha;
-
-      blend->dw_alpha_mod =
-         blend_get_alpha_mod_gen8(dev, state, blend->dual_blend);
-      blend->dw_logicop = blend_get_logicop_enable_gen8(dev, state);
-      blend->dw_shared = (state->dither) ? GEN8_BLEND_DW0_DITHER_ENABLE : 0;
-
-      independent_alpha = blend_init_cso_gen8(dev, state, blend, 0);
-      if (independent_alpha)
-         blend->dw_shared |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE;
-
-      blend->dw_ps_blend = blend_get_ps_blend_gen8(dev,
-            blend->cso[0].dw_blend);
-      blend->dw_ps_blend_dst_alpha_forced_one = blend_get_ps_blend_gen8(dev,
-            blend->cso[0].dw_blend_dst_alpha_forced_one);
-
-      if (state->independent_blend_enable) {
-         for (i = 1; i < Elements(blend->cso); i++) {
-            independent_alpha = blend_init_cso_gen8(dev, state, blend, i);
-            if (independent_alpha)
-               blend->dw_shared |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE;
-         }
-      } else {
-         for (i = 1; i < Elements(blend->cso); i++)
-            blend->cso[i] = blend->cso[0];
-      }
-   } else {
-      blend->dw_alpha_mod =
-         blend_get_alpha_mod_gen6(dev, state, blend->dual_blend);
-      blend->dw_logicop = blend_get_logicop_enable_gen6(dev, state);
-      blend->dw_shared = (state->dither) ? GEN6_RT_DW1_DITHER_ENABLE : 0;
-
-      blend->dw_ps_blend = 0;
-      blend->dw_ps_blend_dst_alpha_forced_one = 0;
-
-      blend_init_cso_gen6(dev, state, blend, 0);
-      if (state->independent_blend_enable) {
-         for (i = 1; i < Elements(blend->cso); i++)
-            blend_init_cso_gen6(dev, state, blend, i);
-      } else {
-         for (i = 1; i < Elements(blend->cso); i++)
-            blend->cso[i] = blend->cso[0];
-      }
-   }
-}
-
-/**
- * Translate a pipe DSA test function to the matching hardware compare
- * function.
- */
-static int
-gen6_translate_dsa_func(unsigned func)
-{
-   switch (func) {
-   case PIPE_FUNC_NEVER:      return GEN6_COMPAREFUNCTION_NEVER;
-   case PIPE_FUNC_LESS:       return GEN6_COMPAREFUNCTION_LESS;
-   case PIPE_FUNC_EQUAL:      return GEN6_COMPAREFUNCTION_EQUAL;
-   case PIPE_FUNC_LEQUAL:     return GEN6_COMPAREFUNCTION_LEQUAL;
-   case PIPE_FUNC_GREATER:    return GEN6_COMPAREFUNCTION_GREATER;
-   case PIPE_FUNC_NOTEQUAL:   return GEN6_COMPAREFUNCTION_NOTEQUAL;
-   case PIPE_FUNC_GEQUAL:     return GEN6_COMPAREFUNCTION_GEQUAL;
-   case PIPE_FUNC_ALWAYS:     return GEN6_COMPAREFUNCTION_ALWAYS;
-   default:
-      assert(!"unknown depth/stencil/alpha test function");
-      return GEN6_COMPAREFUNCTION_NEVER;
-   }
-}
-
-static uint32_t
-dsa_get_stencil_enable_gen6(const struct ilo_dev *dev,
-                            const struct pipe_stencil_state *stencil0,
-                            const struct pipe_stencil_state *stencil1)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!stencil0->enabled)
-      return 0;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 359:
-    *
-    *     "If the Depth Buffer is either undefined or does not have a surface
-    *      format of D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT and separate
-    *      stencil buffer is disabled, Stencil Test Enable must be DISABLED"
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 370:
-    *
-    *     "This field (Stencil Test Enable) cannot be enabled if
-    *      Surface Format in 3DSTATE_DEPTH_BUFFER is set to D16_UNORM."
-    *
-    * TODO We do not check these yet.
-    */
-   dw = GEN6_ZS_DW0_STENCIL_TEST_ENABLE |
-        gen6_translate_dsa_func(stencil0->func) << 28 |
-        gen6_translate_pipe_stencil_op(stencil0->fail_op) << 25 |
-        gen6_translate_pipe_stencil_op(stencil0->zfail_op) << 22 |
-        gen6_translate_pipe_stencil_op(stencil0->zpass_op) << 19;
-   if (stencil0->writemask)
-      dw |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
-
-   if (stencil1->enabled) {
-      dw |= GEN6_ZS_DW0_STENCIL1_ENABLE |
-            gen6_translate_dsa_func(stencil1->func) << 12 |
-            gen6_translate_pipe_stencil_op(stencil1->fail_op) << 9 |
-            gen6_translate_pipe_stencil_op(stencil1->zfail_op) << 6 |
-            gen6_translate_pipe_stencil_op(stencil1->zpass_op) << 3;
-      if (stencil1->writemask)
-         dw |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
-   }
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_stencil_enable_gen8(const struct ilo_dev *dev,
-                            const struct pipe_stencil_state *stencil0,
-                            const struct pipe_stencil_state *stencil1)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!stencil0->enabled)
-      return 0;
-
-   dw = gen6_translate_pipe_stencil_op(stencil0->fail_op) << 29 |
-        gen6_translate_pipe_stencil_op(stencil0->zfail_op) << 26 |
-        gen6_translate_pipe_stencil_op(stencil0->zpass_op) << 23 |
-        gen6_translate_dsa_func(stencil0->func) << 8 |
-        GEN8_ZS_DW1_STENCIL_TEST_ENABLE;
-   if (stencil0->writemask)
-      dw |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
-
-   if (stencil1->enabled) {
-      dw |= gen6_translate_dsa_func(stencil1->func) << 20 |
-            gen6_translate_pipe_stencil_op(stencil1->fail_op) << 17 |
-            gen6_translate_pipe_stencil_op(stencil1->zfail_op) << 14 |
-            gen6_translate_pipe_stencil_op(stencil1->zpass_op) << 11 |
-            GEN8_ZS_DW1_STENCIL1_ENABLE;
-      if (stencil1->writemask)
-         dw |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
-   }
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_depth_enable_gen6(const struct ilo_dev *dev,
-                          const struct pipe_depth_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 360:
-    *
-    *     "Enabling the Depth Test function without defining a Depth Buffer is
-    *      UNDEFINED."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 375:
-    *
-    *     "A Depth Buffer must be defined before enabling writes to it, or
-    *      operation is UNDEFINED."
-    *
-    * TODO We do not check these yet.
-    */
-   if (state->enabled) {
-      dw = GEN6_ZS_DW2_DEPTH_TEST_ENABLE |
-           gen6_translate_dsa_func(state->func) << 27;
-   } else {
-      dw = GEN6_COMPAREFUNCTION_ALWAYS << 27;
-   }
-
-   if (state->writemask)
-      dw |= GEN6_ZS_DW2_DEPTH_WRITE_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_depth_enable_gen8(const struct ilo_dev *dev,
-                          const struct pipe_depth_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (state->enabled) {
-      dw = GEN8_ZS_DW1_DEPTH_TEST_ENABLE |
-           gen6_translate_dsa_func(state->func) << 5;
-   } else {
-      dw = GEN6_COMPAREFUNCTION_ALWAYS << 5;
-   }
-
-   if (state->writemask)
-      dw |= GEN8_ZS_DW1_DEPTH_WRITE_ENABLE;
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_alpha_enable_gen6(const struct ilo_dev *dev,
-                          const struct pipe_alpha_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 6, 7.5);
-
-   if (!state->enabled)
-      return 0;
-
-   /* this will be ORed to BLEND_STATE */
-   dw = GEN6_RT_DW1_ALPHA_TEST_ENABLE |
-        gen6_translate_dsa_func(state->func) << 13;
-
-   return dw;
-}
-
-static uint32_t
-dsa_get_alpha_enable_gen8(const struct ilo_dev *dev,
-                          const struct pipe_alpha_state *state)
-{
-   uint32_t dw;
-
-   ILO_DEV_ASSERT(dev, 8, 8);
-
-   if (!state->enabled)
-      return 0;
-
-   /* this will be ORed to BLEND_STATE */
-   dw = GEN8_BLEND_DW0_ALPHA_TEST_ENABLE |
-        gen6_translate_dsa_func(state->func) << 24;
-
-   return dw;
-}
-
-void
-ilo_gpe_init_dsa(const struct ilo_dev *dev,
-                 const struct pipe_depth_stencil_alpha_state *state,
-                 struct ilo_dsa_state *dsa)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   STATIC_ASSERT(Elements(dsa->payload) >= 3);
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      const uint32_t dw_stencil = dsa_get_stencil_enable_gen8(dev,
-            &state->stencil[0], &state->stencil[1]);
-      const uint32_t dw_depth = dsa_get_depth_enable_gen8(dev, &state->depth);
-
-      assert(!(dw_stencil & dw_depth));
-      dsa->payload[0] = dw_stencil | dw_depth;
-
-      dsa->dw_blend_alpha = dsa_get_alpha_enable_gen8(dev, &state->alpha);
-      dsa->dw_ps_blend_alpha = (state->alpha.enabled) ?
-         GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE : 0;
-   } else {
-      dsa->payload[0] = dsa_get_stencil_enable_gen6(dev,
-            &state->stencil[0], &state->stencil[1]);
-      dsa->payload[2] = dsa_get_depth_enable_gen6(dev, &state->depth);
-
-      dsa->dw_blend_alpha = dsa_get_alpha_enable_gen6(dev, &state->alpha);
-      dsa->dw_ps_blend_alpha = 0;
-   }
-
-   dsa->payload[1] = state->stencil[0].valuemask << 24 |
-                     state->stencil[0].writemask << 16 |
-                     state->stencil[1].valuemask << 8 |
-                     state->stencil[1].writemask;
-
-   dsa->alpha_ref = float_to_ubyte(state->alpha.ref_value);
-}
-
-void
-ilo_gpe_set_scissor(const struct ilo_dev *dev,
-                    unsigned start_slot,
-                    unsigned num_states,
-                    const struct pipe_scissor_state *states,
-                    struct ilo_scissor_state *scissor)
-{
-   unsigned i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   for (i = 0; i < num_states; i++) {
-      uint16_t min_x, min_y, max_x, max_y;
-
-      /* both max and min are inclusive in SCISSOR_RECT */
-      if (states[i].minx < states[i].maxx &&
-          states[i].miny < states[i].maxy) {
-         min_x = states[i].minx;
-         min_y = states[i].miny;
-         max_x = states[i].maxx - 1;
-         max_y = states[i].maxy - 1;
-      }
-      else {
-         /* we have to make min greater than max */
-         min_x = 1;
-         min_y = 1;
-         max_x = 0;
-         max_y = 0;
-      }
-
-      scissor->payload[(start_slot + i) * 2 + 0] = min_y << 16 | min_x;
-      scissor->payload[(start_slot + i) * 2 + 1] = max_y << 16 | max_x;
-   }
-
-   if (!start_slot && num_states)
-      scissor->scissor0 = states[0];
-}
-
-void
-ilo_gpe_set_scissor_null(const struct ilo_dev *dev,
-                         struct ilo_scissor_state *scissor)
-{
-   unsigned i;
-
-   for (i = 0; i < Elements(scissor->payload); i += 2) {
-      scissor->payload[i + 0] = 1 << 16 | 1;
-      scissor->payload[i + 1] = 0;
-   }
-}
-
-static void
-fb_set_blend_caps(const struct ilo_dev *dev,
-                  enum pipe_format format,
-                  struct ilo_fb_blend_caps *caps)
-{
-   const struct util_format_description *desc =
-      util_format_description(format);
-   const int ch = util_format_get_first_non_void_channel(format);
-
-   memset(caps, 0, sizeof(*caps));
-
-   if (format == PIPE_FORMAT_NONE || desc->is_mixed)
-      return;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
-    *
-    *     "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB
-    *      variants), otherwise Logic Ops must be DISABLED."
-    *
-    * According to the classic driver, this is lifted on Gen8+.
-    */
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      caps->can_logicop = true;
-   } else {
-      caps->can_logicop = (ch >= 0 && desc->channel[ch].normalized &&
-            desc->channel[ch].type == UTIL_FORMAT_TYPE_UNSIGNED &&
-            desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
-   }
-
-   /* no blending for pure integer formats */
-   caps->can_blend = !util_format_is_pure_integer(format);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 382:
-    *
-    *     "Alpha Test can only be enabled if Pixel Shader outputs a float
-    *      alpha value."
-    */
-   caps->can_alpha_test = !util_format_is_pure_integer(format);
-
-   caps->dst_alpha_forced_one =
-      (ilo_format_translate_render(dev, format) !=
-       ilo_format_translate_color(dev, format));
-
-   /* sanity check */
-   if (caps->dst_alpha_forced_one) {
-      enum pipe_format render_format;
-
-      switch (format) {
-      case PIPE_FORMAT_B8G8R8X8_UNORM:
-         render_format = PIPE_FORMAT_B8G8R8A8_UNORM;
-         break;
-      default:
-         render_format = PIPE_FORMAT_NONE;
-         break;
-      }
-
-      assert(ilo_format_translate_render(dev, format) ==
-             ilo_format_translate_color(dev, render_format));
-   }
-}
-
-void
-ilo_gpe_set_fb(const struct ilo_dev *dev,
-               const struct pipe_framebuffer_state *state,
-               struct ilo_fb_state *fb)
-{
-   const struct pipe_surface *first_surf = NULL;
-   int i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   util_copy_framebuffer_state(&fb->state, state);
-
-   ilo_gpe_init_view_surface_null(dev,
-         (state->width) ? state->width : 1,
-         (state->height) ? state->height : 1,
-         1, 0, &fb->null_rt);
-
-   for (i = 0; i < state->nr_cbufs; i++) {
-      if (state->cbufs[i]) {
-         fb_set_blend_caps(dev, state->cbufs[i]->format, &fb->blend_caps[i]);
-
-         if (!first_surf)
-            first_surf = state->cbufs[i];
-      } else {
-         fb_set_blend_caps(dev, PIPE_FORMAT_NONE, &fb->blend_caps[i]);
-      }
-   }
-
-   if (!first_surf && state->zsbuf)
-      first_surf = state->zsbuf;
-
-   fb->num_samples = (first_surf) ? first_surf->texture->nr_samples : 1;
-   if (!fb->num_samples)
-      fb->num_samples = 1;
-
-   /*
-    * The PRMs list several restrictions when the framebuffer has more than
-    * one surface.  It seems they are actually lifted on GEN6+.
-    */
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_3d_top.c b/src/gallium/drivers/ilo/core/ilo_state_3d_top.c
deleted file mode 100644
index c17957f..0000000
--- a/src/gallium/drivers/ilo/core/ilo_state_3d_top.c
+++ /dev/null
@@ -1,1716 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2012-2014 LunarG, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#include "genhw/genhw.h"
-#include "util/u_dual_blend.h"
-#include "util/u_framebuffer.h"
-#include "util/u_half.h"
-#include "util/u_resource.h"
-
-#include "ilo_buffer.h"
-#include "ilo_format.h"
-#include "ilo_image.h"
-#include "ilo_state_3d.h"
-#include "../ilo_shader.h"
-
-static void
-ve_init_cso(const struct ilo_dev *dev,
-            const struct pipe_vertex_element *state,
-            unsigned vb_index,
-            struct ilo_ve_cso *cso)
-{
-   int comp[4] = {
-      GEN6_VFCOMP_STORE_SRC,
-      GEN6_VFCOMP_STORE_SRC,
-      GEN6_VFCOMP_STORE_SRC,
-      GEN6_VFCOMP_STORE_SRC,
-   };
-   int format;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   switch (util_format_get_nr_components(state->src_format)) {
-   case 1: comp[1] = GEN6_VFCOMP_STORE_0;
-   case 2: comp[2] = GEN6_VFCOMP_STORE_0;
-   case 3: comp[3] = (util_format_is_pure_integer(state->src_format)) ?
-                     GEN6_VFCOMP_STORE_1_INT :
-                     GEN6_VFCOMP_STORE_1_FP;
-   }
-
-   format = ilo_format_translate_vertex(dev, state->src_format);
-
-   STATIC_ASSERT(Elements(cso->payload) >= 2);
-   cso->payload[0] =
-      vb_index << GEN6_VE_DW0_VB_INDEX__SHIFT |
-      GEN6_VE_DW0_VALID |
-      format << GEN6_VE_DW0_FORMAT__SHIFT |
-      state->src_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT;
-
-   cso->payload[1] =
-         comp[0] << GEN6_VE_DW1_COMP0__SHIFT |
-         comp[1] << GEN6_VE_DW1_COMP1__SHIFT |
-         comp[2] << GEN6_VE_DW1_COMP2__SHIFT |
-         comp[3] << GEN6_VE_DW1_COMP3__SHIFT;
-}
-
-void
-ilo_gpe_init_ve(const struct ilo_dev *dev,
-                unsigned num_states,
-                const struct pipe_vertex_element *states,
-                struct ilo_ve_state *ve)
-{
-   unsigned i;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   ve->count = num_states;
-   ve->vb_count = 0;
-
-   for (i = 0; i < num_states; i++) {
-      const unsigned pipe_idx = states[i].vertex_buffer_index;
-      const unsigned instance_divisor = states[i].instance_divisor;
-      unsigned hw_idx;
-
-      /*
-       * map the pipe vb to the hardware vb, which has a fixed instance
-       * divisor
-       */
-      for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) {
-         if (ve->vb_mapping[hw_idx] == pipe_idx &&
-             ve->instance_divisors[hw_idx] == instance_divisor)
-            break;
-      }
-
-      /* create one if there is no matching hardware vb */
-      if (hw_idx >= ve->vb_count) {
-         hw_idx = ve->vb_count++;
-
-         ve->vb_mapping[hw_idx] = pipe_idx;
-         ve->instance_divisors[hw_idx] = instance_divisor;
-      }
-
-      ve_init_cso(dev, &states[i], hw_idx, &ve->cso[i]);
-   }
-}
-
-void
-ilo_gpe_set_ve_edgeflag(const struct ilo_dev *dev,
-                        struct ilo_ve_cso *cso)
-{
-   int format;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 94:
-    *
-    *     "- This bit (Edge Flag Enable) must only be ENABLED on the last
-    *        valid VERTEX_ELEMENT structure.
-    *
-    *      - When set, Component 0 Control must be set to VFCOMP_STORE_SRC,
-    *        and Component 1-3 Control must be set to VFCOMP_NOSTORE.
-    *
-    *      - The Source Element Format must be set to the UINT format.
-    *
-    *      - [DevSNB]: Edge Flags are not supported for QUADLIST
-    *        primitives.  Software may elect to convert QUADLIST primitives
-    *        to some set of corresponding edge-flag-supported primitive
-    *        types (e.g., POLYGONs) prior to submission to the 3D pipeline."
-    */
-   cso->payload[0] |= GEN6_VE_DW0_EDGE_FLAG_ENABLE;
-
-   /*
-    * Edge flags have format GEN6_FORMAT_R8_USCALED when defined via
-    * glEdgeFlagPointer(), and format GEN6_FORMAT_R32_FLOAT when defined
-    * via glEdgeFlag(), as can be seen in vbo_attrib_tmp.h.
-    *
-    * Since all the hardware cares about is whether the flags are zero or not,
-    * we can treat them as the corresponding _UINT formats.
-    */
-   format = GEN_EXTRACT(cso->payload[0], GEN6_VE_DW0_FORMAT);
-   cso->payload[0] &= ~GEN6_VE_DW0_FORMAT__MASK;
-
-   switch (format) {
-   case GEN6_FORMAT_R32_FLOAT:
-      format = GEN6_FORMAT_R32_UINT;
-      break;
-   case GEN6_FORMAT_R8_USCALED:
-      format = GEN6_FORMAT_R8_UINT;
-      break;
-   default:
-      break;
-   }
-
-   cso->payload[0] |= GEN_SHIFT32(format, GEN6_VE_DW0_FORMAT);
-
-   cso->payload[1] =
-         GEN6_VFCOMP_STORE_SRC << GEN6_VE_DW1_COMP0__SHIFT |
-         GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP1__SHIFT |
-         GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP2__SHIFT |
-         GEN6_VFCOMP_NOSTORE << GEN6_VE_DW1_COMP3__SHIFT;
-}
-
-void
-ilo_gpe_init_ve_nosrc(const struct ilo_dev *dev,
-                          int comp0, int comp1, int comp2, int comp3,
-                          struct ilo_ve_cso *cso)
-{
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   STATIC_ASSERT(Elements(cso->payload) >= 2);
-
-   assert(comp0 != GEN6_VFCOMP_STORE_SRC &&
-          comp1 != GEN6_VFCOMP_STORE_SRC &&
-          comp2 != GEN6_VFCOMP_STORE_SRC &&
-          comp3 != GEN6_VFCOMP_STORE_SRC);
-
-   cso->payload[0] = GEN6_VE_DW0_VALID;
-   cso->payload[1] =
-         comp0 << GEN6_VE_DW1_COMP0__SHIFT |
-         comp1 << GEN6_VE_DW1_COMP1__SHIFT |
-         comp2 << GEN6_VE_DW1_COMP2__SHIFT |
-         comp3 << GEN6_VE_DW1_COMP3__SHIFT;
-}
-
-void
-ilo_gpe_init_vs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *vs,
-                    struct ilo_shader_cso *cso)
-{
-   int start_grf, vue_read_len, sampler_count, max_threads;
-   uint32_t dw2, dw4, dw5;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   start_grf = ilo_shader_get_kernel_param(vs, ILO_KERNEL_URB_DATA_START_REG);
-   vue_read_len = ilo_shader_get_kernel_param(vs, ILO_KERNEL_INPUT_COUNT);
-   sampler_count = ilo_shader_get_kernel_param(vs, ILO_KERNEL_SAMPLER_COUNT);
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 135:
-    *
-    *     "(Vertex URB Entry Read Length) Specifies the number of pairs of
-    *      128-bit vertex elements to be passed into the payload for each
-    *      vertex."
-    *
-    *     "It is UNDEFINED to set this field to 0 indicating no Vertex URB
-    *      data to be read and passed to the thread."
-    */
-   vue_read_len = (vue_read_len + 1) / 2;
-   if (!vue_read_len)
-      vue_read_len = 1;
-
-   max_threads = dev->thread_count;
-   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && dev->gt == 2)
-      max_threads *= 2;
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = start_grf << GEN6_VS_DW4_URB_GRF_START__SHIFT |
-         vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT |
-         0 << GEN6_VS_DW4_URB_READ_OFFSET__SHIFT;
-
-   dw5 = GEN6_VS_DW5_STATISTICS |
-         GEN6_VS_DW5_VS_ENABLE;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
-      dw5 |= (max_threads - 1) << GEN75_VS_DW5_MAX_THREADS__SHIFT;
-   else
-      dw5 |= (max_threads - 1) << GEN6_VS_DW5_MAX_THREADS__SHIFT;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 3);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-}
-
-static void
-gs_init_cso_gen6(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *gs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, vue_read_len, max_threads;
-   uint32_t dw2, dw4, dw5, dw6;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   if (ilo_shader_get_type(gs) == PIPE_SHADER_GEOMETRY) {
-      start_grf = ilo_shader_get_kernel_param(gs,
-            ILO_KERNEL_URB_DATA_START_REG);
-
-      vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_INPUT_COUNT);
-   }
-   else {
-      start_grf = ilo_shader_get_kernel_param(gs,
-            ILO_KERNEL_VS_GEN6_SO_START_REG);
-
-      vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_OUTPUT_COUNT);
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 153:
-    *
-    *     "Specifies the amount of URB data read and passed in the thread
-    *      payload for each Vertex URB entry, in 256-bit register increments.
-    *
-    *      It is UNDEFINED to set this field (Vertex URB Entry Read Length) to
-    *      0 indicating no Vertex URB data to be read and passed to the
-    *      thread."
-    */
-   vue_read_len = (vue_read_len + 1) / 2;
-   if (!vue_read_len)
-      vue_read_len = 1;
-
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 154:
-    *
-    *     "Maximum Number of Threads valid range is [0,27] when Rendering
-    *      Enabled bit is set."
-    *
-    * From the Sandy Bridge PRM, volume 2 part 1, page 173:
-    *
-    *     "Programming Note: If the GS stage is enabled, software must always
-    *      allocate at least one GS URB Entry. This is true even if the GS
-    *      thread never needs to output vertices to the pipeline, e.g., when
-    *      only performing stream output. This is an artifact of the need to
-    *      pass the GS thread an initial destination URB handle."
-    *
-    * As such, we always enable rendering, and limit the number of threads.
-    */
-   if (dev->gt == 2) {
-      /* maximum is 60, but limited to 28 */
-      max_threads = 28;
-   }
-   else {
-      /* maximum is 24, but limited to 21 (see brwCreateContext()) */
-      max_threads = 21;
-   }
-
-   dw2 = GEN6_THREADDISP_SPF;
-
-   dw4 = vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT |
-         0 << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT |
-         start_grf << GEN6_GS_DW4_URB_GRF_START__SHIFT;
-
-   dw5 = (max_threads - 1) << GEN6_GS_DW5_MAX_THREADS__SHIFT |
-         GEN6_GS_DW5_STATISTICS |
-         GEN6_GS_DW5_SO_STATISTICS |
-         GEN6_GS_DW5_RENDER_ENABLE;
-
-   /*
-    * we cannot make use of GEN6_GS_REORDER because it will reorder
-    * triangle strips according to D3D rules (triangle 2N+1 uses vertices
-    * (2N+1, 2N+3, 2N+2)), instead of GL rules (triangle 2N+1 uses vertices
-    * (2N+2, 2N+1, 2N+3)).
-    */
-   dw6 = GEN6_GS_DW6_GS_ENABLE;
-
-   if (ilo_shader_get_kernel_param(gs, ILO_KERNEL_GS_DISCARD_ADJACENCY))
-      dw6 |= GEN6_GS_DW6_DISCARD_ADJACENCY;
-
-   if (ilo_shader_get_kernel_param(gs, ILO_KERNEL_VS_GEN6_SO)) {
-      const uint32_t svbi_post_inc =
-         ilo_shader_get_kernel_param(gs, ILO_KERNEL_GS_GEN6_SVBI_POST_INC);
-
-      dw6 |= GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE;
-      if (svbi_post_inc) {
-         dw6 |= GEN6_GS_DW6_SVBI_POST_INC_ENABLE |
-                svbi_post_inc << GEN6_GS_DW6_SVBI_POST_INC_VAL__SHIFT;
-      }
-   }
-
-   STATIC_ASSERT(Elements(cso->payload) >= 4);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-   cso->payload[3] = dw6;
-}
-
-static void
-gs_init_cso_gen7(const struct ilo_dev *dev,
-                 const struct ilo_shader_state *gs,
-                 struct ilo_shader_cso *cso)
-{
-   int start_grf, vue_read_len, sampler_count, max_threads;
-   uint32_t dw2, dw4, dw5;
-
-   ILO_DEV_ASSERT(dev, 7, 7.5);
-
-   start_grf = ilo_shader_get_kernel_param(gs, ILO_KERNEL_URB_DATA_START_REG);
-   vue_read_len = ilo_shader_get_kernel_param(gs, ILO_KERNEL_INPUT_COUNT);
-   sampler_count = ilo_shader_get_kernel_param(gs, ILO_KERNEL_SAMPLER_COUNT);
-
-   /* in pairs */
-   vue_read_len = (vue_read_len + 1) / 2;
-
-   switch (ilo_dev_gen(dev)) {
-   case ILO_GEN(7.5):
-      max_threads = (dev->gt >= 2) ? 256 : 70;
-      break;
-   case ILO_GEN(7):
-      max_threads = (dev->gt == 2) ? 128 : 36;
-      break;
-   default:
-      max_threads = 1;
-      break;
-   }
-
-   dw2 = (true) ? 0 : GEN6_THREADDISP_FP_MODE_ALT;
-   dw2 |= ((sampler_count + 3) / 4) << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT;
-
-   dw4 = vue_read_len << GEN7_GS_DW4_URB_READ_LEN__SHIFT |
-         GEN7_GS_DW4_INCLUDE_VERTEX_HANDLES |
-         0 << GEN7_GS_DW4_URB_READ_OFFSET__SHIFT |
-         start_grf << GEN7_GS_DW4_URB_GRF_START__SHIFT;
-
-   dw5 = (max_threads - 1) << GEN7_GS_DW5_MAX_THREADS__SHIFT |
-         GEN7_GS_DW5_STATISTICS |
-         GEN7_GS_DW5_GS_ENABLE;
-
-   STATIC_ASSERT(Elements(cso->payload) >= 3);
-   cso->payload[0] = dw2;
-   cso->payload[1] = dw4;
-   cso->payload[2] = dw5;
-}
-
-void
-ilo_gpe_init_gs_cso(const struct ilo_dev *dev,
-                    const struct ilo_shader_state *gs,
-                    struct ilo_shader_cso *cso)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7))
-      gs_init_cso_gen7(dev, gs, cso);
-   else
-      gs_init_cso_gen6(dev, gs, cso);
-}
-
-static void
-view_init_null_gen6(const struct ilo_dev *dev,
-                    unsigned width, unsigned height,
-                    unsigned depth, unsigned level,
-                    struct ilo_view_surface *surf)
-{
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   assert(width >= 1 && height >= 1 && depth >= 1);
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 71:
-    *
-    *     "A null surface will be used in instances where an actual surface is
-    *      not bound. When a write message is generated to a null surface, no
-    *      actual surface is written to. When a read message (including any
-    *      sampling engine message) is generated to a null surface, the result
-    *      is all zeros. Note that a null surface type is allowed to be used
-    *      with all messages, even if it is not specificially indicated as
-    *      supported. All of the remaining fields in surface state are ignored
-    *      for null surfaces, with the following exceptions:
-    *
-    *        * [DevSNB+]: Width, Height, Depth, and LOD fields must match the
-    *          depth buffer's corresponding state for all render target
-    *          surfaces, including null.
-    *        * Surface Format must be R8G8B8A8_UNORM."
-    *
-    * From the Sandy Bridge PRM, volume 4 part 1, page 82:
-    *
-    *     "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must be
-    *      true"
-    */
-
-   STATIC_ASSERT(Elements(surf->payload) >= 6);
-   dw = surf->payload;
-
-   dw[0] = GEN6_SURFTYPE_NULL << GEN6_SURFACE_DW0_TYPE__SHIFT |
-           GEN6_FORMAT_B8G8R8A8_UNORM << GEN6_SURFACE_DW0_FORMAT__SHIFT;
-
-   dw[1] = 0;
-
-   dw[2] = (height - 1) << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
-           (width  - 1) << GEN6_SURFACE_DW2_WIDTH__SHIFT |
-           level << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT;
-
-   dw[3] = (depth - 1) << GEN6_SURFACE_DW3_DEPTH__SHIFT |
-           GEN6_TILING_X;
-
-   dw[4] = 0;
-   dw[5] = 0;
-}
-
-static void
-view_init_for_buffer_gen6(const struct ilo_dev *dev,
-                          const struct ilo_buffer *buf,
-                          unsigned offset, unsigned size,
-                          unsigned struct_size,
-                          enum pipe_format elem_format,
-                          bool is_rt, bool render_cache_rw,
-                          struct ilo_view_surface *surf)
-{
-   const int elem_size = util_format_get_blocksize(elem_format);
-   int width, height, depth, pitch;
-   int surface_format, num_entries;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   /*
-    * For SURFTYPE_BUFFER, a SURFACE_STATE specifies an element of a
-    * structure in a buffer.
-    */
-
-   surface_format = ilo_format_translate_color(dev, elem_format);
-
-   num_entries = size / struct_size;
-   /* see if there is enough space to fit another element */
-   if (size % struct_size >= elem_size)
-      num_entries++;
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 76:
-    *
-    *     "For SURFTYPE_BUFFER render targets, this field (Surface Base
-    *      Address) specifies the base address of first element of the
-    *      surface. The surface is interpreted as a simple array of that
-    *      single element type. The address must be naturally-aligned to the
-    *      element size (e.g., a buffer containing R32G32B32A32_FLOAT elements
-    *      must be 16-byte aligned).
-    *
-    *      For SURFTYPE_BUFFER non-rendertarget surfaces, this field specifies
-    *      the base address of the first element of the surface, computed in
-    *      software by adding the surface base address to the byte offset of
-    *      the element in the buffer."
-    */
-   if (is_rt)
-      assert(offset % elem_size == 0);
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 77:
-    *
-    *     "For buffer surfaces, the number of entries in the buffer ranges
-    *      from 1 to 2^27."
-    */
-   assert(num_entries >= 1 && num_entries <= 1 << 27);
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-    *
-    *     "For surfaces of type SURFTYPE_BUFFER, this field (Surface Pitch)
-    *      indicates the size of the structure."
-    */
-   pitch = struct_size;
-
-   pitch--;
-   num_entries--;
-   /* bits [6:0] */
-   width  = (num_entries & 0x0000007f);
-   /* bits [19:7] */
-   height = (num_entries & 0x000fff80) >> 7;
-   /* bits [26:20] */
-   depth  = (num_entries & 0x07f00000) >> 20;
-
-   STATIC_ASSERT(Elements(surf->payload) >= 6);
-   dw = surf->payload;
-
-   dw[0] = GEN6_SURFTYPE_BUFFER << GEN6_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN6_SURFACE_DW0_FORMAT__SHIFT;
-   if (render_cache_rw)
-      dw[0] |= GEN6_SURFACE_DW0_RENDER_CACHE_RW;
-
-   dw[1] = offset;
-
-   dw[2] = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
-           width << GEN6_SURFACE_DW2_WIDTH__SHIFT;
-
-   dw[3] = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT |
-           pitch << GEN6_SURFACE_DW3_PITCH__SHIFT;
-
-   dw[4] = 0;
-   dw[5] = 0;
-}
-
-static void
-view_init_for_image_gen6(const struct ilo_dev *dev,
-                         const struct ilo_image *img,
-                         enum pipe_texture_target target,
-                         enum pipe_format format,
-                         unsigned first_level,
-                         unsigned num_levels,
-                         unsigned first_layer,
-                         unsigned num_layers,
-                         bool is_rt,
-                         struct ilo_view_surface *surf)
-{
-   int surface_type, surface_format;
-   int width, height, depth, pitch, lod;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   surface_type = ilo_gpe_gen6_translate_texture(target);
-   assert(surface_type != GEN6_SURFTYPE_BUFFER);
-
-   if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && img->separate_stencil)
-      format = PIPE_FORMAT_Z32_FLOAT;
-
-   if (is_rt)
-      surface_format = ilo_format_translate_render(dev, format);
-   else
-      surface_format = ilo_format_translate_texture(dev, format);
-   assert(surface_format >= 0);
-
-   width = img->width0;
-   height = img->height0;
-   depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers;
-   pitch = img->bo_stride;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE) {
-      /*
-       * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-       *
-       *     "For SURFTYPE_CUBE: [DevSNB+]: for Sampling Engine Surfaces, the
-       *      range of this field (Depth) is [0,84], indicating the number of
-       *      cube array elements (equal to the number of underlying 2D array
-       *      elements divided by 6). For other surfaces, this field must be
-       *      zero."
-       *
-       * When is_rt is true, we treat the texture as a 2D one to avoid the
-       * restriction.
-       */
-      if (is_rt) {
-         surface_type = GEN6_SURFTYPE_2D;
-      }
-      else {
-         assert(num_layers % 6 == 0);
-         depth = num_layers / 6;
-      }
-   }
-
-   /* sanity check the size */
-   assert(width >= 1 && height >= 1 && depth >= 1 && pitch >= 1);
-   switch (surface_type) {
-   case GEN6_SURFTYPE_1D:
-      assert(width <= 8192 && height == 1 && depth <= 512);
-      assert(first_layer < 512 && num_layers <= 512);
-      break;
-   case GEN6_SURFTYPE_2D:
-      assert(width <= 8192 && height <= 8192 && depth <= 512);
-      assert(first_layer < 512 && num_layers <= 512);
-      break;
-   case GEN6_SURFTYPE_3D:
-      assert(width <= 2048 && height <= 2048 && depth <= 2048);
-      assert(first_layer < 2048 && num_layers <= 512);
-      if (!is_rt)
-         assert(first_layer == 0);
-      break;
-   case GEN6_SURFTYPE_CUBE:
-      assert(width <= 8192 && height <= 8192 && depth <= 85);
-      assert(width == height);
-      assert(first_layer < 512 && num_layers <= 512);
-      if (is_rt)
-         assert(first_layer == 0);
-      break;
-   default:
-      assert(!"unexpected surface type");
-      break;
-   }
-
-   /* non-full array spacing is supported only on GEN7+ */
-   assert(img->walk != ILO_IMAGE_WALK_LOD);
-   /* non-interleaved samples are supported only on GEN7+ */
-   if (img->sample_count > 1)
-      assert(img->interleaved_samples);
-
-   if (is_rt) {
-      assert(num_levels == 1);
-      lod = first_level;
-   }
-   else {
-      lod = num_levels - 1;
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 76:
-    *
-    *     "Linear render target surface base addresses must be element-size
-    *      aligned, for non-YUV surface formats, or a multiple of 2
-    *      element-sizes for YUV surface formats. Other linear surfaces have
-    *      no alignment requirements (byte alignment is sufficient.)"
-    *
-    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
-    *
-    *     "For linear render target surfaces, the pitch must be a multiple
-    *      of the element size for non-YUV surface formats. Pitch must be a
-    *      multiple of 2 * element size for YUV surface formats."
-    *
-    * From the Sandy Bridge PRM, volume 4 part 1, page 86:
-    *
-    *     "For linear surfaces, this field (X Offset) must be zero"
-    */
-   if (img->tiling == GEN6_TILING_NONE) {
-      if (is_rt) {
-         const int elem_size = util_format_get_blocksize(format);
-         assert(pitch % elem_size == 0);
-      }
-   }
-
-   STATIC_ASSERT(Elements(surf->payload) >= 6);
-   dw = surf->payload;
-
-   dw[0] = surface_type << GEN6_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN6_SURFACE_DW0_FORMAT__SHIFT |
-           GEN6_SURFACE_DW0_MIPLAYOUT_BELOW;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE && !is_rt) {
-      dw[0] |= 1 << 9 |
-               GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
-   }
-
-   if (is_rt)
-      dw[0] |= GEN6_SURFACE_DW0_RENDER_CACHE_RW;
-
-   dw[1] = 0;
-
-   dw[2] = (height - 1) << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
-           (width - 1) << GEN6_SURFACE_DW2_WIDTH__SHIFT |
-           lod << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT;
-
-   assert(img->tiling != GEN8_TILING_W);
-   dw[3] = (depth - 1) << GEN6_SURFACE_DW3_DEPTH__SHIFT |
-           (pitch - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT |
-           img->tiling;
-
-   dw[4] = first_level << GEN6_SURFACE_DW4_MIN_LOD__SHIFT |
-           first_layer << 17 |
-           (num_layers - 1) << 8 |
-           ((img->sample_count > 1) ? GEN6_SURFACE_DW4_MULTISAMPLECOUNT_4 :
-                                      GEN6_SURFACE_DW4_MULTISAMPLECOUNT_1);
-
-   dw[5] = 0;
-
-   assert(img->align_j == 2 || img->align_j == 4);
-   if (img->align_j == 4)
-      dw[5] |= GEN6_SURFACE_DW5_VALIGN_4;
-}
-
-static void
-view_init_null_gen7(const struct ilo_dev *dev,
-                    unsigned width, unsigned height,
-                    unsigned depth, unsigned level,
-                    struct ilo_view_surface *surf)
-{
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 7, 8);
-
-   assert(width >= 1 && height >= 1 && depth >= 1);
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 62:
-    *
-    *     "A null surface is used in instances where an actual surface is not
-    *      bound. When a write message is generated to a null surface, no
-    *      actual surface is written to. When a read message (including any
-    *      sampling engine message) is generated to a null surface, the result
-    *      is all zeros.  Note that a null surface type is allowed to be used
-    *      with all messages, even if it is not specificially indicated as
-    *      supported. All of the remaining fields in surface state are ignored
-    *      for null surfaces, with the following exceptions:
-    *
-    *      * Width, Height, Depth, LOD, and Render Target View Extent fields
-    *        must match the depth buffer's corresponding state for all render
-    *        target surfaces, including null.
-    *      * All sampling engine and data port messages support null surfaces
-    *        with the above behavior, even if not mentioned as specifically
-    *        supported, except for the following:
-    *        * Data Port Media Block Read/Write messages.
-    *      * The Surface Type of a surface used as a render target (accessed
-    *        via the Data Port's Render Target Write message) must be the same
-    *        as the Surface Type of all other render targets and of the depth
-    *        buffer (defined in 3DSTATE_DEPTH_BUFFER), unless either the depth
-    *        buffer or render targets are SURFTYPE_NULL."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 65:
-    *
-    *     "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must be
-    *      true"
-    */
-
-   STATIC_ASSERT(Elements(surf->payload) >= 13);
-   dw = surf->payload;
-
-   dw[0] = GEN6_SURFTYPE_NULL << GEN7_SURFACE_DW0_TYPE__SHIFT |
-           GEN6_FORMAT_B8G8R8A8_UNORM << GEN7_SURFACE_DW0_FORMAT__SHIFT;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      dw[0] |= GEN6_TILING_X << GEN8_SURFACE_DW0_TILING__SHIFT;
-   else
-      dw[0] |= GEN6_TILING_X << GEN7_SURFACE_DW0_TILING__SHIFT;
-
-   dw[1] = 0;
-
-   dw[2] = GEN_SHIFT32(height - 1, GEN7_SURFACE_DW2_HEIGHT) |
-           GEN_SHIFT32(width  - 1, GEN7_SURFACE_DW2_WIDTH);
-
-   dw[3] = GEN_SHIFT32(depth - 1, GEN7_SURFACE_DW3_DEPTH);
-
-   dw[4] = 0;
-   dw[5] = level;
-
-   dw[6] = 0;
-   dw[7] = 0;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      memset(&dw[8], 0, sizeof(*dw) * (13 - 8));
-}
-
-static void
-view_init_for_buffer_gen7(const struct ilo_dev *dev,
-                          const struct ilo_buffer *buf,
-                          unsigned offset, unsigned size,
-                          unsigned struct_size,
-                          enum pipe_format elem_format,
-                          bool is_rt, bool render_cache_rw,
-                          struct ilo_view_surface *surf)
-{
-   const bool typed = (elem_format != PIPE_FORMAT_NONE);
-   const bool structured = (!typed && struct_size > 1);
-   const int elem_size = (typed) ?
-      util_format_get_blocksize(elem_format) : 1;
-   int width, height, depth, pitch;
-   int surface_type, surface_format, num_entries;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 7, 8);
-
-   surface_type = (structured) ? GEN7_SURFTYPE_STRBUF : GEN6_SURFTYPE_BUFFER;
-
-   surface_format = (typed) ?
-      ilo_format_translate_color(dev, elem_format) : GEN6_FORMAT_RAW;
-
-   num_entries = size / struct_size;
-   /* see if there is enough space to fit another element */
-   if (size % struct_size >= elem_size && !structured)
-      num_entries++;
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 67:
-    *
-    *     "For SURFTYPE_BUFFER render targets, this field (Surface Base
-    *      Address) specifies the base address of first element of the
-    *      surface. The surface is interpreted as a simple array of that
-    *      single element type. The address must be naturally-aligned to the
-    *      element size (e.g., a buffer containing R32G32B32A32_FLOAT elements
-    *      must be 16-byte aligned)
-    *
-    *      For SURFTYPE_BUFFER non-rendertarget surfaces, this field specifies
-    *      the base address of the first element of the surface, computed in
-    *      software by adding the surface base address to the byte offset of
-    *      the element in the buffer."
-    */
-   if (is_rt)
-      assert(offset % elem_size == 0);
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
-    *
-    *     "For typed buffer and structured buffer surfaces, the number of
-    *      entries in the buffer ranges from 1 to 2^27.  For raw buffer
-    *      surfaces, the number of entries in the buffer is the number of
-    *      bytes which can range from 1 to 2^30."
-    */
-   assert(num_entries >= 1 &&
-          num_entries <= 1 << ((typed || structured) ? 27 : 30));
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 69:
-    *
-    *     "For SURFTYPE_BUFFER: The low two bits of this field (Width) must be
-    *      11 if the Surface Format is RAW (the size of the buffer must be a
-    *      multiple of 4 bytes)."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-    *
-    *     "For surfaces of type SURFTYPE_BUFFER and SURFTYPE_STRBUF, this
-    *      field (Surface Pitch) indicates the size of the structure."
-    *
-    *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the pitch
-    *      must be a multiple of 4 bytes."
-    */
-   if (structured)
-      assert(struct_size % 4 == 0);
-   else if (!typed)
-      assert(num_entries % 4 == 0);
-
-   pitch = struct_size;
-
-   pitch--;
-   num_entries--;
-   /* bits [6:0] */
-   width  = (num_entries & 0x0000007f);
-   /* bits [20:7] */
-   height = (num_entries & 0x001fff80) >> 7;
-   /* bits [30:21] */
-   depth  = (num_entries & 0x7fe00000) >> 21;
-   /* limit to [26:21] */
-   if (typed || structured)
-      depth &= 0x3f;
-
-   STATIC_ASSERT(Elements(surf->payload) >= 13);
-   dw = surf->payload;
-
-   dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT;
-   if (render_cache_rw)
-      dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      dw[8] = offset;
-      memset(&dw[9], 0, sizeof(*dw) * (13 - 9));
-   } else {
-      dw[1] = offset;
-   }
-
-   dw[2] = GEN_SHIFT32(height, GEN7_SURFACE_DW2_HEIGHT) |
-           GEN_SHIFT32(width, GEN7_SURFACE_DW2_WIDTH);
-
-   dw[3] = GEN_SHIFT32(depth, GEN7_SURFACE_DW3_DEPTH) |
-           pitch;
-
-   dw[4] = 0;
-   dw[5] = 0;
-
-   dw[6] = 0;
-   dw[7] = 0;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
-      dw[7] |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
-               GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
-               GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
-               GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
-   }
-}
-
-static void
-view_init_for_image_gen7(const struct ilo_dev *dev,
-                         const struct ilo_image *img,
-                         enum pipe_texture_target target,
-                         enum pipe_format format,
-                         unsigned first_level,
-                         unsigned num_levels,
-                         unsigned first_layer,
-                         unsigned num_layers,
-                         bool is_rt,
-                         struct ilo_view_surface *surf)
-{
-   int surface_type, surface_format;
-   int width, height, depth, pitch, lod;
-   uint32_t *dw;
-
-   ILO_DEV_ASSERT(dev, 7, 8);
-
-   surface_type = ilo_gpe_gen6_translate_texture(target);
-   assert(surface_type != GEN6_SURFTYPE_BUFFER);
-
-   if (format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT && img->separate_stencil)
-      format = PIPE_FORMAT_Z32_FLOAT;
-
-   if (is_rt)
-      surface_format = ilo_format_translate_render(dev, format);
-   else
-      surface_format = ilo_format_translate_texture(dev, format);
-   assert(surface_format >= 0);
-
-   width = img->width0;
-   height = img->height0;
-   depth = (target == PIPE_TEXTURE_3D) ? img->depth0 : num_layers;
-   pitch = img->bo_stride;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE) {
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-       *
-       *     "For SURFTYPE_CUBE:For Sampling Engine Surfaces, the range of
-       *      this field is [0,340], indicating the number of cube array
-       *      elements (equal to the number of underlying 2D array elements
-       *      divided by 6). For other surfaces, this field must be zero."
-       *
-       * When is_rt is true, we treat the texture as a 2D one to avoid the
-       * restriction.
-       */
-      if (is_rt) {
-         surface_type = GEN6_SURFTYPE_2D;
-      }
-      else {
-         assert(num_layers % 6 == 0);
-         depth = num_layers / 6;
-      }
-   }
-
-   /* sanity check the size */
-   assert(width >= 1 && height >= 1 && depth >= 1 && pitch >= 1);
-   assert(first_layer < 2048 && num_layers <= 2048);
-   switch (surface_type) {
-   case GEN6_SURFTYPE_1D:
-      assert(width <= 16384 && height == 1 && depth <= 2048);
-      break;
-   case GEN6_SURFTYPE_2D:
-      assert(width <= 16384 && height <= 16384 && depth <= 2048);
-      break;
-   case GEN6_SURFTYPE_3D:
-      assert(width <= 2048 && height <= 2048 && depth <= 2048);
-      if (!is_rt)
-         assert(first_layer == 0);
-      break;
-   case GEN6_SURFTYPE_CUBE:
-      assert(width <= 16384 && height <= 16384 && depth <= 86);
-      assert(width == height);
-      if (is_rt)
-         assert(first_layer == 0);
-      break;
-   default:
-      assert(!"unexpected surface type");
-      break;
-   }
-
-   if (is_rt) {
-      assert(num_levels == 1);
-      lod = first_level;
-   }
-   else {
-      lod = num_levels - 1;
-   }
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
-    *
-    *     "The Base Address for linear render target surfaces and surfaces
-    *      accessed with the typed surface read/write data port messages must
-    *      be element-size aligned, for non-YUV surface formats, or a multiple
-    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
-    *      have no alignment requirements (byte alignment is sufficient)."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 70:
-    *
-    *     "For linear render target surfaces and surfaces accessed with the
-    *      typed data port messages, the pitch must be a multiple of the
-    *      element size for non-YUV surface formats. Pitch must be a multiple
-    *      of 2 * element size for YUV surface formats. For linear surfaces
-    *      with Surface Type of SURFTYPE_STRBUF, the pitch must be a multiple
-    *      of 4 bytes.For other linear surfaces, the pitch can be any multiple
-    *      of bytes."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 74:
-    *
-    *     "For linear surfaces, this field (X Offset) must be zero."
-    */
-   if (img->tiling == GEN6_TILING_NONE) {
-      if (is_rt) {
-         const int elem_size = util_format_get_blocksize(format);
-         assert(pitch % elem_size == 0);
-      }
-   }
-
-   STATIC_ASSERT(Elements(surf->payload) >= 13);
-   dw = surf->payload;
-
-   dw[0] = surface_type << GEN7_SURFACE_DW0_TYPE__SHIFT |
-           surface_format << GEN7_SURFACE_DW0_FORMAT__SHIFT;
-
-   /*
-    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
-    *
-    *     "If this field (Surface Array) is enabled, the Surface Type must be
-    *      SURFTYPE_1D, SURFTYPE_2D, or SURFTYPE_CUBE. If this field is
-    *      disabled and Surface Type is SURFTYPE_1D, SURFTYPE_2D, or
-    *      SURFTYPE_CUBE, the Depth field must be set to zero."
-    *
-    * For non-3D sampler surfaces, resinfo (the sampler message) always
-    * returns zero for the number of layers when this field is not set.
-    */
-   if (surface_type != GEN6_SURFTYPE_3D) {
-      switch (target) {
-      case PIPE_TEXTURE_1D_ARRAY:
-      case PIPE_TEXTURE_2D_ARRAY:
-      case PIPE_TEXTURE_CUBE_ARRAY:
-         dw[0] |= GEN7_SURFACE_DW0_IS_ARRAY;
-         break;
-      default:
-         assert(depth == 1);
-         break;
-      }
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      switch (img->align_j) {
-      case 4:
-         dw[0] |= GEN7_SURFACE_DW0_VALIGN_4;
-         break;
-      case 8:
-         dw[0] |= GEN8_SURFACE_DW0_VALIGN_8;
-         break;
-      case 16:
-         dw[0] |= GEN8_SURFACE_DW0_VALIGN_16;
-         break;
-      default:
-         assert(!"unsupported valign");
-         break;
-      }
-
-      switch (img->align_i) {
-      case 4:
-         dw[0] |= GEN8_SURFACE_DW0_HALIGN_4;
-         break;
-      case 8:
-         dw[0] |= GEN8_SURFACE_DW0_HALIGN_8;
-         break;
-      case 16:
-         dw[0] |= GEN8_SURFACE_DW0_HALIGN_16;
-         break;
-      default:
-         assert(!"unsupported halign");
-         break;
-      }
-
-      dw[0] |= img->tiling << GEN8_SURFACE_DW0_TILING__SHIFT;
-   } else {
-      assert(img->align_i == 4 || img->align_i == 8);
-      assert(img->align_j == 2 || img->align_j == 4);
-
-      if (img->align_j == 4)
-         dw[0] |= GEN7_SURFACE_DW0_VALIGN_4;
-
-      if (img->align_i == 8)
-         dw[0] |= GEN7_SURFACE_DW0_HALIGN_8;
-
-      assert(img->tiling != GEN8_TILING_W);
-      dw[0] |= img->tiling << GEN7_SURFACE_DW0_TILING__SHIFT;
-
-      if (img->walk == ILO_IMAGE_WALK_LOD)
-         dw[0] |= GEN7_SURFACE_DW0_ARYSPC_LOD0;
-      else
-         dw[0] |= GEN7_SURFACE_DW0_ARYSPC_FULL;
-   }
-
-   if (is_rt)
-      dw[0] |= GEN7_SURFACE_DW0_RENDER_CACHE_RW;
-
-   if (surface_type == GEN6_SURFTYPE_CUBE && !is_rt)
-      dw[0] |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
-      assert(img->walk_layer_height % 4 == 0);
-      dw[1] = img->walk_layer_height / 4;
-   } else {
-      dw[1] = 0;
-   }
-
-   dw[2] = GEN_SHIFT32(height - 1, GEN7_SURFACE_DW2_HEIGHT) |
-           GEN_SHIFT32(width - 1, GEN7_SURFACE_DW2_WIDTH);
-
-   dw[3] = GEN_SHIFT32(depth - 1, GEN7_SURFACE_DW3_DEPTH) |
-           (pitch - 1);
-
-   dw[4] = first_layer << 18 |
-           (num_layers - 1) << 7;
-
-   /*
-    * MSFMT_MSS means the samples are not interleaved and MSFMT_DEPTH_STENCIL
-    * means the samples are interleaved.  The layouts are the same when the
-    * number of samples is 1.
-    */
-   if (img->interleaved_samples && img->sample_count > 1) {
-      assert(!is_rt);
-      dw[4] |= GEN7_SURFACE_DW4_MSFMT_DEPTH_STENCIL;
-   }
-   else {
-      dw[4] |= GEN7_SURFACE_DW4_MSFMT_MSS;
-   }
-
-   switch (img->sample_count) {
-   case 0:
-   case 1:
-   default:
-      dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_1;
-      break;
-   case 2:
-      dw[4] |= GEN8_SURFACE_DW4_MULTISAMPLECOUNT_2;
-      break;
-   case 4:
-      dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_4;
-      break;
-   case 8:
-      dw[4] |= GEN7_SURFACE_DW4_MULTISAMPLECOUNT_8;
-      break;
-   case 16:
-      dw[4] |= GEN8_SURFACE_DW4_MULTISAMPLECOUNT_16;
-      break;
-   }
-
-   dw[5] = GEN_SHIFT32(first_level, GEN7_SURFACE_DW5_MIN_LOD) |
-           lod;
-
-   dw[6] = 0;
-   dw[7] = 0;
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
-      dw[7] |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
-               GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
-               GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
-               GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(8))
-      memset(&dw[8], 0, sizeof(*dw) * (13 - 8));
-}
-
-void
-ilo_gpe_init_view_surface_null(const struct ilo_dev *dev,
-                               unsigned width, unsigned height,
-                               unsigned depth, unsigned level,
-                               struct ilo_view_surface *surf)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      view_init_null_gen7(dev,
-            width, height, depth, level, surf);
-   } else {
-      view_init_null_gen6(dev,
-            width, height, depth, level, surf);
-   }
-
-   surf->bo = NULL;
-   surf->scanout = false;
-}
-
-void
-ilo_gpe_init_view_surface_for_buffer(const struct ilo_dev *dev,
-                                     const struct ilo_buffer *buf,
-                                     unsigned offset, unsigned size,
-                                     unsigned struct_size,
-                                     enum pipe_format elem_format,
-                                     bool is_rt, bool render_cache_rw,
-                                     struct ilo_view_surface *surf)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      view_init_for_buffer_gen7(dev, buf, offset, size,
-            struct_size, elem_format, is_rt, render_cache_rw, surf);
-   } else {
-      view_init_for_buffer_gen6(dev, buf, offset, size,
-            struct_size, elem_format, is_rt, render_cache_rw, surf);
-   }
-
-   /* do not increment reference count */
-   surf->bo = buf->bo;
-   surf->scanout = false;
-}
-
-void
-ilo_gpe_init_view_surface_for_image(const struct ilo_dev *dev,
-                                    const struct ilo_image *img,
-                                    enum pipe_texture_target target,
-                                    enum pipe_format format,
-                                    unsigned first_level,
-                                    unsigned num_levels,
-                                    unsigned first_layer,
-                                    unsigned num_layers,
-                                    bool is_rt,
-                                    struct ilo_view_surface *surf)
-{
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      view_init_for_image_gen7(dev, img, target, format,
-            first_level, num_levels, first_layer, num_layers,
-            is_rt, surf);
-   } else {
-      view_init_for_image_gen6(dev, img, target, format,
-            first_level, num_levels, first_layer, num_layers,
-            is_rt, surf);
-   }
-
-   surf->scanout = img->scanout;
-   /* do not increment reference count */
-   surf->bo = img->bo;
-}
-
-static void
-sampler_init_border_color_gen6(const struct ilo_dev *dev,
-                               const union pipe_color_union *color,
-                               uint32_t *dw, int num_dwords)
-{
-   float rgba[4] = {
-      color->f[0], color->f[1], color->f[2], color->f[3],
-   };
-
-   ILO_DEV_ASSERT(dev, 6, 6);
-
-   assert(num_dwords >= 12);
-
-   /*
-    * This state is not documented in the Sandy Bridge PRM, but in the
-    * Ironlake PRM.  SNORM8 seems to be in DW11 instead of DW1.
-    */
-
-   /* IEEE_FP */
-   dw[1] = fui(rgba[0]);
-   dw[2] = fui(rgba[1]);
-   dw[3] = fui(rgba[2]);
-   dw[4] = fui(rgba[3]);
-
-   /* FLOAT_16 */
-   dw[5] = util_float_to_half(rgba[0]) |
-           util_float_to_half(rgba[1]) << 16;
-   dw[6] = util_float_to_half(rgba[2]) |
-           util_float_to_half(rgba[3]) << 16;
-
-   /* clamp to [-1.0f, 1.0f] */
-   rgba[0] = CLAMP(rgba[0], -1.0f, 1.0f);
-   rgba[1] = CLAMP(rgba[1], -1.0f, 1.0f);
-   rgba[2] = CLAMP(rgba[2], -1.0f, 1.0f);
-   rgba[3] = CLAMP(rgba[3], -1.0f, 1.0f);
-
-   /* SNORM16 */
-   dw[9] =  (int16_t) util_iround(rgba[0] * 32767.0f) |
-            (int16_t) util_iround(rgba[1] * 32767.0f) << 16;
-   dw[10] = (int16_t) util_iround(rgba[2] * 32767.0f) |
-            (int16_t) util_iround(rgba[3] * 32767.0f) << 16;
-
-   /* SNORM8 */
-   dw[11] = (int8_t) util_iround(rgba[0] * 127.0f) |
-            (int8_t) util_iround(rgba[1] * 127.0f) << 8 |
-            (int8_t) util_iround(rgba[2] * 127.0f) << 16 |
-            (int8_t) util_iround(rgba[3] * 127.0f) << 24;
-
-   /* clamp to [0.0f, 1.0f] */
-   rgba[0] = CLAMP(rgba[0], 0.0f, 1.0f);
-   rgba[1] = CLAMP(rgba[1], 0.0f, 1.0f);
-   rgba[2] = CLAMP(rgba[2], 0.0f, 1.0f);
-   rgba[3] = CLAMP(rgba[3], 0.0f, 1.0f);
-
-   /* UNORM8 */
-   dw[0] = (uint8_t) util_iround(rgba[0] * 255.0f) |
-           (uint8_t) util_iround(rgba[1] * 255.0f) << 8 |
-           (uint8_t) util_iround(rgba[2] * 255.0f) << 16 |
-           (uint8_t) util_iround(rgba[3] * 255.0f) << 24;
-
-   /* UNORM16 */
-   dw[7] = (uint16_t) util_iround(rgba[0] * 65535.0f) |
-           (uint16_t) util_iround(rgba[1] * 65535.0f) << 16;
-   dw[8] = (uint16_t) util_iround(rgba[2] * 65535.0f) |
-           (uint16_t) util_iround(rgba[3] * 65535.0f) << 16;
-}
-
-/**
- * Translate a pipe texture mipfilter to the matching hardware mipfilter.
- */
-static int
-gen6_translate_tex_mipfilter(unsigned filter)
-{
-   switch (filter) {
-   case PIPE_TEX_MIPFILTER_NEAREST: return GEN6_MIPFILTER_NEAREST;
-   case PIPE_TEX_MIPFILTER_LINEAR:  return GEN6_MIPFILTER_LINEAR;
-   case PIPE_TEX_MIPFILTER_NONE:    return GEN6_MIPFILTER_NONE;
-   default:
-      assert(!"unknown mipfilter");
-      return GEN6_MIPFILTER_NONE;
-   }
-}
-
-/**
- * Translate a pipe texture filter to the matching hardware mapfilter.
- */
-static int
-gen6_translate_tex_filter(unsigned filter)
-{
-   switch (filter) {
-   case PIPE_TEX_FILTER_NEAREST: return GEN6_MAPFILTER_NEAREST;
-   case PIPE_TEX_FILTER_LINEAR:  return GEN6_MAPFILTER_LINEAR;
-   default:
-      assert(!"unknown sampler filter");
-      return GEN6_MAPFILTER_NEAREST;
-   }
-}
-
-/**
- * Translate a pipe texture coordinate wrapping mode to the matching hardware
- * wrapping mode.
- */
-static int
-gen6_translate_tex_wrap(unsigned wrap)
-{
-   switch (wrap) {
-   case PIPE_TEX_WRAP_CLAMP:              return GEN8_TEXCOORDMODE_HALF_BORDER;
-   case PIPE_TEX_WRAP_REPEAT:             return GEN6_TEXCOORDMODE_WRAP;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:      return GEN6_TEXCOORDMODE_CLAMP;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:    return GEN6_TEXCOORDMODE_CLAMP_BORDER;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:      return GEN6_TEXCOORDMODE_MIRROR;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-   default:
-      assert(!"unknown sampler wrap mode");
-      return GEN6_TEXCOORDMODE_WRAP;
-   }
-}
-
-/**
- * Translate a pipe shadow compare function to the matching hardware shadow
- * function.
- */
-static int
-gen6_translate_shadow_func(unsigned func)
-{
-   /*
-    * For PIPE_FUNC_x, the reference value is on the left-hand side of the
-    * comparison, and 1.0 is returned when the comparison is true.
-    *
-    * For GEN6_COMPAREFUNCTION_x, the reference value is on the right-hand side of
-    * the comparison, and 0.0 is returned when the comparison is true.
-    */
-   switch (func) {
-   case PIPE_FUNC_NEVER:      return GEN6_COMPAREFUNCTION_ALWAYS;
-   case PIPE_FUNC_LESS:       return GEN6_COMPAREFUNCTION_LEQUAL;
-   case PIPE_FUNC_EQUAL:      return GEN6_COMPAREFUNCTION_NOTEQUAL;
-   case PIPE_FUNC_LEQUAL:     return GEN6_COMPAREFUNCTION_LESS;
-   case PIPE_FUNC_GREATER:    return GEN6_COMPAREFUNCTION_GEQUAL;
-   case PIPE_FUNC_NOTEQUAL:   return GEN6_COMPAREFUNCTION_EQUAL;
-   case PIPE_FUNC_GEQUAL:     return GEN6_COMPAREFUNCTION_GREATER;
-   case PIPE_FUNC_ALWAYS:     return GEN6_COMPAREFUNCTION_NEVER;
-   default:
-      assert(!"unknown shadow compare function");
-      return GEN6_COMPAREFUNCTION_NEVER;
-   }
-}
-
-void
-ilo_gpe_init_sampler_cso(const struct ilo_dev *dev,
-                         const struct pipe_sampler_state *state,
-                         struct ilo_sampler_cso *sampler)
-{
-   int mip_filter, min_filter, mag_filter, max_aniso;
-   int lod_bias, max_lod, min_lod;
-   int wrap_s, wrap_t, wrap_r, wrap_cube;
-   uint32_t dw0, dw1, dw3;
-
-   ILO_DEV_ASSERT(dev, 6, 8);
-
-   memset(sampler, 0, sizeof(*sampler));
-
-   mip_filter = gen6_translate_tex_mipfilter(state->min_mip_filter);
-   min_filter = gen6_translate_tex_filter(state->min_img_filter);
-   mag_filter = gen6_translate_tex_filter(state->mag_img_filter);
-
-   sampler->anisotropic = state->max_anisotropy;
-
-   if (state->max_anisotropy >= 2 && state->max_anisotropy <= 16)
-      max_aniso = state->max_anisotropy / 2 - 1;
-   else if (state->max_anisotropy > 16)
-      max_aniso = GEN6_ANISORATIO_16;
-   else
-      max_aniso = GEN6_ANISORATIO_2;
-
-   /*
-    *
-    * Here is how the hardware calculate per-pixel LOD, from my reading of the
-    * PRMs:
-    *
-    *  1) LOD is set to log2(ratio of texels to pixels) if not specified in
-    *     other ways.  The number of texels is measured using level
-    *     SurfMinLod.
-    *  2) Bias is added to LOD.
-    *  3) LOD is clamped to [MinLod, MaxLod], and the clamped value is
-    *     compared with Base to determine whether magnification or
-    *     minification is needed.  (if preclamp is disabled, LOD is compared
-    *     with Base before clamping)
-    *  4) If magnification is needed, or no mipmapping is requested, LOD is
-    *     set to floor(MinLod).
-    *  5) LOD is clamped to [0, MIPCnt], and SurfMinLod is added to LOD.
-    *
-    * With Gallium interface, Base is always zero and
-    * pipe_sampler_view::u.tex.first_level specifies SurfMinLod.
-    */
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      const float scale = 256.0f;
-
-      /* [-16.0, 16.0) in S4.8 */
-      lod_bias = (int)
-         (CLAMP(state->lod_bias, -16.0f, 15.9f) * scale);
-      lod_bias &= 0x1fff;
-
-      /* [0.0, 14.0] in U4.8 */
-      max_lod = (int) (CLAMP(state->max_lod, 0.0f, 14.0f) * scale);
-      min_lod = (int) (CLAMP(state->min_lod, 0.0f, 14.0f) * scale);
-   }
-   else {
-      const float scale = 64.0f;
-
-      /* [-16.0, 16.0) in S4.6 */
-      lod_bias = (int)
-         (CLAMP(state->lod_bias, -16.0f, 15.9f) * scale);
-      lod_bias &= 0x7ff;
-
-      /* [0.0, 13.0] in U4.6 */
-      max_lod = (int) (CLAMP(state->max_lod, 0.0f, 13.0f) * scale);
-      min_lod = (int) (CLAMP(state->min_lod, 0.0f, 13.0f) * scale);
-   }
-
-   /*
-    * We want LOD to be clamped to determine magnification/minification, and
-    * get set to zero when it is magnification or when mipmapping is disabled.
-    * The hardware would set LOD to floor(MinLod) and that is a problem when
-    * MinLod is greater than or equal to 1.0f.
-    *
-    * With Base being zero, it is always minification when MinLod is non-zero.
-    * To achieve our goal, we just need to set MinLod to zero and set
-    * MagFilter to MinFilter when mipmapping is disabled.
-    */
-   if (state->min_mip_filter == PIPE_TEX_MIPFILTER_NONE && min_lod) {
-      min_lod = 0;
-      mag_filter = min_filter;
-   }
-
-   /* determine wrap s/t/r */
-   wrap_s = gen6_translate_tex_wrap(state->wrap_s);
-   wrap_t = gen6_translate_tex_wrap(state->wrap_t);
-   wrap_r = gen6_translate_tex_wrap(state->wrap_r);
-   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
-      /*
-       * For nearest filtering, PIPE_TEX_WRAP_CLAMP means
-       * PIPE_TEX_WRAP_CLAMP_TO_EDGE;  for linear filtering,
-       * PIPE_TEX_WRAP_CLAMP means PIPE_TEX_WRAP_CLAMP_TO_BORDER while
-       * additionally clamping the texture coordinates to [0.0, 1.0].
-       *
-       * PIPE_TEX_WRAP_CLAMP is not supported natively until Gen8.  The
-       * clamping has to be taken care of in the shaders.  There are two
-       * filters here, but let the minification one has a say.
-       */
-      const bool clamp_is_to_edge =
-         (state->min_img_filter == PIPE_TEX_FILTER_NEAREST);
-
-      if (clamp_is_to_edge) {
-         if (wrap_s == GEN8_TEXCOORDMODE_HALF_BORDER)
-            wrap_s = GEN6_TEXCOORDMODE_CLAMP;
-         if (wrap_t == GEN8_TEXCOORDMODE_HALF_BORDER)
-            wrap_t = GEN6_TEXCOORDMODE_CLAMP;
-         if (wrap_r == GEN8_TEXCOORDMODE_HALF_BORDER)
-            wrap_r = GEN6_TEXCOORDMODE_CLAMP;
-      } else {
-         if (wrap_s == GEN8_TEXCOORDMODE_HALF_BORDER) {
-            wrap_s = GEN6_TEXCOORDMODE_CLAMP_BORDER;
-            sampler->saturate_s = true;
-         }
-         if (wrap_t == GEN8_TEXCOORDMODE_HALF_BORDER) {
-            wrap_t = GEN6_TEXCOORDMODE_CLAMP_BORDER;
-            sampler->saturate_t = true;
-         }
-         if (wrap_r == GEN8_TEXCOORDMODE_HALF_BORDER) {
-            wrap_r = GEN6_TEXCOORDMODE_CLAMP_BORDER;
-            sampler->saturate_r = true;
-         }
-      }
-   }
-
-   /*
-    * From the Sandy Bridge PRM, volume 4 part 1, page 107:
-    *
-    *     "When using cube map texture coordinates, only TEXCOORDMODE_CLAMP
-    *      and TEXCOORDMODE_CUBE settings are valid, and each TC component
-    *      must have the same Address Control mode."
-    *
-    * From the Ivy Bridge PRM, volume 4 part 1, page 96:
-    *
-    *     "This field (Cube Surface Control Mode) must be set to
-    *      CUBECTRLMODE_PROGRAMMED"
-    *
-    * Therefore, we cannot use "Cube Surface Control Mode" for semless cube
-    * map filtering.
-    */
-   if (state->seamless_cube_map &&
-       (state->min_img_filter != PIPE_TEX_FILTER_NEAREST ||
-        state->mag_img_filter != PIPE_TEX_FILTER_NEAREST)) {
-      wrap_cube = GEN6_TEXCOORDMODE_CUBE;
-   }
-   else {
-      wrap_cube = GEN6_TEXCOORDMODE_CLAMP;
-   }
-
-   if (!state->normalized_coords) {
-      /*
-       * From the Ivy Bridge PRM, volume 4 part 1, page 98:
-       *
-       *     "The following state must be set as indicated if this field
-       *      (Non-normalized Coordinate Enable) is enabled:
-       *
-       *      - TCX/Y/Z Address Control Mode must be TEXCOORDMODE_CLAMP,
-       *        TEXCOORDMODE_HALF_BORDER, or TEXCOORDMODE_CLAMP_BORDER.
-       *      - Surface Type must be SURFTYPE_2D or SURFTYPE_3D.
-       *      - Mag Mode Filter must be MAPFILTER_NEAREST or
-       *        MAPFILTER_LINEAR.
-       *      - Min Mode Filter must be MAPFILTER_NEAREST or
-       *        MAPFILTER_LINEAR.
-       *      - Mip Mode Filter must be MIPFILTER_NONE.
-       *      - Min LOD must be 0.
-       *      - Max LOD must be 0.
-       *      - MIP Count must be 0.
-       *      - Surface Min LOD must be 0.
-       *      - Texture LOD Bias must be 0."
-       */
-      assert(wrap_s == GEN6_TEXCOORDMODE_CLAMP ||
-             wrap_s == GEN6_TEXCOORDMODE_CLAMP_BORDER);
-      assert(wrap_t == GEN6_TEXCOORDMODE_CLAMP ||
-             wrap_t == GEN6_TEXCOORDMODE_CLAMP_BORDER);
-      assert(wrap_r == GEN6_TEXCOORDMODE_CLAMP ||
-             wrap_r == GEN6_TEXCOORDMODE_CLAMP_BORDER);
-
-      assert(mag_filter == GEN6_MAPFILTER_NEAREST ||
-             mag_filter == GEN6_MAPFILTER_LINEAR);
-      assert(min_filter == GEN6_MAPFILTER_NEAREST ||
-             min_filter == GEN6_MAPFILTER_LINEAR);
-
-      /* work around a bug in util_blitter */
-      mip_filter = GEN6_MIPFILTER_NONE;
-
-      assert(mip_filter == GEN6_MIPFILTER_NONE);
-   }
-
-   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
-      dw0 = 1 << 28 |
-            mip_filter << 20 |
-            lod_bias << 1;
-
-      sampler->dw_filter = mag_filter << 17 |
-                           min_filter << 14;
-
-      sampler->dw_filter_aniso = GEN6_MAPFILTER_ANISOTROPIC << 17 |
-                                 GEN6_MAPFILTER_ANISOTROPIC << 14 |
-                                 1;
-
-      dw1 = min_lod << 20 |
-            max_lod << 8;
-
-      if (state->compare_mode != PIPE_TEX_COMPARE_NONE)
-         dw1 |= gen6_translate_shadow_func(state->compare_func) << 1;
-
-      dw3 = max_aniso << 19;
-
-      /* round the coordinates for linear filtering */
-      if (min_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_V_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_R_MIN_ROUND);
-      }
-      if (mag_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_V_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_R_MAG_ROUND);
-      }
-
-      if (!state->normalized_coords)
-         dw3 |= 1 << 10;
-
-      sampler->dw_wrap = wrap_s << 6 |
-                         wrap_t << 3 |
-                         wrap_r;
-
-      /*
-       * As noted in the classic i965 driver, the HW may still reference
-       * wrap_t and wrap_r for 1D textures.  We need to set them to a safe
-       * mode
-       */
-      sampler->dw_wrap_1d = wrap_s << 6 |
-                            GEN6_TEXCOORDMODE_WRAP << 3 |
-                            GEN6_TEXCOORDMODE_WRAP;
-
-      sampler->dw_wrap_cube = wrap_cube << 6 |
-                              wrap_cube << 3 |
-                              wrap_cube;
-
-      STATIC_ASSERT(Elements(sampler->payload) >= 7);
-
-      sampler->payload[0] = dw0;
-      sampler->payload[1] = dw1;
-      sampler->payload[2] = dw3;
-
-      memcpy(&sampler->payload[3],
-            state->border_color.ui, sizeof(state->border_color.ui));
-   }
-   else {
-      dw0 = 1 << 28 |
-            mip_filter << 20 |
-            lod_bias << 3;
-
-      if (state->compare_mode != PIPE_TEX_COMPARE_NONE)
-         dw0 |= gen6_translate_shadow_func(state->compare_func);
-
-      sampler->dw_filter = (min_filter != mag_filter) << 27 |
-                           mag_filter << 17 |
-                           min_filter << 14;
-
-      sampler->dw_filter_aniso = GEN6_MAPFILTER_ANISOTROPIC << 17 |
-                                 GEN6_MAPFILTER_ANISOTROPIC << 14;
-
-      dw1 = min_lod << 22 |
-            max_lod << 12;
-
-      sampler->dw_wrap = wrap_s << 6 |
-                         wrap_t << 3 |
-                         wrap_r;
-
-      sampler->dw_wrap_1d = wrap_s << 6 |
-                            GEN6_TEXCOORDMODE_WRAP << 3 |
-                            GEN6_TEXCOORDMODE_WRAP;
-
-      sampler->dw_wrap_cube = wrap_cube << 6 |
-                              wrap_cube << 3 |
-                              wrap_cube;
-
-      dw3 = max_aniso << 19;
-
-      /* round the coordinates for linear filtering */
-      if (min_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_V_MIN_ROUND |
-                 GEN6_SAMPLER_DW3_R_MIN_ROUND);
-      }
-      if (mag_filter != GEN6_MAPFILTER_NEAREST) {
-         dw3 |= (GEN6_SAMPLER_DW3_U_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_V_MAG_ROUND |
-                 GEN6_SAMPLER_DW3_R_MAG_ROUND);
-      }
-
-      if (!state->normalized_coords)
-         dw3 |= 1;
-
-      STATIC_ASSERT(Elements(sampler->payload) >= 15);
-
-      sampler->payload[0] = dw0;
-      sampler->payload[1] = dw1;
-      sampler->payload[2] = dw3;
-
-      sampler_init_border_color_gen6(dev,
-            &state->border_color, &sampler->payload[3], 12);
-   }
-}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.c b/src/gallium/drivers/ilo/core/ilo_state_cc.c
new file mode 100644
index 0000000..83ee8de
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_cc.c
@@ -0,0 +1,890 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_cc.h"
+
+static bool
+cc_validate_gen6_stencil(const struct ilo_dev *dev,
+                         const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_stencil_info *stencil = &info->stencil;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 359:
+    *
+    *     "If the Depth Buffer is either undefined or does not have a surface
+    *      format of D32_FLOAT_S8X24_UINT or D24_UNORM_S8_UINT and separate
+    *      stencil buffer is disabled, Stencil Test Enable must be DISABLED"
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 370:
+    *
+    *     "This field (Stencil Test Enable) cannot be enabled if Surface
+    *      Format in 3DSTATE_DEPTH_BUFFER is set to D16_UNORM."
+    */
+   if (stencil->test_enable)
+      assert(stencil->cv_has_buffer);
+
+   return true;
+}
+
+static bool
+cc_validate_gen6_depth(const struct ilo_dev *dev,
+                       const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_depth_info *depth = &info->depth;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 360:
+    *
+    *     "Enabling the Depth Test function without defining a Depth Buffer is
+    *      UNDEFINED."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 375:
+    *
+    *     "A Depth Buffer must be defined before enabling writes to it, or
+    *      operation is UNDEFINED."
+    */
+   if (depth->test_enable || depth->write_enable)
+      assert(depth->cv_has_buffer);
+
+   return true;
+}
+
+static bool
+cc_set_gen6_DEPTH_STENCIL_STATE(struct ilo_state_cc *cc,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_stencil_info *stencil = &info->stencil;
+   const struct ilo_state_cc_depth_info *depth = &info->depth;
+   const struct ilo_state_cc_params_info *params = &info->params;
+   uint32_t dw0, dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   if (!cc_validate_gen6_stencil(dev, info) ||
+       !cc_validate_gen6_depth(dev, info))
+      return false;
+
+   dw0 = 0;
+   dw1 = 0;
+   if (stencil->test_enable) {
+      const struct ilo_state_cc_stencil_op_info *front = &stencil->front;
+      const struct ilo_state_cc_stencil_params_info *front_p =
+         &params->stencil_front;
+      const struct ilo_state_cc_stencil_op_info *back;
+      const struct ilo_state_cc_stencil_params_info *back_p;
+
+      dw0 |= GEN6_ZS_DW0_STENCIL_TEST_ENABLE;
+
+      if (stencil->twosided_enable) {
+         dw0 |= GEN6_ZS_DW0_STENCIL1_ENABLE;
+
+         back = &stencil->back;
+         back_p = &params->stencil_back;
+      } else {
+         back = &stencil->front;
+         back_p = &params->stencil_front;
+      }
+
+      dw0 |= front->test_func << GEN6_ZS_DW0_STENCIL_FUNC__SHIFT |
+             front->fail_op << GEN6_ZS_DW0_STENCIL_FAIL_OP__SHIFT |
+             front->zfail_op << GEN6_ZS_DW0_STENCIL_ZFAIL_OP__SHIFT |
+             front->zpass_op << GEN6_ZS_DW0_STENCIL_ZPASS_OP__SHIFT |
+             back->test_func << GEN6_ZS_DW0_STENCIL1_FUNC__SHIFT |
+             back->fail_op << GEN6_ZS_DW0_STENCIL1_FAIL_OP__SHIFT |
+             back->zfail_op << GEN6_ZS_DW0_STENCIL1_ZFAIL_OP__SHIFT |
+             back->zpass_op << GEN6_ZS_DW0_STENCIL1_ZPASS_OP__SHIFT;
+
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 363:
+       *
+       *     "If this field (Stencil Buffer Write Enable) is enabled, Stencil
+       *      Test Enable must also be enabled."
+       *
+       * This is different from depth write enable, which is independent from
+       * depth test enable.
+       */
+      if (front_p->write_mask || back_p->write_mask)
+         dw0 |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
+
+      dw1 |= front_p->test_mask << GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT |
+             front_p->write_mask << GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT |
+             back_p->test_mask << GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT |
+             back_p->write_mask << GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT;
+   }
+
+   dw2 = 0;
+   if (depth->test_enable) {
+      dw2 |= GEN6_ZS_DW2_DEPTH_TEST_ENABLE |
+             depth->test_func << GEN6_ZS_DW2_DEPTH_FUNC__SHIFT;
+   } else {
+      dw2 |= GEN6_COMPAREFUNCTION_ALWAYS << GEN6_ZS_DW2_DEPTH_FUNC__SHIFT;
+   }
+
+   /* independent from depth->test_enable */
+   if (depth->write_enable)
+      dw2 |= GEN6_ZS_DW2_DEPTH_WRITE_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->ds) >= 3);
+   cc->ds[0] = dw0;
+   cc->ds[1] = dw1;
+   cc->ds[2] = dw2;
+
+   return true;
+}
+
+static bool
+cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL(struct ilo_state_cc *cc,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_stencil_info *stencil = &info->stencil;
+   const struct ilo_state_cc_depth_info *depth = &info->depth;
+   const struct ilo_state_cc_params_info *params = &info->params;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!cc_validate_gen6_stencil(dev, info) ||
+       !cc_validate_gen6_depth(dev, info))
+      return false;
+
+   dw1 = 0;
+   dw2 = 0;
+   if (stencil->test_enable) {
+      const struct ilo_state_cc_stencil_op_info *front = &stencil->front;
+      const struct ilo_state_cc_stencil_params_info *front_p =
+         &params->stencil_front;
+      const struct ilo_state_cc_stencil_op_info *back;
+      const struct ilo_state_cc_stencil_params_info *back_p;
+
+      dw1 |= GEN8_ZS_DW1_STENCIL_TEST_ENABLE;
+
+      if (stencil->twosided_enable) {
+         dw1 |= GEN8_ZS_DW1_STENCIL1_ENABLE;
+
+         back = &stencil->back;
+         back_p = &params->stencil_back;
+      } else {
+         back = &stencil->front;
+         back_p = &params->stencil_front;
+      }
+
+      dw1 |= front->fail_op << GEN8_ZS_DW1_STENCIL_FAIL_OP__SHIFT |
+             front->zfail_op << GEN8_ZS_DW1_STENCIL_ZFAIL_OP__SHIFT |
+             front->zpass_op << GEN8_ZS_DW1_STENCIL_ZPASS_OP__SHIFT |
+             back->test_func << GEN8_ZS_DW1_STENCIL1_FUNC__SHIFT |
+             back->fail_op << GEN8_ZS_DW1_STENCIL1_FAIL_OP__SHIFT |
+             back->zfail_op << GEN8_ZS_DW1_STENCIL1_ZFAIL_OP__SHIFT |
+             back->zpass_op << GEN8_ZS_DW1_STENCIL1_ZPASS_OP__SHIFT |
+             front->test_func << GEN8_ZS_DW1_STENCIL_FUNC__SHIFT;
+
+      if (front_p->write_mask || back_p->write_mask)
+         dw1 |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
+
+      dw2 |= front_p->test_mask << GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT |
+             front_p->write_mask << GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT |
+             back_p->test_mask << GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT |
+             back_p->write_mask << GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT;
+   }
+
+   if (depth->test_enable) {
+      dw1 |= GEN8_ZS_DW1_DEPTH_TEST_ENABLE |
+             depth->test_func << GEN8_ZS_DW1_DEPTH_FUNC__SHIFT;
+   } else {
+      dw1 |= GEN6_COMPAREFUNCTION_ALWAYS << GEN8_ZS_DW1_DEPTH_FUNC__SHIFT;
+   }
+
+   if (depth->write_enable)
+      dw1 |= GEN8_ZS_DW1_DEPTH_WRITE_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->ds) >= 2);
+   cc->ds[0] = dw1;
+   cc->ds[1] = dw2;
+
+   return true;
+}
+
+static bool
+is_dual_source_blend_factor(enum gen_blend_factor factor)
+{
+   switch (factor) {
+   case GEN6_BLENDFACTOR_SRC1_COLOR:
+   case GEN6_BLENDFACTOR_SRC1_ALPHA:
+   case GEN6_BLENDFACTOR_INV_SRC1_COLOR:
+   case GEN6_BLENDFACTOR_INV_SRC1_ALPHA:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+cc_get_gen6_dual_source_blending(const struct ilo_dev *dev,
+                                 const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   bool dual_source_blending;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   dual_source_blending = (blend->rt_count &&
+         (is_dual_source_blend_factor(blend->rt[0].rgb_src) ||
+          is_dual_source_blend_factor(blend->rt[0].rgb_dst) ||
+          is_dual_source_blend_factor(blend->rt[0].a_src) ||
+          is_dual_source_blend_factor(blend->rt[0].a_dst)));
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 356:
+    *
+    *     "Dual Source Blending: When using "Dual Source" Render Target
+    *      Write messages, the Source1 pixel color+alpha passed in the
+    *      message can be selected as a src/dst blend factor. See Color
+    *      Buffer Blending.  In single-source mode, those blend factor
+    *      selections are invalid. If SRC1 is included in a src/dst blend
+    *      factor and a DualSource RT Write message is not utilized,
+    *      results are UNDEFINED. (This reflects the same restriction in DX
+    *      APIs, where undefined results are produced if "o1" is not
+    *      written by a PS - there are no default values defined). If SRC1
+    *      is not included in a src/dst blend factor, dual source blending
+    *      must be disabled."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 356:
+    *
+    *     "The single source message will not cause a write to the render
+    *      target if Dual Source Blend Enable in 3DSTATE_WM is enabled."
+    *
+    *     "The dual source message will revert to a single source message
+    *      using source 0 if Dual Source Blend Enable in 3DSTATE_WM is
+    *      disabled."
+    *
+    * Dual source blending must be enabled or disabled universally.
+    */
+   for (i = 1; i < blend->rt_count; i++) {
+      assert(dual_source_blending ==
+         (is_dual_source_blend_factor(blend->rt[i].rgb_src) ||
+          is_dual_source_blend_factor(blend->rt[i].rgb_dst) ||
+          is_dual_source_blend_factor(blend->rt[i].a_src) ||
+          is_dual_source_blend_factor(blend->rt[i].a_dst)));
+   }
+
+   return dual_source_blending;
+}
+
+static bool
+cc_validate_gen6_alpha(const struct ilo_dev *dev,
+                       const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 356:
+    *
+    *     "Alpha values from the pixel shader are treated as FLOAT32 format
+    *      for computing the AlphaToCoverage Mask."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 378:
+    *
+    *     "If set (AlphaToCoverage Enable), Source0 Alpha is converted to a
+    *      temporary 1/2/4-bit coverage mask and the mask bit corresponding to
+    *      the sample# ANDed with the sample mask bit. If set, sample coverage
+    *      is computed based on src0 alpha value. Value of 0 disables all
+    *      samples and value of 1 enables all samples for that pixel. The same
+    *      coverage needs to apply to all the RTs in MRT case. Further, any
+    *      value of src0 alpha between 0 and 1 monotonically increases the
+    *      number of enabled pixels.
+    *
+    *      The same coverage needs to be applied to all the RTs in MRT case."
+    *
+    *     "If set (AlphaToOne Enable), Source0 Alpha is set to 1.0f after
+    *      (possibly) being used to generate the AlphaToCoverage coverage
+    *      mask.
+    *
+    *      The same coverage needs to be applied to all the RTs in MRT case.
+    *
+    *      If Dual Source Blending is enabled, this bit must be disabled."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 382:
+    *
+    *     "Alpha Test can only be enabled if Pixel Shader outputs a float
+    *      alpha value.
+    *
+    *      Alpha Test is applied independently on each render target by
+    *      comparing that render target's alpha value against the alpha
+    *      reference value. If the alpha test fails, the corresponding pixel
+    *      write will be supressed only for that render target. The
+    *      depth/stencil update will occur if alpha test passes for any render
+    *      target."
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 194:
+    *
+    *     "Multiple render targets are supported with the single source and
+    *      replicate data messages. Each render target is accessed with a
+    *      separate Render Target Write message, each with a different surface
+    *      indicated (different binding table index). The depth buffer is
+    *      written only by the message(s) to the last render target, indicated
+    *      by the Last Render Target Select bit set to clear the pixel
+    *      scoreboard bits."
+    *
+    * When AlphaToCoverage/AlphaToOne/AlphaTest is enabled, it is
+    * required/desirable for the RT write messages to set "Source0 Alpha
+    * Present to RenderTarget" in the MRT case.  It is also required/desirable
+    * for the alpha values to be FLOAT32.
+    */
+   if (alpha->alpha_to_coverage || alpha->alpha_to_one || alpha->test_enable)
+      assert(alpha->cv_float_source0_alpha);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 356:
+    *
+    *     "[DevSNB]: When NumSamples = 1, AlphaToCoverage and AlphaTo
+    *      Coverage Dither both must be disabled."
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && alpha->alpha_to_coverage)
+      assert(alpha->cv_sample_count_one);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 378:
+    *
+    *     "If Dual Source Blending is enabled, this bit (AlphaToOne Enable)
+    *      must be disabled."
+    */
+   if (alpha->alpha_to_one)
+      assert(!cc_get_gen6_dual_source_blending(dev, info));
+
+   return true;
+}
+
+static bool
+cc_validate_gen6_blend(const struct ilo_dev *dev,
+                       const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(blend->rt_count <= ILO_STATE_CC_BLEND_MAX_RT_COUNT);
+
+   return true;
+}
+
+static enum gen_blend_factor
+get_dst_alpha_one_blend_factor(enum gen_blend_factor factor, bool is_rgb)
+{
+   switch (factor) {
+   case GEN6_BLENDFACTOR_DST_ALPHA:
+      return GEN6_BLENDFACTOR_ONE;
+   case GEN6_BLENDFACTOR_INV_DST_ALPHA:
+      return GEN6_BLENDFACTOR_ZERO;
+   case GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return (is_rgb) ? GEN6_BLENDFACTOR_ZERO : GEN6_BLENDFACTOR_ONE;
+   default:
+      return factor;
+   }
+}
+
+static void
+cc_get_gen6_effective_rt(const struct ilo_dev *dev,
+                         const struct ilo_state_cc_info *info,
+                         uint8_t rt_index,
+                         struct ilo_state_cc_blend_rt_info *dst)
+{
+   const struct ilo_state_cc_blend_rt_info *rt = &info->blend.rt[rt_index];
+
+   if (rt->logicop_enable || rt->blend_enable ||
+       rt->argb_write_disables != 0xf)
+      assert(rt->cv_has_buffer);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
+    *
+    *     "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB
+    *      variants), otherwise Logic Ops must be DISABLED."
+    *
+    * From the Broadwell PRM, volume 7, page 671:
+    *
+    *     "Logic Ops are supported on all blendable render targets and render
+    *      targets with *INT formats."
+    */
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && rt->logicop_enable)
+      assert(rt->cv_is_unorm);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 361:
+    *
+    *     "Only certain surface formats support Color Buffer Blending.  Refer
+    *      to the Surface Format tables in Sampling Engine. Blending must be
+    *      disabled on a RenderTarget if blending is not supported."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
+    *
+    *     "Color Buffer Blending and Logic Ops must not be enabled
+    *      simultaneously, or behavior is UNDEFINED."
+    */
+   if (rt->blend_enable)
+      assert(!rt->cv_is_integer && !rt->logicop_enable);
+
+   *dst = *rt;
+   if (rt->blend_enable) {
+      /* 0x0 is reserved in enum gen_blend_factor */
+      assert(rt->rgb_src && rt->rgb_dst && rt->a_src && rt->a_dst);
+
+      if (rt->force_dst_alpha_one) {
+         dst->rgb_src = get_dst_alpha_one_blend_factor(rt->rgb_src, true);
+         dst->rgb_dst = get_dst_alpha_one_blend_factor(rt->rgb_dst, true);
+         dst->a_src = get_dst_alpha_one_blend_factor(rt->a_src, false);
+         dst->a_dst = get_dst_alpha_one_blend_factor(rt->a_dst, false);
+         dst->force_dst_alpha_one = false;
+      }
+   } else {
+      dst->rgb_src = GEN6_BLENDFACTOR_ONE;
+      dst->rgb_dst = GEN6_BLENDFACTOR_ZERO;
+      dst->rgb_func = GEN6_BLENDFUNCTION_ADD;
+      dst->a_src = dst->rgb_src;
+      dst->a_dst = dst->rgb_dst;
+      dst->a_func = dst->rgb_func;
+   }
+}
+
+static bool
+cc_set_gen6_BLEND_STATE(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   uint32_t dw_rt[2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT], dw1_invariant;
+   uint32_t dw0, dw1;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   if (!cc_validate_gen6_alpha(dev, info) ||
+       !cc_validate_gen6_blend(dev, info))
+      return false;
+
+   /*
+    * According to the Sandy Bridge PRM, volume 2 part 1, page 360, pre-blend
+    * and post-blend color clamps must be enabled in most cases.  For the
+    * other cases, they are either desirable or ignored.  We can enable them
+    * unconditionally.
+    */
+   dw1 = GEN6_RT_DW1_COLORCLAMP_RTFORMAT |
+         GEN6_RT_DW1_PRE_BLEND_CLAMP |
+         GEN6_RT_DW1_POST_BLEND_CLAMP;
+
+   if (alpha->alpha_to_coverage) {
+      dw1 |= GEN6_RT_DW1_ALPHA_TO_COVERAGE;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 379:
+       *
+       *     "[DevSNB]: This bit (AlphaToCoverage Dither Enable) must be
+       *      disabled."
+       */
+      if (ilo_dev_gen(dev) >= ILO_GEN(7))
+         dw1 |= GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER;
+   }
+
+   if (alpha->alpha_to_one)
+      dw1 |= GEN6_RT_DW1_ALPHA_TO_ONE;
+
+   if (alpha->test_enable) {
+      dw1 |= GEN6_RT_DW1_ALPHA_TEST_ENABLE |
+             alpha->test_func << GEN6_RT_DW1_ALPHA_TEST_FUNC__SHIFT;
+   } else {
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 371:
+       *
+       *     "When Alpha Test is disabled, Alpha Test Function must be
+       *      COMPAREFUNCTION_ALWAYS."
+       */
+      dw1 |= GEN6_COMPAREFUNCTION_ALWAYS <<
+         GEN6_RT_DW1_ALPHA_TEST_FUNC__SHIFT;
+   }
+
+   if (blend->dither_enable)
+      dw1 |= GEN6_RT_DW1_DITHER_ENABLE;
+
+   dw1_invariant = dw1;
+
+   for (i = 0; i < blend->rt_count; i++) {
+      struct ilo_state_cc_blend_rt_info rt;
+
+      cc_get_gen6_effective_rt(dev, info, i, &rt);
+
+      /* 0x0 is reserved for blend factors and we have to set them all */
+      dw0 = rt.a_func << GEN6_RT_DW0_ALPHA_FUNC__SHIFT |
+            rt.a_src << GEN6_RT_DW0_SRC_ALPHA_FACTOR__SHIFT |
+            rt.a_dst << GEN6_RT_DW0_DST_ALPHA_FACTOR__SHIFT |
+            rt.rgb_func << GEN6_RT_DW0_COLOR_FUNC__SHIFT |
+            rt.rgb_src << GEN6_RT_DW0_SRC_COLOR_FACTOR__SHIFT |
+            rt.rgb_dst << GEN6_RT_DW0_DST_COLOR_FACTOR__SHIFT;
+
+      if (rt.blend_enable) {
+         dw0 |= GEN6_RT_DW0_BLEND_ENABLE;
+
+         if (rt.a_src != rt.rgb_src ||
+             rt.a_dst != rt.rgb_dst ||
+             rt.a_func != rt.rgb_func)
+            dw0 |= GEN6_RT_DW0_INDEPENDENT_ALPHA_ENABLE;
+      }
+
+      dw1 = dw1_invariant |
+            rt.argb_write_disables << GEN6_RT_DW1_WRITE_DISABLES__SHIFT;
+
+      if (rt.logicop_enable) {
+         dw1 |= GEN6_RT_DW1_LOGICOP_ENABLE |
+                rt.logicop_func << GEN6_RT_DW1_LOGICOP_FUNC__SHIFT;
+      }
+
+      dw_rt[2 * i + 0] = dw0;
+      dw_rt[2 * i + 1] = dw1;
+   }
+
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= ARRAY_SIZE(dw_rt));
+   memcpy(&cc->blend[0], dw_rt, sizeof(uint32_t) * 2 * blend->rt_count);
+   cc->blend_state_count = info->blend.rt_count;
+
+   return true;
+}
+
+static bool
+cc_set_gen8_BLEND_STATE(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   uint32_t dw_rt[2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT], dw0, dw1;
+   bool indep_alpha_enable;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!cc_validate_gen6_alpha(dev, info) ||
+       !cc_validate_gen6_blend(dev, info))
+      return false;
+
+   indep_alpha_enable = false;
+   for (i = 0; i < blend->rt_count; i++) {
+      struct ilo_state_cc_blend_rt_info rt;
+
+      cc_get_gen6_effective_rt(dev, info, i, &rt);
+
+      dw0 = rt.rgb_src << GEN8_RT_DW0_SRC_COLOR_FACTOR__SHIFT |
+            rt.rgb_dst << GEN8_RT_DW0_DST_COLOR_FACTOR__SHIFT |
+            rt.rgb_func << GEN8_RT_DW0_COLOR_FUNC__SHIFT |
+            rt.a_src << GEN8_RT_DW0_SRC_ALPHA_FACTOR__SHIFT |
+            rt.a_dst << GEN8_RT_DW0_DST_ALPHA_FACTOR__SHIFT |
+            rt.a_func << GEN8_RT_DW0_ALPHA_FUNC__SHIFT |
+            rt.argb_write_disables << GEN8_RT_DW0_WRITE_DISABLES__SHIFT;
+
+      if (rt.blend_enable) {
+         dw0 |= GEN8_RT_DW0_BLEND_ENABLE;
+
+         if (rt.a_src != rt.rgb_src ||
+             rt.a_dst != rt.rgb_dst ||
+             rt.a_func != rt.rgb_func)
+            indep_alpha_enable = true;
+      }
+
+      dw1 = GEN8_RT_DW1_COLORCLAMP_RTFORMAT |
+            GEN8_RT_DW1_PRE_BLEND_CLAMP |
+            GEN8_RT_DW1_POST_BLEND_CLAMP;
+
+      if (rt.logicop_enable) {
+         dw1 |= GEN8_RT_DW1_LOGICOP_ENABLE |
+                rt.logicop_func << GEN8_RT_DW1_LOGICOP_FUNC__SHIFT;
+      }
+
+      dw_rt[2 * i + 0] = dw0;
+      dw_rt[2 * i + 1] = dw1;
+   }
+
+   dw0 = 0;
+
+   if (alpha->alpha_to_coverage) {
+      dw0 |= GEN8_BLEND_DW0_ALPHA_TO_COVERAGE |
+             GEN8_BLEND_DW0_ALPHA_TO_COVERAGE_DITHER;
+   }
+
+   if (indep_alpha_enable)
+      dw0 |= GEN8_BLEND_DW0_INDEPENDENT_ALPHA_ENABLE;
+
+   if (alpha->alpha_to_one)
+      dw0 |= GEN8_BLEND_DW0_ALPHA_TO_ONE;
+
+   if (alpha->test_enable) {
+      dw0 |= GEN8_BLEND_DW0_ALPHA_TEST_ENABLE |
+             alpha->test_func << GEN8_BLEND_DW0_ALPHA_TEST_FUNC__SHIFT;
+   } else {
+      dw0 |= GEN6_COMPAREFUNCTION_ALWAYS <<
+         GEN8_BLEND_DW0_ALPHA_TEST_FUNC__SHIFT;
+   }
+
+   if (blend->dither_enable)
+      dw0 |= GEN8_BLEND_DW0_DITHER_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= 2 + ARRAY_SIZE(dw_rt));
+   cc->blend[1] = dw0;
+   memcpy(&cc->blend[2], dw_rt, sizeof(uint32_t) * 2 * blend->rt_count);
+   cc->blend_state_count = info->blend.rt_count;
+
+   return true;
+}
+
+static bool
+cc_set_gen8_3DSTATE_PS_BLEND(struct ilo_state_cc *cc,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_cc_info *info)
+{
+   const struct ilo_state_cc_alpha_info *alpha = &info->alpha;
+   const struct ilo_state_cc_blend_info *blend = &info->blend;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = 0;
+
+   if (alpha->alpha_to_coverage)
+      dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TO_COVERAGE;
+
+   if (alpha->test_enable)
+      dw1 |= GEN8_PS_BLEND_DW1_ALPHA_TEST_ENABLE;
+
+   if (blend->rt_count) {
+      struct ilo_state_cc_blend_rt_info rt0;
+      uint8_t i;
+
+      cc_get_gen6_effective_rt(dev, info, 0, &rt0);
+
+      /* 0x0 is reserved for blend factors and we have to set them all */
+      dw1 |= rt0.a_src << GEN8_PS_BLEND_DW1_SRC_ALPHA_FACTOR__SHIFT |
+             rt0.a_dst << GEN8_PS_BLEND_DW1_DST_ALPHA_FACTOR__SHIFT |
+             rt0.rgb_src << GEN8_PS_BLEND_DW1_SRC_COLOR_FACTOR__SHIFT |
+             rt0.rgb_dst << GEN8_PS_BLEND_DW1_DST_COLOR_FACTOR__SHIFT;
+
+      for (i = 0; i < blend->rt_count; i++) {
+         if (blend->rt[i].argb_write_disables != 0xf) {
+            dw1 |= GEN8_PS_BLEND_DW1_WRITABLE_RT;
+            break;
+         }
+      }
+
+      if (rt0.blend_enable) {
+         dw1 |= GEN8_PS_BLEND_DW1_BLEND_ENABLE;
+
+         if (rt0.a_src != rt0.rgb_src || rt0.a_dst != rt0.rgb_dst)
+            dw1 |= GEN8_PS_BLEND_DW1_INDEPENDENT_ALPHA_ENABLE;
+      }
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->blend) >= 1);
+   cc->blend[0] = dw1;
+
+   return true;
+}
+
+static bool
+cc_params_set_gen6_COLOR_CALC_STATE(struct ilo_state_cc *cc,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_cc_params_info *params)
+{
+   uint32_t dw0;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   dw0 = params->stencil_front.test_ref << GEN6_CC_DW0_STENCIL_REF__SHIFT |
+         params->stencil_back.test_ref << GEN6_CC_DW0_STENCIL1_REF__SHIFT |
+         GEN6_CC_DW0_ALPHATEST_FLOAT32;
+
+   STATIC_ASSERT(ARRAY_SIZE(cc->cc) >= 6);
+   cc->cc[0] = dw0;
+   cc->cc[1] = fui(params->alpha_ref);
+   cc->cc[2] = fui(params->blend_rgba[0]);
+   cc->cc[3] = fui(params->blend_rgba[1]);
+   cc->cc[4] = fui(params->blend_rgba[2]);
+   cc->cc[5] = fui(params->blend_rgba[3]);
+
+   return true;
+}
+
+bool
+ilo_state_cc_init(struct ilo_state_cc *cc,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_cc_info *info)
+{
+   assert(ilo_is_zeroed(cc, sizeof(*cc)));
+   return ilo_state_cc_set_info(cc, dev, info);
+}
+
+bool
+ilo_state_cc_set_info(struct ilo_state_cc *cc,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_cc_info *info)
+{
+   bool ret = true;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ret &= cc_set_gen8_3DSTATE_WM_DEPTH_STENCIL(cc, dev, info);
+      ret &= cc_set_gen8_BLEND_STATE(cc, dev, info);
+      ret &= cc_set_gen8_3DSTATE_PS_BLEND(cc, dev, info);
+   } else {
+      ret &= cc_set_gen6_DEPTH_STENCIL_STATE(cc, dev, info);
+      ret &= cc_set_gen6_BLEND_STATE(cc, dev, info);
+   }
+
+   ret &= cc_params_set_gen6_COLOR_CALC_STATE(cc, dev, &info->params);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_cc_set_params(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_params_info *params)
+{
+   /* modify stencil masks */
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      uint32_t dw1 = cc->ds[0];
+      uint32_t dw2 = cc->ds[1];
+
+      if (dw1 & GEN8_ZS_DW1_STENCIL_TEST_ENABLE) {
+         const bool twosided_enable = (dw1 & GEN8_ZS_DW1_STENCIL1_ENABLE);
+         const struct ilo_state_cc_stencil_params_info *front_p =
+            &params->stencil_front;
+         const struct ilo_state_cc_stencil_params_info *back_p =
+            (twosided_enable) ? &params->stencil_back :
+                                &params->stencil_front;
+
+         if (front_p->write_mask || back_p->write_mask)
+            dw1 |= GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
+         else
+            dw1 &= ~GEN8_ZS_DW1_STENCIL_WRITE_ENABLE;
+
+         dw2 =
+            front_p->test_mask << GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT |
+            front_p->write_mask << GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT |
+            back_p->test_mask << GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT |
+            back_p->write_mask << GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT;
+      }
+
+      cc->ds[0] = dw1;
+      cc->ds[1] = dw2;
+   } else {
+      uint32_t dw0 = cc->ds[0];
+      uint32_t dw1 = cc->ds[1];
+
+      if (dw0 & GEN6_ZS_DW0_STENCIL_TEST_ENABLE) {
+         const bool twosided_enable = (dw0 & GEN6_ZS_DW0_STENCIL1_ENABLE);
+         const struct ilo_state_cc_stencil_params_info *front_p =
+            &params->stencil_front;
+         const struct ilo_state_cc_stencil_params_info *back_p =
+            (twosided_enable) ? &params->stencil_back :
+                                &params->stencil_front;
+
+         if (front_p->write_mask || back_p->write_mask)
+            dw0 |= GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
+         else
+            dw0 &= ~GEN6_ZS_DW0_STENCIL_WRITE_ENABLE;
+
+         dw1 =
+            front_p->test_mask << GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT |
+            front_p->write_mask << GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT |
+            back_p->test_mask << GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT |
+            back_p->write_mask << GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT;
+      }
+
+      cc->ds[0] = dw0;
+      cc->ds[1] = dw1;
+   }
+
+   /* modify COLOR_CALC_STATE */
+   cc_params_set_gen6_COLOR_CALC_STATE(cc, dev, params);
+
+   return true;
+}
+
+void
+ilo_state_cc_full_delta(const struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_cc_delta *delta)
+{
+   delta->dirty = ILO_STATE_CC_BLEND_STATE |
+                  ILO_STATE_CC_COLOR_CALC_STATE;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      delta->dirty |= ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL |
+                      ILO_STATE_CC_3DSTATE_PS_BLEND;
+   } else {
+      delta->dirty |= ILO_STATE_CC_DEPTH_STENCIL_STATE;
+   }
+}
+
+void
+ilo_state_cc_get_delta(const struct ilo_state_cc *cc,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_cc *old,
+                       struct ilo_state_cc_delta *delta)
+{
+   delta->dirty = 0;
+
+   if (memcmp(cc->ds, old->ds, sizeof(cc->ds))) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL;
+      else
+         delta->dirty |= ILO_STATE_CC_DEPTH_STENCIL_STATE;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      if (cc->blend[0] != old->blend[0])
+         delta->dirty |= ILO_STATE_CC_3DSTATE_PS_BLEND;
+
+      if (memcmp(&cc->blend[1], &old->blend[1],
+               sizeof(uint32_t) * (1 + 2 * cc->blend_state_count)))
+         delta->dirty |= ILO_STATE_CC_BLEND_STATE;
+   } else if (memcmp(cc->blend, old->blend,
+            sizeof(uint32_t) * 2 * cc->blend_state_count)) {
+      delta->dirty |= ILO_STATE_CC_BLEND_STATE;
+   }
+
+   if (memcmp(cc->cc, old->cc, sizeof(cc->cc)))
+      delta->dirty |= ILO_STATE_CC_COLOR_CALC_STATE;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_cc.h b/src/gallium/drivers/ilo/core/ilo_state_cc.h
new file mode 100644
index 0000000..5b96a60
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_cc.h
@@ -0,0 +1,199 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_CC_H
+#define ILO_STATE_CC_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 38:
+ *
+ *     "Render Target Index. Specifies the render target index that will be
+ *      used to select blend state from BLEND_STATE.
+ *      Format = U3"
+ */
+#define ILO_STATE_CC_BLEND_MAX_RT_COUNT 8
+
+enum ilo_state_cc_dirty_bits {
+   ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL           = (1 << 0),
+   ILO_STATE_CC_3DSTATE_PS_BLEND                   = (1 << 1),
+   ILO_STATE_CC_DEPTH_STENCIL_STATE                = (1 << 2),
+   ILO_STATE_CC_BLEND_STATE                        = (1 << 3),
+   ILO_STATE_CC_COLOR_CALC_STATE                   = (1 << 4),
+};
+
+/**
+ * AlphaCoverage and AlphaTest.
+ */
+struct ilo_state_cc_alpha_info {
+   bool cv_sample_count_one;
+   bool cv_float_source0_alpha;
+
+   bool alpha_to_coverage;
+   bool alpha_to_one;
+
+   bool test_enable;
+   enum gen_compare_function test_func;
+};
+
+struct ilo_state_cc_stencil_op_info {
+   enum gen_compare_function test_func;
+   enum gen_stencil_op fail_op;
+   enum gen_stencil_op zfail_op;
+   enum gen_stencil_op zpass_op;
+};
+
+/**
+ * StencilTest.
+ */
+struct ilo_state_cc_stencil_info {
+   bool cv_has_buffer;
+
+   bool test_enable;
+   bool twosided_enable;
+
+   struct ilo_state_cc_stencil_op_info front;
+   struct ilo_state_cc_stencil_op_info back;
+};
+
+/**
+ * DepthTest.
+ */
+struct ilo_state_cc_depth_info {
+   bool cv_has_buffer;
+
+   bool test_enable;
+   /* independent from test_enable */
+   bool write_enable;
+
+   enum gen_compare_function test_func;
+};
+
+struct ilo_state_cc_blend_rt_info {
+   bool cv_has_buffer;
+   bool cv_is_unorm;
+   bool cv_is_integer;
+
+   uint8_t argb_write_disables;
+
+   bool logicop_enable;
+   enum gen_logic_op logicop_func;
+
+   bool blend_enable;
+   bool force_dst_alpha_one;
+   enum gen_blend_factor rgb_src;
+   enum gen_blend_factor rgb_dst;
+   enum gen_blend_function rgb_func;
+   enum gen_blend_factor a_src;
+   enum gen_blend_factor a_dst;
+   enum gen_blend_function a_func;
+};
+
+/**
+ * ColorBufferBlending, Dithering, and LogicOps.
+ */
+struct ilo_state_cc_blend_info {
+   const struct ilo_state_cc_blend_rt_info *rt;
+   uint8_t rt_count;
+
+   bool dither_enable;
+};
+
+struct ilo_state_cc_stencil_params_info {
+   uint8_t test_ref;
+   uint8_t test_mask;
+   uint8_t write_mask;
+};
+
+/**
+ * CC parameters.
+ */
+struct ilo_state_cc_params_info {
+   float alpha_ref;
+
+   struct ilo_state_cc_stencil_params_info stencil_front;
+   struct ilo_state_cc_stencil_params_info stencil_back;
+
+   float blend_rgba[4];
+};
+
+/**
+ * Pixel processing.
+ */
+struct ilo_state_cc_info {
+   struct ilo_state_cc_alpha_info alpha;
+   struct ilo_state_cc_stencil_info stencil;
+   struct ilo_state_cc_depth_info depth;
+   struct ilo_state_cc_blend_info blend;
+
+   struct ilo_state_cc_params_info params;
+};
+
+struct ilo_state_cc {
+   uint32_t ds[3];
+
+   uint8_t blend_state_count;
+   uint32_t blend[1 + 1 + 2 * ILO_STATE_CC_BLEND_MAX_RT_COUNT];
+
+   uint32_t cc[6];
+};
+
+struct ilo_state_cc_delta {
+   uint32_t dirty;
+};
+
+bool
+ilo_state_cc_init(struct ilo_state_cc *cc,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_cc_info *info);
+
+bool
+ilo_state_cc_set_info(struct ilo_state_cc *cc,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_cc_info *info);
+
+bool
+ilo_state_cc_set_params(struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_cc_params_info *params);
+
+void
+ilo_state_cc_full_delta(const struct ilo_state_cc *cc,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_cc_delta *delta);
+
+void
+ilo_state_cc_get_delta(const struct ilo_state_cc *cc,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_cc *old,
+                       struct ilo_state_cc_delta *delta);
+
+#endif /* ILO_STATE_CC_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.c b/src/gallium/drivers/ilo/core/ilo_state_compute.c
new file mode 100644
index 0000000..a5fe5e1
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.c
@@ -0,0 +1,435 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_compute.h"
+
+struct compute_urb_configuration {
+   int idrt_entry_count;
+   int curbe_entry_count;
+
+   int urb_entry_count;
+   /* in 256-bit register increments */
+   int urb_entry_size;
+};
+
+static int
+get_gen6_rob_entry_count(const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 60:
+    *
+    *     "ROB has 64KB of storage; 2048 entries."
+    *
+    * From the valid ranges of "CURBE Allocation Size", we can also conclude
+    * that interface entries and CURBE data must be in ROB.  And that ROB
+    * should be 16KB, or 512 entries, on Gen7 GT1.
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      return 2048;
+   else if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      return (dev->gt == 2) ? 2048 : 512;
+   else
+      return (dev->gt == 2) ? 2048 : 1024;
+}
+
+static int
+get_gen6_idrt_entry_count(const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 21:
+    *
+    *     "The first 32 URB entries are reserved for the interface
+    *      descriptor..."
+    *
+    * From the Haswell PRM, volume 7, page 836:
+    *
+    *     "The first 64 URB entries are reserved for the interface
+    *      description..."
+    */
+   return (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 64 : 32;
+}
+
+static int
+get_gen6_curbe_entry_count(const struct ilo_dev *dev, uint32_t curbe_size)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 21:
+    *
+    *     "(CURBE Allocation Size) Specifies the total length allocated for
+    *      CURBE, in 256-bit register increments.
+    */
+   const int entry_count = (curbe_size + 31) / 32;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(get_gen6_idrt_entry_count(dev) + entry_count <=
+         get_gen6_rob_entry_count(dev));
+
+   return entry_count;
+}
+
+static bool
+compute_get_gen6_urb_configuration(const struct ilo_dev *dev,
+                                   const struct ilo_state_compute_info *info,
+                                   struct compute_urb_configuration *urb)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   urb->idrt_entry_count = get_gen6_idrt_entry_count(dev);
+   urb->curbe_entry_count =
+      get_gen6_curbe_entry_count(dev, info->curbe_alloc_size);
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 451:
+    *
+    *     "Please note that 0 is not allowed for this field (Number of URB
+    *      Entries)."
+    */
+   urb->urb_entry_count = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 1 : 0;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 52:
+    *
+    *     "(URB Entry Allocation Size) Specifies the length of each URB entry
+    *      used by the unit, in 256-bit register increments - 1."
+    */
+   urb->urb_entry_size = 1;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 22:
+    *
+    *      MEDIA_VFE_STATE specifies the amount of CURBE space, the URB handle
+    *      size and the number of URB handles. The driver must ensure that
+    *      ((URB_handle_size * URB_num_handle) - CURBE - 32) <=
+    *      URB_allocation_in_L3."
+    */
+   assert(urb->idrt_entry_count + urb->curbe_entry_count +
+         urb->urb_entry_count * urb->urb_entry_size <=
+         info->cv_urb_alloc_size / 32);
+
+   return true;
+}
+
+static int
+compute_interface_get_gen6_read_end(const struct ilo_dev *dev,
+                                    const struct ilo_state_compute_interface_info *interface)
+{
+   const int per_thread_read = (interface->curbe_read_length + 31) / 32;
+   const int cross_thread_read =
+      (interface->cross_thread_curbe_read_length + 31) / 32;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(interface->curbe_read_offset % 32 == 0);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 60:
+    *
+    *     "(Constant URB Entry Read Length) [0,63]"
+    */
+   assert(per_thread_read <= 63);
+
+   /* From the Haswell PRM, volume 2d, page 199:
+    *
+    *     "(Cross-Thread Constant Data Read Length) [0,127]"
+    */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      assert(cross_thread_read <= 127);
+   else
+      assert(!cross_thread_read);
+
+   if (per_thread_read || cross_thread_read) {
+      return interface->curbe_read_offset / 32 + cross_thread_read +
+         per_thread_read * interface->thread_group_size;
+   } else {
+      return 0;
+   }
+}
+
+static bool
+compute_validate_gen6(const struct ilo_dev *dev,
+                      const struct ilo_state_compute_info *info,
+                      const struct compute_urb_configuration *urb)
+{
+   int min_curbe_entry_count;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->interface_count <= urb->idrt_entry_count);
+
+   min_curbe_entry_count = 0;
+   for (i = 0; i < info->interface_count; i++) {
+      const int read_end =
+         compute_interface_get_gen6_read_end(dev, &info->interfaces[i]);
+
+      if (min_curbe_entry_count < read_end)
+         min_curbe_entry_count = read_end;
+   }
+
+   assert(min_curbe_entry_count <= urb->curbe_entry_count);
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 452:
+    *
+    *     "CURBE Allocation Size should be 0 for GPGPU workloads that uses
+    *      indirect instead of CURBE."
+    */
+   if (!min_curbe_entry_count)
+      assert(!urb->curbe_entry_count);
+
+   return true;
+}
+
+static uint8_t
+compute_get_gen6_scratch_space(const struct ilo_dev *dev,
+                               const struct ilo_state_compute_info *info)
+{
+   uint32_t scratch_size = 0;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < info->interface_count; i++) {
+      if (scratch_size < info->interfaces[i].scratch_size)
+         scratch_size = info->interfaces[i].scratch_size;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      assert(scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 1KB */
+      return (scratch_size > 1024) ?
+         (util_last_bit(scratch_size - 1) - 10): 0;
+   } else if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      assert(scratch_size <= 2 * 1024 * 1024);
+
+      /* next power of two, starting from 2KB */
+      return (scratch_size > 2048) ?
+         (util_last_bit(scratch_size - 1) - 11): 0;
+   } else {
+      assert(scratch_size <= 12 * 1024);
+
+      return (scratch_size > 1024) ?
+         (scratch_size - 1) / 1024 : 0;
+   }
+}
+
+static bool
+compute_set_gen6_MEDIA_VFE_STATE(struct ilo_state_compute *compute,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_compute_info *info)
+{
+   struct compute_urb_configuration urb;
+   uint8_t scratch_space;
+
+   uint32_t dw1, dw2, dw4;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!compute_get_gen6_urb_configuration(dev, info, &urb) ||
+       !compute_validate_gen6(dev, info, &urb))
+      return false;
+
+   scratch_space = compute_get_gen6_scratch_space(dev, info);
+
+   dw1 = scratch_space << GEN6_VFE_DW1_SCRATCH_SPACE_PER_THREAD__SHIFT;
+   dw2 = (dev->thread_count - 1) << GEN6_VFE_DW2_MAX_THREADS__SHIFT |
+         urb.urb_entry_count << GEN6_VFE_DW2_URB_ENTRY_COUNT__SHIFT |
+         GEN6_VFE_DW2_RESET_GATEWAY_TIMER |
+         GEN6_VFE_DW2_BYPASS_GATEWAY_CONTROL;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5))
+      dw2 |= GEN7_VFE_DW2_GPGPU_MODE;
+
+   assert(urb.urb_entry_size);
+
+   dw4 = (urb.urb_entry_size - 1) << GEN6_VFE_DW4_URB_ENTRY_SIZE__SHIFT |
+         urb.curbe_entry_count << GEN6_VFE_DW4_CURBE_SIZE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(compute->vfe) >= 3);
+   compute->vfe[0] = dw1;
+   compute->vfe[1] = dw2;
+   compute->vfe[2] = dw4;
+
+   return true;
+}
+
+static uint8_t
+compute_interface_get_gen6_sampler_count(const struct ilo_dev *dev,
+                                         const struct ilo_state_compute_interface_info *interface)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+   return (interface->sampler_count <= 12) ?
+      (interface->sampler_count + 3) / 4 : 4;
+}
+
+static uint8_t
+compute_interface_get_gen6_surface_count(const struct ilo_dev *dev,
+                                         const struct ilo_state_compute_interface_info *interface)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+   return (interface->surface_count <= 31) ? interface->surface_count : 31;
+}
+
+static uint8_t
+compute_interface_get_gen7_slm_size(const struct ilo_dev *dev,
+                                    const struct ilo_state_compute_interface_info *interface)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 2, page 61:
+    *
+    *     "The amount is specified in 4k blocks, but only powers of 2 are
+    *      allowed: 0, 4k, 8k, 16k, 32k and 64k per half-slice."
+    */
+   assert(interface->slm_size <= 64 * 1024);
+
+   return util_next_power_of_two((interface->slm_size + 4095) / 4096);
+}
+
+static bool
+compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(struct ilo_state_compute *compute,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_compute_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < info->interface_count; i++) {
+      const struct ilo_state_compute_interface_info *interface =
+         &info->interfaces[i];
+      uint16_t read_offset, per_thread_read_len, cross_thread_read_len;
+      uint8_t sampler_count, surface_count;
+      uint32_t dw0, dw2, dw3, dw4, dw5, dw6;
+
+      assert(interface->kernel_offset % 64 == 0);
+      assert(interface->thread_group_size);
+
+      read_offset = interface->curbe_read_offset / 32;
+      per_thread_read_len = (interface->curbe_read_length + 31) / 32;
+      cross_thread_read_len =
+         (interface->cross_thread_curbe_read_length + 31) / 32;
+
+      sampler_count =
+         compute_interface_get_gen6_sampler_count(dev, interface);
+      surface_count =
+         compute_interface_get_gen6_surface_count(dev, interface);
+
+      dw0 = interface->kernel_offset;
+      dw2 = sampler_count << GEN6_IDRT_DW2_SAMPLER_COUNT__SHIFT;
+      dw3 = surface_count << GEN6_IDRT_DW3_BINDING_TABLE_SIZE__SHIFT;
+      dw4 = per_thread_read_len << GEN6_IDRT_DW4_CURBE_READ_LEN__SHIFT |
+            read_offset << GEN6_IDRT_DW4_CURBE_READ_OFFSET__SHIFT;
+
+      dw5 = 0;
+      dw6 = 0;
+      if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+         const uint8_t slm_size =
+            compute_interface_get_gen7_slm_size(dev, interface);
+
+         dw5 |= GEN7_IDRT_DW5_ROUNDING_MODE_RTNE;
+
+         if (slm_size) {
+            dw5 |= GEN7_IDRT_DW5_BARRIER_ENABLE |
+                   slm_size << GEN7_IDRT_DW5_SLM_SIZE__SHIFT;
+         }
+
+         /*
+          * From the Haswell PRM, volume 2d, page 199:
+          *
+          *     "(Number of Threads in GPGPU Thread Group) Specifies the
+          *      number of threads that are in this thread group.  Used to
+          *      program the barrier for the number of messages to expect. The
+          *      minimum value is 0 (which will disable the barrier), while
+          *      the maximum value is the number of threads in a subslice for
+          *      local barriers."
+          *
+          * From the Broadwell PRM, volume 2d, page 183:
+          *
+          *     "(Number of Threads in GPGPU Thread Group) Specifies the
+          *      number of threads that are in this thread group.  The minimum
+          *      value is 1, while the maximum value is the number of threads
+          *      in a subslice for local barriers. See vol1b Configurations
+          *      for the number of threads per subslice for different
+          *      products.  The maximum value for global barriers is limited
+          *      by the number of threads in the system, or by 511, whichever
+          *      is lower. This field should not be set to 0 even if the
+          *      barrier is disabled, since an accurate value is needed for
+          *      proper pre-emption."
+          */
+         if (slm_size || ilo_dev_gen(dev) >= ILO_GEN(8)) {
+            dw5 |= interface->thread_group_size <<
+               GEN7_IDRT_DW5_THREAD_GROUP_SIZE__SHIFT;
+         }
+
+         if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+            dw6 |= cross_thread_read_len <<
+               GEN75_IDRT_DW6_CROSS_THREAD_CURBE_READ_LEN__SHIFT;
+         }
+      }
+
+      STATIC_ASSERT(ARRAY_SIZE(compute->idrt[i]) >= 6);
+      compute->idrt[i][0] = dw0;
+      compute->idrt[i][1] = dw2;
+      compute->idrt[i][2] = dw3;
+      compute->idrt[i][3] = dw4;
+      compute->idrt[i][4] = dw5;
+      compute->idrt[i][5] = dw6;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_compute_init(struct ilo_state_compute *compute,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_compute_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(compute, sizeof(*compute)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   assert(ilo_state_compute_data_size(dev, info->interface_count) <=
+         info->data_size);
+   compute->idrt = (uint32_t (*)[6]) info->data;
+
+   ret &= compute_set_gen6_MEDIA_VFE_STATE(compute, dev, info);
+   ret &= compute_set_gen6_INTERFACE_DESCRIPTOR_DATA(compute, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_compute.h b/src/gallium/drivers/ilo/core/ilo_state_compute.h
new file mode 100644
index 0000000..346f7b6
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_compute.h
@@ -0,0 +1,92 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_COMPUTE_H
+#define ILO_STATE_COMPUTE_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Haswell PRM, volume 7, page 836:
+ *
+ *     "The first 64 URB entries are reserved for the interface
+ *      description..."
+ */
+#define ILO_STATE_COMPUTE_MAX_INTERFACE_COUNT 64
+
+struct ilo_state_compute_interface_info {
+   /* usually 0 unless there are multiple interfaces */
+   uint32_t kernel_offset;
+
+   uint32_t scratch_size;
+
+   uint8_t sampler_count;
+   uint8_t surface_count;
+
+   uint16_t thread_group_size;
+   uint32_t slm_size;
+
+   uint16_t curbe_read_offset;
+   uint16_t curbe_read_length;
+   uint16_t cross_thread_curbe_read_length;
+};
+
+struct ilo_state_compute_info {
+   void *data;
+   size_t data_size;
+
+   const struct ilo_state_compute_interface_info *interfaces;
+   uint8_t interface_count;
+
+   uint32_t cv_urb_alloc_size;
+   uint32_t curbe_alloc_size;
+};
+
+struct ilo_state_compute {
+   uint32_t vfe[3];
+
+   uint32_t (*idrt)[6];
+   uint8_t idrt_count;
+};
+
+static inline size_t
+ilo_state_compute_data_size(const struct ilo_dev *dev,
+                            uint8_t interface_count)
+{
+   const struct ilo_state_compute *compute = NULL;
+   return sizeof(compute->idrt[0]) * interface_count;
+}
+
+bool
+ilo_state_compute_init(struct ilo_state_compute *compute,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_compute_info *info);
+
+#endif /* ILO_STATE_COMPUTE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.c b/src/gallium/drivers/ilo/core/ilo_state_raster.c
new file mode 100644
index 0000000..ed64a1f
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_raster.c
@@ -0,0 +1,1252 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_raster.h"
+
+static bool
+raster_validate_gen6_clip(const struct ilo_dev *dev,
+                          const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(clip->viewport_count);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 188:
+    *
+    *     ""Clip Distance Cull Test Enable Bitmask" and "Clip Distance Clip
+    *      Test Enable Bitmask" should not have overlapping bits in the mask,
+    *      else the results are undefined."
+    */
+   assert(!(clip->user_cull_enables & clip->user_clip_enables));
+
+   if (ilo_dev_gen(dev) < ILO_GEN(9))
+      assert(clip->z_near_enable == clip->z_far_enable);
+
+   return true;
+}
+
+static bool
+raster_set_gen6_3DSTATE_CLIP(struct ilo_state_raster *rs,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!raster_validate_gen6_clip(dev, info))
+      return false;
+
+   dw1 = clip->user_cull_enables << GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT;
+
+   if (clip->stats_enable)
+      dw1 |= GEN6_CLIP_DW1_STATISTICS;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 219:
+       *
+       *     "Workaround : Due to Hardware issue "EarlyCull" needs to be
+       *      enabled only for the cases where the incoming primitive topology
+       *      into the clipper guaranteed to be Trilist."
+       *
+       * What does this mean?
+       */
+      dw1 |= GEN7_CLIP_DW1_SUBPIXEL_8BITS |
+             GEN7_CLIP_DW1_EARLY_CULL_ENABLE;
+
+      if (ilo_dev_gen(dev) <= ILO_GEN(7.5)) {
+         dw1 |= tri->front_winding << GEN7_CLIP_DW1_FRONT_WINDING__SHIFT |
+                tri->cull_mode << GEN7_CLIP_DW1_CULL_MODE__SHIFT;
+      }
+   }
+
+   dw2 = clip->user_clip_enables << GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT |
+         GEN6_CLIPMODE_NORMAL << GEN6_CLIP_DW2_CLIP_MODE__SHIFT;
+
+   if (clip->clip_enable)
+      dw2 |= GEN6_CLIP_DW2_CLIP_ENABLE;
+
+   if (clip->z_near_zero)
+      dw2 |= GEN6_CLIP_DW2_APIMODE_D3D;
+   else
+      dw2 |= GEN6_CLIP_DW2_APIMODE_OGL;
+
+   if (clip->xy_test_enable)
+      dw2 |= GEN6_CLIP_DW2_XY_TEST_ENABLE;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && clip->z_near_enable)
+      dw2 |= GEN6_CLIP_DW2_Z_TEST_ENABLE;
+
+   if (clip->gb_test_enable)
+      dw2 |= GEN6_CLIP_DW2_GB_TEST_ENABLE;
+
+   if (scan->barycentric_interps & (GEN6_INTERP_NONPERSPECTIVE_PIXEL |
+                                    GEN6_INTERP_NONPERSPECTIVE_CENTROID |
+                                    GEN6_INTERP_NONPERSPECTIVE_SAMPLE))
+      dw2 |= GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE;
+
+   if (setup->first_vertex_provoking) {
+      dw2 |= 0 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
+             0 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
+             1 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
+   } else {
+      dw2 |= 2 << GEN6_CLIP_DW2_TRI_PROVOKE__SHIFT |
+             1 << GEN6_CLIP_DW2_LINE_PROVOKE__SHIFT |
+             2 << GEN6_CLIP_DW2_TRIFAN_PROVOKE__SHIFT;
+   }
+
+   dw3 = 0x1 << GEN6_CLIP_DW3_MIN_POINT_WIDTH__SHIFT |
+         0x7ff << GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT |
+         (clip->viewport_count - 1) << GEN6_CLIP_DW3_MAX_VPINDEX__SHIFT;
+
+   if (clip->force_rtaindex_zero)
+      dw3 |= GEN6_CLIP_DW3_FORCE_RTAINDEX_ZERO;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->clip) >= 3);
+   rs->clip[0] = dw1;
+   rs->clip[1] = dw2;
+   rs->clip[2] = dw3;
+
+   return true;
+}
+
+static bool
+raster_params_is_gen6_line_aa_allowed(const struct ilo_dev *dev,
+                                      const struct ilo_state_raster_params_info *params)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 251:
+    *
+    *     "This field (Anti-aliasing Enable) must be disabled if any of the
+    *      render targets have integer (UINT or SINT) surface format."
+    */
+   if (params->any_integer_rt)
+      return false;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 321:
+    *
+    *     "[DevSNB+]: This field (Hierarchical Depth Buffer Enable) must be
+    *      disabled if Anti-aliasing Enable in 3DSTATE_SF is enabled.
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && params->hiz_enable)
+      return false;
+
+   return true;
+}
+
+static void
+raster_get_gen6_effective_line(const struct ilo_dev *dev,
+                               const struct ilo_state_raster_info *info,
+                               struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_params_info *params = &info->params;
+
+   *line = info->line;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 251:
+    *
+    *     "This field (Anti-aliasing Enable) is ignored when Multisample
+    *      Rasterization Mode is MSRASTMODE_ON_xx."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 251:
+    *
+    *     "Setting a Line Width of 0.0 specifies the rasterization of the
+    *      "thinnest" (one-pixel-wide), non-antialiased lines. Note that
+    *      this effectively overrides the effect of AAEnable (though the
+    *      AAEnable state variable is not modified). Lines rendered with
+    *      zero Line Width are rasterized using GIQ (Grid Intersection
+    *      Quantization) rules as specified by the GDI and Direct3D APIs."
+    *
+    *     "Software must not program a value of 0.0 when running in
+    *      MSRASTMODE_ON_xxx modes - zero-width lines are not available
+    *      when multisampling rasterization is enabled."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 294:
+    *
+    *     "Line stipple, controlled via the Line Stipple Enable state variable
+    *      in WM_STATE, discards certain pixels that are produced by non-AA
+    *      line rasterization."
+    */
+   if (setup->line_msaa_enable ||
+       !raster_params_is_gen6_line_aa_allowed(dev, params))
+      line->aa_enable = false;
+   if (setup->line_msaa_enable || line->aa_enable) {
+      line->stipple_enable = false;
+      line->giq_enable = false;
+      line->giq_last_pixel = false;
+   }
+}
+
+static bool
+raster_validate_gen8_raster(const struct ilo_dev *dev,
+                            const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 249:
+    *
+    *     "This setting (SOLID) is required when rendering rectangle
+    *      (RECTLIST) objects.
+    */
+   if (tri->fill_mode_front != GEN6_FILLMODE_SOLID ||
+       tri->fill_mode_back != GEN6_FILLMODE_SOLID)
+      assert(!setup->cv_is_rectangle);
+
+   return true;
+}
+
+static enum gen_msrast_mode
+raster_setup_get_gen6_msrast_mode(const struct ilo_dev *dev,
+                                  const struct ilo_state_raster_setup_info *setup)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (setup->line_msaa_enable) {
+      return (setup->msaa_enable) ? GEN6_MSRASTMODE_ON_PATTERN :
+                                    GEN6_MSRASTMODE_ON_PIXEL;
+   } else {
+      return (setup->msaa_enable) ? GEN6_MSRASTMODE_OFF_PATTERN :
+                                    GEN6_MSRASTMODE_OFF_PIXEL;
+   }
+}
+
+static int
+get_gen6_line_width(const struct ilo_dev *dev, float fwidth,
+                    bool line_aa_enable, bool line_giq_enable)
+{
+   int line_width;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* in U3.7 */
+   line_width = (int) (fwidth * 128.0f + 0.5f);
+
+   /*
+    * Smooth lines should intersect ceil(line_width) or (ceil(line_width) + 1)
+    * pixels in the minor direction.  We have to make the lines slightly
+    * thicker, 0.5 pixel on both sides, so that they intersect that many
+    * pixels.
+    */
+   if (line_aa_enable)
+      line_width += 128;
+
+   line_width = CLAMP(line_width, 1, 1023);
+
+   if (line_giq_enable && line_width == 128)
+      line_width = 0;
+
+   return line_width;
+}
+
+static int
+get_gen6_point_width(const struct ilo_dev *dev, float fwidth)
+{
+   int point_width;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* in U8.3 */
+   point_width = (int) (fwidth * 8.0f + 0.5f);
+   point_width = CLAMP(point_width, 1, 2047);
+
+   return point_width;
+}
+
+static bool
+raster_set_gen7_3DSTATE_SF(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_point_info *point = &info->point;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_params_info *params = &info->params;
+   const enum gen_msrast_mode msrast =
+      raster_setup_get_gen6_msrast_mode(dev, setup);
+   const int line_width = get_gen6_line_width(dev, params->line_width,
+         line->aa_enable, line->giq_enable);
+   const int point_width = get_gen6_point_width(dev, params->point_width);
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   if (!raster_validate_gen8_raster(dev, info))
+      return false;
+
+   dw1 = tri->fill_mode_front << GEN7_SF_DW1_FILL_MODE_FRONT__SHIFT |
+         tri->fill_mode_back << GEN7_SF_DW1_FILL_MODE_BACK__SHIFT |
+         tri->front_winding << GEN7_SF_DW1_FRONT_WINDING__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && ilo_dev_gen(dev) <= ILO_GEN(7.5)) {
+      enum gen_depth_format format;
+
+      /* do it here as we want 0x0 to be valid */
+      switch (tri->depth_offset_format) {
+      case GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT:
+         format = GEN6_ZFORMAT_D32_FLOAT;
+         break;
+      case GEN6_ZFORMAT_D24_UNORM_S8_UINT:
+         format = GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+         break;
+      default:
+         format = tri->depth_offset_format;
+         break;
+      }
+
+      dw1 |= format << GEN7_SF_DW1_DEPTH_FORMAT__SHIFT;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "This bit (Statistics Enable) should be set whenever clipping is
+    *      enabled and the Statistics Enable bit is set in CLIP_STATE. It
+    *      should be cleared if clipping is disabled or Statistics Enable in
+    *      CLIP_STATE is clear."
+    */
+   if (clip->stats_enable && clip->clip_enable)
+      dw1 |= GEN7_SF_DW1_STATISTICS;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 258:
+    *
+    *     "This bit (Legacy Global Depth Bias Enable, Global Depth Offset
+    *      Enable Solid , Global Depth Offset Enable Wireframe, and Global
+    *      Depth Offset Enable Point) should be set whenever non zero depth
+    *      bias (Slope, Bias) values are used. Setting this bit may have some
+    *      degradation of performance for some workloads."
+    *
+    * But it seems fine to ignore that.
+    */
+   if (tri->depth_offset_solid)
+      dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_SOLID;
+   if (tri->depth_offset_wireframe)
+      dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME;
+   if (tri->depth_offset_point)
+      dw1 |= GEN7_SF_DW1_DEPTH_OFFSET_POINT;
+
+   if (setup->viewport_transform)
+      dw1 |= GEN7_SF_DW1_VIEWPORT_TRANSFORM;
+
+   dw2 = tri->cull_mode << GEN7_SF_DW2_CULL_MODE__SHIFT |
+         line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
+         GEN7_SF_DW2_AA_LINE_CAP_1_0 |
+         msrast << GEN7_SF_DW2_MSRASTMODE__SHIFT;
+
+   if (line->aa_enable)
+      dw2 |= GEN7_SF_DW2_AA_LINE_ENABLE;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && line->stipple_enable)
+      dw2 |= GEN75_SF_DW2_LINE_STIPPLE_ENABLE;
+
+   if (setup->scissor_enable)
+      dw2 |= GEN7_SF_DW2_SCISSOR_ENABLE;
+
+   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
+         GEN7_SF_DW3_SUBPIXEL_8BITS;
+
+   /* this has no effect when line_width != 0 */
+   if (line->giq_last_pixel)
+      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
+
+   if (setup->first_vertex_provoking) {
+      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   } else {
+      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   }
+
+   /* setup->point_aa_enable is ignored */
+   if (!point->programmable_width) {
+      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH |
+             point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sf) >= 3);
+   rs->sf[0] = dw1;
+   rs->sf[1] = dw2;
+   rs->sf[2] = dw3;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->raster) >= 4);
+   rs->raster[0] = 0;
+   rs->raster[1] = fui(params->depth_offset_const);
+   rs->raster[2] = fui(params->depth_offset_scale);
+   rs->raster[3] = fui(params->depth_offset_clamp);
+
+   rs->line_aa_enable = line->aa_enable;
+   rs->line_giq_enable = line->giq_enable;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3DSTATE_SF(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_point_info *point = &info->point;
+   const struct ilo_state_raster_params_info *params = &info->params;
+   const int line_width = get_gen6_line_width(dev, params->line_width,
+         line->aa_enable, line->giq_enable);
+   const int point_width = get_gen6_point_width(dev, params->point_width);
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = 0;
+
+   if (clip->stats_enable && clip->clip_enable)
+      dw1 |= GEN7_SF_DW1_STATISTICS;
+
+   if (setup->viewport_transform)
+      dw1 |= GEN7_SF_DW1_VIEWPORT_TRANSFORM;
+
+   dw2 = line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT |
+         GEN7_SF_DW2_AA_LINE_CAP_1_0;
+
+   dw3 = GEN7_SF_DW3_TRUE_AA_LINE_DISTANCE |
+         GEN7_SF_DW3_SUBPIXEL_8BITS;
+
+   /* this has no effect when line_width != 0 */
+   if (line->giq_last_pixel)
+      dw3 |= GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE;
+
+   if (setup->first_vertex_provoking) {
+      dw3 |= 0 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             0 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   } else {
+      dw3 |= 2 << GEN7_SF_DW3_TRI_PROVOKE__SHIFT |
+             1 << GEN7_SF_DW3_LINE_PROVOKE__SHIFT |
+             2 << GEN7_SF_DW3_TRIFAN_PROVOKE__SHIFT;
+   }
+
+   if (!point->programmable_width) {
+      dw3 |= GEN7_SF_DW3_USE_POINT_WIDTH |
+             point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sf) >= 3);
+   rs->sf[0] = dw1;
+   rs->sf[1] = dw2;
+   rs->sf[2] = dw3;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3DSTATE_RASTER(struct ilo_state_raster *rs,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_raster_info *info,
+                               const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_clip_info *clip = &info->clip;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_point_info *point = &info->point;
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_params_info *params = &info->params;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!raster_validate_gen8_raster(dev, info))
+      return false;
+
+   dw1 = tri->front_winding << GEN8_RASTER_DW1_FRONT_WINDING__SHIFT |
+         tri->cull_mode << GEN8_RASTER_DW1_CULL_MODE__SHIFT |
+         tri->fill_mode_front << GEN8_RASTER_DW1_FILL_MODE_FRONT__SHIFT |
+         tri->fill_mode_back << GEN8_RASTER_DW1_FILL_MODE_BACK__SHIFT;
+
+   if (point->aa_enable)
+      dw1 |= GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE;
+
+   /* where should line_msaa_enable be set? */
+   if (setup->msaa_enable)
+      dw1 |= GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE;
+
+   if (tri->depth_offset_solid)
+      dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID;
+   if (tri->depth_offset_wireframe)
+      dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME;
+   if (tri->depth_offset_point)
+      dw1 |= GEN8_RASTER_DW1_DEPTH_OFFSET_POINT;
+
+   if (line->aa_enable)
+      dw1 |= GEN8_RASTER_DW1_AA_LINE_ENABLE;
+
+   if (setup->scissor_enable)
+      dw1 |= GEN8_RASTER_DW1_SCISSOR_ENABLE;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(9)) {
+      if (clip->z_far_enable)
+         dw1 |= GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE;
+      if (clip->z_near_enable)
+         dw1 |= GEN9_RASTER_DW1_Z_TEST_NEAR_ENABLE;
+   } else {
+      if (clip->z_near_enable)
+         dw1 |= GEN8_RASTER_DW1_Z_TEST_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->raster) >= 4);
+   rs->raster[0] = dw1;
+   rs->raster[1] = fui(params->depth_offset_const);
+   rs->raster[2] = fui(params->depth_offset_scale);
+   rs->raster[3] = fui(params->depth_offset_clamp);
+
+   rs->line_aa_enable = line->aa_enable;
+   rs->line_giq_enable = line->giq_enable;
+
+   return true;
+}
+
+static enum gen_sample_count
+get_gen6_sample_count(const struct ilo_dev *dev, uint8_t sample_count)
+{
+   enum gen_sample_count c;
+   int min_gen;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (sample_count) {
+   case 1:
+      c = GEN6_NUMSAMPLES_1;
+      min_gen = ILO_GEN(6);
+      break;
+   case 2:
+      c = GEN8_NUMSAMPLES_2;
+      min_gen = ILO_GEN(8);
+      break;
+   case 4:
+      c = GEN6_NUMSAMPLES_4;
+      min_gen = ILO_GEN(6);
+      break;
+   case 8:
+      c = GEN7_NUMSAMPLES_8;
+      min_gen = ILO_GEN(7);
+      break;
+   case 16:
+      c = GEN8_NUMSAMPLES_16;
+      min_gen = ILO_GEN(8);
+      break;
+   default:
+      assert(!"unexpected sample count");
+      c = GEN6_NUMSAMPLES_1;
+      break;
+   }
+
+   assert(ilo_dev_gen(dev) >= min_gen);
+
+   return c;
+}
+
+static bool
+raster_set_gen8_3DSTATE_MULTISAMPLE(struct ilo_state_raster *rs,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_sample_count count =
+      get_gen6_sample_count(dev, scan->sample_count);
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 307:
+    *
+    *     "Setting Multisample Rasterization Mode to MSRASTMODE_xxx_PATTERN
+    *      when Number of Multisamples == NUMSAMPLES_1 is UNDEFINED."
+    */
+   if (setup->msaa_enable)
+      assert(scan->sample_count > 1);
+
+   dw1 = scan->pixloc << GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT |
+         count << GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sample) >= 1);
+   rs->sample[0] = dw1;
+
+   return true;
+}
+
+static bool
+raster_set_gen6_3DSTATE_SAMPLE_MASK(struct ilo_state_raster *rs,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 294:
+    *
+    *     "If Number of Multisamples is NUMSAMPLES_1, bits 7:1 of this field
+    *      (Sample Mask) must be zero.
+    *
+    *      If Number of Multisamples is NUMSAMPLES_4, bits 7:4 of this field
+    *      must be zero."
+    */
+   const uint32_t mask = (1 << scan->sample_count) - 1;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   dw1 = (scan->sample_mask & mask) << GEN6_SAMPLE_MASK_DW1_VAL__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->sample) >= 2);
+   rs->sample[1] = dw1;
+
+   return true;
+}
+
+static bool
+raster_validate_gen6_wm(const struct ilo_dev *dev,
+                        const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) == ILO_GEN(6))
+      assert(scan->earlyz_control == GEN7_EDSC_NORMAL);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 272:
+    *
+    *     "This bit (Statistics Enable) must be disabled if either of these
+    *      bits is set: Depth Buffer Clear , Hierarchical Depth Buffer Resolve
+    *      Enable or Depth Buffer Resolve Enable."
+    */
+   if (scan->earlyz_op != ILO_STATE_RASTER_EARLYZ_NORMAL)
+      assert(!scan->stats_enable);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 273:
+    *
+    *     "If this field (Depth Buffer Resolve Enable) is enabled, the Depth
+    *      Buffer Clear and Hierarchical Depth Buffer Resolve Enable fields
+    *      must both be disabled."
+    *
+    *     "If this field (Hierarchical Depth Buffer Resolve Enable) is
+    *      enabled, the Depth Buffer Clear and Depth Buffer Resolve Enable
+    *      fields must both be disabled."
+    *
+    * This is guaranteed.
+    */
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 314-315:
+    *
+    *     "Stencil buffer clear can be performed at the same time by enabling
+    *      Stencil Buffer Write Enable."
+    *
+    *     "Note also that stencil buffer clear can be performed without depth
+    *      buffer clear."
+    */
+   if (scan->earlyz_stencil_clear) {
+      assert(scan->earlyz_op == ILO_STATE_RASTER_EARLYZ_NORMAL ||
+             scan->earlyz_op == ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR);
+   }
+
+   return true;
+}
+
+static bool
+raster_set_gen6_3dstate_wm(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_msrast_mode msrast =
+      raster_setup_get_gen6_msrast_mode(dev, setup);
+   /* only scan conversion states are set, as in Gen8+ */
+   uint32_t dw4, dw5, dw6;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!raster_validate_gen6_wm(dev, info))
+      return false;
+
+   dw4 = 0;
+
+   if (scan->stats_enable)
+      dw4 |= GEN6_WM_DW4_STATISTICS;
+
+   switch (scan->earlyz_op) {
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+      dw4 |= GEN6_WM_DW4_DEPTH_CLEAR;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
+      dw4 |= GEN6_WM_DW4_DEPTH_RESOLVE;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE:
+      dw4 |= GEN6_WM_DW4_HIZ_RESOLVE;
+      break;
+   default:
+      if (scan->earlyz_stencil_clear)
+         dw4 |= GEN6_WM_DW4_DEPTH_CLEAR;
+      break;
+   }
+
+   dw5 = GEN6_WM_DW5_AA_LINE_CAP_1_0 | /* same as in 3DSTATE_SF */
+         GEN6_WM_DW5_AA_LINE_WIDTH_2_0;
+
+   if (tri->poly_stipple_enable)
+      dw5 |= GEN6_WM_DW5_POLY_STIPPLE_ENABLE;
+   if (line->stipple_enable)
+      dw5 |= GEN6_WM_DW5_LINE_STIPPLE_ENABLE;
+
+   dw6 = scan->zw_interp << GEN6_WM_DW6_ZW_INTERP__SHIFT |
+         scan->barycentric_interps << GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT |
+         GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT |
+         msrast << GEN6_WM_DW6_MSRASTMODE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 3);
+   rs->wm[0] = dw4;
+   rs->wm[1] = dw5;
+   rs->wm[2] = dw6;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3DSTATE_WM(struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster_info *info,
+                           const struct ilo_state_raster_line_info *line)
+{
+   const struct ilo_state_raster_tri_info *tri = &info->tri;
+   const struct ilo_state_raster_setup_info *setup = &info->setup;
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_msrast_mode msrast =
+      raster_setup_get_gen6_msrast_mode(dev, setup);
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!raster_validate_gen6_wm(dev, info))
+      return false;
+
+   dw1 = scan->earlyz_control << GEN7_WM_DW1_EDSC__SHIFT |
+         scan->zw_interp << GEN7_WM_DW1_ZW_INTERP__SHIFT |
+         scan->barycentric_interps << GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT |
+         GEN7_WM_DW1_AA_LINE_CAP_1_0 | /* same as in 3DSTATE_SF */
+         GEN7_WM_DW1_AA_LINE_WIDTH_2_0 |
+         GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT;
+
+   if (scan->stats_enable)
+      dw1 |= GEN7_WM_DW1_STATISTICS;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      switch (scan->earlyz_op) {
+      case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+         dw1 |= GEN7_WM_DW1_DEPTH_CLEAR;
+         break;
+      case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
+         dw1 |= GEN7_WM_DW1_DEPTH_RESOLVE;
+         break;
+      case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE:
+         dw1 |= GEN7_WM_DW1_HIZ_RESOLVE;
+         break;
+      default:
+         if (scan->earlyz_stencil_clear)
+            dw1 |= GEN7_WM_DW1_DEPTH_CLEAR;
+         break;
+      }
+   }
+
+   if (tri->poly_stipple_enable)
+      dw1 |= GEN7_WM_DW1_POLY_STIPPLE_ENABLE;
+   if (line->stipple_enable)
+      dw1 |= GEN7_WM_DW1_LINE_STIPPLE_ENABLE;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8))
+      dw1 |= msrast << GEN7_WM_DW1_MSRASTMODE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 1);
+   rs->wm[0] = dw1;
+
+   return true;
+}
+
+static bool
+raster_set_gen8_3dstate_wm_hz_op(struct ilo_state_raster *rs,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_raster_info *info)
+{
+   const struct ilo_state_raster_scan_info *scan = &info->scan;
+   const enum gen_sample_count count =
+      get_gen6_sample_count(dev, scan->sample_count);
+   const uint32_t mask = (1 << scan->sample_count) - 1;
+   uint32_t dw1, dw4;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = count << GEN8_WM_HZ_DW1_NUM_SAMPLES__SHIFT;
+
+   if (scan->earlyz_stencil_clear)
+      dw1 |= GEN8_WM_HZ_DW1_STENCIL_CLEAR;
+
+   switch (scan->earlyz_op) {
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+      dw1 |= GEN8_WM_HZ_DW1_DEPTH_CLEAR;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
+      dw1 |= GEN8_WM_HZ_DW1_DEPTH_RESOLVE;
+      break;
+   case ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE:
+      dw1 |= GEN8_WM_HZ_DW1_HIZ_RESOLVE;
+      break;
+   default:
+      break;
+   }
+
+   dw4 = (scan->sample_mask & mask) << GEN8_WM_HZ_DW4_SAMPLE_MASK__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(rs->wm) >= 3);
+   rs->wm[1] = dw1;
+   rs->wm[2] = dw4;
+
+   return true;
+}
+
+static bool
+sample_pattern_get_gen6_packed_offsets(const struct ilo_dev *dev,
+                                       uint8_t sample_count,
+                                       const struct ilo_state_sample_pattern_offset_info *in,
+                                       uint8_t *out)
+{
+   uint8_t max_dist, i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   max_dist = 0;
+   for (i = 0; i < sample_count; i++) {
+      const int8_t dist_x = (int8_t) in[i].x - 8;
+      const int8_t dist_y = (int8_t) in[i].y - 8;
+      const uint8_t dist = dist_x * dist_x + dist_y * dist_y;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 305:
+       *
+       *     "Programming Note: When programming the sample offsets (for
+       *      NUMSAMPLES_4 or _8 and MSRASTMODE_xxx_PATTERN), the order of the
+       *      samples 0 to 3 (or 7 for 8X) must have monotonically increasing
+       *      distance from the pixel center. This is required to get the
+       *      correct centroid computation in the device."
+       */
+      assert(dist >= max_dist);
+      max_dist = dist;
+
+      assert(in[i].x < 16);
+      assert(in[i].y < 16);
+
+      out[i] = in[i].x << 4 | in[i].y;
+   }
+
+   return true;
+}
+
+static bool
+line_stipple_set_gen6_3DSTATE_LINE_STIPPLE(struct ilo_state_line_stipple *stipple,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_line_stipple_info *info)
+{
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->repeat_count >= 1 && info->repeat_count <= 256);
+
+   dw1 = info->pattern;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      /* in U1.16 */
+      const uint32_t inverse = 65536 / info->repeat_count;
+      dw2 = inverse << GEN7_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
+            info->repeat_count << GEN6_LINE_STIPPLE_DW2_REPEAT_COUNT__SHIFT;
+   } else {
+      /* in U1.13 */
+      const uint16_t inverse = 8192 / info->repeat_count;
+      dw2 = inverse << GEN6_LINE_STIPPLE_DW2_INVERSE_REPEAT_COUNT__SHIFT |
+            info->repeat_count << GEN6_LINE_STIPPLE_DW2_REPEAT_COUNT__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(stipple->stipple) >= 2);
+   stipple->stipple[0] = dw1;
+   stipple->stipple[1] = dw2;
+
+   return true;
+}
+
+static bool
+sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN(struct ilo_state_sample_pattern *pattern,
+                                               const struct ilo_dev *dev,
+                                               const struct ilo_state_sample_pattern_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_1x) >= 1);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_2x) >= 2);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_4x) >= 4);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_8x) >= 8);
+   STATIC_ASSERT(ARRAY_SIZE(pattern->pattern_16x) >= 16);
+
+   return (sample_pattern_get_gen6_packed_offsets(dev, 1,
+              info->pattern_1x, pattern->pattern_1x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 2,
+              info->pattern_2x, pattern->pattern_2x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 4,
+              info->pattern_4x, pattern->pattern_4x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 8,
+              info->pattern_8x, pattern->pattern_8x) &&
+           sample_pattern_get_gen6_packed_offsets(dev, 16,
+              info->pattern_16x, pattern->pattern_16x));
+
+}
+
+static bool
+poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN(struct ilo_state_poly_stipple *stipple,
+                                                   const struct ilo_dev *dev,
+                                                   const struct ilo_state_poly_stipple_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(stipple->stipple) >= 32);
+   memcpy(stipple->stipple, info->pattern, sizeof(info->pattern));
+
+   return true;
+}
+
+bool
+ilo_state_raster_init(struct ilo_state_raster *rs,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_raster_info *info)
+{
+   assert(ilo_is_zeroed(rs, sizeof(*rs)));
+   return ilo_state_raster_set_info(rs, dev, info);
+}
+
+bool
+ilo_state_raster_init_for_rectlist(struct ilo_state_raster *rs,
+                                   const struct ilo_dev *dev,
+                                   uint8_t sample_count,
+                                   enum ilo_state_raster_earlyz_op earlyz_op,
+                                   bool earlyz_stencil_clear)
+{
+   struct ilo_state_raster_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   info.clip.viewport_count = 1;
+   info.setup.cv_is_rectangle = true;
+   info.setup.msaa_enable = (sample_count > 1);
+   info.scan.sample_count = sample_count;
+   info.scan.sample_mask = ~0u;
+   info.scan.earlyz_op = earlyz_op;
+   info.scan.earlyz_stencil_clear = earlyz_stencil_clear;
+
+   return ilo_state_raster_init(rs, dev, &info);
+}
+
+bool
+ilo_state_raster_set_info(struct ilo_state_raster *rs,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_raster_info *info)
+{
+   struct ilo_state_raster_line_info line;
+   bool ret = true;
+
+   ret &= raster_set_gen6_3DSTATE_CLIP(rs, dev, info);
+
+   raster_get_gen6_effective_line(dev, info, &line);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ret &= raster_set_gen8_3DSTATE_SF(rs, dev, info, &line);
+      ret &= raster_set_gen8_3DSTATE_RASTER(rs, dev, info, &line);
+   } else {
+      ret &= raster_set_gen7_3DSTATE_SF(rs, dev, info, &line);
+   }
+
+   ret &= raster_set_gen8_3DSTATE_MULTISAMPLE(rs, dev, info);
+   ret &= raster_set_gen6_3DSTATE_SAMPLE_MASK(rs, dev, info);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= raster_set_gen8_3DSTATE_WM(rs, dev, info, &line);
+
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         ret &= raster_set_gen8_3dstate_wm_hz_op(rs, dev, info);
+   } else {
+      ret &= raster_set_gen6_3dstate_wm(rs, dev, info, &line);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_raster_set_params(struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            const struct ilo_state_raster_params_info *params)
+{
+   const bool line_aa_enable = (rs->line_aa_enable &&
+         raster_params_is_gen6_line_aa_allowed(dev, params));
+   const int line_width = get_gen6_line_width(dev, params->line_width,
+         line_aa_enable, rs->line_giq_enable);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* modify line AA enable */
+   if (rs->line_aa_enable) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+         if (line_aa_enable)
+            rs->raster[0] |= GEN8_RASTER_DW1_AA_LINE_ENABLE;
+         else
+            rs->raster[0] &= ~GEN8_RASTER_DW1_AA_LINE_ENABLE;
+      } else {
+         if (line_aa_enable)
+            rs->sf[1] |= GEN7_SF_DW2_AA_LINE_ENABLE;
+         else
+            rs->sf[1] &= ~GEN7_SF_DW2_AA_LINE_ENABLE;
+      }
+   }
+
+   /* modify line width */
+   rs->sf[1] = (rs->sf[1] & ~GEN7_SF_DW2_LINE_WIDTH__MASK) |
+               line_width << GEN7_SF_DW2_LINE_WIDTH__SHIFT;
+
+   /* modify point width */
+   if (rs->sf[2] & GEN7_SF_DW3_USE_POINT_WIDTH) {
+      const int point_width = get_gen6_point_width(dev, params->point_width);
+
+      rs->sf[2] = (rs->sf[2] & ~GEN7_SF_DW3_POINT_WIDTH__MASK) |
+                  point_width << GEN7_SF_DW3_POINT_WIDTH__SHIFT;
+   }
+
+   /* modify depth offset */
+   rs->raster[1] = fui(params->depth_offset_const);
+   rs->raster[2] = fui(params->depth_offset_scale);
+   rs->raster[3] = fui(params->depth_offset_clamp);
+
+   return true;
+}
+
+void
+ilo_state_raster_full_delta(const struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            struct ilo_state_raster_delta *delta)
+{
+   delta->dirty = ILO_STATE_RASTER_3DSTATE_CLIP |
+                  ILO_STATE_RASTER_3DSTATE_SF |
+                  ILO_STATE_RASTER_3DSTATE_MULTISAMPLE |
+                  ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK |
+                  ILO_STATE_RASTER_3DSTATE_WM |
+                  ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_RASTER |
+                      ILO_STATE_RASTER_3DSTATE_WM_HZ_OP;
+   }
+}
+
+void
+ilo_state_raster_get_delta(const struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster *old,
+                           struct ilo_state_raster_delta *delta)
+{
+   delta->dirty = 0;
+
+   if (memcmp(rs->clip, old->clip, sizeof(rs->clip)))
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_CLIP;
+
+   if (memcmp(rs->sf, old->sf, sizeof(rs->sf)))
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_SF;
+
+   if (memcmp(rs->raster, old->raster, sizeof(rs->raster))) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_RASTER_3DSTATE_RASTER;
+      else
+         delta->dirty |= ILO_STATE_RASTER_3DSTATE_SF;
+   }
+
+   if (memcmp(rs->sample, old->sample, sizeof(rs->sample))) {
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_MULTISAMPLE |
+                      ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK;
+   }
+
+   if (memcmp(rs->wm, old->wm, sizeof(rs->wm))) {
+      delta->dirty |= ILO_STATE_RASTER_3DSTATE_WM;
+
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_RASTER_3DSTATE_WM_HZ_OP;
+   }
+}
+
+bool
+ilo_state_sample_pattern_init(struct ilo_state_sample_pattern *pattern,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sample_pattern_info *info)
+{
+   bool ret = true;
+
+   ret &= sample_pattern_set_gen8_3DSTATE_SAMPLE_PATTERN(pattern, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sample_pattern_init_default(struct ilo_state_sample_pattern *pattern,
+                                      const struct ilo_dev *dev)
+{
+   static const struct ilo_state_sample_pattern_info default_info = {
+      .pattern_1x = {
+         {  8,  8 },
+      },
+
+      .pattern_2x = {
+         {  4,  4 }, { 12, 12 },
+      },
+
+      .pattern_4x = {
+         {  6,  2 }, { 14,  6 }, {  2, 10 }, { 10, 14 },
+      },
+
+      /* \see brw_multisample_positions_8x */
+      .pattern_8x = {
+         {  7,  9 }, {  9, 13 }, { 11,  3 }, { 13, 11 },
+         {  1,  7 }, {  5,  1 }, { 15,  5 }, {  3, 15 },
+      },
+
+      .pattern_16x = {
+         {  8, 10 }, { 11,  8 }, {  5,  6 }, {  6,  4 },
+         { 12, 11 }, { 13,  9 }, { 14,  7 }, { 10,  2 },
+         {  4, 13 }, {  3,  3 }, {  7,  1 }, { 15,  5 },
+         {  1, 12 }, {  9,  0 }, {  2, 14 }, {  0, 15 },
+      },
+   };
+
+   return ilo_state_sample_pattern_init(pattern, dev, &default_info);
+}
+
+const uint8_t *
+ilo_state_sample_pattern_get_packed_offsets(const struct ilo_state_sample_pattern *pattern,
+                                            const struct ilo_dev *dev,
+                                            uint8_t sample_count)
+{
+   switch (sample_count) {
+   case 1:  return pattern->pattern_1x;
+   case 2:  return pattern->pattern_2x;
+   case 4:  return pattern->pattern_4x;
+   case 8:  return pattern->pattern_8x;
+   case 16: return pattern->pattern_16x;
+   default:
+      assert(!"unknown sample count");
+      return NULL;
+   }
+}
+
+void
+ilo_state_sample_pattern_get_offset(const struct ilo_state_sample_pattern *pattern,
+                                    const struct ilo_dev *dev,
+                                    uint8_t sample_count, uint8_t sample_index,
+                                    uint8_t *x, uint8_t *y)
+{
+   const const uint8_t *packed =
+      ilo_state_sample_pattern_get_packed_offsets(pattern, dev, sample_count);
+
+   assert(sample_index < sample_count);
+
+   *x = (packed[sample_index] >> 4) & 0xf;
+   *y = packed[sample_index] & 0xf;
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_line_stipple_set_info(struct ilo_state_line_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_line_stipple_info *info)
+{
+   bool ret = true;
+
+   ret &= line_stipple_set_gen6_3DSTATE_LINE_STIPPLE(stipple,
+         dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_poly_stipple_set_info(struct ilo_state_poly_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_poly_stipple_info *info)
+{
+   bool ret = true;
+
+   ret &= poly_stipple_set_gen6_3DSTATE_POLY_STIPPLE_PATTERN(stipple,
+         dev, info);
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_raster.h b/src/gallium/drivers/ilo/core/ilo_state_raster.h
new file mode 100644
index 0000000..fc90b49
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_raster.h
@@ -0,0 +1,301 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_RASTER_H
+#define ILO_STATE_RASTER_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+enum ilo_state_raster_dirty_bits {
+   ILO_STATE_RASTER_3DSTATE_CLIP                   = (1 << 0),
+   ILO_STATE_RASTER_3DSTATE_SF                     = (1 << 1),
+   ILO_STATE_RASTER_3DSTATE_RASTER                 = (1 << 2),
+   ILO_STATE_RASTER_3DSTATE_MULTISAMPLE            = (1 << 3),
+   ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK            = (1 << 4),
+   ILO_STATE_RASTER_3DSTATE_WM                     = (1 << 5),
+   ILO_STATE_RASTER_3DSTATE_WM_HZ_OP               = (1 << 6),
+   ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS     = (1 << 7),
+};
+
+enum ilo_state_raster_earlyz_op {
+   ILO_STATE_RASTER_EARLYZ_NORMAL,
+   ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR,
+   ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE,
+   ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE,
+};
+
+/**
+ * VUE readback, VertexClipTest, ClipDetermination, and primitive output.
+ */
+struct ilo_state_raster_clip_info {
+   bool clip_enable;
+   /* CL_INVOCATION_COUNT and CL_PRIMITIVES_COUNT */
+   bool stats_enable;
+
+   uint8_t viewport_count;
+   bool force_rtaindex_zero;
+
+   /* these should be mutually exclusive */
+   uint8_t user_cull_enables;
+   uint8_t user_clip_enables;
+
+   bool gb_test_enable;
+   bool xy_test_enable;
+
+   /* far/near must be enabled together prior to Gen9 */
+   bool z_far_enable;
+   bool z_near_enable;
+   bool z_near_zero;
+};
+
+/**
+ * Primitive assembly, viewport transformation, scissoring, MSAA, etc.
+ */
+struct ilo_state_raster_setup_info {
+   bool cv_is_rectangle;
+
+   bool first_vertex_provoking;
+   bool viewport_transform;
+
+   bool scissor_enable;
+
+   /* MSAA enables for lines and non-lines */
+   bool msaa_enable;
+   bool line_msaa_enable;
+};
+
+/**
+ * 3DOBJ_POINT rasterization rules.
+ */
+struct ilo_state_raster_point_info {
+   /* ignored when msaa_enable is set */
+   bool aa_enable;
+
+   bool programmable_width;
+};
+
+/**
+ * 3DOBJ_LINE rasterization rules.
+ */
+struct ilo_state_raster_line_info {
+   /* ignored when line_msaa_enable is set */
+   bool aa_enable;
+
+   /* ignored when line_msaa_enable or aa_enable is set */
+   bool stipple_enable;
+   bool giq_enable;
+   bool giq_last_pixel;
+};
+
+/**
+ * 3DOBJ_TRIANGLE rasterization rules.
+ */
+struct ilo_state_raster_tri_info {
+   enum gen_front_winding front_winding;
+   enum gen_cull_mode cull_mode;
+   enum gen_fill_mode fill_mode_front;
+   enum gen_fill_mode fill_mode_back;
+
+   enum gen_depth_format depth_offset_format;
+   bool depth_offset_solid;
+   bool depth_offset_wireframe;
+   bool depth_offset_point;
+
+   bool poly_stipple_enable;
+};
+
+/**
+ * Scan conversion.
+ */
+struct ilo_state_raster_scan_info {
+   /* PS_DEPTH_COUNT and PS_INVOCATION_COUNT */
+   bool stats_enable;
+
+   uint8_t sample_count;
+
+   /* pixel location for non-MSAA or 1x-MSAA */
+   enum gen_pixel_location pixloc;
+
+   uint32_t sample_mask;
+
+   /* interpolations */
+   enum gen_zw_interp zw_interp;
+   uint8_t barycentric_interps;
+
+   /* Gen7+ only */
+   enum gen_edsc_mode earlyz_control;
+   enum ilo_state_raster_earlyz_op earlyz_op;
+   bool earlyz_stencil_clear;
+};
+
+/**
+ * Raster parameters.
+ */
+struct ilo_state_raster_params_info {
+   bool any_integer_rt;
+   bool hiz_enable;
+
+   float point_width;
+   float line_width;
+
+   /* const term will be scaled by 'r' */
+   float depth_offset_const;
+   float depth_offset_scale;
+   float depth_offset_clamp;
+};
+
+struct ilo_state_raster_info {
+   struct ilo_state_raster_clip_info clip;
+   struct ilo_state_raster_setup_info setup;
+   struct ilo_state_raster_point_info point;
+   struct ilo_state_raster_line_info line;
+   struct ilo_state_raster_tri_info tri;
+   struct ilo_state_raster_scan_info scan;
+
+   struct ilo_state_raster_params_info params;
+};
+
+struct ilo_state_raster {
+   uint32_t clip[3];
+   uint32_t sf[3];
+   uint32_t raster[4];
+   uint32_t sample[2];
+   uint32_t wm[3];
+
+   bool line_aa_enable;
+   bool line_giq_enable;
+};
+
+struct ilo_state_raster_delta {
+   uint32_t dirty;
+};
+
+struct ilo_state_sample_pattern_offset_info {
+   /* in U0.4 */
+   uint8_t x;
+   uint8_t y;
+};
+
+struct ilo_state_sample_pattern_info {
+   struct ilo_state_sample_pattern_offset_info pattern_1x[1];
+   struct ilo_state_sample_pattern_offset_info pattern_2x[2];
+   struct ilo_state_sample_pattern_offset_info pattern_4x[4];
+   struct ilo_state_sample_pattern_offset_info pattern_8x[8];
+   struct ilo_state_sample_pattern_offset_info pattern_16x[16];
+};
+
+struct ilo_state_sample_pattern {
+   uint8_t pattern_1x[1];
+   uint8_t pattern_2x[2];
+   uint8_t pattern_4x[4];
+   uint8_t pattern_8x[8];
+   uint8_t pattern_16x[16];
+};
+
+struct ilo_state_line_stipple_info {
+   uint16_t pattern;
+   uint16_t repeat_count;
+};
+
+struct ilo_state_line_stipple {
+   uint32_t stipple[2];
+};
+
+struct ilo_state_poly_stipple_info {
+   uint32_t pattern[32];
+};
+
+struct ilo_state_poly_stipple {
+   uint32_t stipple[32];
+};
+
+bool
+ilo_state_raster_init(struct ilo_state_raster *rs,
+                      const struct ilo_dev *dev,
+                      const struct ilo_state_raster_info *info);
+
+bool
+ilo_state_raster_init_for_rectlist(struct ilo_state_raster *rs,
+                                   const struct ilo_dev *dev,
+                                   uint8_t sample_count,
+                                   enum ilo_state_raster_earlyz_op earlyz_op,
+                                   bool earlyz_stencil_clear);
+
+bool
+ilo_state_raster_set_info(struct ilo_state_raster *rs,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_raster_info *info);
+
+bool
+ilo_state_raster_set_params(struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            const struct ilo_state_raster_params_info *params);
+
+void
+ilo_state_raster_full_delta(const struct ilo_state_raster *rs,
+                            const struct ilo_dev *dev,
+                            struct ilo_state_raster_delta *delta);
+
+void
+ilo_state_raster_get_delta(const struct ilo_state_raster *rs,
+                           const struct ilo_dev *dev,
+                           const struct ilo_state_raster *old,
+                           struct ilo_state_raster_delta *delta);
+
+bool
+ilo_state_sample_pattern_init(struct ilo_state_sample_pattern *pattern,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sample_pattern_info *info);
+
+bool
+ilo_state_sample_pattern_init_default(struct ilo_state_sample_pattern *pattern,
+                                      const struct ilo_dev *dev);
+
+const uint8_t *
+ilo_state_sample_pattern_get_packed_offsets(const struct ilo_state_sample_pattern *pattern,
+                                            const struct ilo_dev *dev,
+                                            uint8_t sample_count);
+
+void
+ilo_state_sample_pattern_get_offset(const struct ilo_state_sample_pattern *pattern,
+                                    const struct ilo_dev *dev,
+                                    uint8_t sample_count, uint8_t sample_index,
+                                    uint8_t *x, uint8_t *y);
+bool
+ilo_state_line_stipple_set_info(struct ilo_state_line_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_line_stipple_info *info);
+
+bool
+ilo_state_poly_stipple_set_info(struct ilo_state_poly_stipple *stipple,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_poly_stipple_info *info);
+
+#endif /* ILO_STATE_RASTER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sampler.c b/src/gallium/drivers/ilo/core/ilo_state_sampler.c
new file mode 100644
index 0000000..3787f68
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sampler.c
@@ -0,0 +1,742 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "util/u_half.h"
+
+#include "ilo_debug.h"
+#include "ilo_state_surface.h"
+#include "ilo_state_sampler.h"
+
+static bool
+sampler_validate_gen6_non_normalized(const struct ilo_dev *dev,
+                                     const struct ilo_state_sampler_info *info)
+{
+   const enum gen_texcoord_mode addr_ctrls[3] = {
+      info->tcx_ctrl, info->tcy_ctrl, info->tcz_ctrl,
+   };
+   int i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 98:
+    *
+    *     "The following state must be set as indicated if this field
+    *      (Non-normalized Coordinate Enable) is enabled:
+    *
+    *      - TCX/Y/Z Address Control Mode must be TEXCOORDMODE_CLAMP,
+    *        TEXCOORDMODE_HALF_BORDER, or TEXCOORDMODE_CLAMP_BORDER.
+    *      - Surface Type must be SURFTYPE_2D or SURFTYPE_3D.
+    *      - Mag Mode Filter must be MAPFILTER_NEAREST or
+    *        MAPFILTER_LINEAR.
+    *      - Min Mode Filter must be MAPFILTER_NEAREST or
+    *        MAPFILTER_LINEAR.
+    *      - Mip Mode Filter must be MIPFILTER_NONE.
+    *      - Min LOD must be 0.
+    *      - Max LOD must be 0.
+    *      - MIP Count must be 0.
+    *      - Surface Min LOD must be 0.
+    *      - Texture LOD Bias must be 0."
+    */
+   for (i = 0; i < 3; i++) {
+      switch (addr_ctrls[i]) {
+      case GEN6_TEXCOORDMODE_CLAMP:
+      case GEN6_TEXCOORDMODE_CLAMP_BORDER:
+      case GEN8_TEXCOORDMODE_HALF_BORDER:
+         break;
+      default:
+         assert(!"bad non-normalized coordinate wrap mode");
+         break;
+      }
+   }
+
+   assert(info->mip_filter == GEN6_MIPFILTER_NONE);
+
+   assert((info->min_filter == GEN6_MAPFILTER_NEAREST ||
+           info->min_filter == GEN6_MAPFILTER_LINEAR) &&
+          (info->mag_filter == GEN6_MAPFILTER_NEAREST ||
+           info->mag_filter == GEN6_MAPFILTER_LINEAR));
+
+   assert(info->min_lod == 0.0f &&
+          info->max_lod == 0.0f &&
+          info->lod_bias == 0.0f);
+
+   return true;
+}
+
+static bool
+sampler_validate_gen6_sampler(const struct ilo_dev *dev,
+                              const struct ilo_state_sampler_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->non_normalized &&
+       !sampler_validate_gen6_non_normalized(dev, info))
+      return false;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+       assert(info->tcx_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER &&
+              info->tcy_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER &&
+              info->tcz_ctrl != GEN8_TEXCOORDMODE_HALF_BORDER);
+   }
+
+   return true;
+}
+
+static uint32_t
+sampler_get_gen6_integer_filters(const struct ilo_dev *dev,
+                                 const struct ilo_state_sampler_info *info)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 103:
+    *
+    *     "MIPFILTER_LINEAR is not supported for surface formats that do not
+    *      support "Sampling Engine Filtering" as indicated in the Surface
+    *      Formats table unless using the sample_c message type."
+    *
+    *     "Only MAPFILTER_NEAREST is supported for surface formats that do not
+    *      support "Sampling Engine Filtering" as indicated in the Surface
+    *      Formats table unless using the sample_c message type.
+    */
+   const enum gen_mip_filter mip_filter =
+      (info->mip_filter == GEN6_MIPFILTER_LINEAR) ?
+      GEN6_MIPFILTER_NEAREST : info->mip_filter;
+   const enum gen_map_filter min_filter = GEN6_MAPFILTER_NEAREST;
+   const enum gen_map_filter mag_filter = GEN6_MAPFILTER_NEAREST;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT |
+          mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT |
+          min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT;
+}
+
+static uint32_t
+sampler_get_gen6_3d_filters(const struct ilo_dev *dev,
+                            const struct ilo_state_sampler_info *info)
+{
+   const enum gen_mip_filter mip_filter = info->mip_filter;
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 103:
+    *
+    *     "Only MAPFILTER_NEAREST and MAPFILTER_LINEAR are supported for
+    *      surfaces of type SURFTYPE_3D."
+    */
+   const enum gen_map_filter min_filter =
+      (info->min_filter == GEN6_MAPFILTER_NEAREST ||
+       info->min_filter == GEN6_MAPFILTER_LINEAR) ?
+      info->min_filter : GEN6_MAPFILTER_LINEAR;
+   const enum gen_map_filter mag_filter =
+      (info->mag_filter == GEN6_MAPFILTER_NEAREST ||
+       info->mag_filter == GEN6_MAPFILTER_LINEAR) ?
+       info->mag_filter : GEN6_MAPFILTER_LINEAR;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT |
+          mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT |
+          min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT;
+}
+
+static uint32_t
+get_gen6_addr_controls(const struct ilo_dev *dev,
+                       enum gen_texcoord_mode tcx_ctrl,
+                       enum gen_texcoord_mode tcy_ctrl,
+                       enum gen_texcoord_mode tcz_ctrl)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      return tcx_ctrl << GEN7_SAMPLER_DW3_U_WRAP__SHIFT |
+             tcy_ctrl << GEN7_SAMPLER_DW3_V_WRAP__SHIFT |
+             tcz_ctrl << GEN7_SAMPLER_DW3_R_WRAP__SHIFT;
+   } else {
+      return tcx_ctrl << GEN6_SAMPLER_DW1_U_WRAP__SHIFT |
+             tcy_ctrl << GEN6_SAMPLER_DW1_V_WRAP__SHIFT |
+             tcz_ctrl << GEN6_SAMPLER_DW1_R_WRAP__SHIFT;
+   }
+}
+
+static uint32_t
+sampler_get_gen6_1d_addr_controls(const struct ilo_dev *dev,
+                                  const struct ilo_state_sampler_info *info)
+{
+   const enum gen_texcoord_mode tcx_ctrl =
+      (info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcx_ctrl;
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 100:
+    *
+    *     "If this field (TCY Address Control Mode) is set to
+    *      TEXCOORDMODE_CLAMP_BORDER or TEXCOORDMODE_HALF_BORDER and a 1D
+    *      surface is sampled, incorrect blending with the border color in the
+    *      vertical direction may occur."
+    */
+   const enum gen_texcoord_mode tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+   const enum gen_texcoord_mode tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl);
+}
+
+static uint32_t
+sampler_get_gen6_2d_3d_addr_controls(const struct ilo_dev *dev,
+                                     const struct ilo_state_sampler_info *info)
+{
+   const enum gen_texcoord_mode tcx_ctrl =
+      (info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcx_ctrl;
+   const enum gen_texcoord_mode tcy_ctrl =
+      (info->tcy_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcy_ctrl;
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 108:
+    *
+    *     "[DevSNB]: if this field (TCZ Address Control Mode) is set to
+    *      TEXCOORDMODE_CLAMP_BORDER samples outside the map will clamp to 0
+    *      instead of boarder color"
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 100:
+    *
+    *     "If this field is set to TEXCOORDMODE_CLAMP_BORDER for 3D maps on
+    *      formats without an alpha channel, samples straddling the map in the
+    *      Z direction may have their alpha channels off by 1."
+    *
+    * Do we want to do something here?
+    */
+   const enum gen_texcoord_mode tcz_ctrl =
+      (info->tcz_ctrl == GEN6_TEXCOORDMODE_CUBE) ?
+      GEN6_TEXCOORDMODE_CLAMP : info->tcz_ctrl;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl);
+}
+
+static uint32_t
+sampler_get_gen6_cube_addr_controls(const struct ilo_dev *dev,
+                                    const struct ilo_state_sampler_info *info)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 99:
+    *
+    *     "When using cube map texture coordinates, only TEXCOORDMODE_CLAMP
+    *      and TEXCOORDMODE_CUBE settings are valid, and each TC component
+    *      must have the same Address Control mode.
+    *
+    *      When TEXCOORDMODE_CUBE is not used accessing a cube map, the map's
+    *      Cube Face Enable field must be programmed to 111111b (all faces
+    *      enabled)."
+    *
+    * From the Haswell PRM, volume 2d, page 278:
+    *
+    *     "When using cube map texture coordinates, each TC component must
+    *      have the same Address Control Mode.
+    *
+    *      When TEXCOORDMODE_CUBE is not used accessing a cube map, the map's
+    *      Cube Face Enable field must be programmed to 111111b (all faces
+    *      enabled)."
+    *
+    * We always enable all cube faces and only need to make sure all address
+    * control modes are the same.
+    */
+   const enum gen_texcoord_mode tcx_ctrl =
+      (ilo_dev_gen(dev) >= ILO_GEN(7.5) ||
+       info->tcx_ctrl == GEN6_TEXCOORDMODE_CUBE ||
+       info->tcx_ctrl == GEN6_TEXCOORDMODE_CLAMP) ?
+      info->tcx_ctrl : GEN6_TEXCOORDMODE_CLAMP;
+   const enum gen_texcoord_mode tcy_ctrl = tcx_ctrl;
+   const enum gen_texcoord_mode tcz_ctrl = tcx_ctrl;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_addr_controls(dev, tcx_ctrl, tcy_ctrl, tcz_ctrl);
+}
+
+static uint16_t
+get_gen6_lod_bias(const struct ilo_dev *dev, float bias)
+{
+   /* [-16.0, 16.0) in S4.6 or S4.8 */
+   const int fbits = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 8 : 6;
+   const float max = 16.0f;
+   const float scale = (float) (1 << fbits);
+   const int mask = (1 << (1 + 4 + fbits)) - 1;
+   const int scaled_max = (16 << fbits) - 1;
+   int scaled;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (bias > max)
+      bias = max;
+   else if (bias < -max)
+      bias = -max;
+
+   scaled = (int) (bias * scale);
+   if (scaled > scaled_max)
+      scaled = scaled_max;
+
+   return (scaled & mask);
+}
+
+static uint16_t
+get_gen6_lod_clamp(const struct ilo_dev *dev, float clamp)
+{
+   /* [0.0, 13.0] in U4.6 or [0.0, 14.0] in U4.8 */
+   const int fbits = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 8 : 6;
+   const float max = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 14.0f : 13.0f;
+   const float scale = (float) (1 << fbits);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (clamp > max)
+      clamp = max;
+   else if (clamp < 0.0f)
+      clamp = 0.0f;
+
+   return (int) (clamp * scale);
+}
+
+static bool
+sampler_set_gen6_SAMPLER_STATE(struct ilo_state_sampler *sampler,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_sampler_info *info)
+{
+   uint16_t lod_bias, max_lod, min_lod;
+   uint32_t dw0, dw1, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!sampler_validate_gen6_sampler(dev, info))
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 15:
+    *
+    *     "The per-pixel LOD is computed in an implementation-dependent manner
+    *      and approximates the log2 of the texel/pixel ratio at the given
+    *      pixel. The computation is typically based on the differential
+    *      texel-space distances associated with a one-pixel differential
+    *      distance along the screen x- and y-axes. These texel-space
+    *      distances are computed by evaluating neighboring pixel texture
+    *      coordinates, these coordinates being in units of texels on the base
+    *      MIP level (multiplied by the corresponding surface size in
+    *      texels)."
+    *
+    * Judging from the LOD computation pseudocode on page 16-18, the "base MIP
+    * level" should be given by SurfMinLod.  To summarize, for the "sample"
+    * message,
+    *
+    *   1) LOD is set to log2(texel/pixel ratio).  The number of texels is
+    *      measured against level SurfMinLod.
+    *   2) Bias is added to LOD.
+    *   3) if pre-clamp is enabled, LOD is clamped to [MinLod, MaxLod] first
+    *   4) LOD is compared with Base to determine whether magnification or
+    *      minification is needed.
+    *   5) If magnification is needed, or no mipmapping is requested, LOD is
+    *      set to floor(MinLod).
+    *   6) LOD is clamped to [0, MIPCnt], and SurfMinLod is added to LOD.
+    *
+    * As an example, we could set SurfMinLod to GL_TEXTURE_BASE_LEVEL and Base
+    * to 0 to match GL.  But GL expects LOD to be set to 0, instead of
+    * floor(MinLod), in 5).  Since this is only an issue when MinLod is
+    * greater than or equal to one, and, with Base being 0, a non-zero MinLod
+    * implies minification, we only need to deal with the case when mipmapping
+    * is disabled.  We can thus do:
+    *
+    *   if (MipFilter == MIPFILTER_NONE && MinLod) {
+    *     MinLod = 0;
+    *     MagFilter = MinFilter;
+    *   }
+    */
+
+   lod_bias = get_gen6_lod_bias(dev, info->lod_bias);
+   min_lod = get_gen6_lod_clamp(dev, info->min_lod);
+   max_lod = get_gen6_lod_clamp(dev, info->max_lod);
+
+   dw0 = GEN6_SAMPLER_DW0_LOD_PRECLAMP_ENABLE |
+         0 << GEN6_SAMPLER_DW0_BASE_LOD__SHIFT |
+         info->mip_filter << GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT |
+         info->mag_filter << GEN6_SAMPLER_DW0_MAG_FILTER__SHIFT |
+         info->min_filter << GEN6_SAMPLER_DW0_MIN_FILTER__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      dw0 |= GEN7_SAMPLER_DW0_BORDER_COLOR_MODE_DX10_OGL |
+             lod_bias << GEN7_SAMPLER_DW0_LOD_BIAS__SHIFT;
+
+      if (info->min_filter == GEN6_MAPFILTER_ANISOTROPIC ||
+          info->mag_filter == GEN6_MAPFILTER_ANISOTROPIC)
+         dw0 |= GEN7_SAMPLER_DW0_ANISO_ALGO_EWA;
+   } else {
+      dw0 |= lod_bias << GEN6_SAMPLER_DW0_LOD_BIAS__SHIFT |
+             info->shadow_func << GEN6_SAMPLER_DW0_SHADOW_FUNC__SHIFT;
+
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 102:
+       *
+       *     "(Min and Mag State Not Equal) Must be set to 1 if any of the
+       *      following are true:
+       *
+       *      - Mag Mode Filter and Min Mode Filter are not the same
+       *      - Address Rounding Enable: U address mag filter and U address
+       *        min filter are not the same
+       *      - Address Rounding Enable: V address mag filter and V address
+       *        min filter are not the same
+       *      - Address Rounding Enable: R address mag filter and R address
+       *        min filter are not the same"
+       *
+       * We set address rounding for U, V, and R uniformly.  Only need to
+       * check the filters.
+       */
+      if (info->min_filter != info->mag_filter)
+         dw0 |= GEN6_SAMPLER_DW0_MIN_MAG_NOT_EQUAL;
+   }
+
+   dw1 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      /*
+       * From the Ivy Bridge PRM, volume 4 part 1, page 96:
+       *
+       *     "This field (Cube Surface Control Mode) must be set to
+       *      CUBECTRLMODE_PROGRAMMED"
+       */
+      dw1 |= min_lod << GEN7_SAMPLER_DW1_MIN_LOD__SHIFT |
+             max_lod << GEN7_SAMPLER_DW1_MAX_LOD__SHIFT |
+             info->shadow_func << GEN7_SAMPLER_DW1_SHADOW_FUNC__SHIFT |
+             GEN7_SAMPLER_DW1_CUBECTRLMODE_PROGRAMMED;
+   } else {
+      dw1 |= min_lod << GEN6_SAMPLER_DW1_MIN_LOD__SHIFT |
+             max_lod << GEN6_SAMPLER_DW1_MAX_LOD__SHIFT |
+             GEN6_SAMPLER_DW1_CUBECTRLMODE_PROGRAMMED |
+             info->tcx_ctrl << GEN6_SAMPLER_DW1_U_WRAP__SHIFT |
+             info->tcy_ctrl << GEN6_SAMPLER_DW1_V_WRAP__SHIFT |
+             info->tcz_ctrl << GEN6_SAMPLER_DW1_R_WRAP__SHIFT;
+   }
+
+   dw3 = info->max_anisotropy << GEN6_SAMPLER_DW3_MAX_ANISO__SHIFT;
+
+   /* round the coordinates for linear filtering */
+   if (info->min_filter != GEN6_MAPFILTER_NEAREST) {
+      dw3 |= GEN6_SAMPLER_DW3_U_MIN_ROUND |
+             GEN6_SAMPLER_DW3_V_MIN_ROUND |
+             GEN6_SAMPLER_DW3_R_MIN_ROUND;
+   }
+   if (info->mag_filter != GEN6_MAPFILTER_NEAREST) {
+      dw3 |= GEN6_SAMPLER_DW3_U_MAG_ROUND |
+             GEN6_SAMPLER_DW3_V_MAG_ROUND |
+             GEN6_SAMPLER_DW3_R_MAG_ROUND;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      dw3 |= GEN7_SAMPLER_DW3_TRIQUAL_FULL |
+             info->tcx_ctrl << GEN7_SAMPLER_DW3_U_WRAP__SHIFT |
+             info->tcy_ctrl << GEN7_SAMPLER_DW3_V_WRAP__SHIFT |
+             info->tcz_ctrl << GEN7_SAMPLER_DW3_R_WRAP__SHIFT;
+
+      if (info->non_normalized)
+         dw3 |= GEN7_SAMPLER_DW3_NON_NORMALIZED_COORD;
+   } else {
+      if (info->non_normalized)
+         dw3 |= GEN6_SAMPLER_DW3_NON_NORMALIZED_COORD;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(sampler->sampler) >= 3);
+   sampler->sampler[0] = dw0;
+   sampler->sampler[1] = dw1;
+   sampler->sampler[2] = dw3;
+
+   sampler->filter_integer = sampler_get_gen6_integer_filters(dev, info);
+   sampler->filter_3d = sampler_get_gen6_3d_filters(dev, info);
+   sampler->addr_ctrl_1d = sampler_get_gen6_1d_addr_controls(dev, info);
+   sampler->addr_ctrl_2d_3d = sampler_get_gen6_2d_3d_addr_controls(dev, info);
+   sampler->addr_ctrl_cube = sampler_get_gen6_cube_addr_controls(dev, info);
+
+   sampler->non_normalized = info->non_normalized;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 21:
+    *
+    *     "[DevSNB] Errata: Incorrect behavior is observed in cases where the
+    *      min and mag mode filters are different and SurfMinLOD is nonzero.
+    *      The determination of MagMode uses the following equation instead of
+    *      the one in the above pseudocode:
+    *
+    *      MagMode = (LOD + SurfMinLOD - Base <= 0)"
+    *
+    * As a way to work around that, request Base to be set to SurfMinLod.
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) &&
+       info->min_filter != info->mag_filter)
+      sampler->base_to_surf_min_lod = true;
+
+   return true;
+}
+
+static bool
+sampler_border_set_gen6_SAMPLER_BORDER_COLOR_STATE(struct ilo_state_sampler_border *border,
+                                                   const struct ilo_dev *dev,
+                                                   const struct ilo_state_sampler_border_info *info)
+{
+   uint32_t dw[12];
+   float rgba[4];
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 117:
+    *
+    *     "For ([DevSNB]), if border color is used, all formats must be
+    *      provided.  Hardware will choose the appropriate format based on
+    *      Surface Format and Texture Border Color Mode. The values
+    *      represented by each format should be the same (other than being
+    *      subject to range-based clamping and precision) to avoid unexpected
+    *      behavior."
+    *
+    * XXX We do not honor info->is_integer yet.
+    */
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /* make a copy so that we can clamp for SNORM and UNORM */
+   memcpy(rgba, info->rgba.f, sizeof(rgba));
+
+   /* IEEE_FP */
+   dw[1] = fui(rgba[0]);
+   dw[2] = fui(rgba[1]);
+   dw[3] = fui(rgba[2]);
+   dw[4] = fui(rgba[3]);
+
+   /* FLOAT_16 */
+   dw[5] = util_float_to_half(rgba[0]) |
+           util_float_to_half(rgba[1]) << 16;
+   dw[6] = util_float_to_half(rgba[2]) |
+           util_float_to_half(rgba[3]) << 16;
+
+   /* clamp to [-1.0f, 1.0f] */
+   rgba[0] = CLAMP(rgba[0], -1.0f, 1.0f);
+   rgba[1] = CLAMP(rgba[1], -1.0f, 1.0f);
+   rgba[2] = CLAMP(rgba[2], -1.0f, 1.0f);
+   rgba[3] = CLAMP(rgba[3], -1.0f, 1.0f);
+
+   /* SNORM16 */
+   dw[9] =  (int16_t) util_iround(rgba[0] * 32767.0f) |
+            (int16_t) util_iround(rgba[1] * 32767.0f) << 16;
+   dw[10] = (int16_t) util_iround(rgba[2] * 32767.0f) |
+            (int16_t) util_iround(rgba[3] * 32767.0f) << 16;
+
+   /* SNORM8 */
+   dw[11] = (int8_t) util_iround(rgba[0] * 127.0f) |
+            (int8_t) util_iround(rgba[1] * 127.0f) << 8 |
+            (int8_t) util_iround(rgba[2] * 127.0f) << 16 |
+            (int8_t) util_iround(rgba[3] * 127.0f) << 24;
+
+   /* clamp to [0.0f, 1.0f] */
+   rgba[0] = CLAMP(rgba[0], 0.0f, 1.0f);
+   rgba[1] = CLAMP(rgba[1], 0.0f, 1.0f);
+   rgba[2] = CLAMP(rgba[2], 0.0f, 1.0f);
+   rgba[3] = CLAMP(rgba[3], 0.0f, 1.0f);
+
+   /* UNORM8 */
+   dw[0] = (uint8_t) util_iround(rgba[0] * 255.0f) |
+           (uint8_t) util_iround(rgba[1] * 255.0f) << 8 |
+           (uint8_t) util_iround(rgba[2] * 255.0f) << 16 |
+           (uint8_t) util_iround(rgba[3] * 255.0f) << 24;
+
+   /* UNORM16 */
+   dw[7] = (uint16_t) util_iround(rgba[0] * 65535.0f) |
+           (uint16_t) util_iround(rgba[1] * 65535.0f) << 16;
+   dw[8] = (uint16_t) util_iround(rgba[2] * 65535.0f) |
+           (uint16_t) util_iround(rgba[3] * 65535.0f) << 16;
+
+   STATIC_ASSERT(ARRAY_SIZE(border->color) >= 12);
+   memcpy(border->color, dw, sizeof(dw));
+
+   return true;
+}
+
+static bool
+sampler_border_set_gen7_SAMPLER_BORDER_COLOR_STATE(struct ilo_state_sampler_border *border,
+                                                   const struct ilo_dev *dev,
+                                                   const struct ilo_state_sampler_border_info *info)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 116:
+    *
+    *     "In DX10/OGL mode, the format of the border color is
+    *      R32G32B32A32_FLOAT, regardless of the surface format chosen."
+    *
+    * From the Haswell PRM, volume 2d, page 240:
+    *
+    *     "So, SW will have to program the table in SAMPLER_BORDER_COLOR_STATE
+    *      at offsets DWORD16 to 19, as per the integer surface format type."
+    *
+    * From the Broadwell PRM, volume 2d, page 297:
+    *
+    *     "DX10/OGL mode: the format of the border color depends on the format
+    *      of the surface being sampled. If the map format is UINT, then the
+    *      border color format is R32G32B32A32_UINT. If the map format is
+    *      SINT, then the border color format is R32G32B32A32_SINT. Otherwise,
+    *      the border color format is R32G32B32A32_FLOAT."
+    *
+    * XXX every Gen is different
+    */
+
+   STATIC_ASSERT(ARRAY_SIZE(border->color) >= 4);
+   memcpy(border->color, info->rgba.f, sizeof(info->rgba.f));
+
+   return true;
+}
+
+bool
+ilo_state_sampler_init(struct ilo_state_sampler *sampler,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sampler_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(sampler, sizeof(*sampler)));
+
+   ret &= sampler_set_gen6_SAMPLER_STATE(sampler, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sampler_init_disabled(struct ilo_state_sampler *sampler,
+                                const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(ilo_is_zeroed(sampler, sizeof(*sampler)));
+
+   sampler->sampler[0] = GEN6_SAMPLER_DW0_DISABLE;
+   sampler->sampler[1] = 0;
+   sampler->sampler[2] = 0;
+
+   return true;
+}
+
+/**
+ * Modify \p sampler to work with \p surf.  There will be loss of information.
+ * Callers should make a copy of the orignal sampler first.
+ */
+bool
+ilo_state_sampler_set_surface(struct ilo_state_sampler *sampler,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_surface *surf)
+{
+   uint32_t addr_ctrl;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (sampler->non_normalized) {
+      /* see sampler_validate_gen6_non_normalized() */
+      assert(surf->type == GEN6_SURFTYPE_2D ||
+             surf->type == GEN6_SURFTYPE_3D);
+      assert(!surf->min_lod && !surf->mip_count);
+   }
+
+   if (sampler->base_to_surf_min_lod) {
+      const uint8_t base = surf->min_lod << GEN6_SAMPLER_DW0_BASE_LOD__RADIX;
+
+      sampler->sampler[0] =
+         (sampler->sampler[0] & ~GEN6_SAMPLER_DW0_BASE_LOD__MASK) |
+         base << GEN6_SAMPLER_DW0_BASE_LOD__SHIFT;
+   }
+
+   if (surf->is_integer || surf->type == GEN6_SURFTYPE_3D) {
+      const uint32_t mask = (GEN6_SAMPLER_DW0_MIP_FILTER__MASK |
+                             GEN6_SAMPLER_DW0_MIN_FILTER__MASK |
+                             GEN6_SAMPLER_DW0_MAG_FILTER__MASK);
+      const uint32_t filter = (surf->is_integer) ?
+         sampler->filter_integer : sampler->filter_3d;
+
+      assert((filter & mask) == filter);
+      sampler->sampler[0] = (sampler->sampler[0] & ~mask) |
+                            filter;
+   }
+
+   switch (surf->type) {
+   case GEN6_SURFTYPE_1D:
+      addr_ctrl = sampler->addr_ctrl_1d;
+      break;
+   case GEN6_SURFTYPE_2D:
+   case GEN6_SURFTYPE_3D:
+      addr_ctrl = sampler->addr_ctrl_2d_3d;
+      break;
+   case GEN6_SURFTYPE_CUBE:
+      addr_ctrl = sampler->addr_ctrl_cube;
+      break;
+   default:
+      assert(!"unexpected surface type");
+      addr_ctrl = 0;
+      break;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      const uint32_t mask = (GEN7_SAMPLER_DW3_U_WRAP__MASK |
+                             GEN7_SAMPLER_DW3_V_WRAP__MASK |
+                             GEN7_SAMPLER_DW3_R_WRAP__MASK);
+
+      assert((addr_ctrl & mask) == addr_ctrl);
+      sampler->sampler[2] = (sampler->sampler[2] & ~mask) |
+                            addr_ctrl;
+   } else {
+      const uint32_t mask = (GEN6_SAMPLER_DW1_U_WRAP__MASK |
+                             GEN6_SAMPLER_DW1_V_WRAP__MASK |
+                             GEN6_SAMPLER_DW1_R_WRAP__MASK);
+
+      assert((addr_ctrl & mask) == addr_ctrl);
+      sampler->sampler[1] = (sampler->sampler[1] & ~mask) |
+                            addr_ctrl;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_sampler_border_init(struct ilo_state_sampler_border *border,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sampler_border_info *info)
+{
+   bool ret = true;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= sampler_border_set_gen7_SAMPLER_BORDER_COLOR_STATE(border,
+            dev, info);
+   } else {
+      ret &= sampler_border_set_gen6_SAMPLER_BORDER_COLOR_STATE(border,
+            dev, info);
+   }
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sampler.h b/src/gallium/drivers/ilo/core/ilo_state_sampler.h
new file mode 100644
index 0000000..75c7620
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sampler.h
@@ -0,0 +1,103 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_SAMPLER_H
+#define ILO_STATE_SAMPLER_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+struct ilo_state_surface;
+
+struct ilo_state_sampler_info {
+   bool non_normalized;
+
+   float lod_bias;
+   float min_lod;
+   float max_lod;
+
+   enum gen_mip_filter mip_filter;
+   enum gen_map_filter min_filter;
+   enum gen_map_filter mag_filter;
+   enum gen_aniso_ratio max_anisotropy;
+
+   enum gen_texcoord_mode tcx_ctrl;
+   enum gen_texcoord_mode tcy_ctrl;
+   enum gen_texcoord_mode tcz_ctrl;
+
+   enum gen_prefilter_op shadow_func;
+};
+
+struct ilo_state_sampler_border_info {
+   union {
+      float f[4];
+      uint32_t ui[4];
+   } rgba;
+
+   bool is_integer;
+};
+
+struct ilo_state_sampler {
+   uint32_t sampler[3];
+
+   uint32_t filter_integer;
+   uint32_t filter_3d;
+
+   uint32_t addr_ctrl_1d;
+   uint32_t addr_ctrl_2d_3d;
+   uint32_t addr_ctrl_cube;
+
+   bool non_normalized;
+   bool base_to_surf_min_lod;
+};
+
+struct ilo_state_sampler_border {
+   uint32_t color[12];
+};
+
+bool
+ilo_state_sampler_init(struct ilo_state_sampler *sampler,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sampler_info *info);
+
+bool
+ilo_state_sampler_init_disabled(struct ilo_state_sampler *sampler,
+                                const struct ilo_dev *dev);
+
+bool
+ilo_state_sampler_set_surface(struct ilo_state_sampler *sampler,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_surface *surf);
+
+bool
+ilo_state_sampler_border_init(struct ilo_state_sampler_border *border,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sampler_border_info *info);
+
+#endif /* ILO_STATE_SAMPLER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.c b/src/gallium/drivers/ilo/core/ilo_state_sbe.c
new file mode 100644
index 0000000..5d1d400
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.c
@@ -0,0 +1,350 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_sbe.h"
+
+static bool
+sbe_validate_gen8(const struct ilo_dev *dev,
+                  const struct ilo_state_sbe_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->attr_count <= ILO_STATE_SBE_MAX_ATTR_COUNT);
+
+   assert(info->vue_read_base + info->vue_read_count <=
+         info->cv_vue_attr_count);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "(Vertex URB Entry Read Length)
+    *      Format: U5
+    *      Range [1,16]
+    *
+    *      Specifies the amount of URB data read for each Vertex URB entry, in
+    *      256-bit register increments.
+    *
+    *      Programming Notes
+    *      It is UNDEFINED to set this field to 0 indicating no Vertex URB
+    *      data to be read."
+    *
+    *     "(Vertex URB Entry Read Offset)
+    *      Format: U6
+    *      Range [0,63]
+    *
+    *      Specifies the offset (in 256-bit units) at which Vertex URB data is
+    *      to be read from the URB."
+    */
+   assert(info->vue_read_base % 2 == 0 && info->vue_read_base <= 126);
+   assert(info->vue_read_count <= 32);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 268:
+    *
+    *     "This field (Point Sprite Texture Coordinate Enable) must be
+    *      programmed to 0 when non-point primitives are rendered."
+    */
+   if (ilo_dev_gen(dev) < ILO_GEN(7.5) && info->point_sprite_enables)
+      assert(info->cv_is_point);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 246:
+    *
+    *     "(Number of SF Output Attributes) 33-48: Specifies 17-32 attributes
+    *      (# attributes = field value - 16). Swizzling performed on
+    *      Attributes 16-31 (as required) only. Attributes 0-15 passed through
+    *      unmodified.
+    *
+    *      Note :
+    *
+    *      Attribute n Component Override and Constant Source states apply to
+    *      Attributes 16-31 (as required) instead of Attributes 0-15. E.g.,
+    *      this allows an Attribute 16-31 component to be overridden with the
+    *      PrimitiveID value.
+    *
+    *      Attribute n WrapShortest Enables still apply to Attributes 0-15.
+    *
+    *      Attribute n Swizzle Select and Attribute n Source Attribute states
+    *      are ignored and none of the swizzling functions available through
+    *      these controls are performed."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 247:
+    *
+    *     "This bit (Attribute Swizzle Enable) controls the use of the
+    *      Attribute n Swizzle Select and Attribute n Source Attribute fields
+    *      only. If ENABLED, those fields are used as described below. If
+    *      DISABLED, attributes are copied from their corresponding source
+    *      attributes, for the purposes of Swizzle Select only.
+    *
+    *      Note that the following fields are unaffected by this bit, and are
+    *      therefore always used to control their respective fields:
+    *      Attribute n Component Override X/Y/Z/W
+    *      Attribute n Constant Source
+    *      Attribute n WrapShortest Enables"
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 264:
+    *
+    *     "When Attribute Swizzle Enable is ENABLED, this bit (Attribute
+    *      Swizzle Control Mode) controls whether attributes 0-15 or 16-31 are
+    *      subject to the following swizzle controls:
+    *
+    *      - Attribute n Component Override X/Y/Z/W
+    *      - Attribute n Constant Source
+    *      - Attribute n Swizzle Select
+    *      - Attribute n Source Attribute
+    *      - Attribute n Wrap Shortest Enables"
+    *
+    *     "SWIZ_16_31... Only valid when 16 or more attributes are output."
+    */
+   assert(info->swizzle_count <= ILO_STATE_SBE_MAX_SWIZZLE_COUNT);
+   if (info->swizzle_16_31) {
+      assert(ilo_dev_gen(dev) >= ILO_GEN(7) &&
+             info->swizzle_enable &&
+             info->attr_count > 16);
+   }
+
+   return true;
+}
+
+static uint8_t
+sbe_get_gen8_min_read_count(const struct ilo_dev *dev,
+                            const struct ilo_state_sbe_info *info)
+{
+   uint8_t min_count = 0;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* minimum read count for non-swizzled attributes */
+   if (!info->swizzle_enable || info->swizzle_count < info->attr_count) {
+      if (info->swizzle_16_31 && info->swizzle_count + 16 == info->attr_count)
+         min_count = 16;
+      else
+         min_count = info->attr_count;
+   }
+
+   if (info->swizzle_enable) {
+      uint8_t i;
+
+      for (i = 0; i < info->swizzle_count; i++) {
+         const struct ilo_state_sbe_swizzle_info *swizzle =
+            &info->swizzles[i];
+         bool inputattr_facing;
+
+         switch (swizzle->attr_select) {
+         case GEN6_INPUTATTR_FACING:
+         case GEN6_INPUTATTR_FACING_W:
+            inputattr_facing = true;
+            break;
+         default:
+            inputattr_facing = false;
+            break;
+         }
+
+         if (min_count < swizzle->attr + inputattr_facing + 1)
+            min_count = swizzle->attr + inputattr_facing + 1;
+      }
+   }
+
+   return min_count;
+}
+
+static uint8_t
+sbe_get_gen8_read_length(const struct ilo_dev *dev,
+                         const struct ilo_state_sbe_info *info)
+{
+   uint8_t read_len;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "(Vertex URB Entry Read Length)
+    *      This field should be set to the minimum length required to read the
+    *      maximum source attribute. The maximum source attribute is indicated
+    *      by the maximum value of the enabled Attribute # Source Attribute if
+    *      Attribute Swizzle Enable is set, Number of Output Attributes -1 if
+    *      enable is not set.
+    *      read_length = ceiling((max_source_attr+1)/2)
+    *
+    *      [errata] Corruption/Hang possible if length programmed larger than
+    *      recommended"
+    */
+   if (info->has_min_read_count) {
+      read_len = info->vue_read_count;
+      assert(read_len == sbe_get_gen8_min_read_count(dev, info));
+   } else {
+      read_len = sbe_get_gen8_min_read_count(dev, info);
+      assert(read_len <= info->vue_read_count);
+   }
+
+   /*
+    * In pairs.  URB entries are aligned to 1024-bits or 512-bits.  There is
+    * no need to worry about reading past entries.
+    */
+   read_len = (read_len + 1) / 2;
+   if (!read_len)
+      read_len = 1;
+
+   return read_len;
+}
+
+static bool
+sbe_set_gen8_3DSTATE_SBE(struct ilo_state_sbe *sbe,
+                         const struct ilo_dev *dev,
+                         const struct ilo_state_sbe_info *info)
+{
+   uint8_t vue_read_offset, vue_read_len;
+   uint8_t attr_count;
+   uint32_t dw1, dw2, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!sbe_validate_gen8(dev, info))
+      return false;
+
+   vue_read_offset = info->vue_read_base / 2;
+   vue_read_len = sbe_get_gen8_read_length(dev, info);
+
+   attr_count = info->attr_count;
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && info->swizzle_16_31)
+      attr_count += 16;
+
+   dw1 = attr_count << GEN7_SBE_DW1_ATTR_COUNT__SHIFT |
+         vue_read_len << GEN7_SBE_DW1_URB_READ_LEN__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      dw1 |= GEN8_SBE_DW1_USE_URB_READ_LEN |
+             GEN8_SBE_DW1_USE_URB_READ_OFFSET |
+             vue_read_offset << GEN8_SBE_DW1_URB_READ_OFFSET__SHIFT;
+   } else {
+      dw1 |= vue_read_offset << GEN7_SBE_DW1_URB_READ_OFFSET__SHIFT;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7) && info->swizzle_16_31)
+      dw1 |= GEN7_SBE_DW1_ATTR_SWIZZLE_16_31;
+
+   if (info->swizzle_enable)
+      dw1 |= GEN7_SBE_DW1_ATTR_SWIZZLE_ENABLE;
+
+   dw1 |= (info->point_sprite_origin_lower_left) ?
+      GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_LOWERLEFT :
+      GEN7_SBE_DW1_POINT_SPRITE_TEXCOORD_UPPERLEFT;
+
+   dw2 = info->point_sprite_enables;
+   dw3 = info->const_interp_enables;
+
+   STATIC_ASSERT(ARRAY_SIZE(sbe->sbe) >= 3);
+   sbe->sbe[0] = dw1;
+   sbe->sbe[1] = dw2;
+   sbe->sbe[2] = dw3;
+
+   return true;
+}
+
+static bool
+sbe_set_gen8_3DSTATE_SBE_SWIZ(struct ilo_state_sbe *sbe,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_sbe_info *info)
+{
+   uint16_t swiz[ILO_STATE_SBE_MAX_SWIZZLE_COUNT];
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < info->swizzle_count; i++) {
+      const struct ilo_state_sbe_swizzle_info *swizzle = &info->swizzles[i];
+
+      /* U5 */
+      assert(swizzle->attr < 32);
+      swiz[i] = swizzle->attr_select << GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT |
+                swizzle->attr << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT;
+
+      if (swizzle->force_zeros) {
+         swiz[i] |= GEN8_SBE_SWIZ_OVERRIDE_W |
+                    GEN8_SBE_SWIZ_OVERRIDE_Z |
+                    GEN8_SBE_SWIZ_OVERRIDE_Y |
+                    GEN8_SBE_SWIZ_OVERRIDE_X |
+                    GEN8_SBE_SWIZ_CONST_0000;
+      }
+   }
+
+   for (; i < ARRAY_SIZE(swiz); i++) {
+      swiz[i] = GEN6_INPUTATTR_NORMAL << GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT |
+                i << GEN8_SBE_SWIZ_SRC_ATTR__SHIFT;
+   }
+
+   STATIC_ASSERT(sizeof(sbe->swiz) == sizeof(swiz));
+   memcpy(sbe->swiz, swiz, sizeof(swiz));
+
+   return true;
+}
+
+bool
+ilo_state_sbe_init(struct ilo_state_sbe *sbe,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sbe_info *info)
+{
+   assert(ilo_is_zeroed(sbe, sizeof(*sbe)));
+   return ilo_state_sbe_set_info(sbe, dev, info);
+}
+
+bool
+ilo_state_sbe_init_for_rectlist(struct ilo_state_sbe *sbe,
+                                const struct ilo_dev *dev,
+                                uint8_t read_base,
+                                uint8_t read_count)
+{
+   struct ilo_state_sbe_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.attr_count = read_count;
+   info.cv_vue_attr_count = read_base + read_count;
+   info.vue_read_base = read_base;
+   info.vue_read_count = read_count;
+   info.has_min_read_count = true;
+
+   return ilo_state_sbe_set_info(sbe, dev, &info);
+}
+
+bool
+ilo_state_sbe_set_info(struct ilo_state_sbe *sbe,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sbe_info *info)
+{
+   bool ret = true;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   ret &= sbe_set_gen8_3DSTATE_SBE(sbe, dev, info);
+   ret &= sbe_set_gen8_3DSTATE_SBE_SWIZ(sbe, dev, info);
+
+   assert(ret);
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sbe.h b/src/gallium/drivers/ilo/core/ilo_state_sbe.h
new file mode 100644
index 0000000..122999a
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sbe.h
@@ -0,0 +1,103 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_SBE_H
+#define ILO_STATE_SBE_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 264:
+ *
+ *     "Number of SF Output Attributes sets the number of attributes that will
+ *      be output from the SF stage, not including position. This can be used
+ *      to specify up to 32, and may differ from the number of input
+ *      attributes."
+ *
+ *     "The first or last set of 16 attributes can be swizzled according to
+ *      certain state fields."
+ */
+#define ILO_STATE_SBE_MAX_ATTR_COUNT 32
+#define ILO_STATE_SBE_MAX_SWIZZLE_COUNT 16
+
+struct ilo_state_sbe_swizzle_info {
+   /* select an attribute from read ones */
+   enum gen_inputattr_select attr_select;
+   uint8_t attr;
+
+   bool force_zeros;
+};
+
+struct ilo_state_sbe_info {
+   uint8_t attr_count;
+
+   /* which VUE attributes to read */
+   uint8_t cv_vue_attr_count;
+   uint8_t vue_read_base;
+   uint8_t vue_read_count;
+   bool has_min_read_count;
+
+   bool cv_is_point;
+   bool point_sprite_origin_lower_left;
+   /* force sprite coordinates to the four corner vertices of the point */
+   uint32_t point_sprite_enables;
+
+   /* force attr at the provoking vertex to a0 and zero to a1/a2 */
+   uint32_t const_interp_enables;
+
+   bool swizzle_enable;
+   /* swizzle attribute 16 to 31 instead; Gen7+ only */
+   bool swizzle_16_31;
+   uint8_t swizzle_count;
+   const struct ilo_state_sbe_swizzle_info *swizzles;
+};
+
+struct ilo_state_sbe {
+   uint32_t sbe[3];
+   uint32_t swiz[8];
+};
+
+bool
+ilo_state_sbe_init(struct ilo_state_sbe *sbe,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sbe_info *info);
+
+bool
+ilo_state_sbe_init_for_rectlist(struct ilo_state_sbe *sbe,
+                                const struct ilo_dev *dev,
+                                uint8_t read_base,
+                                uint8_t read_count);
+
+bool
+ilo_state_sbe_set_info(struct ilo_state_sbe *sbe,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_sbe_info *info);
+
+#endif /* ILO_STATE_SBE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.c b/src/gallium/drivers/ilo/core/ilo_state_shader.c
new file mode 100644
index 0000000..f67326c
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.c
@@ -0,0 +1,737 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_shader.h"
+
+enum vertex_stage {
+   STAGE_VS,
+   STAGE_HS,
+   STAGE_DS,
+   STAGE_GS,
+};
+
+struct vertex_ff {
+   uint8_t grf_start;
+   uint8_t scratch_space;
+
+   uint8_t sampler_count;
+   uint8_t surface_count;
+   bool has_uav;
+
+   uint8_t vue_read_offset;
+   uint8_t vue_read_len;
+
+   uint8_t user_clip_enables;
+};
+
+static bool
+vertex_validate_gen6_kernel(const struct ilo_dev *dev,
+                            enum vertex_stage stage,
+                            const struct ilo_state_shader_kernel_info *kernel)
+{
+   /*
+    * "Dispatch GRF Start Register for URB Data" is U4 for GS and U5 for
+    * others.
+    */
+   const uint8_t max_grf_start = (stage == STAGE_GS) ? 16 : 32;
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 134:
+    *
+    *     "(Per-Thread Scratch Space)
+    *      Range    [0,11] indicating [1K Bytes, 2M Bytes]"
+    */
+   const uint32_t max_scratch_size = 2 * 1024 * 1024;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* we do not want to save it */
+   assert(!kernel->offset);
+
+   assert(kernel->grf_start < max_grf_start);
+   assert(kernel->scratch_size <= max_scratch_size);
+
+   return true;
+}
+
+static bool
+vertex_validate_gen6_urb(const struct ilo_dev *dev,
+                         enum vertex_stage stage,
+                         const struct ilo_state_shader_urb_info *urb)
+{
+   /* "Vertex/Patch URB Entry Read Offset" is U6, in pairs */
+   const uint8_t max_read_base = 63 * 2;
+   /*
+    * "Vertex/Patch URB Entry Read Length" is limited to 64 for DS and U6 for
+    * others, in pairs
+    */
+   const uint8_t max_read_count = ((stage == STAGE_DS) ? 64 : 63) * 2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(urb->read_base + urb->read_count <= urb->cv_input_attr_count);
+
+   assert(urb->read_base % 2 == 0 && urb->read_base <= max_read_base);
+
+   /*
+    * There is no need to worry about reading past entries, as URB entries are
+    * aligned to 1024-bits (Gen6) or 512-bits (Gen7+).
+    */
+   assert(urb->read_count <= max_read_count);
+
+   return true;
+}
+
+static bool
+vertex_get_gen6_ff(const struct ilo_dev *dev,
+                   enum vertex_stage stage,
+                   const struct ilo_state_shader_kernel_info *kernel,
+                   const struct ilo_state_shader_resource_info *resource,
+                   const struct ilo_state_shader_urb_info *urb,
+                   struct vertex_ff *ff)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_validate_gen6_kernel(dev, stage, kernel) ||
+       !vertex_validate_gen6_urb(dev, stage, urb))
+      return false;
+
+   ff->grf_start = kernel->grf_start;
+   /* next power of two, starting from 1KB */
+   ff->scratch_space = (kernel->scratch_size > 1024) ?
+      (util_last_bit(kernel->scratch_size - 1) - 10): 0;
+
+   ff->sampler_count = (resource->sampler_count <= 12) ?
+      (resource->sampler_count + 3) / 4 : 4;
+   ff->surface_count = resource->surface_count;
+   ff->has_uav = resource->has_uav;
+
+   ff->vue_read_offset = urb->read_base / 2;
+   ff->vue_read_len = (urb->read_count + 1) / 2;
+
+   /* need to read something unless VUE handles are included */
+   switch (stage) {
+   case STAGE_VS:
+      if (!ff->vue_read_len)
+         ff->vue_read_len = 1;
+
+      /* one GRF per attribute */
+      assert(kernel->grf_start + urb->read_count * 2 <= 128);
+      break;
+   case STAGE_GS:
+      if (ilo_dev_gen(dev) == ILO_GEN(6) && !ff->vue_read_len)
+         ff->vue_read_len = 1;
+      break;
+   default:
+      break;
+   }
+
+   ff->user_clip_enables = urb->user_clip_enables;
+
+   return true;
+}
+
+static uint16_t
+vs_get_gen6_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_vs_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_VS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 280 : 70;
+      break;
+   case ILO_GEN(7):
+   case ILO_GEN(6):
+   default:
+      thread_count = dev->thread_count;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+vs_set_gen6_3DSTATE_VS(struct ilo_state_vs *vs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_vs_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_VS, &info->kernel,
+            &info->resource, &info->urb, &ff))
+      return false;
+
+   thread_count = vs_get_gen6_thread_count(dev, info);
+
+   dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw2 |= GEN75_THREADDISP_ACCESS_UAV;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff.grf_start << GEN6_VS_DW4_URB_GRF_START__SHIFT |
+         ff.vue_read_len << GEN6_VS_DW4_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN6_VS_DW4_URB_READ_OFFSET__SHIFT;
+
+   dw5 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw5 |= thread_count << GEN75_VS_DW5_MAX_THREADS__SHIFT;
+   else
+      dw5 |= thread_count << GEN6_VS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN6_VS_DW5_STATISTICS;
+   if (info->dispatch_enable)
+      dw5 |= GEN6_VS_DW5_VS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(vs->vs) >= 5);
+   vs->vs[0] = dw2;
+   vs->vs[1] = dw3;
+   vs->vs[2] = dw4;
+   vs->vs[3] = dw5;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      vs->vs[4] = ff.user_clip_enables << GEN8_VS_DW8_UCP_CLIP_ENABLES__SHIFT;
+
+   return true;
+}
+
+static uint16_t
+hs_get_gen7_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_hs_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_HS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 256 : 70;
+      break;
+   case ILO_GEN(7):
+   default:
+      thread_count = dev->thread_count;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+hs_set_gen7_3DSTATE_HS(struct ilo_state_hs *hs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_hs_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw1, dw2, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_HS, &info->kernel,
+            &info->resource, &info->urb, &ff))
+      return false;
+
+   thread_count = hs_get_gen7_thread_count(dev, info);
+
+   dw1 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw1 |= thread_count << GEN75_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;
+   else
+      dw1 |= thread_count << GEN7_HS_DW1_DISPATCH_MAX_THREADS__SHIFT;
+
+   dw2 = 0 << GEN7_HS_DW2_INSTANCE_COUNT__SHIFT;
+
+   if (info->dispatch_enable)
+      dw2 |= GEN7_HS_DW2_HS_ENABLE;
+   if (info->stats_enable)
+      dw2 |= GEN7_HS_DW2_STATISTICS;
+
+   dw4 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw5 = GEN7_HS_DW5_INCLUDE_VERTEX_HANDLES |
+         ff.grf_start << GEN7_HS_DW5_URB_GRF_START__SHIFT |
+         ff.vue_read_len << GEN7_HS_DW5_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN7_HS_DW5_URB_READ_OFFSET__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw5 |= GEN75_HS_DW5_ACCESS_UAV;
+
+   STATIC_ASSERT(ARRAY_SIZE(hs->hs) >= 4);
+   hs->hs[0] = dw1;
+   hs->hs[1] = dw2;
+   hs->hs[2] = dw4;
+   hs->hs[3] = dw5;
+
+   return true;
+}
+
+static bool
+ds_set_gen7_3DSTATE_TE(struct ilo_state_ds *ds,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ds_info *info)
+{
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   dw1 = 0;
+
+   if (info->dispatch_enable) {
+      dw1 |= GEN7_TE_DW1_MODE_HW |
+             GEN7_TE_DW1_TE_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(ds->te) >= 3);
+   ds->te[0] = dw1;
+   ds->te[1] = fui(63.0f);
+   ds->te[2] = fui(64.0f);
+
+   return true;
+}
+
+static uint16_t
+ds_get_gen7_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_ds_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_DS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 280 : 70;
+      break;
+   case ILO_GEN(7):
+   default:
+      thread_count = dev->thread_count;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+ds_set_gen7_3DSTATE_DS(struct ilo_state_ds *ds,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ds_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_DS, &info->kernel,
+            &info->resource, &info->urb, &ff))
+      return false;
+
+   thread_count = ds_get_gen7_thread_count(dev, info);
+
+   dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw2 |= GEN75_THREADDISP_ACCESS_UAV;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff.grf_start << GEN7_DS_DW4_URB_GRF_START__SHIFT |
+         ff.vue_read_len << GEN7_DS_DW4_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN7_DS_DW4_URB_READ_OFFSET__SHIFT;
+
+   dw5 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw5 |= thread_count << GEN75_DS_DW5_MAX_THREADS__SHIFT;
+   else
+      dw5 |= thread_count << GEN7_DS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN7_DS_DW5_STATISTICS;
+   if (info->dispatch_enable)
+      dw5 |= GEN7_DS_DW5_DS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(ds->ds) >= 5);
+   ds->ds[0] = dw2;
+   ds->ds[1] = dw3;
+   ds->ds[2] = dw4;
+   ds->ds[3] = dw5;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ds->ds[4] = ff.user_clip_enables << GEN8_DS_DW8_UCP_CLIP_ENABLES__SHIFT;
+
+   return true;
+}
+
+static bool
+gs_get_gen6_ff(const struct ilo_dev *dev,
+               const struct ilo_state_gs_info *info,
+               struct vertex_ff *ff)
+{
+   const struct ilo_state_shader_urb_info *urb = &info->urb;
+   const struct ilo_state_gs_sol_info *sol = &info->sol;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_get_gen6_ff(dev, STAGE_GS, &info->kernel,
+            &info->resource, &info->urb, ff))
+      return false;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 168-169:
+    *
+    *     "[0,62] indicating [1,63] 16B units"
+    *
+    *     "Programming Restrictions: The vertex size must be programmed as a
+    *      multiple of 32B units with the following exception: Rendering is
+    *      disabled (as per SOL stage state) and the vertex size output by the
+    *      GS thread is 16B.
+    *
+    *      If rendering is enabled (as per SOL state) the vertex size must be
+    *      programmed as a multiple of 32B units. In other words, the only
+    *      time software can program a vertex size with an odd number of 16B
+    *      units is when rendering is disabled."
+    */
+   assert(urb->output_attr_count <= 63);
+   if (!sol->render_disable)
+      assert(urb->output_attr_count % 2 == 0);
+
+   return true;
+}
+
+static uint16_t
+gs_get_gen6_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_gs_info *info)
+{
+   const struct ilo_state_gs_sol_info *sol = &info->sol;
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_GS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      thread_count = 504;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt >= 2) ? 256 : 70;
+      break;
+   case ILO_GEN(7):
+   case ILO_GEN(6):
+   default:
+      thread_count = dev->thread_count;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 154:
+       *
+       *     "Maximum Number of Threads valid range is [0,27] when Rendering
+       *      Enabled bit is set."
+       *
+       * According to the classic driver, [0, 20] for GT1.
+       */
+      if (!sol->render_disable)
+         thread_count = (dev->gt == 2) ? 27 : 20;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+gs_set_gen6_3DSTATE_GS(struct ilo_state_gs *gs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_gs_info *info)
+{
+   const struct ilo_state_gs_sol_info *sol = &info->sol;
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint32_t dw2, dw3, dw4, dw5, dw6;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!gs_get_gen6_ff(dev, info, &ff))
+      return false;
+
+   thread_count = gs_get_gen6_thread_count(dev, info);
+
+   dw2 = GEN6_THREADDISP_SPF |
+         ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff.vue_read_len << GEN6_GS_DW4_URB_READ_LEN__SHIFT |
+         ff.vue_read_offset << GEN6_GS_DW4_URB_READ_OFFSET__SHIFT |
+         ff.grf_start << GEN6_GS_DW4_URB_GRF_START__SHIFT;
+
+   dw5 = thread_count << GEN6_GS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN6_GS_DW5_STATISTICS;
+   if (sol->stats_enable)
+      dw5 |= GEN6_GS_DW5_SO_STATISTICS;
+   if (!sol->render_disable)
+      dw5 |= GEN6_GS_DW5_RENDER_ENABLE;
+
+   dw6 = 0;
+
+   /* GEN7_REORDER_TRAILING is handled by the kernel */
+   if (sol->tristrip_reorder == GEN7_REORDER_LEADING)
+      dw6 |= GEN6_GS_DW6_REORDER_LEADING_ENABLE;
+
+   if (sol->sol_enable) {
+      dw6 |= GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE;
+
+      if (sol->svbi_post_inc) {
+         dw6 |= GEN6_GS_DW6_SVBI_POST_INC_ENABLE |
+                sol->svbi_post_inc << GEN6_GS_DW6_SVBI_POST_INC_VAL__SHIFT;
+      }
+   }
+
+   if (info->dispatch_enable)
+      dw6 |= GEN6_GS_DW6_GS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(gs->gs) >= 5);
+   gs->gs[0] = dw2;
+   gs->gs[1] = dw3;
+   gs->gs[2] = dw4;
+   gs->gs[3] = dw5;
+   gs->gs[4] = dw6;
+
+   return true;
+}
+
+static uint8_t
+gs_get_gen7_vertex_size(const struct ilo_dev *dev,
+                        const struct ilo_state_gs_info *info)
+{
+   const struct ilo_state_shader_urb_info *urb = &info->urb;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   return (urb->output_attr_count) ? urb->output_attr_count - 1 : 0;
+}
+
+static bool
+gs_set_gen7_3DSTATE_GS(struct ilo_state_gs *gs,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_gs_info *info)
+{
+   struct vertex_ff ff;
+   uint16_t thread_count;
+   uint8_t vertex_size;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!gs_get_gen6_ff(dev, info, &ff))
+      return false;
+
+   thread_count = gs_get_gen6_thread_count(dev, info);
+   vertex_size = gs_get_gen7_vertex_size(dev, info);
+
+   dw2 = ff.sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff.surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff.has_uav)
+      dw2 |= GEN75_THREADDISP_ACCESS_UAV;
+
+   dw3 = ff.scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = vertex_size << GEN7_GS_DW4_OUTPUT_SIZE__SHIFT |
+         0 << GEN7_GS_DW4_OUTPUT_TOPO__SHIFT |
+         ff.vue_read_len << GEN7_GS_DW4_URB_READ_LEN__SHIFT |
+         GEN7_GS_DW4_INCLUDE_VERTEX_HANDLES |
+         ff.vue_read_offset << GEN7_GS_DW4_URB_READ_OFFSET__SHIFT |
+         ff.grf_start << GEN7_GS_DW4_URB_GRF_START__SHIFT;
+
+   dw5 = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw5 = thread_count << GEN75_GS_DW5_MAX_THREADS__SHIFT;
+   else
+      dw5 = thread_count << GEN7_GS_DW5_MAX_THREADS__SHIFT;
+
+   if (info->stats_enable)
+      dw5 |= GEN7_GS_DW5_STATISTICS;
+   if (info->dispatch_enable)
+      dw5 |= GEN7_GS_DW5_GS_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(gs->gs) >= 5);
+   gs->gs[0] = dw2;
+   gs->gs[1] = dw3;
+   gs->gs[2] = dw4;
+   gs->gs[3] = dw5;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      gs->gs[4] = ff.user_clip_enables << GEN8_GS_DW9_UCP_CLIP_ENABLES__SHIFT;
+
+   return true;
+}
+
+bool
+ilo_state_vs_init(struct ilo_state_vs *vs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(vs, sizeof(*vs)));
+
+   ret &= vs_set_gen6_3DSTATE_VS(vs, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_vs_init_disabled(struct ilo_state_vs *vs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_vs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_vs_init(vs, dev, &info);
+}
+
+bool
+ilo_state_hs_init(struct ilo_state_hs *hs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_hs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(hs, sizeof(*hs)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= hs_set_gen7_3DSTATE_HS(hs, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_hs_init_disabled(struct ilo_state_hs *hs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_hs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_hs_init(hs, dev, &info);
+}
+
+bool
+ilo_state_ds_init(struct ilo_state_ds *ds,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ds_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(ds, sizeof(*ds)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= ds_set_gen7_3DSTATE_TE(ds, dev, info);
+      ret &= ds_set_gen7_3DSTATE_DS(ds, dev, info);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_ds_init_disabled(struct ilo_state_ds *ds,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_ds_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_ds_init(ds, dev, &info);
+}
+
+bool
+ilo_state_gs_init(struct ilo_state_gs *gs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_gs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(gs, sizeof(*gs)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= gs_set_gen7_3DSTATE_GS(gs, dev, info);
+   else
+      ret &= gs_set_gen6_3DSTATE_GS(gs, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_gs_init_disabled(struct ilo_state_gs *gs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_gs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_gs_init(gs, dev, &info);
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader.h b/src/gallium/drivers/ilo/core/ilo_state_shader.h
new file mode 100644
index 0000000..44690c5
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader.h
@@ -0,0 +1,256 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_SHADER_H
+#define ILO_STATE_SHADER_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/**
+ * Kernel information.
+ */
+struct ilo_state_shader_kernel_info {
+   /* usually 0 unless the shader has multiple kernels */
+   uint32_t offset;
+
+   uint8_t grf_start;
+   uint8_t pcb_attr_count;
+
+   uint32_t scratch_size;
+};
+
+/**
+ * Shader resources.
+ */
+struct ilo_state_shader_resource_info {
+   /* for prefetches */
+   uint8_t sampler_count;
+   uint8_t surface_count;
+
+   bool has_uav;
+};
+
+/**
+ * URB inputs/outputs.
+ */
+struct ilo_state_shader_urb_info {
+   uint8_t cv_input_attr_count;
+
+   uint8_t read_base;
+   uint8_t read_count;
+
+   uint8_t output_attr_count;
+
+   uint8_t user_cull_enables;
+   uint8_t user_clip_enables;
+};
+
+struct ilo_state_vs_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+struct ilo_state_hs_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+struct ilo_state_ds_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+/**
+ * Stream output.  Must be consistent with ilo_state_sol_info.
+ */
+struct ilo_state_gs_sol_info {
+   bool sol_enable;
+   bool stats_enable;
+   bool render_disable;
+
+   uint16_t svbi_post_inc;
+
+   enum gen_reorder_mode tristrip_reorder;
+};
+
+struct ilo_state_gs_info {
+   struct ilo_state_shader_kernel_info kernel;
+   struct ilo_state_shader_resource_info resource;
+   struct ilo_state_shader_urb_info urb;
+
+   struct ilo_state_gs_sol_info sol;
+
+   bool dispatch_enable;
+   bool stats_enable;
+};
+
+struct ilo_state_ps_io_info {
+   /* inputs */
+   enum gen_position_offset posoffset;
+   uint8_t attr_count;
+   bool use_z;
+   bool use_w;
+   bool use_coverage_mask;
+
+   /* outputs */
+   enum gen_pscdepth_mode pscdepth;
+   bool has_rt_write;
+   bool write_pixel_mask;
+   bool write_omask;
+};
+
+struct ilo_state_ps_params_info {
+   /* compatibility with raster states */
+   uint32_t sample_mask;
+   bool earlyz_control_psexec;
+
+   /* compatibility with cc states */
+   bool alpha_may_kill;
+   bool dual_source_blending;
+   bool has_writeable_rt;
+};
+
+struct ilo_state_ps_info {
+   struct ilo_state_shader_kernel_info kernel_8;
+   struct ilo_state_shader_kernel_info kernel_16;
+   struct ilo_state_shader_kernel_info kernel_32;
+   struct ilo_state_shader_resource_info resource;
+
+   struct ilo_state_ps_io_info io;
+   struct ilo_state_ps_params_info params;
+
+   /* bitmask of GEN6_PS_DISPATCH_x */
+   uint8_t valid_kernels;
+   bool per_sample_dispatch;
+   bool sample_count_one;
+   bool cv_per_sample_interp;
+   bool cv_has_earlyz_op;
+
+   bool rt_clear_enable;
+   bool rt_resolve_enable;
+
+   bool cv_has_depth_buffer;
+};
+
+struct ilo_state_vs {
+   uint32_t vs[5];
+};
+
+struct ilo_state_hs {
+   uint32_t hs[4];
+};
+
+struct ilo_state_ds {
+   uint32_t te[3];
+   uint32_t ds[5];
+};
+
+struct ilo_state_gs {
+   uint32_t gs[5];
+};
+
+struct ilo_state_ps {
+   uint32_t ps[8];
+
+   struct ilo_state_ps_dispatch_conds {
+      bool ps_valid;
+
+      bool has_rt_write;
+      bool write_odepth;
+      bool write_ostencil;
+      bool has_uav_write;
+      bool ps_may_kill;
+   } conds;
+};
+
+bool
+ilo_state_vs_init(struct ilo_state_vs *vs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vs_info *info);
+
+bool
+ilo_state_vs_init_disabled(struct ilo_state_vs *vs,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_hs_init(struct ilo_state_hs *hs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_hs_info *info);
+
+bool
+ilo_state_hs_init_disabled(struct ilo_state_hs *hs,
+                           const struct ilo_dev *dev);
+
+
+bool
+ilo_state_ds_init(struct ilo_state_ds *ds,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ds_info *info);
+
+bool
+ilo_state_ds_init_disabled(struct ilo_state_ds *ds,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_gs_init(struct ilo_state_gs *gs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_gs_info *info);
+
+bool
+ilo_state_gs_init_disabled(struct ilo_state_gs *gs,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_ps_init(struct ilo_state_ps *ps,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ps_info *info);
+
+bool
+ilo_state_ps_init_disabled(struct ilo_state_ps *ps,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_ps_set_params(struct ilo_state_ps *ps,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_ps_params_info *params);
+
+#endif /* ILO_STATE_SHADER_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
new file mode 100644
index 0000000..f4d801e
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_shader_ps.c
@@ -0,0 +1,771 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_shader.h"
+
+struct pixel_ff {
+   uint8_t dispatch_modes;
+
+   uint32_t kernel_offsets[3];
+   uint8_t grf_starts[3];
+   bool pcb_enable;
+   uint8_t scratch_space;
+
+   uint8_t sampler_count;
+   uint8_t surface_count;
+   bool has_uav;
+
+   uint16_t thread_count;
+
+   struct ilo_state_ps_dispatch_conds conds;
+
+   bool kill_pixel;
+   bool dispatch_enable;
+   bool dual_source_blending;
+   uint32_t sample_mask;
+};
+
+static bool
+ps_kernel_validate_gen6(const struct ilo_dev *dev,
+                        const struct ilo_state_shader_kernel_info *kernel)
+{
+   /* "Dispatch GRF Start Register for Constant/Setup Data" is U7 */
+   const uint8_t max_grf_start = 128;
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 271:
+    *
+    *     "(Per-Thread Scratch Space)
+    *      Range  [0,11] indicating [1k bytes, 2M bytes] in powers of two"
+    */
+   const uint32_t max_scratch_size = 2 * 1024 * 1024;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* "Kernel Start Pointer" is 64-byte aligned */
+   assert(kernel->offset % 64 == 0);
+
+   assert(kernel->grf_start < max_grf_start);
+   assert(kernel->scratch_size <= max_scratch_size);
+
+   return true;
+}
+
+static bool
+ps_validate_gen6(const struct ilo_dev *dev,
+                 const struct ilo_state_ps_info *info)
+{
+   const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
+   const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
+   const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
+   const struct ilo_state_ps_io_info *io = &info->io;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!ps_kernel_validate_gen6(dev, kernel_8) ||
+       !ps_kernel_validate_gen6(dev, kernel_16) ||
+       !ps_kernel_validate_gen6(dev, kernel_32))
+      return false;
+
+   /* unsupported on Gen6 */
+   if (ilo_dev_gen(dev) == ILO_GEN(6))
+      assert(!io->use_coverage_mask);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
+    *
+    *     "If a NULL Depth Buffer is selected, the Pixel Shader Computed Depth
+    *      field must be set to disabled."
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6) && io->pscdepth != GEN7_PSCDEPTH_OFF)
+      assert(info->cv_has_depth_buffer);
+
+   if (!info->per_sample_dispatch) {
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 281:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select
+       *      POSOFFSET_SAMPLE."
+       */
+      assert(io->posoffset != GEN6_POSOFFSET_SAMPLE);
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 282:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select
+       *      INTERP_SAMPLE."
+       *
+       * From the Sandy Bridge PRM, volume 2 part 1, page 283:
+       *
+       *     "MSDISPMODE_PERSAMPLE is required in order to select Perspective
+       *      Sample or Non-perspective Sample barycentric coordinates."
+       */
+      assert(!info->cv_per_sample_interp);
+   }
+
+   /*
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 314:
+    *
+    *     "Pixel Shader Dispatch, Alpha... must all be disabled."
+    *
+    * Simply disallow any valid kernel when there is early-z op.  Also, when
+    * there is no valid kernel, io should be zeroed.
+    */
+   if (info->valid_kernels)
+      assert(!info->cv_has_earlyz_op);
+   else
+      assert(ilo_is_zeroed(io, sizeof(*io)));
+
+   return true;
+}
+
+static uint8_t
+ps_get_gen6_dispatch_modes(const struct ilo_dev *dev,
+                           const struct ilo_state_ps_info *info)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint8_t dispatch_modes = info->valid_kernels;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!dispatch_modes)
+      return 0;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 334:
+    *
+    *     "Not valid on [DevSNB] if 4x PERPIXEL mode with pixel shader
+    *      computed depth."
+    *
+    *     "Valid on all products, except when in non-1x PERSAMPLE mode
+    *      (applies to [DevSNB+] only)"
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 239:
+    *
+    *     "[DevSNB]: When Pixel Shader outputs oDepth and PS invocation mode
+    *      is PERPIXEL, Message Type for Render Target Write must be SIMD8.
+    *
+    *      Errata: [DevSNB+]: When Pixel Shader outputs oMask, this message
+    *      type is not supported: SIMD8 (including SIMD8_DUALSRC_xx)."
+    *
+    * It is really hard to follow what combinations are valid on what
+    * platforms.  Judging from the restrictions on RT write messages on Gen6,
+    * oDepth and oMask related issues should be Gen6-specific.  PERSAMPLE
+    * issue should be universal, and disallows multiple dispatch modes.
+    */
+   if (ilo_dev_gen(dev) == ILO_GEN(6)) {
+      if (io->pscdepth != GEN7_PSCDEPTH_OFF && !info->per_sample_dispatch)
+         dispatch_modes &= GEN6_PS_DISPATCH_8;
+      if (io->write_omask)
+         dispatch_modes &= ~GEN6_PS_DISPATCH_8;
+   }
+   if (info->per_sample_dispatch && !info->sample_count_one) {
+      /* prefer 32 over 16 over 8 */
+      if (dispatch_modes & GEN6_PS_DISPATCH_32)
+         dispatch_modes &= GEN6_PS_DISPATCH_32;
+      else if (dispatch_modes & GEN6_PS_DISPATCH_16)
+         dispatch_modes &= GEN6_PS_DISPATCH_16;
+      else
+         dispatch_modes &= GEN6_PS_DISPATCH_8;
+   }
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 149:
+    *
+    *     "When Render Target Fast Clear Enable is ENABLED or Render Target
+    *      Resolve Type = RESOLVE_PARTIAL or RESOLVE_FULL, this bit (8 Pixel
+    *      Dispatch or Dual-8 Pixel Dispatch Enable) must be DISABLED."
+    */
+   if (info->rt_clear_enable || info->rt_resolve_enable)
+      dispatch_modes &= ~GEN6_PS_DISPATCH_8;
+
+   assert(dispatch_modes);
+
+   return dispatch_modes;
+}
+
+static uint16_t
+ps_get_gen6_thread_count(const struct ilo_dev *dev,
+                         const struct ilo_state_ps_info *info)
+{
+   uint16_t thread_count;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* Maximum Number of Threads of 3DSTATE_PS */
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+      /* scaled automatically */
+      thread_count = 64 - 1;
+      break;
+   case ILO_GEN(7.5):
+      thread_count = (dev->gt == 3) ? 408 :
+                     (dev->gt == 2) ? 204 : 102;
+      break;
+   case ILO_GEN(7):
+      thread_count = (dev->gt == 2) ? 172 : 48;
+      break;
+   case ILO_GEN(6):
+   default:
+      /* from the classic driver instead of the PRM */
+      thread_count = (dev->gt == 2) ? 80 : 40;
+      break;
+   }
+
+   return thread_count - 1;
+}
+
+static bool
+ps_params_get_gen6_kill_pixel(const struct ilo_dev *dev,
+                              const struct ilo_state_ps_params_info *params,
+                              const struct ilo_state_ps_dispatch_conds *conds)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 275:
+    *
+    *     "This bit (Pixel Shader Kill Pixel), if ENABLED, indicates that the
+    *      PS kernel or color calculator has the ability to kill (discard)
+    *      pixels or samples, other than due to depth or stencil testing.
+    *      This bit is required to be ENABLED in the following situations:
+    *
+    *      The API pixel shader program contains "killpix" or "discard"
+    *      instructions, or other code in the pixel shader kernel that can
+    *      cause the final pixel mask to differ from the pixel mask received
+    *      on dispatch.
+    *
+    *      A sampler with chroma key enabled with kill pixel mode is used by
+    *      the pixel shader.
+    *
+    *      Any render target has Alpha Test Enable or AlphaToCoverage Enable
+    *      enabled.
+    *
+    *      The pixel shader kernel generates and outputs oMask.
+    *
+    *      Note: As ClipDistance clipping is fully supported in hardware and
+    *      therefore not via PS instructions, there should be no need to
+    *      ENABLE this bit due to ClipDistance clipping."
+    */
+   return (conds->ps_may_kill || params->alpha_may_kill);
+}
+
+static bool
+ps_params_get_gen6_dispatch_enable(const struct ilo_dev *dev,
+                                   const struct ilo_state_ps_params_info *params,
+                                   const struct ilo_state_ps_dispatch_conds *conds)
+{
+   /*
+    * We want to skip dispatching when EarlyZ suffices.  The conditions that
+    * require dispatching are
+    *
+    *  - PS writes RTs and RTs are writeable
+    *  - PS changes depth value and depth test/write is enabled
+    *  - PS changes stencil value and stencil test is enabled
+    *  - PS writes UAVs
+    *  - PS or CC kills pixels
+    *  - EDSC is PSEXEC, and depth test/write or stencil test is enabled
+    */
+   bool dispatch_required =
+      ((conds->has_rt_write && params->has_writeable_rt) ||
+       conds->write_odepth ||
+       conds->write_ostencil ||
+       conds->has_uav_write ||
+       ps_params_get_gen6_kill_pixel(dev, params, conds) ||
+       params->earlyz_control_psexec);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 280:
+    *
+    *     "If EDSC_PSEXEC mode is selected, Thread Dispatch Enable must be
+    *      set."
+    */
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && params->earlyz_control_psexec)
+      dispatch_required = true;
+
+   /* assert it is valid to dispatch */
+   if (dispatch_required)
+      assert(conds->ps_valid);
+
+   return dispatch_required;
+}
+
+static bool
+ps_get_gen6_ff_kernels(const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       struct pixel_ff *ff)
+{
+   const struct ilo_state_shader_kernel_info *kernel_8 = &info->kernel_8;
+   const struct ilo_state_shader_kernel_info *kernel_16 = &info->kernel_16;
+   const struct ilo_state_shader_kernel_info *kernel_32 = &info->kernel_32;
+   uint32_t scratch_size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   ff->dispatch_modes = ps_get_gen6_dispatch_modes(dev, info);
+
+   /* initialize kernel offsets and GRF starts */
+   if (util_is_power_of_two(ff->dispatch_modes)) {
+      if (ff->dispatch_modes & GEN6_PS_DISPATCH_8) {
+         ff->kernel_offsets[0] = kernel_8->offset;
+         ff->grf_starts[0] = kernel_8->grf_start;
+      } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_16) {
+         ff->kernel_offsets[0] = kernel_16->offset;
+         ff->grf_starts[0] = kernel_16->grf_start;
+      } else if (ff->dispatch_modes & GEN6_PS_DISPATCH_32) {
+         ff->kernel_offsets[0] = kernel_32->offset;
+         ff->grf_starts[0] = kernel_32->grf_start;
+      }
+   } else {
+      ff->kernel_offsets[0] = kernel_8->offset;
+      ff->kernel_offsets[1] = kernel_32->offset;
+      ff->kernel_offsets[2] = kernel_16->offset;
+
+      ff->grf_starts[0] = kernel_8->grf_start;
+      ff->grf_starts[1] = kernel_32->grf_start;
+      ff->grf_starts[2] = kernel_16->grf_start;
+   }
+
+   /* we do not want to save it */
+   assert(ff->kernel_offsets[0] == 0);
+
+   ff->pcb_enable = (((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
+                      kernel_8->pcb_attr_count) ||
+                     ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
+                      kernel_16->pcb_attr_count) ||
+                     ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
+                      kernel_32->pcb_attr_count));
+
+   scratch_size = 0;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_8) &&
+       scratch_size < kernel_8->scratch_size)
+      scratch_size = kernel_8->scratch_size;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_16) &&
+       scratch_size < kernel_16->scratch_size)
+      scratch_size = kernel_16->scratch_size;
+   if ((ff->dispatch_modes & GEN6_PS_DISPATCH_32) &&
+       scratch_size < kernel_32->scratch_size)
+      scratch_size = kernel_32->scratch_size;
+
+   /* next power of two, starting from 1KB */
+   ff->scratch_space = (scratch_size > 1024) ?
+      (util_last_bit(scratch_size - 1) - 10): 0;
+
+   /* GPU hangs on Haswell if none of the dispatch mode bits is set */
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5) && !ff->dispatch_modes)
+      ff->dispatch_modes |= GEN6_PS_DISPATCH_8;
+
+   return true;
+}
+
+static bool
+ps_get_gen6_ff(const struct ilo_dev *dev,
+               const struct ilo_state_ps_info *info,
+               struct pixel_ff *ff)
+{
+   const struct ilo_state_shader_resource_info *resource = &info->resource;
+   const struct ilo_state_ps_io_info *io = &info->io;
+   const struct ilo_state_ps_params_info *params = &info->params;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   memset(ff, 0, sizeof(*ff));
+
+   if (!ps_validate_gen6(dev, info) || !ps_get_gen6_ff_kernels(dev, info, ff))
+      return false;
+
+   ff->sampler_count = (resource->sampler_count <= 12) ?
+      (resource->sampler_count + 3) / 4 : 4;
+   ff->surface_count = resource->surface_count;
+   ff->has_uav = resource->has_uav;
+
+   ff->thread_count = ps_get_gen6_thread_count(dev, info);
+
+   ff->conds.ps_valid = (info->valid_kernels != 0x0);
+   ff->conds.has_rt_write = io->has_rt_write;
+   ff->conds.write_odepth = (io->pscdepth != GEN7_PSCDEPTH_OFF);
+   ff->conds.write_ostencil = false;
+   ff->conds.has_uav_write = resource->has_uav;
+   ff->conds.ps_may_kill = (io->write_pixel_mask || io->write_omask);
+
+   ff->kill_pixel = ps_params_get_gen6_kill_pixel(dev, params, &ff->conds);
+   ff->dispatch_enable =
+      ps_params_get_gen6_dispatch_enable(dev, params, &ff->conds);
+   ff->dual_source_blending = params->dual_source_blending;
+   ff->sample_mask = params->sample_mask;
+
+   return true;
+}
+
+static bool
+ps_set_gen6_3dstate_wm(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw2, dw3, dw4, dw5, dw6;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = ff->grf_starts[0] << GEN6_WM_DW4_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN6_WM_DW4_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN6_WM_DW4_URB_GRF_START2__SHIFT;
+
+   dw5 = ff->thread_count << GEN6_WM_DW5_MAX_THREADS__SHIFT |
+         ff->dispatch_modes << GEN6_WM_DW5_PS_DISPATCH_MODE__SHIFT;
+
+   if (ff->kill_pixel)
+      dw5 |= GEN6_WM_DW5_PS_KILL_PIXEL;
+
+   if (io->pscdepth != GEN7_PSCDEPTH_OFF)
+      dw5 |= GEN6_WM_DW5_PS_COMPUTE_DEPTH;
+   if (io->use_z)
+      dw5 |= GEN6_WM_DW5_PS_USE_DEPTH;
+
+   if (ff->dispatch_enable)
+      dw5 |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+
+   if (io->write_omask)
+      dw5 |= GEN6_WM_DW5_PS_COMPUTE_OMASK;
+   if (io->use_w)
+      dw5 |= GEN6_WM_DW5_PS_USE_W;
+
+   if (ff->dual_source_blending)
+      dw5 |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+
+   dw6 = io->attr_count << GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT |
+         io->posoffset << GEN6_WM_DW6_PS_POSOFFSET__SHIFT;
+
+   dw6 |= (info->per_sample_dispatch) ?
+      GEN6_WM_DW6_MSDISPMODE_PERSAMPLE : GEN6_WM_DW6_MSDISPMODE_PERPIXEL;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 7);
+   ps->ps[0] = dw2;
+   ps->ps[1] = dw3;
+   ps->ps[2] = dw4;
+   ps->ps[3] = dw5;
+   ps->ps[4] = dw6;
+   ps->ps[5] = ff->kernel_offsets[1];
+   ps->ps[6] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen7_3dstate_wm(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   dw1 = io->pscdepth << GEN7_WM_DW1_PSCDEPTH__SHIFT;
+
+   if (ff->dispatch_enable)
+      dw1 |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+   if (ff->kill_pixel)
+      dw1 |= GEN7_WM_DW1_PS_KILL_PIXEL;
+
+   if (io->use_z)
+      dw1 |= GEN7_WM_DW1_PS_USE_DEPTH;
+   if (io->use_w)
+      dw1 |= GEN7_WM_DW1_PS_USE_W;
+   if (io->use_coverage_mask)
+      dw1 |= GEN7_WM_DW1_PS_USE_COVERAGE_MASK;
+
+   dw2 = (info->per_sample_dispatch) ?
+      GEN7_WM_DW2_MSDISPMODE_PERSAMPLE : GEN7_WM_DW2_MSDISPMODE_PERPIXEL;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 2);
+   ps->ps[0] = dw1;
+   ps->ps[1] = dw2;
+
+   return true;
+}
+
+static bool
+ps_set_gen7_3DSTATE_PS(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   dw2 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw2 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw3 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw4 = io->posoffset << GEN7_PS_DW4_POSOFFSET__SHIFT |
+         ff->dispatch_modes << GEN7_PS_DW4_DISPATCH_MODE__SHIFT;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5)) {
+      dw4 |= ff->thread_count << GEN75_PS_DW4_MAX_THREADS__SHIFT |
+             (ff->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
+   } else {
+      dw4 |= ff->thread_count << GEN7_PS_DW4_MAX_THREADS__SHIFT;
+   }
+
+   if (ff->pcb_enable)
+      dw4 |= GEN7_PS_DW4_PUSH_CONSTANT_ENABLE;
+   if (io->attr_count)
+      dw4 |= GEN7_PS_DW4_ATTR_ENABLE;
+   if (io->write_omask)
+      dw4 |= GEN7_PS_DW4_COMPUTE_OMASK;
+   if (info->rt_clear_enable)
+      dw4 |= GEN7_PS_DW4_RT_FAST_CLEAR;
+   if (ff->dual_source_blending)
+      dw4 |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+   if (info->rt_resolve_enable)
+      dw4 |= GEN7_PS_DW4_RT_RESOLVE;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5) && ff->has_uav)
+      dw4 |= GEN75_PS_DW4_ACCESS_UAV;
+
+   dw5 = ff->grf_starts[0] << GEN7_PS_DW5_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN7_PS_DW5_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN7_PS_DW5_URB_GRF_START2__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 8);
+   ps->ps[2] = dw2;
+   ps->ps[3] = dw3;
+   ps->ps[4] = dw4;
+   ps->ps[5] = dw5;
+   ps->ps[6] = ff->kernel_offsets[1];
+   ps->ps[7] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen8_3DSTATE_PS(struct ilo_state_ps *ps,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_ps_info *info,
+                       const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw3, dw4, dw6, dw7;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw3 = ff->sampler_count << GEN6_THREADDISP_SAMPLER_COUNT__SHIFT |
+         ff->surface_count << GEN6_THREADDISP_BINDING_TABLE_SIZE__SHIFT;
+
+   if (false)
+      dw3 |= GEN6_THREADDISP_FP_MODE_ALT;
+
+   dw4 = ff->scratch_space << GEN6_THREADSCRATCH_SPACE_PER_THREAD__SHIFT;
+
+   dw6 = ff->thread_count << GEN8_PS_DW6_MAX_THREADS__SHIFT |
+         io->posoffset << GEN8_PS_DW6_POSOFFSET__SHIFT |
+         ff->dispatch_modes << GEN8_PS_DW6_DISPATCH_MODE__SHIFT;
+
+   if (ff->pcb_enable)
+      dw6 |= GEN8_PS_DW6_PUSH_CONSTANT_ENABLE;
+
+   if (info->rt_clear_enable)
+      dw6 |= GEN8_PS_DW6_RT_FAST_CLEAR;
+   if (info->rt_resolve_enable)
+      dw6 |= GEN8_PS_DW6_RT_RESOLVE;
+
+   dw7 = ff->grf_starts[0] << GEN8_PS_DW7_URB_GRF_START0__SHIFT |
+         ff->grf_starts[1] << GEN8_PS_DW7_URB_GRF_START1__SHIFT |
+         ff->grf_starts[2] << GEN8_PS_DW7_URB_GRF_START2__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 6);
+   ps->ps[0] = dw3;
+   ps->ps[1] = dw4;
+   ps->ps[2] = dw6;
+   ps->ps[3] = dw7;
+   ps->ps[4] = ff->kernel_offsets[1];
+   ps->ps[5] = ff->kernel_offsets[2];
+
+   return true;
+}
+
+static bool
+ps_set_gen8_3DSTATE_PS_EXTRA(struct ilo_state_ps *ps,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_ps_info *info,
+                             const struct pixel_ff *ff)
+{
+   const struct ilo_state_ps_io_info *io = &info->io;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = io->pscdepth << GEN8_PSX_DW1_PSCDEPTH__SHIFT;
+
+   if (info->valid_kernels)
+      dw1 |= GEN8_PSX_DW1_VALID;
+   if (!io->has_rt_write)
+      dw1 |= GEN8_PSX_DW1_UAV_ONLY;
+   if (io->write_omask)
+      dw1 |= GEN8_PSX_DW1_COMPUTE_OMASK;
+   if (io->write_pixel_mask)
+      dw1 |= GEN8_PSX_DW1_KILL_PIXEL;
+
+   if (io->use_z)
+      dw1 |= GEN8_PSX_DW1_USE_DEPTH;
+   if (io->use_w)
+      dw1 |= GEN8_PSX_DW1_USE_W;
+   if (io->attr_count)
+      dw1 |= GEN8_PSX_DW1_ATTR_ENABLE;
+
+   if (info->per_sample_dispatch)
+      dw1 |= GEN8_PSX_DW1_PER_SAMPLE;
+   if (ff->has_uav)
+      dw1 |= GEN8_PSX_DW1_ACCESS_UAV;
+   if (io->use_coverage_mask)
+      dw1 |= GEN8_PSX_DW1_USE_COVERAGE_MASK;
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 151:
+    *
+    *     "When this bit (Pixel Shader Valid) clear the rest of this command
+    *      should also be clear.
+    */
+   if (!info->valid_kernels)
+      dw1 = 0;
+
+   STATIC_ASSERT(ARRAY_SIZE(ps->ps) >= 5);
+   ps->ps[4] = dw1;
+
+   return true;
+}
+
+bool
+ilo_state_ps_init(struct ilo_state_ps *ps,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_ps_info *info)
+{
+   struct pixel_ff ff;
+   bool ret = true;
+
+   assert(ilo_is_zeroed(ps, sizeof(*ps)));
+
+   ret &= ps_get_gen6_ff(dev, info, &ff);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ret &= ps_set_gen8_3DSTATE_PS(ps, dev, info, &ff);
+      ret &= ps_set_gen8_3DSTATE_PS_EXTRA(ps, dev, info, &ff);
+   } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= ps_set_gen7_3dstate_wm(ps, dev, info, &ff);
+      ret &= ps_set_gen7_3DSTATE_PS(ps, dev, info, &ff);
+   } else {
+      ret &= ps_set_gen6_3dstate_wm(ps, dev, info, &ff);
+   }
+
+   /* save conditions */
+   ps->conds = ff.conds;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_ps_init_disabled(struct ilo_state_ps *ps,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_ps_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_ps_init(ps, dev, &info);
+}
+
+bool
+ilo_state_ps_set_params(struct ilo_state_ps *ps,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_ps_params_info *params)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* modify sample mask */
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5)) {
+      ps->ps[4] = (ps->ps[4] & ~GEN75_PS_DW4_SAMPLE_MASK__MASK) |
+         (params->sample_mask & 0xff) << GEN75_PS_DW4_SAMPLE_MASK__SHIFT;
+   }
+
+   /* modify dispatch enable, pixel kill, and dual source blending */
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+         if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds))
+            ps->ps[0] |= GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+         else
+            ps->ps[0] &= ~GEN7_WM_DW1_PS_DISPATCH_ENABLE;
+
+         if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds))
+            ps->ps[0] |= GEN7_WM_DW1_PS_KILL_PIXEL;
+         else
+            ps->ps[0] &= ~GEN7_WM_DW1_PS_KILL_PIXEL;
+
+         if (params->dual_source_blending)
+            ps->ps[4] |= GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+         else
+            ps->ps[4] &= ~GEN7_PS_DW4_DUAL_SOURCE_BLEND;
+      } else {
+         if (ps_params_get_gen6_dispatch_enable(dev, params, &ps->conds))
+            ps->ps[3] |= GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_DISPATCH_ENABLE;
+
+         if (ps_params_get_gen6_kill_pixel(dev, params, &ps->conds))
+            ps->ps[3] |= GEN6_WM_DW5_PS_KILL_PIXEL;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_KILL_PIXEL;
+
+         if (params->dual_source_blending)
+            ps->ps[3] |= GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+         else
+            ps->ps[3] &= ~GEN6_WM_DW5_PS_DUAL_SOURCE_BLEND;
+      }
+   }
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.c b/src/gallium/drivers/ilo/core/ilo_state_sol.c
new file mode 100644
index 0000000..38c0b71
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.c
@@ -0,0 +1,464 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_buffer.h"
+#include "ilo_state_sol.h"
+
+static bool
+sol_stream_validate_gen7(const struct ilo_dev *dev,
+                         const struct ilo_state_sol_stream_info *stream)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   assert(stream->vue_read_base + stream->vue_read_count <=
+         stream->cv_vue_attr_count);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 200:
+    *
+    *     "(Stream 0 Vertex Read Offset)
+    *      Format: U1 count of 256-bit units
+    *
+    *      Specifies amount of data to skip over before reading back Stream 0
+    *      vertex data. Must be zero if the GS is enabled and the Output
+    *      Vertex Size field in 3DSTATE_GS is programmed to 0 (i.e., one 16B
+    *      unit)."
+    *
+    *     "(Stream 0 Vertex Read Length)
+    *      Format: U5-1 count of 256-bit units
+    *
+    *      Specifies amount of vertex data to read back for Stream 0 vertices,
+    *      starting at the Stream 0 Vertex Read Offset location. Maximum
+    *      readback is 17 256-bit units (34 128-bit vertex attributes). Read
+    *      data past the end of the valid vertex data has undefined contents,
+    *      and therefore shouldn't be used to source stream out data.  Must be
+    *      zero (i.e., read length = 256b) if the GS is enabled and the Output
+    *      Vertex Size field in 3DSTATE_GS is programmed to 0 (i.e., one 16B
+    *      unit)."
+    */
+   assert(stream->vue_read_base == 0 || stream->vue_read_base == 2);
+   assert(stream->vue_read_count <= 34);
+
+   assert(stream->decl_count <= ILO_STATE_SOL_MAX_DECL_COUNT);
+
+   for (i = 0; i < stream->decl_count; i++) {
+      const struct ilo_state_sol_decl_info *decl = &stream->decls[i];
+
+      assert(decl->is_hole || decl->attr < stream->vue_read_count);
+
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 205:
+       *
+       *     "There is only enough internal storage for the 128-bit vertex
+       *      header and 32 128-bit vertex attributes."
+       */
+      assert(decl->attr < 33);
+
+      assert(decl->component_base < 4 &&
+             decl->component_base + decl->component_count <= 4);
+      assert(decl->buffer < ILO_STATE_SOL_MAX_BUFFER_COUNT);
+   }
+
+   return true;
+}
+
+static bool
+sol_validate_gen7(const struct ilo_dev *dev,
+                  const struct ilo_state_sol_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 198:
+    *
+    *     "This bit (Render Stream Select) is used even if SO Function Enable
+    *      is DISABLED."
+    *
+    * From the Haswell PRM, volume 2b, page 796:
+    *
+    *     "SO Function Enable must also be ENABLED in order for thiis field
+    *      (Render Stream Select) to select a stream for rendering. When SO
+    *      Function Enable is DISABLED and Rendering Disable is cleared (i.e.,
+    *      rendering is enabled), StreamID is ignored downstream of the SO
+    *      stage, allowing any stream to be rendered."
+    *
+    * We want Gen7 behavior, but we have to require users to follow Gen7.5
+    * behavior: info->sol_enable must be set for info->render_stream to work.
+    */
+
+   for (i = 0; i < ARRAY_SIZE(info->streams); i++) {
+      if (!sol_stream_validate_gen7(dev, &info->streams[i]))
+         return false;
+   }
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 208:
+    *
+    *     "(Surface Pitch)
+    *      [0,2048]  Must be 0 or a multiple of 4 Bytes."
+    */
+   for (i = 0; i < ARRAY_SIZE(info->buffer_strides); i++) {
+      assert(info->buffer_strides[i] <= 2048 &&
+             info->buffer_strides[i] % 4 == 0);
+   }
+
+   return true;
+}
+
+static bool
+sol_set_gen7_3DSTATE_STREAMOUT(struct ilo_state_sol *sol,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_sol_info *info)
+{
+   struct {
+      uint8_t offset;
+      uint8_t len;
+   } vue_read[ILO_STATE_SOL_MAX_STREAM_COUNT];
+   uint8_t i;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!sol_validate_gen7(dev, info))
+      return false;
+
+   for (i = 0; i < ARRAY_SIZE(info->streams); i++) {
+      const struct ilo_state_sol_stream_info *stream = &info->streams[i];
+
+      vue_read[i].offset = stream->vue_read_base / 2;
+      /*
+       * In pairs minus 1.  URB entries are aligned to 512-bits.  There is no
+       * need to worry about reading past entries.
+       */
+      vue_read[i].len = (stream->vue_read_count + 1) / 2;
+      if (vue_read[i].len)
+         vue_read[i].len--;
+   }
+
+   dw1 = info->render_stream << GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT |
+         info->tristrip_reorder << GEN7_SO_DW1_REORDER_MODE__SHIFT;
+
+   if (info->sol_enable)
+      dw1 |= GEN7_SO_DW1_SO_ENABLE;
+
+   if (info->render_disable)
+      dw1 |= GEN7_SO_DW1_RENDER_DISABLE;
+
+   if (info->stats_enable)
+      dw1 |= GEN7_SO_DW1_STATISTICS;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      const uint8_t buffer_enables = ((bool) info->buffer_strides[3]) << 3 |
+                                     ((bool) info->buffer_strides[2]) << 2 |
+                                     ((bool) info->buffer_strides[1]) << 1 |
+                                     ((bool) info->buffer_strides[0]);
+      dw1 |= buffer_enables << GEN7_SO_DW1_BUFFER_ENABLES__SHIFT;
+   }
+
+   dw2 = vue_read[3].offset << GEN7_SO_DW2_STREAM3_READ_OFFSET__SHIFT |
+         vue_read[3].len << GEN7_SO_DW2_STREAM3_READ_LEN__SHIFT |
+         vue_read[2].offset << GEN7_SO_DW2_STREAM2_READ_OFFSET__SHIFT |
+         vue_read[2].len << GEN7_SO_DW2_STREAM2_READ_LEN__SHIFT |
+         vue_read[1].offset << GEN7_SO_DW2_STREAM1_READ_OFFSET__SHIFT |
+         vue_read[1].len << GEN7_SO_DW2_STREAM1_READ_LEN__SHIFT |
+         vue_read[0].offset << GEN7_SO_DW2_STREAM0_READ_OFFSET__SHIFT |
+         vue_read[0].len << GEN7_SO_DW2_STREAM0_READ_LEN__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(sol->streamout) >= 2);
+   sol->streamout[0] = dw1;
+   sol->streamout[1] = dw2;
+
+   memcpy(sol->strides, info->buffer_strides, sizeof(sol->strides));
+
+   return true;
+}
+
+static bool
+sol_set_gen7_3DSTATE_SO_DECL_LIST(struct ilo_state_sol *sol,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_sol_info *info,
+                                  uint8_t max_decl_count)
+{
+   uint64_t decl_list[ILO_STATE_SOL_MAX_DECL_COUNT];
+   uint8_t decl_counts[ILO_STATE_SOL_MAX_STREAM_COUNT];
+   uint8_t buffer_selects[ILO_STATE_SOL_MAX_STREAM_COUNT];
+   uint32_t dw1, dw2;
+   uint8_t i, j;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   memset(decl_list, 0, sizeof(decl_list[0]) * max_decl_count);
+
+   for (i = 0; i < ARRAY_SIZE(info->streams); i++) {
+      const struct ilo_state_sol_stream_info *stream = &info->streams[i];
+
+      assert(stream->decl_count <= max_decl_count);
+      decl_counts[i] = stream->decl_count;
+      buffer_selects[i] = 0;
+
+      for (j = 0; j < stream->decl_count; j++) {
+         const struct ilo_state_sol_decl_info *decl = &stream->decls[j];
+         const uint8_t mask = ((1 << decl->component_count) - 1) <<
+            decl->component_base;
+         uint16_t val;
+
+         val = decl->buffer << GEN7_SO_DECL_OUTPUT_SLOT__SHIFT |
+               mask << GEN7_SO_DECL_COMPONENT_MASK__SHIFT;
+
+         if (decl->is_hole)
+            val |= GEN7_SO_DECL_HOLE_FLAG;
+         else
+            val |= decl->attr << GEN7_SO_DECL_REG_INDEX__SHIFT;
+
+         decl_list[j] |= (uint64_t) val << (16 * i);
+         buffer_selects[i] |= 1 << decl->buffer;
+      }
+   }
+
+   dw1 = buffer_selects[3] << GEN7_SO_DECL_DW1_STREAM3_BUFFER_SELECTS__SHIFT |
+         buffer_selects[2] << GEN7_SO_DECL_DW1_STREAM2_BUFFER_SELECTS__SHIFT |
+         buffer_selects[1] << GEN7_SO_DECL_DW1_STREAM1_BUFFER_SELECTS__SHIFT |
+         buffer_selects[0] << GEN7_SO_DECL_DW1_STREAM0_BUFFER_SELECTS__SHIFT;
+   dw2 = decl_counts[3] << GEN7_SO_DECL_DW2_STREAM3_ENTRY_COUNT__SHIFT |
+         decl_counts[2] << GEN7_SO_DECL_DW2_STREAM2_ENTRY_COUNT__SHIFT |
+         decl_counts[1] << GEN7_SO_DECL_DW2_STREAM1_ENTRY_COUNT__SHIFT |
+         decl_counts[0] << GEN7_SO_DECL_DW2_STREAM0_ENTRY_COUNT__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(sol->so_decl) >= 2);
+   sol->so_decl[0] = dw1;
+   sol->so_decl[1] = dw2;
+
+   STATIC_ASSERT(ARRAY_SIZE(sol->decl[0]) == 2);
+   memcpy(sol->decl, decl_list, sizeof(sol->decl[0]) * max_decl_count);
+   sol->decl_count = max_decl_count;
+
+   return true;
+}
+
+static bool
+sol_buffer_validate_gen7(const struct ilo_dev *dev,
+                         const struct ilo_state_sol_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (info->buf)
+      assert(info->offset < info->buf->bo_size && info->size);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 208:
+    *
+    *     "(Surface Base Address) This field specifies the starting DWord
+    *      address..."
+    */
+   assert(info->offset % 4 == 0);
+
+   /* Gen8+ only */
+   if (info->write_offset_load || info->write_offset_save)
+      assert(ilo_dev_gen(dev) >= ILO_GEN(8));
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 206:
+    *
+    *     "This field (Stream Offset) specifies the Offset in stream output
+    *      buffer to start at, or whether to append to the end of an existing
+    *      buffer. The Offset must be DWORD aligned."
+    */
+   if (info->write_offset_imm_enable) {
+      assert(info->write_offset_load);
+      assert(info->write_offset_imm % 4 == 0);
+   }
+
+   return true;
+}
+
+static uint32_t
+sol_buffer_get_gen6_size(const struct ilo_dev *dev,
+                         const struct ilo_state_sol_buffer_info *info)
+{
+   uint32_t size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!info->buf)
+      return 0;
+
+   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
+      info->buf->bo_size - info->offset;
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 208:
+    *
+    *     "(Surface End Address) This field specifies the ending DWord
+    *      address..."
+    */
+   size &= ~3;
+
+   return size;
+}
+
+static bool
+sol_buffer_set_gen7_3dstate_so_buffer(struct ilo_state_sol_buffer *sb,
+                                      const struct ilo_dev *dev,
+                                      const struct ilo_state_sol_buffer_info *info)
+{
+   const uint32_t size = sol_buffer_get_gen6_size(dev, info);
+
+   ILO_DEV_ASSERT(dev, 7, 7.5);
+
+   if (!sol_buffer_validate_gen7(dev, info))
+      return false;
+
+   STATIC_ASSERT(ARRAY_SIZE(sb->so_buf) >= 2);
+   sb->so_buf[0] = info->offset;
+   sb->so_buf[1] = (size) ? info->offset + size : 0;
+
+   return true;
+}
+
+static bool
+sol_buffer_set_gen8_3dstate_so_buffer(struct ilo_state_sol_buffer *sb,
+                                      const struct ilo_dev *dev,
+                                      const struct ilo_state_sol_buffer_info *info)
+{
+   const uint32_t size = sol_buffer_get_gen6_size(dev, info);
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   if (!sol_buffer_validate_gen7(dev, info))
+      return false;
+
+   dw1 = 0;
+
+   if (info->buf)
+      dw1 |= GEN8_SO_BUF_DW1_ENABLE;
+   if (info->write_offset_load)
+      dw1 |= GEN8_SO_BUF_DW1_OFFSET_WRITE_ENABLE;
+   if (info->write_offset_save)
+      dw1 |= GEN8_SO_BUF_DW1_OFFSET_ENABLE;
+
+   STATIC_ASSERT(ARRAY_SIZE(sb->so_buf) >= 4);
+   sb->so_buf[0] = dw1;
+   sb->so_buf[1] = info->offset;
+
+   /*
+    * From the Broadwell PRM, volume 2b, page 205:
+    *
+    *     "This field (Surface Size) specifies the size of buffer in number
+    *      DWords minus 1 of the buffer in Graphics Memory."
+    */
+   sb->so_buf[2] = (size) ? size / 4 - 1 : 0;
+
+   /* load from imm or sb->write_offset_bo */
+   sb->so_buf[3] = (info->write_offset_imm_enable) ?
+      info->write_offset_imm : ~0u;
+
+   return true;
+}
+
+bool
+ilo_state_sol_init(struct ilo_state_sol *sol,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sol_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(sol, sizeof(*sol)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      uint8_t max_decl_count, i;
+
+      max_decl_count = info->streams[0].decl_count;
+      for (i = 1; i < ARRAY_SIZE(info->streams); i++) {
+         if (max_decl_count < info->streams[i].decl_count)
+            max_decl_count = info->streams[i].decl_count;
+      }
+
+      assert(ilo_state_sol_data_size(dev, max_decl_count) <= info->data_size);
+      sol->decl = (uint32_t (*)[2]) info->data;
+
+      ret &= sol_set_gen7_3DSTATE_STREAMOUT(sol, dev, info);
+      ret &= sol_set_gen7_3DSTATE_SO_DECL_LIST(sol, dev, info, max_decl_count);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
+                            const struct ilo_dev *dev,
+                            bool render_disable)
+{
+   struct ilo_state_sol_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.render_disable = render_disable;
+
+   return ilo_state_sol_init(sol, dev, &info);
+}
+
+bool
+ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_sol_buffer_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(sb, sizeof(*sb)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ret &= sol_buffer_set_gen8_3dstate_so_buffer(sb, dev, info);
+   else
+      ret &= sol_buffer_set_gen7_3dstate_so_buffer(sb, dev, info);
+
+   sb->need_bo = (info->size > 0);
+   sb->need_write_offset_bo = (info->write_offset_save ||
+         (info->write_offset_load && !info->write_offset_imm_enable));
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_sol_buffer_init_disabled(struct ilo_state_sol_buffer *sb,
+                                   const struct ilo_dev *dev)
+{
+   struct ilo_state_sol_buffer_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_sol_buffer_init(sb, dev, &info);
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_sol.h b/src/gallium/drivers/ilo/core/ilo_state_sol.h
new file mode 100644
index 0000000..2513fcb
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_sol.h
@@ -0,0 +1,166 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_SOL_H
+#define ILO_STATE_SOL_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Ivy Bridge PRM, volume 2 part 1, page 193:
+ *
+ *     "Incoming topologies are tagged with a 2-bit StreamID."
+ */
+#define ILO_STATE_SOL_MAX_STREAM_COUNT 4
+
+/*
+ * From the Ivy Bridge PRM, volume 2 part 1, page 195:
+ *
+ *     "Up to four SO buffers are supported."
+ */
+#define ILO_STATE_SOL_MAX_BUFFER_COUNT 4
+
+/*
+ * From the Ivy Bridge PRM, volume 2 part 1, page 201:
+ *
+ *     "All 128 decls..."
+ */
+#define ILO_STATE_SOL_MAX_DECL_COUNT 128
+
+/**
+ * Output a vertex attribute.
+ */
+struct ilo_state_sol_decl_info {
+   /* select an attribute from read ones */
+   uint8_t attr;
+   bool is_hole;
+
+   /* which components to write */
+   uint8_t component_base;
+   uint8_t component_count;
+
+   /* destination buffer */
+   uint8_t buffer;
+};
+
+struct ilo_state_sol_stream_info {
+   /* which VUE attributes to read */
+   uint8_t cv_vue_attr_count;
+   uint8_t vue_read_base;
+   uint8_t vue_read_count;
+
+   uint8_t decl_count;
+   const struct ilo_state_sol_decl_info *decls;
+};
+
+struct ilo_state_sol_info {
+   void *data;
+   size_t data_size;
+
+   bool sol_enable;
+   bool stats_enable;
+   enum gen_reorder_mode tristrip_reorder;
+
+   bool render_disable;
+   /* ignored when SOL is disabled */
+   uint8_t render_stream;
+
+   /* a buffer is disabled when its stride is zero */
+   uint16_t buffer_strides[ILO_STATE_SOL_MAX_BUFFER_COUNT];
+
+   struct ilo_state_sol_stream_info streams[ILO_STATE_SOL_MAX_STREAM_COUNT];
+};
+
+struct ilo_state_sol {
+   uint32_t streamout[2];
+   uint16_t strides[4];
+
+   uint32_t so_decl[2];
+   uint32_t (*decl)[2];
+   uint8_t decl_count;
+};
+
+struct ilo_buffer;
+
+struct ilo_state_sol_buffer_info {
+   const struct ilo_buffer *buf;
+   uint32_t offset;
+   uint32_t size;
+
+   /*
+    * Gen8+ only.  When enabled, require a write offset bo of at least
+    * (sizeof(uint32_t) * ILO_STATE_SOL_MAX_BUFFER_COUNT) bytes
+    */
+   bool write_offset_load;
+   bool write_offset_save;
+
+   bool write_offset_imm_enable;
+   uint32_t write_offset_imm;
+};
+
+struct ilo_state_sol_buffer {
+   uint32_t so_buf[4];
+
+   bool need_bo;
+   bool need_write_offset_bo;
+
+   /* managed by users */
+   struct intel_bo *bo;
+   struct intel_bo *write_offset_bo;
+};
+
+static inline size_t
+ilo_state_sol_data_size(const struct ilo_dev *dev, uint8_t max_decl_count)
+{
+   const struct ilo_state_sol *so = NULL;
+   return (ilo_dev_gen(dev) >= ILO_GEN(7)) ?
+      sizeof(so->decl[0]) * max_decl_count : 0;
+}
+
+bool
+ilo_state_sol_init(struct ilo_state_sol *sol,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_sol_info *info);
+
+bool
+ilo_state_sol_init_disabled(struct ilo_state_sol *sol,
+                            const struct ilo_dev *dev,
+                            bool render_disable);
+
+bool
+ilo_state_sol_buffer_init(struct ilo_state_sol_buffer *sb,
+                          const struct ilo_dev *dev,
+                          const struct ilo_state_sol_buffer_info *info);
+
+bool
+ilo_state_sol_buffer_init_disabled(struct ilo_state_sol_buffer *sb,
+                                   const struct ilo_dev *dev);
+
+#endif /* ILO_STATE_SOL_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.c b/src/gallium/drivers/ilo/core/ilo_state_surface.c
new file mode 100644
index 0000000..5be9f8f
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.c
@@ -0,0 +1,1179 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_buffer.h"
+#include "ilo_image.h"
+#include "ilo_state_surface.h"
+
+static bool
+surface_set_gen6_null_SURFACE_STATE(struct ilo_state_surface *surf,
+                                    const struct ilo_dev *dev)
+{
+   uint32_t dw0, dw3;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 71:
+    *
+    *     "All of the remaining fields in surface state are ignored for null
+    *      surfaces, with the following exceptions:
+    *
+    *        - [DevSNB+]: Width, Height, Depth, and LOD fields must match the
+    *          depth buffer's corresponding state for all render target
+    *          surfaces, including null.
+    *        - Surface Format must be R8G8B8A8_UNORM."
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 82:
+    *
+    *     "If Surface Type is SURFTYPE_NULL, this field (Tiled Surface) must
+    *      be true"
+    *
+    * Note that we ignore the first exception for all surface types.
+    */
+   dw0 = GEN6_SURFTYPE_NULL << GEN6_SURFACE_DW0_TYPE__SHIFT |
+         GEN6_FORMAT_R8G8B8A8_UNORM << GEN6_SURFACE_DW0_FORMAT__SHIFT;
+   dw3 = GEN6_TILING_X << GEN6_SURFACE_DW3_TILING__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6);
+   surf->surface[0] = dw0;
+   surf->surface[1] = 0;
+   surf->surface[2] = 0;
+   surf->surface[3] = dw3;
+   surf->surface[4] = 0;
+   surf->surface[5] = 0;
+
+   return true;
+}
+
+static bool
+surface_set_gen7_null_SURFACE_STATE(struct ilo_state_surface *surf,
+                                    const struct ilo_dev *dev)
+{
+   uint32_t dw0;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   dw0 = GEN6_SURFTYPE_NULL << GEN7_SURFACE_DW0_TYPE__SHIFT |
+         GEN6_FORMAT_R8G8B8A8_UNORM << GEN7_SURFACE_DW0_FORMAT__SHIFT;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      dw0 |= GEN6_TILING_X << GEN8_SURFACE_DW0_TILING__SHIFT;
+   else
+      dw0 |= GEN6_TILING_X << GEN7_SURFACE_DW0_TILING__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13);
+   surf->surface[0] = dw0;
+   memset(&surf->surface[1], 0, sizeof(uint32_t) *
+         (((ilo_dev_gen(dev) >= ILO_GEN(8)) ? 13 : 8) - 1));
+
+   return true;
+}
+
+static bool
+surface_validate_gen6_buffer(const struct ilo_dev *dev,
+                             const struct ilo_state_surface_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* SVB writes are Gen6-only */
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB);
+
+   if (info->offset + info->size > info->buf->bo_size) {
+      ilo_warn("invalid buffer range\n");
+      return false;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 81:
+    *
+    *     "For surfaces of type SURFTYPE_BUFFER: [0,2047] -> [1B, 2048B]
+    *      For surfaces of type SURFTYPE_STRBUF: [0,2047] -> [1B, 2048B]"
+    */
+   if (!info->struct_size || info->struct_size > 2048) {
+      ilo_warn("invalid buffer struct size\n");
+      return false;
+   }
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    *
+    *     "The Base Address for linear render target surfaces and surfaces
+    *      accessed with the typed surface read/write data port messages must
+    *      be element-size aligned, for non-YUV surface formats, or a multiple
+    *      of 2 element-sizes for YUV surface formats.  Other linear surfaces
+    *      have no alignment requirements (byte alignment is sufficient)."
+    *
+    *     "Certain message types used to access surfaces have more stringent
+    *      alignment requirements. Please refer to the specific message
+    *      documentation for additional restrictions."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 233, 235, and 237:
+    *
+    *     "the surface base address must be OWord aligned"
+    *
+    * for OWord Block Read/Write, Unaligned OWord Block Read, and OWord Dual
+    * Block Read/Write.
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 246 and 249:
+    *
+    *     "The surface base address must be DWord aligned"
+    *
+    * for DWord Scattered Read/Write and Byte Scattered Read/Write.
+    *
+    * We have to rely on users to correctly set info->struct_size here.  DWord
+    * Scattered Read/Write has conflicting pitch and alignment, but we do not
+    * use them yet so we are fine.
+    *
+    * It is unclear if sampling engine surfaces require aligned offsets.
+    */
+   if (info->access != ILO_STATE_SURFACE_ACCESS_DP_SVB) {
+      assert(info->struct_size % info->format_size == 0);
+
+      if (info->offset % info->struct_size) {
+         ilo_warn("bad buffer offset\n");
+         return false;
+      }
+   }
+
+   if (info->format == GEN6_FORMAT_RAW) {
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 97:
+       *
+       *     ""RAW" is supported only with buffers and structured buffers
+       *      accessed via the untyped surface read/write and untyped atomic
+       *      operation messages, which do not have a column in the table."
+       *
+       * We do not have a specific access mode for untyped messages.
+       */
+      assert(info->access == ILO_STATE_SURFACE_ACCESS_DP_UNTYPED);
+
+      /*
+       * Nothing is said about Untyped* messages, but I guess they require the
+       * base address to be DWord aligned.
+       */
+      if (info->offset % 4) {
+         ilo_warn("bad RAW buffer offset\n");
+         return false;
+      }
+
+      if (info->struct_size > 1) {
+         /* no STRBUF on Gen6 */
+         if (ilo_dev_gen(dev) == ILO_GEN(6)) {
+            ilo_warn("no STRBUF support\n");
+            return false;
+         }
+
+         /*
+          * From the Ivy Bridge PRM, volume 4 part 1, page 70:
+          *
+          *     "For linear surfaces with Surface Type of SURFTYPE_STRBUF, the
+          *      pitch must be a multiple of 4 bytes."
+          */
+         if (info->struct_size % 4) {
+            ilo_warn("bad STRBUF pitch\n");
+            return false;
+         }
+      }
+   }
+
+   return true;
+}
+
+static bool
+surface_get_gen6_buffer_struct_count(const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_buffer_info *info,
+                                     uint32_t *count)
+{
+   uint32_t max_struct, c;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   c = info->size / info->struct_size;
+   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_SVB &&
+       info->format_size < info->size - info->struct_size * c)
+      c++;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 77:
+    *
+    *     "For buffer surfaces, the number of entries in the buffer ranges
+    *      from 1 to 2^27."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 68:
+    *
+    *     "For typed buffer and structured buffer surfaces, the number of
+    *      entries in the buffer ranges from 1 to 2^27.  For raw buffer
+    *      surfaces, the number of entries in the buffer is the number of
+    *      bytes which can range from 1 to 2^30."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 69:
+    *
+    *      For SURFTYPE_BUFFER: The low two bits of this field (Width) must be
+    *      11 if the Surface Format is RAW (the size of the buffer must be a
+    *      multiple of 4 bytes)."
+    */
+   max_struct = 1 << 27;
+   if (info->format == GEN6_FORMAT_RAW && info->struct_size == 1) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(7))
+         max_struct = 1 << 30;
+
+      c &= ~3;
+   }
+
+   if (!c || c > max_struct) {
+      ilo_warn("too many or zero buffer structs\n");
+      return false;
+   }
+
+   *count = c - 1;
+
+   return true;
+}
+
+static bool
+surface_set_gen6_buffer_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_buffer_info *info)
+{
+   uint32_t dw0, dw1, dw2, dw3;
+   uint32_t struct_count;
+   int width, height, depth;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!surface_validate_gen6_buffer(dev, info) ||
+       !surface_get_gen6_buffer_struct_count(dev, info, &struct_count))
+      return false;
+
+   /* bits [6:0] */
+   width  = (struct_count & 0x0000007f);
+   /* bits [19:7] */
+   height = (struct_count & 0x000fff80) >> 7;
+   /* bits [26:20] */
+   depth  = (struct_count & 0x07f00000) >> 20;
+
+   dw0 = GEN6_SURFTYPE_BUFFER << GEN6_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT;
+   dw1 = info->offset;
+   dw2 = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
+         width << GEN6_SURFACE_DW2_WIDTH__SHIFT;
+   dw3 = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT |
+         (info->struct_size - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6);
+   surf->surface[0] = dw0;
+   surf->surface[1] = dw1;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = 0;
+   surf->surface[5] = 0;
+
+   surf->type = GEN6_SURFTYPE_BUFFER;
+   surf->min_lod = 0;
+   surf->mip_count = 0;
+
+   return true;
+}
+
+static bool
+surface_set_gen7_buffer_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_buffer_info *info)
+{
+   uint32_t dw0, dw1, dw2, dw3, dw7;
+   enum gen_surface_type type;
+   uint32_t struct_count;
+   int width, height, depth;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!surface_validate_gen6_buffer(dev, info) ||
+       !surface_get_gen6_buffer_struct_count(dev, info, &struct_count))
+      return false;
+
+   type = (info->format == GEN6_FORMAT_RAW && info->struct_size > 1) ?
+      GEN7_SURFTYPE_STRBUF : GEN6_SURFTYPE_BUFFER;
+
+   /* bits [6:0] */
+   width  = (struct_count & 0x0000007f);
+   /* bits [20:7] */
+   height = (struct_count & 0x001fff80) >> 7;
+   /* bits [30:21] */
+   depth  = (struct_count & 0x7fe00000) >> 21;
+
+   dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT;
+   dw1 = (ilo_dev_gen(dev) >= ILO_GEN(8)) ? 0 : info->offset;
+   dw2 = GEN_SHIFT32(height, GEN7_SURFACE_DW2_HEIGHT) |
+         GEN_SHIFT32(width, GEN7_SURFACE_DW2_WIDTH);
+   dw3 = GEN_SHIFT32(depth, GEN7_SURFACE_DW3_DEPTH) |
+         GEN_SHIFT32(info->struct_size - 1, GEN7_SURFACE_DW3_PITCH);
+
+   dw7 = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      dw7 |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
+             GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
+             GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
+             GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13);
+   surf->surface[0] = dw0;
+   surf->surface[1] = dw1;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = 0;
+   surf->surface[5] = 0;
+   surf->surface[6] = 0;
+   surf->surface[7] = dw7;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      surf->surface[8] = info->offset;
+      surf->surface[9] = 0;
+      surf->surface[10] = 0;
+      surf->surface[11] = 0;
+      surf->surface[12] = 0;
+   }
+
+   surf->type = type;
+   surf->min_lod = 0;
+   surf->mip_count = 0;
+
+   return true;
+}
+
+static enum gen_surface_type
+get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (img->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return GEN6_SURFTYPE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return GEN6_SURFTYPE_2D;
+   case PIPE_TEXTURE_3D:
+      return GEN6_SURFTYPE_3D;
+   default:
+      assert(!"unknown texture target");
+      return GEN6_SURFTYPE_NULL;
+   }
+}
+
+static bool
+surface_validate_gen6_image(const struct ilo_dev *dev,
+                            const struct ilo_state_surface_image_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (info->access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      assert(ilo_dev_gen(dev) >= ILO_GEN(7));
+      break;
+   default:
+      assert(!"unsupported surface access");
+      break;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 78:
+    *
+    *     "For surface types other than SURFTYPE_BUFFER, the Width specified
+    *      by this field must be less than or equal to the surface pitch
+    *      (specified in bytes via the Surface Pitch field)."
+    */
+   assert(info->img->bo_stride && info->img->bo_stride <= 512 * 1024 &&
+          info->img->width0 <= info->img->bo_stride);
+
+   if (info->is_cube_map) {
+      assert(get_gen6_surface_type(dev, info->img) == GEN6_SURFTYPE_2D);
+
+      /*
+       * From the Sandy Bridge PRM, volume 4 part 1, page 78:
+       *
+       *     "For cube maps, Width must be set equal to the Height."
+       */
+      assert(info->img->width0 == info->img->height0);
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 72:
+    *
+    *     "Tile Walk TILEWALK_YMAJOR is UNDEFINED for render target formats
+    *      that have 128 bits-per-element (BPE)."
+    *
+    *     "If Number of Multisamples is set to a value other than
+    *      MULTISAMPLECOUNT_1, this field cannot be set to the following
+    *      formats:
+    *
+    *      - any format with greater than 64 bits per element
+    *      - any compressed texture format (BC*)
+    *      - any YCRCB* format"
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+    *
+    *      If Number of Multisamples is set to a value other than
+    *      MULTISAMPLECOUNT_1, this field cannot be set to the following
+    *      formats: any format with greater than 64 bits per element, if
+    *      Number of Multisamples is MULTISAMPLECOUNT_8, any compressed
+    *      texture format (BC*), and any YCRCB* format.
+    *
+    * TODO
+    */
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8) && info->img->tiling == GEN8_TILING_W) {
+      ilo_warn("tiling W is not supported\n");
+      return false;
+   }
+
+   return true;
+}
+
+static void
+get_gen6_max_extent(const struct ilo_dev *dev,
+                    const struct ilo_image *img,
+                    uint16_t *max_w, uint16_t *max_h)
+{
+   const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (get_gen6_surface_type(dev, img)) {
+   case GEN6_SURFTYPE_1D:
+      *max_w = max_size;
+      *max_h = 1;
+      break;
+   case GEN6_SURFTYPE_2D:
+      *max_w = max_size;
+      *max_h = max_size;
+      break;
+   case GEN6_SURFTYPE_3D:
+      *max_w = 2048;
+      *max_h = 2048;
+      break;
+   default:
+      assert(!"invalid surface type");
+      *max_w = 1;
+      *max_h = 1;
+      break;
+   }
+}
+
+static bool
+surface_get_gen6_image_extent(const struct ilo_dev *dev,
+                              const struct ilo_state_surface_image_info *info,
+                              uint16_t *width, uint16_t *height)
+{
+   uint16_t w, h, max_w, max_h;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   w = info->img->width0;
+   h = info->img->height0;
+
+   get_gen6_max_extent(dev, info->img, &max_w, &max_h);
+   assert(w && h && w <= max_w && h <= max_h);
+
+   *width = w - 1;
+   *height = h - 1;
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_slices(const struct ilo_dev *dev,
+                              const struct ilo_state_surface_image_info *info,
+                              uint16_t *depth, uint16_t *min_array_elem,
+                              uint16_t *rt_view_extent)
+{
+   uint16_t max_slice, d;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 63:
+    *
+    *     "If this field (Surface Array) is enabled, the Surface Type must be
+    *      SURFTYPE_1D, SURFTYPE_2D, or SURFTYPE_CUBE. If this field is
+    *      disabled and Surface Type is SURFTYPE_1D, SURFTYPE_2D, or
+    *      SURFTYPE_CUBE, the Depth field must be set to zero."
+    *
+    * From the Ivy Bridge PRM, volume 4 part 1, page 69:
+    *
+    *     "This field (Depth) specifies the total number of levels for a
+    *      volume texture or the number of array elements allowed to be
+    *      accessed starting at the Minimum Array Element for arrayed
+    *      surfaces.  If the volume texture is MIP-mapped, this field
+    *      specifies the depth of the base MIP level."
+    *
+    *     "For SURFTYPE_CUBE:For Sampling Engine Surfaces, the range of this
+    *      field is [0,340], indicating the number of cube array elements
+    *      (equal to the number of underlying 2D array elements divided by 6).
+    *      For other surfaces, this field must be zero."
+    *
+    *     "Errata: For SURFTYPE_CUBE sampling engine surfaces, the range of
+    *      this field is limited to [0,85].
+    *
+    *      Errata: If Surface Array is enabled, and Depth is between 1024 and
+    *      2047, an incorrect array slice may be accessed if the requested
+    *      array index in the message is greater than or equal to 4096."
+    *
+    * The errata are for Gen7-specific, and they limit the number of useable
+    * layers to (86 * 6), about 512.
+    */
+
+   switch (get_gen6_surface_type(dev, info->img)) {
+   case GEN6_SURFTYPE_1D:
+   case GEN6_SURFTYPE_2D:
+      max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7.5)) ? 2048 : 512;
+
+      assert(info->img->array_size <= max_slice);
+      max_slice = info->img->array_size;
+
+      d = info->slice_count;
+      if (info->is_cube_map) {
+         if (info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
+            if (!d || d % 6) {
+               ilo_warn("invalid cube slice count\n");
+               return false;
+            }
+
+            if (ilo_dev_gen(dev) == ILO_GEN(7) && d > 86 * 6) {
+               ilo_warn("cube slice count exceeds Gen7 limit\n");
+               return false;
+            }
+         } else {
+            /*
+             * Minumum Array Element and Depth must be 0; Render Target View
+             * Extent is ignored.
+             */
+            if (info->slice_base || d != 6) {
+               ilo_warn("no cube RT array support in data port\n");
+               return false;
+            }
+         }
+
+         d /= 6;
+      }
+
+      if (!info->is_array && d > 1) {
+         ilo_warn("non-array surface with non-zero depth\n");
+         return false;
+      }
+      break;
+   case GEN6_SURFTYPE_3D:
+      max_slice = 2048;
+
+      assert(info->img->depth0 <= max_slice);
+      max_slice = u_minify(info->img->depth0, info->level_base);
+
+      d = info->img->depth0;
+
+      if (info->is_array) {
+         ilo_warn("3D surfaces cannot be arrays\n");
+         return false;
+      }
+      break;
+   default:
+      assert(!"invalid surface type");
+      return false;
+      break;
+   }
+
+   if (!info->slice_count ||
+       info->slice_base + info->slice_count > max_slice) {
+      ilo_warn("invalid slice range\n");
+      return false;
+   }
+
+   assert(d);
+   *depth = d - 1;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 84:
+    *
+    *     "For Sampling Engine and Render Target 1D and 2D Surfaces:
+    *      This field (Minimum Array Element) indicates the minimum array
+    *      element that can be accessed as part of this surface.  This field
+    *      is added to the delivered array index before it is used to address
+    *      the surface.
+    *
+    *      For Render Target 3D Surfaces:
+    *      This field indicates the minimum `R' coordinate on the LOD
+    *      currently being rendered to.  This field is added to the delivered
+    *      array index before it is used to address the surface.
+    *
+    *      For Sampling Engine Cube Surfaces on [DevSNB+] only:
+    *      This field indicates the minimum array element in the underlying 2D
+    *      surface array that can be accessed as part of this surface (the
+    *      cube array index is multipled by 6 to compute this value, although
+    *      this field is not restricted to only multiples of 6). This field is
+    *      added to the delivered array index before it is used to address the
+    *      surface.
+    *
+    *      For Other Surfaces:
+    *      This field must be set to zero."
+    *
+    * On Gen7+, typed sufaces are treated like sampling engine 1D and 2D
+    * surfaces.
+    */
+   *min_array_elem = info->slice_base;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 84:
+    *
+    *     "For Render Target 3D Surfaces:
+    *      This field (Render Target View Extent) indicates the extent of the
+    *      accessible `R' coordinates minus 1 on the LOD currently being
+    *      rendered to.
+    *
+    *      For Render Target 1D and 2D Surfaces:
+    *      This field must be set to the same value as the Depth field.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    */
+   *rt_view_extent = info->slice_count - 1;
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_levels(const struct ilo_dev *dev,
+                              const struct ilo_state_surface_image_info *info,
+                              uint8_t *min_lod, uint8_t *mip_count)
+{
+   uint8_t max_level = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 15 : 14;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->img->level_count <= max_level);
+   max_level = info->img->level_count;
+
+   if (!info->level_count ||
+       info->level_base + info->level_count > max_level) {
+      ilo_warn("invalid level range\n");
+      return false;
+   }
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 79:
+    *
+    *     "For Sampling Engine Surfaces:
+    *      This field (MIP Count / LOD) indicates the number of MIP levels
+    *      allowed to be accessed starting at Surface Min LOD, which must be
+    *      less than or equal to the number of MIP levels actually stored in
+    *      memory for this surface.
+    *
+    *      Force the mip map access to be between the mipmap specified by the
+    *      integer bits of the Min LOD and the ceiling of the value specified
+    *      here.
+    *
+    *      For Render Target Surfaces:
+    *      This field defines the MIP level that is currently being rendered
+    *      into. This is the absolute MIP level on the surface and is not
+    *      relative to the Surface Min LOD field, which is ignored for render
+    *      target surfaces.
+    *
+    *      For Other Surfaces:
+    *      This field is reserved : MBZ"
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 83:
+    *
+    *     "For Sampling Engine Surfaces:
+    *
+    *      This field (Surface Min LOD) indicates the most detailed LOD that
+    *      can be accessed as part of this surface.  This field is added to
+    *      the delivered LOD (sample_l, ld, or resinfo message types) before
+    *      it is used to address the surface.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    *
+    * On Gen7+, typed sufaces are treated like sampling engine surfaces.
+    */
+   if (info->access == ILO_STATE_SURFACE_ACCESS_DP_RENDER) {
+      assert(info->level_count == 1);
+
+      *min_lod = 0;
+      *mip_count = info->level_base;
+   } else {
+      *min_lod = info->level_base;
+      *mip_count = info->level_count - 1;
+   }
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_sample_count(const struct ilo_dev *dev,
+                                    const struct ilo_state_surface_image_info *info,
+                                    enum gen_sample_count *sample_count)
+{
+   int min_gen;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (info->img->sample_count) {
+   case 1:
+      *sample_count = GEN6_NUMSAMPLES_1;
+      min_gen = ILO_GEN(6);
+      break;
+   case 2:
+      *sample_count = GEN8_NUMSAMPLES_2;
+      min_gen = ILO_GEN(8);
+      break;
+   case 4:
+      *sample_count = GEN6_NUMSAMPLES_4;
+      min_gen = ILO_GEN(6);
+      break;
+   case 8:
+      *sample_count = GEN7_NUMSAMPLES_8;
+      min_gen = ILO_GEN(7);
+      break;
+   case 16:
+      *sample_count = GEN8_NUMSAMPLES_16;
+      min_gen = ILO_GEN(8);
+      break;
+   default:
+      assert(!"invalid sample count");
+      *sample_count = GEN6_NUMSAMPLES_1;
+      break;
+   }
+
+   assert(ilo_dev_gen(dev) >= min_gen);
+
+   return true;
+}
+
+static bool
+surface_get_gen6_image_alignments(const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_image_info *info,
+                                  uint32_t *alignments)
+{
+   uint32_t a = 0;
+   bool err = false;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      switch (info->img->align_i) {
+      case 4:
+         a |= GEN8_SURFACE_DW0_HALIGN_4;
+         break;
+      case 8:
+         a |= GEN8_SURFACE_DW0_HALIGN_8;
+         break;
+      case 16:
+         a |= GEN8_SURFACE_DW0_HALIGN_16;
+         break;
+      default:
+         err = true;
+         break;
+      }
+
+      switch (info->img->align_j) {
+      case 4:
+         a |= GEN7_SURFACE_DW0_VALIGN_4;
+         break;
+      case 8:
+         a |= GEN8_SURFACE_DW0_VALIGN_8;
+         break;
+      case 16:
+         a |= GEN8_SURFACE_DW0_VALIGN_16;
+         break;
+      default:
+         err = true;
+         break;
+      }
+   } else if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (info->img->align_i) {
+      case 4:
+         a |= GEN7_SURFACE_DW0_HALIGN_4;
+         break;
+      case 8:
+         a |= GEN7_SURFACE_DW0_HALIGN_8;
+         break;
+      default:
+         err = true;
+         break;
+      }
+
+      switch (info->img->align_j) {
+      case 2:
+         a |= GEN7_SURFACE_DW0_VALIGN_2;
+         break;
+      case 4:
+         a |= GEN7_SURFACE_DW0_VALIGN_4;
+         break;
+      default:
+         err = true;
+         break;
+      }
+   } else {
+      if (info->img->align_i != 4)
+         err = true;
+
+      switch (info->img->align_j) {
+      case 2:
+         a |= GEN6_SURFACE_DW5_VALIGN_2;
+         break;
+      case 4:
+         a |= GEN6_SURFACE_DW5_VALIGN_4;
+         break;
+      default:
+         err = true;
+         break;
+      }
+   }
+
+   if (err)
+      assert(!"invalid HALIGN or VALIGN");
+
+   *alignments = a;
+
+   return true;
+}
+
+static bool
+surface_set_gen6_image_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_image_info *info)
+{
+   uint16_t width, height, depth, array_base, view_extent;
+   uint8_t min_lod, mip_count;
+   enum gen_sample_count sample_count;
+   uint32_t alignments;
+   enum gen_surface_type type;
+   uint32_t dw0, dw2, dw3, dw4, dw5;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!surface_validate_gen6_image(dev, info) ||
+       !surface_get_gen6_image_extent(dev, info, &width, &height) ||
+       !surface_get_gen6_image_slices(dev, info, &depth, &array_base,
+                                      &view_extent) ||
+       !surface_get_gen6_image_levels(dev, info, &min_lod, &mip_count) ||
+       !surface_get_gen6_image_sample_count(dev, info, &sample_count) ||
+       !surface_get_gen6_image_alignments(dev, info, &alignments))
+      return false;
+
+   /* no ARYSPC_LOD0 */
+   assert(info->img->walk != ILO_IMAGE_WALK_LOD);
+   /* no UMS/CMS */
+   if (info->img->sample_count > 1)
+      assert(info->img->interleaved_samples);
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+      get_gen6_surface_type(dev, info->img);
+
+   dw0 = type << GEN6_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN6_SURFACE_DW0_FORMAT__SHIFT |
+         GEN6_SURFACE_DW0_MIPLAYOUT_BELOW;
+
+   /*
+    * From the Sandy Bridge PRM, volume 4 part 1, page 74:
+    *
+    *     "CUBE_AVERAGE may only be selected if all of the Cube Face Enable
+    *      fields are equal to one."
+    *
+    * From the Sandy Bridge PRM, volume 4 part 1, page 75-76:
+    *
+    *     "For SURFTYPE_CUBE Surfaces accessed via the Sampling Engine:
+    *      Bits 5:0 of this field (Cube Face Enables) enable the individual
+    *      faces of a cube map.  Enabling a face indicates that the face is
+    *      present in the cube map, while disabling it indicates that that
+    *      face is represented by the texture map's border color. Refer to
+    *      Memory Data Formats for the correlation between faces and the cube
+    *      map memory layout. Note that storage for disabled faces must be
+    *      provided.
+    *
+    *      For other surfaces:
+    *      This field is reserved : MBZ"
+    *
+    *     "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this
+    *      field must be programmed to 111111b (all faces enabled)."
+    */
+   if (info->is_cube_map &&
+       info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER) {
+      dw0 |= GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE |
+             GEN6_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
+   }
+
+   dw2 = height << GEN6_SURFACE_DW2_HEIGHT__SHIFT |
+         width << GEN6_SURFACE_DW2_WIDTH__SHIFT |
+         mip_count << GEN6_SURFACE_DW2_MIP_COUNT_LOD__SHIFT;
+
+   dw3 = depth << GEN6_SURFACE_DW3_DEPTH__SHIFT |
+         (info->img->bo_stride - 1) << GEN6_SURFACE_DW3_PITCH__SHIFT |
+         info->img->tiling << GEN6_SURFACE_DW3_TILING__SHIFT;
+
+   dw4 = min_lod << GEN6_SURFACE_DW4_MIN_LOD__SHIFT |
+         array_base << GEN6_SURFACE_DW4_MIN_ARRAY_ELEMENT__SHIFT |
+         view_extent << GEN6_SURFACE_DW4_RT_VIEW_EXTENT__SHIFT |
+         sample_count << GEN6_SURFACE_DW4_MULTISAMPLECOUNT__SHIFT;
+
+   dw5 = alignments;
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 6);
+   surf->surface[0] = dw0;
+   surf->surface[1] = 0;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = dw4;
+   surf->surface[5] = dw5;
+
+   surf->type = type;
+   surf->min_lod = min_lod;
+   surf->mip_count = mip_count;
+
+   return true;
+}
+
+static bool
+surface_set_gen7_image_SURFACE_STATE(struct ilo_state_surface *surf,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_surface_image_info *info)
+{
+   uint16_t width, height, depth, array_base, view_extent;
+   uint8_t min_lod, mip_count;
+   uint32_t alignments;
+   enum gen_sample_count sample_count;
+   enum gen_surface_type type;
+   uint32_t dw0, dw1, dw2, dw3, dw4, dw5, dw7;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!surface_validate_gen6_image(dev, info) ||
+       !surface_get_gen6_image_extent(dev, info, &width, &height) ||
+       !surface_get_gen6_image_slices(dev, info, &depth, &array_base,
+                                      &view_extent) ||
+       !surface_get_gen6_image_levels(dev, info, &min_lod, &mip_count) ||
+       !surface_get_gen6_image_sample_count(dev, info, &sample_count) ||
+       !surface_get_gen6_image_alignments(dev, info, &alignments))
+      return false;
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+      get_gen6_surface_type(dev, info->img);
+
+   dw0 = type << GEN7_SURFACE_DW0_TYPE__SHIFT |
+         info->format << GEN7_SURFACE_DW0_FORMAT__SHIFT |
+         alignments;
+
+   if (info->is_array)
+      dw0 |= GEN7_SURFACE_DW0_IS_ARRAY;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      dw0 |= info->img->tiling << GEN8_SURFACE_DW0_TILING__SHIFT;
+   } else {
+      dw0 |= info->img->tiling << GEN7_SURFACE_DW0_TILING__SHIFT;
+
+      if (info->img->walk == ILO_IMAGE_WALK_LOD)
+         dw0 |= GEN7_SURFACE_DW0_ARYSPC_LOD0;
+      else
+         dw0 |= GEN7_SURFACE_DW0_ARYSPC_FULL;
+   }
+
+   /*
+    * From the Ivy Bridge PRM, volume 4 part 1, page 67:
+    *
+    *     "For SURFTYPE_CUBE Surfaces accessed via the Sampling Engine: Bits
+    *      5:0 of this field (Cube Face Enables) enable the individual faces
+    *      of a cube map. Enabling a face indicates that the face is present
+    *      in the cube map, while disabling it indicates that that face is
+    *      represented by the texture map's border color. Refer to Memory Data
+    *      Formats for the correlation between faces and the cube map memory
+    *      layout. Note that storage for disabled faces must be provided. For
+    *      other surfaces this field is reserved and MBZ."
+    *
+    *     "When TEXCOORDMODE_CLAMP is used when accessing a cube map, this
+    *      field must be programmed to 111111b (all faces enabled). This field
+    *      is ignored unless the Surface Type is SURFTYPE_CUBE."
+    */
+   if (info->is_cube_map &&
+       info->access == ILO_STATE_SURFACE_ACCESS_SAMPLER)
+      dw0 |= GEN7_SURFACE_DW0_CUBE_FACE_ENABLES__MASK;
+
+   dw1 = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      assert(info->img->walk_layer_height % 4 == 0);
+      dw1 |= info->img->walk_layer_height / 4 <<
+         GEN8_SURFACE_DW1_QPITCH__SHIFT;
+   }
+
+   dw2 = height << GEN7_SURFACE_DW2_HEIGHT__SHIFT |
+         width << GEN7_SURFACE_DW2_WIDTH__SHIFT;
+
+   dw3 = depth << GEN7_SURFACE_DW3_DEPTH__SHIFT |
+         (info->img->bo_stride - 1) << GEN7_SURFACE_DW3_PITCH__SHIFT;
+
+   if (ilo_dev_gen(dev) == ILO_GEN(7.5))
+      dw3 |= 0 << GEN75_SURFACE_DW3_INTEGER_SURFACE_FORMAT__SHIFT;
+
+   dw4 = array_base << GEN7_SURFACE_DW4_MIN_ARRAY_ELEMENT__SHIFT |
+         view_extent << GEN7_SURFACE_DW4_RT_VIEW_EXTENT__SHIFT |
+         sample_count << GEN7_SURFACE_DW4_MULTISAMPLECOUNT__SHIFT;
+
+   /*
+    * MSFMT_MSS means the samples are not interleaved and MSFMT_DEPTH_STENCIL
+    * means the samples are interleaved.  The layouts are the same when the
+    * number of samples is 1.
+    */
+   if (info->img->interleaved_samples && info->img->sample_count > 1) {
+      assert(info->access != ILO_STATE_SURFACE_ACCESS_DP_RENDER);
+      dw4 |= GEN7_SURFACE_DW4_MSFMT_DEPTH_STENCIL;
+   } else {
+      dw4 |= GEN7_SURFACE_DW4_MSFMT_MSS;
+   }
+
+   dw5 = min_lod << GEN7_SURFACE_DW5_MIN_LOD__SHIFT |
+         mip_count << GEN7_SURFACE_DW5_MIP_COUNT_LOD__SHIFT;
+
+   dw7 = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      dw7 |= GEN_SHIFT32(GEN75_SCS_RED,   GEN75_SURFACE_DW7_SCS_R) |
+             GEN_SHIFT32(GEN75_SCS_GREEN, GEN75_SURFACE_DW7_SCS_G) |
+             GEN_SHIFT32(GEN75_SCS_BLUE,  GEN75_SURFACE_DW7_SCS_B) |
+             GEN_SHIFT32(GEN75_SCS_ALPHA, GEN75_SURFACE_DW7_SCS_A);
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(surf->surface) >= 13);
+   surf->surface[0] = dw0;
+   surf->surface[1] = dw1;
+   surf->surface[2] = dw2;
+   surf->surface[3] = dw3;
+   surf->surface[4] = dw4;
+   surf->surface[5] = dw5;
+   surf->surface[6] = 0;
+   surf->surface[7] = dw7;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      surf->surface[8] = 0;
+      surf->surface[9] = 0;
+      surf->surface[10] = 0;
+      surf->surface[11] = 0;
+      surf->surface[12] = 0;
+   }
+
+   surf->type = type;
+   surf->min_lod = min_lod;
+   surf->mip_count = mip_count;
+
+   return true;
+}
+
+bool
+ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
+                                const struct ilo_dev *dev)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(surf, sizeof(*surf)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= surface_set_gen7_null_SURFACE_STATE(surf, dev);
+   else
+      ret &= surface_set_gen6_null_SURFACE_STATE(surf, dev);
+
+   surf->type = GEN6_SURFTYPE_NULL;
+   surf->readonly = true;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_buffer_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(surf, sizeof(*surf)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= surface_set_gen7_buffer_SURFACE_STATE(surf, dev, info);
+   else
+      ret &= surface_set_gen6_buffer_SURFACE_STATE(surf, dev, info);
+
+   surf->readonly = info->readonly;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_surface_init_for_image(struct ilo_state_surface *surf,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_surface_image_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(surf, sizeof(*surf)));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      ret &= surface_set_gen7_image_SURFACE_STATE(surf, dev, info);
+   else
+      ret &= surface_set_gen6_image_SURFACE_STATE(surf, dev, info);
+
+   surf->is_integer = info->is_integer;
+   surf->readonly = info->readonly;
+   surf->scanout = info->img->scanout;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_surface_set_scs(struct ilo_state_surface *surf,
+                          const struct ilo_dev *dev,
+                          enum gen_surface_scs rgba[4])
+{
+   const uint32_t scs = GEN_SHIFT32(rgba[0], GEN75_SURFACE_DW7_SCS_R) |
+                        GEN_SHIFT32(rgba[1], GEN75_SURFACE_DW7_SCS_G) |
+                        GEN_SHIFT32(rgba[2], GEN75_SURFACE_DW7_SCS_B) |
+                        GEN_SHIFT32(rgba[3], GEN75_SURFACE_DW7_SCS_A);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(ilo_dev_gen(dev) >= ILO_GEN(7.5));
+
+   surf->surface[7] = (surf->surface[7] & ~GEN75_SURFACE_DW7_SCS__MASK) | scs;
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface.h b/src/gallium/drivers/ilo/core/ilo_state_surface.h
new file mode 100644
index 0000000..9c02542
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface.h
@@ -0,0 +1,121 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_SURFACE_H
+#define ILO_STATE_SURFACE_H
+
+#include "genhw/genhw.h"
+#include "intel_winsys.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+struct ilo_buffer;
+struct ilo_image;
+
+enum ilo_state_surface_access {
+   ILO_STATE_SURFACE_ACCESS_SAMPLER,      /* sampling engine surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_RENDER,    /* render target surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_TYPED,     /* typed surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_UNTYPED,   /* untyped surfaces */
+   ILO_STATE_SURFACE_ACCESS_DP_DATA,
+   ILO_STATE_SURFACE_ACCESS_DP_SVB,
+};
+
+struct ilo_state_surface_buffer_info {
+   const struct ilo_buffer *buf;
+
+   enum ilo_state_surface_access access;
+
+   enum gen_surface_format format;
+   uint8_t format_size;
+
+   bool readonly;
+   uint16_t struct_size;
+
+   uint32_t offset;
+   uint32_t size;
+};
+
+struct ilo_state_surface_image_info {
+   const struct ilo_image *img;
+
+   enum ilo_state_surface_access access;
+
+   enum gen_surface_format format;
+   bool is_integer;
+
+   bool readonly;
+   bool is_cube_map;
+   bool is_array;
+
+   uint8_t level_base;
+   uint8_t level_count;
+   uint16_t slice_base;
+   uint16_t slice_count;
+};
+
+struct ilo_state_surface {
+   uint32_t surface[13];
+
+   enum gen_surface_type type;
+   uint8_t min_lod;
+   uint8_t mip_count;
+   bool is_integer;
+
+   bool readonly;
+   bool scanout;
+
+   /* managed by users */
+   struct intel_bo *bo;
+};
+
+bool
+ilo_state_surface_valid_format(const struct ilo_dev *dev,
+                               enum ilo_state_surface_access access,
+                               enum gen_surface_format format);
+
+bool
+ilo_state_surface_init_for_null(struct ilo_state_surface *surf,
+                                const struct ilo_dev *dev);
+
+bool
+ilo_state_surface_init_for_buffer(struct ilo_state_surface *surf,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_surface_buffer_info *info);
+
+bool
+ilo_state_surface_init_for_image(struct ilo_state_surface *surf,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_surface_image_info *info);
+
+bool
+ilo_state_surface_set_scs(struct ilo_state_surface *surf,
+                          const struct ilo_dev *dev,
+                          enum gen_surface_scs rgba[4]);
+
+#endif /* ILO_STATE_SURFACE_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_surface_format.c b/src/gallium/drivers/ilo/core/ilo_state_surface_format.c
new file mode 100644
index 0000000..a40c1b8
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_surface_format.c
@@ -0,0 +1,351 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2013 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "genhw/genhw.h"
+#include "ilo_state_surface.h"
+
+static bool
+surface_valid_sampler_format(const struct ilo_dev *dev,
+                             enum ilo_state_surface_access access,
+                             enum gen_surface_format format)
+{
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 4 part 1, page 84-87
+    */
+   static const struct sampler_cap {
+      int sampling;
+      int filtering;
+      int shadow_map;
+      int chroma_key;
+   } caps[] = {
+#define CAP(sampling, filtering, shadow_map, chroma_key) \
+      { ILO_GEN(sampling), ILO_GEN(filtering), ILO_GEN(shadow_map), ILO_GEN(chroma_key) }
+      [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32B32X32_FLOAT]       = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32B32_FLOAT]          = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32B32_SINT]           = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32B32_UINT]           = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32_FLOAT_X8X24_TYPELESS] = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_X32_TYPELESS_G8X24_UINT]  = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_L32A32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_R16G16B16X16_UNORM]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16X16_FLOAT]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_A32X32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_L32X32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_I32X32_FLOAT]             = CAP(  1,   5,   0,   0),
+      [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_R24_UNORM_X8_TYPELESS]    = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_X24_TYPELESS_G8_UINT]     = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_L16A16_UNORM]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_I24X8_UNORM]              = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_L24X8_UNORM]              = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_A24X8_UNORM]              = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_I32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_L32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_A32_FLOAT]                = CAP(  1,   5,   1,   0),
+      [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B8G8R8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8X8_UNORM]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8B8X8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R9G9B9E5_SHAREDEXP]       = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B10G10R10X2_UNORM]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_L16A16_FLOAT]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_A8P8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_A8P8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_I16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_A16_UNORM]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L8A8_UNORM]               = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_I16_FLOAT]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L16_FLOAT]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_A16_FLOAT]                = CAP(  1,   1,   1,   0),
+      [GEN6_FORMAT_L8A8_UNORM_SRGB]          = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_R5G5_SNORM_B6_UNORM]      = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_P8A8_UNORM_PALETTE0]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_P8A8_UNORM_PALETTE1]      = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   0, 4.5),
+      [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_I8_UNORM]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_L8_UNORM]                 = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_P4A4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_A4P4_UNORM_PALETTE0]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_P8_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_L8_UNORM_SRGB]            = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_P8_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_P4A4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_A4P4_UNORM_PALETTE1]      = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_DXT1_RGB_SRGB]            = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_R1_UNORM]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_P2_UNORM_PALETTE0]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_P2_UNORM_PALETTE1]        = CAP(4.5, 4.5,   0,   0),
+      [GEN6_FORMAT_BC1_UNORM]                = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_BC2_UNORM]                = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_BC3_UNORM]                = CAP(  1,   1,   0,   1),
+      [GEN6_FORMAT_BC4_UNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC5_UNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC1_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC2_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC3_UNORM_SRGB]           = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_MONO8]                    = CAP(  1,   0,   0,   0),
+      [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_DXT1_RGB]                 = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_FXT1]                     = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC4_SNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_BC5_SNORM]                = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R16G16B16_FLOAT]          = CAP(  5,   5,   0,   0),
+      [GEN6_FORMAT_BC6H_SF16]                = CAP(  7,   7,   0,   0),
+      [GEN6_FORMAT_BC7_UNORM]                = CAP(  7,   7,   0,   0),
+      [GEN6_FORMAT_BC7_UNORM_SRGB]           = CAP(  7,   7,   0,   0),
+      [GEN6_FORMAT_BC6H_UF16]                = CAP(  7,   7,   0,   0),
+#undef CAP
+   };
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return (format < ARRAY_SIZE(caps) && caps[format].sampling &&
+           ilo_dev_gen(dev) >= caps[format].sampling);
+}
+
+static bool
+surface_valid_dp_format(const struct ilo_dev *dev,
+                        enum ilo_state_surface_access access,
+                        enum gen_surface_format format)
+{
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 4 part 1, page 172, 252-253, and 277-278
+    *  - the Haswell PRM, volume 7, page 262-264
+    */
+   static const struct dp_cap {
+      int rt_write;
+      int rt_write_blending;
+      int typed_write;
+      int media_color_processing;
+   } caps[] = {
+#define CAP(rt_write, rt_write_blending, typed_write, media_color_processing) \
+      { ILO_GEN(rt_write), ILO_GEN(rt_write_blending), ILO_GEN(typed_write), ILO_GEN(media_color_processing) }
+      [GEN6_FORMAT_R32G32B32A32_FLOAT]       = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32G32B32A32_SINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32G32B32A32_UINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_UNORM]       = CAP(  1, 4.5,   7,   6),
+      [GEN6_FORMAT_R16G16B16A16_SNORM]       = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_SINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_UINT]        = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16B16A16_FLOAT]       = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32G32_FLOAT]             = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32G32_SINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32G32_UINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_B8G8R8A8_UNORM]           = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_B8G8R8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R10G10B10A2_UNORM]        = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_R10G10B10A2_UNORM_SRGB]   = CAP(  0,   0,   0,   6),
+      [GEN6_FORMAT_R10G10B10A2_UINT]         = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8G8B8A8_UNORM]           = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_R8G8B8A8_UNORM_SRGB]      = CAP(  1,   1,   0,   6),
+      [GEN6_FORMAT_R8G8B8A8_SNORM]           = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R8G8B8A8_SINT]            = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8G8B8A8_UINT]            = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16_UNORM]             = CAP(  1, 4.5,   7,   0),
+      [GEN6_FORMAT_R16G16_SNORM]             = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R16G16_SINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16_UINT]              = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16G16_FLOAT]             = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B10G10R10A2_UNORM]        = CAP(  1,   1,   7,   6),
+      [GEN6_FORMAT_B10G10R10A2_UNORM_SRGB]   = CAP(  1,   1,   0,   6),
+      [GEN6_FORMAT_R11G11B10_FLOAT]          = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R32_SINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32_UINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R32_FLOAT]                = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B8G8R8X8_UNORM]           = CAP(  0,   0,   0,   6),
+      [GEN6_FORMAT_B5G6R5_UNORM]             = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G6R5_UNORM_SRGB]        = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B5G5R5A1_UNORM]           = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G5R5A1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_B4G4R4A4_UNORM]           = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B4G4R4A4_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8G8_UNORM]               = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R8G8_SNORM]               = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R8G8_SINT]                = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8G8_UINT]                = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16_UNORM]                = CAP(  1, 4.5,   7,   7),
+      [GEN6_FORMAT_R16_SNORM]                = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R16_SINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16_UINT]                 = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R16_FLOAT]                = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G5R5X1_UNORM]           = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_B5G5R5X1_UNORM_SRGB]      = CAP(  1,   1,   0,   0),
+      [GEN6_FORMAT_R8_UNORM]                 = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_R8_SNORM]                 = CAP(  1,   6,   7,   0),
+      [GEN6_FORMAT_R8_SINT]                  = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_R8_UINT]                  = CAP(  1,   0,   7,   0),
+      [GEN6_FORMAT_A8_UNORM]                 = CAP(  1,   1,   7,   0),
+      [GEN6_FORMAT_YCRCB_NORMAL]             = CAP(  1,   0,   0,   6),
+      [GEN6_FORMAT_YCRCB_SWAPUVY]            = CAP(  1,   0,   0,   6),
+      [GEN6_FORMAT_YCRCB_SWAPUV]             = CAP(  1,   0,   0,   6),
+      [GEN6_FORMAT_YCRCB_SWAPY]              = CAP(  1,   0,   0,   6),
+#undef CAP
+   };
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (format >= ARRAY_SIZE(caps))
+      return false;
+
+   switch (access) {
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+      return (caps[format].rt_write &&
+              ilo_dev_gen(dev) >= caps[format].rt_write);
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+      return (caps[format].typed_write &&
+              ilo_dev_gen(dev) >= caps[format].typed_write);
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+      return (format == GEN6_FORMAT_RAW);
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      /* ignored, but can it be raw? */
+      assert(format != GEN6_FORMAT_RAW);
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+surface_valid_svb_format(const struct ilo_dev *dev,
+                         enum gen_surface_format format)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 2 part 1, page 195
+    *  - the Haswell PRM, volume 7, page 535
+    */
+   switch (format) {
+   case GEN6_FORMAT_R32G32B32A32_FLOAT:
+   case GEN6_FORMAT_R32G32B32A32_SINT:
+   case GEN6_FORMAT_R32G32B32A32_UINT:
+   case GEN6_FORMAT_R32G32B32_FLOAT:
+   case GEN6_FORMAT_R32G32B32_SINT:
+   case GEN6_FORMAT_R32G32B32_UINT:
+   case GEN6_FORMAT_R32G32_FLOAT:
+   case GEN6_FORMAT_R32G32_SINT:
+   case GEN6_FORMAT_R32G32_UINT:
+   case GEN6_FORMAT_R32_SINT:
+   case GEN6_FORMAT_R32_UINT:
+   case GEN6_FORMAT_R32_FLOAT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+ilo_state_surface_valid_format(const struct ilo_dev *dev,
+                               enum ilo_state_surface_access access,
+                               enum gen_surface_format format)
+{
+   bool valid;
+
+   switch (access) {
+   case ILO_STATE_SURFACE_ACCESS_SAMPLER:
+      valid = surface_valid_sampler_format(dev, access, format);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_RENDER:
+   case ILO_STATE_SURFACE_ACCESS_DP_TYPED:
+   case ILO_STATE_SURFACE_ACCESS_DP_UNTYPED:
+   case ILO_STATE_SURFACE_ACCESS_DP_DATA:
+      valid = surface_valid_dp_format(dev, access, format);
+      break;
+   case ILO_STATE_SURFACE_ACCESS_DP_SVB:
+      valid = surface_valid_svb_format(dev, format);
+      break;
+   default:
+      valid = false;
+      break;
+   }
+
+   return valid;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_urb.c b/src/gallium/drivers/ilo/core/ilo_state_urb.c
new file mode 100644
index 0000000..cbd150c
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_urb.c
@@ -0,0 +1,769 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_urb.h"
+
+struct urb_configuration {
+   uint8_t vs_pcb_alloc_kb;
+   uint8_t hs_pcb_alloc_kb;
+   uint8_t ds_pcb_alloc_kb;
+   uint8_t gs_pcb_alloc_kb;
+   uint8_t ps_pcb_alloc_kb;
+
+   uint8_t urb_offset_8kb;
+
+   uint8_t vs_urb_alloc_8kb;
+   uint8_t hs_urb_alloc_8kb;
+   uint8_t ds_urb_alloc_8kb;
+   uint8_t gs_urb_alloc_8kb;
+
+   uint8_t vs_entry_rows;
+   uint8_t hs_entry_rows;
+   uint8_t ds_entry_rows;
+   uint8_t gs_entry_rows;
+
+   int vs_entry_count;
+   int hs_entry_count;
+   int ds_entry_count;
+   int gs_entry_count;
+};
+
+static void
+urb_alloc_gen7_pcb(const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info,
+                   struct urb_configuration *conf)
+{
+   /*
+    * From the Haswell PRM, volume 2b, page 940:
+    *
+    *     "[0,16] (0KB - 16KB) Increments of 1KB DevHSW:GT1, DevHSW:GT2
+    *      [0,32] (0KB - 32KB) Increments of 2KB DevHSW:GT3"
+    */
+   const uint8_t increment_kb =
+      (ilo_dev_gen(dev) >= ILO_GEN(8) ||
+       (ilo_dev_gen(dev) == ILO_GEN(7.5) && dev->gt == 3)) ? 2 : 1;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * Keep the strategy simple as we do not know the workloads and how
+    * expensive it is to change the configuration frequently.
+    */
+   if (info->hs_const_data || info->ds_const_data) {
+      conf->vs_pcb_alloc_kb = increment_kb * 4;
+      conf->hs_pcb_alloc_kb = increment_kb * 3;
+      conf->ds_pcb_alloc_kb = increment_kb * 3;
+      conf->gs_pcb_alloc_kb = increment_kb * 3;
+      conf->ps_pcb_alloc_kb = increment_kb * 3;
+   } else if (info->gs_const_data) {
+      conf->vs_pcb_alloc_kb = increment_kb * 6;
+      conf->gs_pcb_alloc_kb = increment_kb * 5;
+      conf->ps_pcb_alloc_kb = increment_kb * 5;
+   } else {
+      conf->vs_pcb_alloc_kb = increment_kb * 8;
+      conf->ps_pcb_alloc_kb = increment_kb * 8;
+   }
+
+   conf->urb_offset_8kb = increment_kb * 16 / 8;
+}
+
+static void
+urb_alloc_gen6_urb(const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info,
+                   struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 34:
+    *
+    *     "(VS URB Starting Address) Offset from the start of the URB memory
+    *      where VS starts its allocation, specified in multiples of 8 KB."
+    *
+    * Same for other stages.
+    */
+   const int space_avail_8kb = dev->urb_size / 8192 - conf->urb_offset_8kb;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 173:
+    *
+    *     "Programming Note: If the GS stage is enabled, software must always
+    *      allocate at least one GS URB Entry. This is true even if the GS
+    *      thread never needs to output vertices to the urb, e.g., when only
+    *      performing stream output. This is an artifact of the need to pass
+    *      the GS thread an initial destination URB handle."
+    */
+   const bool force_gs_alloc =
+      (ilo_dev_gen(dev) == ILO_GEN(6) && info->gs_enable);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->hs_entry_size || info->ds_entry_size) {
+      conf->vs_urb_alloc_8kb = space_avail_8kb / 4;
+      conf->hs_urb_alloc_8kb = space_avail_8kb / 4;
+      conf->ds_urb_alloc_8kb = space_avail_8kb / 4;
+      conf->gs_urb_alloc_8kb = space_avail_8kb / 4;
+
+      if (space_avail_8kb % 4) {
+         assert(space_avail_8kb % 2 == 0);
+         conf->vs_urb_alloc_8kb++;
+         conf->gs_urb_alloc_8kb++;
+      }
+   } else if (info->gs_entry_size || force_gs_alloc) {
+      assert(space_avail_8kb % 2 == 0);
+      conf->vs_urb_alloc_8kb = space_avail_8kb / 2;
+      conf->gs_urb_alloc_8kb = space_avail_8kb / 2;
+   } else {
+      conf->vs_urb_alloc_8kb = space_avail_8kb;
+   }
+}
+
+static bool
+urb_init_gen6_vs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 28:
+    *
+    *     "(VS URB Entry Allocation Size)
+    *      Range [0,4] = [1,5] 1024-bit URB rows"
+    *
+    *     "(VS Number of URB Entries)
+    *      Range [24,256] in multiples of 4
+    *            [24, 128] in multiples of 4[DevSNBGT1]"
+    */
+   const int max_entry_count = (dev->gt == 2) ? 256 : 252;
+   const int row_size = 1024 / 8;
+   int row_count, entry_count;
+   int entry_size;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   /* VE and VS share the same VUE for each vertex */
+   entry_size = info->vs_entry_size;
+   if (entry_size < info->ve_entry_size)
+      entry_size = info->ve_entry_size;
+
+   row_count = (entry_size + row_size - 1) / row_size;
+   if (row_count > 5)
+      return false;
+   else if (!row_count)
+      row_count++;
+
+   entry_count = conf->vs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   entry_count &= ~3;
+   assert(entry_count >= 24);
+
+   conf->vs_entry_rows = row_count;
+   conf->vs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen6_gs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 29:
+    *
+    *     "(GS Number of URB Entries)
+    *      Range [0,256] in multiples of 4
+    *            [0, 254] in multiples of 4[DevSNBGT1]"
+    *
+    *     "(GS URB Entry Allocation Size)
+    *      Range [0,4] = [1,5] 1024-bit URB rows"
+    */
+   const int max_entry_count = (dev->gt == 2) ? 256 : 252;
+   const int row_size = 1024 / 8;
+   int row_count, entry_count;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   row_count = (info->gs_entry_size + row_size - 1) / row_size;
+   if (row_count > 5)
+      return false;
+   else if (!row_count)
+      row_count++;
+
+   entry_count = conf->gs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   entry_count &= ~3;
+
+   conf->gs_entry_rows = row_count;
+   conf->gs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_vs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 34-35:
+    *
+    *     "VS URB Entry Allocation Size equal to 4(5 512-bit URB rows) may
+    *      cause performance to decrease due to banking in the URB. Element
+    *      sizes of 16 to 20 should be programmed with six 512-bit URB rows."
+    *
+    *     "(VS URB Entry Allocation Size)
+    *      Format: U9-1 count of 512-bit units"
+    *
+    *     "(VS Number of URB Entries)
+    *      [32,704]
+    *      [32,512]
+    *
+    *      Programming Restriction: VS Number of URB Entries must be divisible
+    *      by 8 if the VS URB Entry Allocation Size is less than 9 512-bit URB
+    *      entries."2:0" = reserved "000b""
+    *
+    * From the Haswell PRM, volume 2b, page 847:
+    *
+    *     "(VS Number of URB Entries)
+    *      [64,1664] DevHSW:GT3
+    *      [64,1664] DevHSW:GT2
+    *      [32,640]  DevHSW:GT1"
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int entry_size;
+   int max_entry_count, min_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 35:
+    *
+    *     "Programming Restriction: As the VS URB entry serves as both the
+    *      per-vertex input and output of the VS shader, the VS URB Allocation
+    *      Size must be sized to the maximum of the vertex input and output
+    *      structures."
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 42:
+    *
+    *     "If the VS function is enabled, the VF-written VUEs are not required
+    *      to have Vertex Headers, as the VS-incoming vertices are guaranteed
+    *      to be consumed by the VS (i.e., the VS thread is responsible for
+    *      overwriting the input vertex data)."
+    *
+    * VE and VS share the same VUE for each vertex.
+    */
+   entry_size = info->vs_entry_size;
+   if (entry_size < info->ve_entry_size)
+      entry_size = info->ve_entry_size;
+
+   row_count = (entry_size + row_size - 1) / row_size;
+   if (row_count == 5 || !row_count)
+      row_count++;
+
+   entry_count = conf->vs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 1664 : 640;
+      min_entry_count = (dev->gt >= 2) ? 64 : 32;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 704 : 512;
+      min_entry_count = 32;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (entry_count < min_entry_count)
+      return false;
+
+   conf->vs_entry_rows = row_count;
+   conf->vs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_hs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 37:
+    *
+    *     "HS Number of URB Entries must be divisible by 8 if the HS URB Entry
+    *      Allocation Size is less than 9 512-bit URB
+    *      entries."2:0" = reserved "000"
+    *
+    *      [0,64]
+    *      [0,32]"
+    *
+    * From the Haswell PRM, volume 2b, page 849:
+    *
+    *     "(HS Number of URB Entries)
+    *      [0,128] DevHSW:GT2
+    *      [0,64]  DevHSW:GT1"
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int max_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   row_count = (info->hs_entry_size + row_size - 1) / row_size;
+   if (!row_count)
+      row_count++;
+
+   entry_count = conf->hs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 128 : 64;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 64 : 32;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (info->hs_entry_size && !entry_count)
+      return false;
+
+   conf->hs_entry_rows = row_count;
+   conf->hs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_ds_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 38:
+    *
+    *     "(DS URB Entry Allocation Size)
+    *      [0,9]"
+    *
+    *     "(DS Number of URB Entries) If Domain Shader Thread Dispatch is
+    *      Enabled then the minimum number handles that must be allocated is
+    *      138 URB entries.
+    *      "2:0" = reserved "000"
+    *
+    *      [0,448]
+    *      [0,288]
+    *
+    *      DS Number of URB Entries must be divisible by 8 if the DS URB Entry
+    *      Allocation Size is less than 9 512-bit URB entries.If Domain Shader
+    *      Thread Dispatch is Enabled then the minimum number of handles that
+    *      must be allocated is 10 URB entries."
+    *
+    * From the Haswell PRM, volume 2b, page 851:
+    *
+    *     "(DS Number of URB Entries)
+    *      [0,960] DevHSW:GT2
+    *      [0,384] DevHSW:GT1"
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int max_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   row_count = (info->ds_entry_size + row_size - 1) / row_size;
+   if (row_count > 10)
+      return false;
+   else if (!row_count)
+      row_count++;
+
+   entry_count = conf->ds_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 960 : 384;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 448 : 288;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (info->ds_entry_size && entry_count < 10)
+      return false;
+
+   conf->ds_entry_rows = row_count;
+   conf->ds_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_init_gen7_gs_entry(const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info,
+                       struct urb_configuration *conf)
+{
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 40:
+    *
+    *     "(GS Number of URB Entries) GS Number of URB Entries must be
+    *      divisible by 8 if the GS URB Entry Allocation Size is less than 9
+    *      512-bit URB entries.
+    *      "2:0" = reserved "000"
+    *
+    *      [0,320]
+    *      [0,192]"
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 171:
+    *
+    *     "(DUAL_INSTANCE and DUAL_OBJECT) The GS must be allocated at least
+    *      two URB handles or behavior is UNDEFINED."
+    *
+    * From the Haswell PRM, volume 2b, page 853:
+    *
+    *     "(GS Number of URB Entries)
+    *      [0,640] DevHSW:GT2
+    *      [0,256] DevHSW:GT1
+    *
+    *      Only if GS is disabled can this field be programmed to 0.  If GS is
+    *      enabled this field shall be programmed to a value greater than 0.
+    *      For GS Dispatch Mode "Single", this field shall be programmed to a
+    *      value greater than or equal to 1. For other GS Dispatch Modes,
+    *      refer to the definition of Dispatch Mode (3DSTATE_GS) for minimum
+    *      values of this field."
+    */
+   const int row_size = 512 / 8;
+   int row_count, entry_count;
+   int max_entry_count;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   row_count = (info->gs_entry_size + row_size - 1) / row_size;
+   if (!row_count)
+      row_count++;
+
+   entry_count = conf->gs_urb_alloc_8kb * 8192 / (row_size * row_count);
+   if (row_count < 9)
+      entry_count &= ~7;
+
+   switch (ilo_dev_gen(dev)) {
+   case ILO_GEN(8):
+   case ILO_GEN(7.5):
+      max_entry_count = (dev->gt >= 2) ? 640 : 256;
+      break;
+   case ILO_GEN(7):
+      max_entry_count = (dev->gt == 2) ? 320 : 192;
+      break;
+   default:
+      assert(!"unexpected gen");
+      return false;
+      break;
+   }
+
+   if (entry_count > max_entry_count)
+      entry_count = max_entry_count;
+   else if (info->gs_entry_size && entry_count < 2)
+      return false;
+
+   conf->gs_entry_rows = row_count;
+   conf->gs_entry_count = entry_count;
+
+   return true;
+}
+
+static bool
+urb_get_gen6_configuration(const struct ilo_dev *dev,
+                           const struct ilo_state_urb_info *info,
+                           struct urb_configuration *conf)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   memset(conf, 0, sizeof(*conf));
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      urb_alloc_gen7_pcb(dev, info, conf);
+
+   urb_alloc_gen6_urb(dev, info, conf);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      if (!urb_init_gen7_vs_entry(dev, info, conf) ||
+          !urb_init_gen7_hs_entry(dev, info, conf) ||
+          !urb_init_gen7_ds_entry(dev, info, conf) ||
+          !urb_init_gen7_gs_entry(dev, info, conf))
+         return false;
+   } else {
+      if (!urb_init_gen6_vs_entry(dev, info, conf) ||
+          !urb_init_gen6_gs_entry(dev, info, conf))
+         return false;
+   }
+
+   return true;
+}
+
+static bool
+urb_set_gen7_3dstate_push_constant_alloc(struct ilo_state_urb *urb,
+                                         const struct ilo_dev *dev,
+                                         const struct ilo_state_urb_info *info,
+                                         const struct urb_configuration *conf)
+{
+   uint32_t dw1[5];
+   uint8_t sizes_kb[5], offset_kb;
+   int i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   sizes_kb[0] = conf->vs_pcb_alloc_kb;
+   sizes_kb[1] = conf->hs_pcb_alloc_kb;
+   sizes_kb[2] = conf->ds_pcb_alloc_kb;
+   sizes_kb[3] = conf->gs_pcb_alloc_kb;
+   sizes_kb[4] = conf->ps_pcb_alloc_kb;
+   offset_kb = 0;
+
+   for (i = 0; i < 5; i++) {
+      /* careful for the valid range of offsets */
+      if (sizes_kb[i]) {
+         dw1[i] = offset_kb << GEN7_PCB_ALLOC_DW1_OFFSET__SHIFT |
+                  sizes_kb[i] << GEN7_PCB_ALLOC_DW1_SIZE__SHIFT;
+         offset_kb += sizes_kb[i];
+      } else {
+         dw1[i] = 0;
+      }
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(urb->pcb) >= 5);
+   memcpy(urb->pcb, dw1, sizeof(dw1));
+
+   return true;
+}
+
+static bool
+urb_set_gen6_3DSTATE_URB(struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         const struct ilo_state_urb_info *info,
+                         const struct urb_configuration *conf)
+{
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   assert(conf->vs_entry_rows && conf->gs_entry_rows);
+
+   dw1 = (conf->vs_entry_rows - 1) << GEN6_URB_DW1_VS_ENTRY_SIZE__SHIFT |
+         conf->vs_entry_count << GEN6_URB_DW1_VS_ENTRY_COUNT__SHIFT;
+   dw2 = conf->gs_entry_count << GEN6_URB_DW2_GS_ENTRY_COUNT__SHIFT |
+         (conf->gs_entry_rows - 1) << GEN6_URB_DW2_GS_ENTRY_SIZE__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(urb->urb) >= 2);
+   urb->urb[0] = dw1;
+   urb->urb[1] = dw2;
+
+   return true;
+}
+
+static bool
+urb_set_gen7_3dstate_urb(struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         const struct ilo_state_urb_info *info,
+                         const struct urb_configuration *conf)
+{
+   uint32_t dw1[4];
+   struct {
+      uint8_t alloc_8kb;
+      uint8_t entry_rows;
+      int entry_count;
+   } stages[4];
+   uint8_t offset_8kb;
+   int i;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   stages[0].alloc_8kb = conf->vs_urb_alloc_8kb;
+   stages[1].alloc_8kb = conf->hs_urb_alloc_8kb;
+   stages[2].alloc_8kb = conf->ds_urb_alloc_8kb;
+   stages[3].alloc_8kb = conf->gs_urb_alloc_8kb;
+
+   stages[0].entry_rows = conf->vs_entry_rows;
+   stages[1].entry_rows = conf->hs_entry_rows;
+   stages[2].entry_rows = conf->ds_entry_rows;
+   stages[3].entry_rows = conf->gs_entry_rows;
+
+   stages[0].entry_count = conf->vs_entry_count;
+   stages[1].entry_count = conf->hs_entry_count;
+   stages[2].entry_count = conf->ds_entry_count;
+   stages[3].entry_count = conf->gs_entry_count;
+
+   offset_8kb = conf->urb_offset_8kb;
+
+   for (i = 0; i < 4; i++) {
+      /* careful for the valid range of offsets */
+      if (stages[i].alloc_8kb) {
+         assert(stages[i].entry_rows);
+         dw1[i] =
+            offset_8kb << GEN7_URB_DW1_OFFSET__SHIFT |
+            (stages[i].entry_rows - 1) << GEN7_URB_DW1_ENTRY_SIZE__SHIFT |
+            stages[i].entry_count << GEN7_URB_DW1_ENTRY_COUNT__SHIFT;
+         offset_8kb += stages[i].alloc_8kb;
+      } else {
+         dw1[i] = 0;
+      }
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(urb->urb) >= 4);
+   memcpy(urb->urb, dw1, sizeof(dw1));
+
+   return true;
+}
+
+bool
+ilo_state_urb_init(struct ilo_state_urb *urb,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info)
+{
+   assert(ilo_is_zeroed(urb, sizeof(*urb)));
+   return ilo_state_urb_set_info(urb, dev, info);
+}
+
+bool
+ilo_state_urb_init_for_rectlist(struct ilo_state_urb *urb,
+                                const struct ilo_dev *dev,
+                                uint8_t vf_attr_count)
+{
+   struct ilo_state_urb_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.ve_entry_size = sizeof(uint32_t) * 4 * vf_attr_count;
+
+   return ilo_state_urb_init(urb, dev, &info);
+}
+
+bool
+ilo_state_urb_set_info(struct ilo_state_urb *urb,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info)
+{
+   struct urb_configuration conf;
+   bool ret = true;
+
+   ret &= urb_get_gen6_configuration(dev, info, &conf);
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      ret &= urb_set_gen7_3dstate_push_constant_alloc(urb, dev, info, &conf);
+      ret &= urb_set_gen7_3dstate_urb(urb, dev, info, &conf);
+   } else {
+      ret &= urb_set_gen6_3DSTATE_URB(urb, dev, info, &conf);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+void
+ilo_state_urb_full_delta(const struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         struct ilo_state_urb_delta *delta)
+{
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      delta->dirty = ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS |
+                     ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS |
+                     ILO_STATE_URB_3DSTATE_URB_VS |
+                     ILO_STATE_URB_3DSTATE_URB_HS |
+                     ILO_STATE_URB_3DSTATE_URB_DS |
+                     ILO_STATE_URB_3DSTATE_URB_GS;
+   } else {
+      delta->dirty = ILO_STATE_URB_3DSTATE_URB_VS |
+                     ILO_STATE_URB_3DSTATE_URB_GS;
+   }
+}
+
+void
+ilo_state_urb_get_delta(const struct ilo_state_urb *urb,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_urb *old,
+                        struct ilo_state_urb_delta *delta)
+{
+   delta->dirty = 0;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      if (memcmp(urb->pcb, old->pcb, sizeof(urb->pcb))) {
+         delta->dirty |= ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS |
+                         ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS;
+      }
+
+      /*
+       * From the Ivy Bridge PRM, volume 2 part 1, page 34:
+       *
+       *     "3DSTATE_URB_HS, 3DSTATE_URB_DS, and 3DSTATE_URB_GS must also be
+       *      programmed in order for the programming of this state
+       *      (3DSTATE_URB_VS) to be valid."
+       *
+       * The same is true for the other three states.
+       */
+      if (memcmp(urb->urb, old->urb, sizeof(urb->urb))) {
+         delta->dirty |= ILO_STATE_URB_3DSTATE_URB_VS |
+                         ILO_STATE_URB_3DSTATE_URB_HS |
+                         ILO_STATE_URB_3DSTATE_URB_DS |
+                         ILO_STATE_URB_3DSTATE_URB_GS;
+      }
+   } else {
+      if (memcmp(urb->urb, old->urb, sizeof(uint32_t) * 2)) {
+         delta->dirty |= ILO_STATE_URB_3DSTATE_URB_VS |
+                         ILO_STATE_URB_3DSTATE_URB_GS;
+      }
+   }
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_urb.h b/src/gallium/drivers/ilo/core/ilo_state_urb.h
new file mode 100644
index 0000000..9522b3b
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_urb.h
@@ -0,0 +1,103 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_URB_H
+#define ILO_STATE_URB_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+enum ilo_state_urb_dirty_bits {
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS = (1 << 0),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS = (1 << 1),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS = (1 << 2),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS = (1 << 3),
+   ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS = (1 << 4),
+   ILO_STATE_URB_3DSTATE_URB_VS                 = (1 << 5),
+   ILO_STATE_URB_3DSTATE_URB_HS                 = (1 << 6),
+   ILO_STATE_URB_3DSTATE_URB_DS                 = (1 << 7),
+   ILO_STATE_URB_3DSTATE_URB_GS                 = (1 << 8),
+};
+
+/**
+ * URB entry allocation sizes and sizes of constant data extracted from PCBs
+ * to threads.
+ */
+struct ilo_state_urb_info {
+   bool gs_enable;
+
+   bool vs_const_data;
+   bool hs_const_data;
+   bool ds_const_data;
+   bool gs_const_data;
+   bool ps_const_data;
+
+   uint16_t ve_entry_size;
+   uint16_t vs_entry_size;
+   uint16_t hs_entry_size;
+   uint16_t ds_entry_size;
+   uint16_t gs_entry_size;
+};
+
+struct ilo_state_urb {
+   uint32_t pcb[5];
+   uint32_t urb[4];
+};
+
+struct ilo_state_urb_delta {
+   uint32_t dirty;
+};
+
+bool
+ilo_state_urb_init(struct ilo_state_urb *urb,
+                   const struct ilo_dev *dev,
+                   const struct ilo_state_urb_info *info);
+
+bool
+ilo_state_urb_init_for_rectlist(struct ilo_state_urb *urb,
+                                const struct ilo_dev *dev,
+                                uint8_t vf_attr_count);
+
+bool
+ilo_state_urb_set_info(struct ilo_state_urb *urb,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_urb_info *info);
+
+void
+ilo_state_urb_full_delta(const struct ilo_state_urb *urb,
+                         const struct ilo_dev *dev,
+                         struct ilo_state_urb_delta *delta);
+
+void
+ilo_state_urb_get_delta(const struct ilo_state_urb *urb,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_urb *old,
+                        struct ilo_state_urb_delta *delta);
+
+#endif /* ILO_STATE_URB_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.c b/src/gallium/drivers/ilo/core/ilo_state_vf.c
new file mode 100644
index 0000000..ddc7542
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.c
@@ -0,0 +1,984 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_buffer.h"
+#include "ilo_state_vf.h"
+
+static bool
+vf_validate_gen6_elements(const struct ilo_dev *dev,
+                          const struct ilo_state_vf_info *info)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 95:
+    *
+    *     "(Source Element Offset (in bytes))
+    *      Format: U11
+    *      Range [0,2047"
+    *
+    * From the Haswell PRM, volume 2d, page 415:
+    *
+    *     "(Source Element Offset)
+    *      Format: U12 byte offset
+    *      ...
+    *      [0,4095]"
+    *
+    * From the Broadwell PRM, volume 2d, page 469:
+    *
+    *     "(Source Element Offset)
+    *      Format: U12 byte offset
+    *      ...
+    *      [0,2047]"
+    */
+   const uint16_t max_vertex_offset =
+      (ilo_dev_gen(dev) == ILO_GEN(7.5)) ? 4096 : 2048;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(info->element_count <= ILO_STATE_VF_MAX_ELEMENT_COUNT);
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+
+      assert(elem->buffer < ILO_STATE_VF_MAX_BUFFER_COUNT);
+      assert(elem->vertex_offset < max_vertex_offset);
+      assert(ilo_state_vf_valid_element_format(dev, elem->format));
+   }
+
+   return true;
+}
+
+static uint32_t
+get_gen6_component_controls(const struct ilo_dev *dev,
+                            enum gen_vf_component comp_x,
+                            enum gen_vf_component comp_y,
+                            enum gen_vf_component comp_z,
+                            enum gen_vf_component comp_w)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return comp_x << GEN6_VE_DW1_COMP0__SHIFT |
+          comp_y << GEN6_VE_DW1_COMP1__SHIFT |
+          comp_z << GEN6_VE_DW1_COMP2__SHIFT |
+          comp_w << GEN6_VE_DW1_COMP3__SHIFT;
+}
+
+static bool
+get_gen6_edge_flag_format(const struct ilo_dev *dev,
+                          const struct ilo_state_vf_element_info *elem,
+                          enum gen_surface_format *format)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 94:
+    *
+    *     "The Source Element Format must be set to the UINT format."
+    *
+    * From the Haswell PRM, volume 2d, page 413:
+    *
+    *     "The SourceElementFormat needs to be a single-component format with
+    *      an element which has edge flag enabled."
+    */
+   if (elem->component_count != 1)
+      return false;
+
+   /* pick the format we like */
+   switch (elem->format_size) {
+   case 1:
+      *format = GEN6_FORMAT_R8_UINT;
+      break;
+   case 2:
+      *format = GEN6_FORMAT_R16_UINT;
+      break;
+   case 4:
+      *format = GEN6_FORMAT_R32_UINT;
+      break;
+   default:
+      return false;
+      break;
+   }
+
+   return true;
+}
+
+static bool
+vf_set_gen6_3DSTATE_VERTEX_ELEMENTS(struct ilo_state_vf *vf,
+                                    const struct ilo_dev *dev,
+                                    const struct ilo_state_vf_info *info)
+{
+   enum gen_surface_format edge_flag_format;
+   uint32_t dw0, dw1;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vf_validate_gen6_elements(dev, info))
+      return false;
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+      enum gen_vf_component components[4] = {
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         (elem->is_integer) ? GEN6_VFCOMP_STORE_1_INT :
+                              GEN6_VFCOMP_STORE_1_FP,
+      };
+
+      switch (elem->component_count) {
+      case 4: components[3] = GEN6_VFCOMP_STORE_SRC; /* fall through */
+      case 3: components[2] = GEN6_VFCOMP_STORE_SRC; /* fall through */
+      case 2: components[1] = GEN6_VFCOMP_STORE_SRC; /* fall through */
+      case 1: components[0] = GEN6_VFCOMP_STORE_SRC; break;
+      default:
+              assert(!"unexpected component count");
+              break;
+      }
+
+      dw0 = elem->buffer << GEN6_VE_DW0_VB_INDEX__SHIFT |
+            GEN6_VE_DW0_VALID |
+            elem->format << GEN6_VE_DW0_FORMAT__SHIFT |
+            elem->vertex_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT;
+      dw1 = get_gen6_component_controls(dev,
+            components[0], components[1],
+            components[2], components[3]);
+
+      STATIC_ASSERT(ARRAY_SIZE(vf->user_ve[i]) >= 2);
+      vf->user_ve[i][0] = dw0;
+      vf->user_ve[i][1] = dw1;
+   }
+
+   vf->user_ve_count = i;
+
+   vf->edge_flag_supported = (i && get_gen6_edge_flag_format(dev,
+         &info->elements[i - 1], &edge_flag_format));
+   if (vf->edge_flag_supported) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i - 1];
+
+      /* without edge flag enable */
+      vf->last_user_ve[0][0] = dw0;
+      vf->last_user_ve[0][1] = dw1;
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 94:
+       *
+       *     "This bit (Edge Flag Enable) must only be ENABLED on the last
+       *      valid VERTEX_ELEMENT structure.
+       *
+       *      When set, Component 0 Control must be set to
+       *      VFCOMP_STORE_SRC, and Component 1-3 Control must be set to
+       *      VFCOMP_NOSTORE."
+       */
+      dw0 = elem->buffer << GEN6_VE_DW0_VB_INDEX__SHIFT |
+            GEN6_VE_DW0_VALID |
+            edge_flag_format << GEN6_VE_DW0_FORMAT__SHIFT |
+            GEN6_VE_DW0_EDGE_FLAG_ENABLE |
+            elem->vertex_offset << GEN6_VE_DW0_VB_OFFSET__SHIFT;
+      dw1 = get_gen6_component_controls(dev, GEN6_VFCOMP_STORE_SRC,
+            GEN6_VFCOMP_NOSTORE, GEN6_VFCOMP_NOSTORE, GEN6_VFCOMP_NOSTORE);
+
+      /* with edge flag enable */
+      vf->last_user_ve[1][0] = dw0;
+      vf->last_user_ve[1][1] = dw1;
+   }
+
+   return true;
+}
+
+static bool
+vf_set_gen6_vertex_buffer_state(struct ilo_state_vf *vf,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_vf_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   memset(vf->vb_to_first_elem, -1, sizeof(vf->vb_to_first_elem));
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+
+      STATIC_ASSERT(ARRAY_SIZE(vf->user_instancing[i]) >= 2);
+      /* instancing enable only */
+      vf->user_instancing[i][0] = (elem->instancing_enable) ?
+         GEN6_VB_DW0_ACCESS_INSTANCEDATA :
+         GEN6_VB_DW0_ACCESS_VERTEXDATA;
+      vf->user_instancing[i][1] = elem->instancing_step_rate;
+
+      /*
+       * Instancing is per VB, not per VE, before Gen8.  Set up a VB-to-VE
+       * mapping as well.
+       */
+      if (vf->vb_to_first_elem[elem->buffer] < 0) {
+         vf->vb_to_first_elem[elem->buffer] = i;
+      } else {
+         const struct ilo_state_vf_element_info *first =
+            &info->elements[vf->vb_to_first_elem[elem->buffer]];
+
+         assert(elem->instancing_enable == first->instancing_enable &&
+                elem->instancing_step_rate == first->instancing_step_rate);
+      }
+   }
+
+   return true;
+}
+
+static bool
+vf_set_gen8_3DSTATE_VF_INSTANCING(struct ilo_state_vf *vf,
+                                  const struct ilo_dev *dev,
+                                  const struct ilo_state_vf_info *info)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   for (i = 0; i < info->element_count; i++) {
+      const struct ilo_state_vf_element_info *elem = &info->elements[i];
+
+      STATIC_ASSERT(ARRAY_SIZE(vf->user_instancing[i]) >= 2);
+      vf->user_instancing[i][0] = (elem->instancing_enable) ?
+         GEN8_INSTANCING_DW1_ENABLE : 0;
+      vf->user_instancing[i][1] = elem->instancing_step_rate;
+   }
+
+   return true;
+}
+
+static uint32_t
+get_gen6_component_zeros(const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return get_gen6_component_controls(dev,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0,
+         GEN6_VFCOMP_STORE_0);
+}
+
+static uint32_t
+get_gen6_component_ids(const struct ilo_dev *dev,
+                       bool vertexid, bool instanceid)
+{
+   ILO_DEV_ASSERT(dev, 6, 7.5);
+
+   return get_gen6_component_controls(dev,
+      (vertexid) ? GEN6_VFCOMP_STORE_VID : GEN6_VFCOMP_STORE_0,
+      (instanceid) ? GEN6_VFCOMP_STORE_IID : GEN6_VFCOMP_STORE_0,
+      GEN6_VFCOMP_STORE_0,
+      GEN6_VFCOMP_STORE_0);
+}
+
+static bool
+vf_params_set_gen6_internal_ve(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_vf_params_info *params,
+                               uint8_t user_ve_count)
+{
+   const bool prepend_ids =
+      (params->prepend_vertexid || params->prepend_instanceid);
+   uint8_t internal_ve_count = 0, i;
+   uint32_t dw1[2];
+
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 92:
+    *
+    *     "- At least one VERTEX_ELEMENT_STATE structure must be included.
+    *
+    *      - Inclusion of partial VERTEX_ELEMENT_STATE structures is
+    *        UNDEFINED.
+    *
+    *      - SW must ensure that at least one vertex element is defined prior
+    *        to issuing a 3DPRIMTIVE command, or operation is UNDEFINED.
+    *
+    *      - There are no "holes" allowed in the destination vertex: NOSTORE
+    *        components must be overwritten by subsequent components unless
+    *        they are the trailing DWords of the vertex.  Software must
+    *        explicitly chose some value (probably 0) to be written into
+    *        DWords that would otherwise be "holes"."
+    *
+    *      - ...
+    *
+    *      - [DevILK+] Element[0] must be valid."
+    */
+   if (params->prepend_zeros || (!user_ve_count && !prepend_ids))
+      dw1[internal_ve_count++] = get_gen6_component_zeros(dev);
+
+   if (prepend_ids) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+         /* placeholder for 3DSTATE_VF_SGVS */
+         dw1[internal_ve_count++] = get_gen6_component_zeros(dev);
+      } else {
+         dw1[internal_ve_count++] = get_gen6_component_ids(dev,
+               params->prepend_vertexid, params->prepend_instanceid);
+      }
+   }
+
+   for (i = 0; i < internal_ve_count; i++) {
+      STATIC_ASSERT(ARRAY_SIZE(vf->internal_ve[i]) >= 2);
+      vf->internal_ve[i][0] = GEN6_VE_DW0_VALID;
+      vf->internal_ve[i][1] = dw1[i];
+   }
+
+   vf->internal_ve_count = internal_ve_count;
+
+   return true;
+}
+
+static bool
+vf_params_set_gen8_3DSTATE_VF_SGVS(struct ilo_state_vf *vf,
+                                   const struct ilo_dev *dev,
+                                   const struct ilo_state_vf_params_info *params)
+{
+   const uint8_t attr = (params->prepend_zeros) ? 1 : 0;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 8, 8);
+
+   dw1 = 0;
+
+   if (params->prepend_instanceid) {
+      dw1 |= GEN8_SGVS_DW1_IID_ENABLE |
+             1 << GEN8_SGVS_DW1_IID_VE_COMP__SHIFT |
+             attr << GEN8_SGVS_DW1_IID_VE_INDEX__SHIFT;
+   }
+
+   if (params->prepend_vertexid) {
+      dw1 |= GEN8_SGVS_DW1_VID_ENABLE |
+             0 << GEN8_SGVS_DW1_VID_VE_COMP__SHIFT |
+             attr << GEN8_SGVS_DW1_VID_VE_INDEX__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(vf->sgvs) >= 1);
+   vf->sgvs[0] = dw1;
+
+   return true;
+}
+
+static uint32_t
+get_gen6_fixed_cut_index(const struct ilo_dev *dev,
+                         enum gen_index_format format)
+{
+   const uint32_t fixed = ~0u;
+
+   ILO_DEV_ASSERT(dev, 6, 7);
+
+   switch (format) {
+   case GEN6_INDEX_BYTE:   return (uint8_t)  fixed;
+   case GEN6_INDEX_WORD:   return (uint16_t) fixed;
+   case GEN6_INDEX_DWORD:  return (uint32_t) fixed;
+   default:
+      assert(!"unknown index format");
+      return fixed;
+   }
+}
+
+static bool
+get_gen6_cut_index_supported(const struct ilo_dev *dev,
+                             enum gen_3dprim_type topology)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * See the Sandy Bridge PRM, volume 2 part 1, page 80 and the Haswell PRM,
+    * volume 7, page 456.
+    */
+   switch (topology) {
+   case GEN6_3DPRIM_TRIFAN:
+   case GEN6_3DPRIM_QUADLIST:
+   case GEN6_3DPRIM_QUADSTRIP:
+   case GEN6_3DPRIM_POLYGON:
+   case GEN6_3DPRIM_LINELOOP:
+      return (ilo_dev_gen(dev) >= ILO_GEN(7.5));
+   case GEN6_3DPRIM_RECTLIST:
+   case GEN6_3DPRIM_TRIFAN_NOSTIPPLE:
+      return false;
+   default:
+      return true;
+   }
+}
+
+static bool
+vf_params_set_gen6_3dstate_index_buffer(struct ilo_state_vf *vf,
+                                        const struct ilo_dev *dev,
+                                        const struct ilo_state_vf_params_info *params)
+{
+   uint32_t dw0 = 0;
+
+   ILO_DEV_ASSERT(dev, 6, 7);
+
+   /* cut index only, as in 3DSTATE_VF */
+   if (params->cut_index_enable) {
+      assert(get_gen6_cut_index_supported(dev, params->cv_topology));
+      assert(get_gen6_fixed_cut_index(dev, params->cv_index_format) ==
+            params->cut_index);
+
+      dw0 |= GEN6_IB_DW0_CUT_INDEX_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(vf->cut) >= 1);
+   vf->cut[0] = dw0;
+
+   return true;
+}
+
+static bool
+vf_params_set_gen75_3DSTATE_VF(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               const struct ilo_state_vf_params_info *params)
+{
+   uint32_t dw0 = 0;
+
+   ILO_DEV_ASSERT(dev, 7.5, 8);
+
+   if (params->cut_index_enable) {
+      assert(get_gen6_cut_index_supported(dev, params->cv_topology));
+      dw0 |= GEN75_VF_DW0_CUT_INDEX_ENABLE;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(vf->cut) >= 2);
+   vf->cut[0] = dw0;
+   vf->cut[1] = params->cut_index;
+
+   return true;
+}
+
+static bool
+vertex_buffer_validate_gen6(const struct ilo_dev *dev,
+                            const struct ilo_state_vertex_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (info->buf)
+      assert(info->offset < info->buf->bo_size && info->size);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 86:
+    *
+    *     "(Buffer Pitch)
+    *      Range  [DevCTG+]: [0,2048] Bytes"
+    */
+   assert(info->stride <= 2048);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 86:
+    *
+    *     "64-bit floating point values must be 64-bit aligned in memory, or
+    *      UNPREDICTABLE data will be fetched. When accessing an element
+    *      containing 64-bit floating point values, the Buffer Starting
+    *      Address and Source Element Offset values must add to a 64-bit
+    *      aligned address, and BufferPitch must be a multiple of 64-bits."
+    */
+   if (info->cv_has_double) {
+      assert(info->stride % 8 == 0);
+      assert((info->offset + info->cv_double_vertex_offset_mod_8) % 8 == 0);
+   }
+
+   return true;
+}
+
+static uint32_t
+vertex_buffer_get_gen6_size(const struct ilo_dev *dev,
+                            const struct ilo_state_vertex_buffer_info *info)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!info->buf)
+      return 0;
+
+   return (info->offset + info->size <= info->buf->bo_size) ? info->size :
+      info->buf->bo_size - info->offset;
+}
+
+static bool
+vertex_buffer_set_gen8_vertex_buffer_state(struct ilo_state_vertex_buffer *vb,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_vertex_buffer_info *info)
+{
+   const uint32_t size = vertex_buffer_get_gen6_size(dev, info);
+   uint32_t dw0;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!vertex_buffer_validate_gen6(dev, info))
+      return false;
+
+   dw0 = info->stride << GEN6_VB_DW0_PITCH__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7))
+      dw0 |= GEN7_VB_DW0_ADDR_MODIFIED;
+   if (!info->buf)
+      dw0 |= GEN6_VB_DW0_IS_NULL;
+
+   STATIC_ASSERT(ARRAY_SIZE(vb->vb) >= 3);
+   vb->vb[0] = dw0;
+   vb->vb[1] = info->offset;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      vb->vb[2] = size;
+   } else {
+      /* address of the last valid byte */
+      vb->vb[2] = (size) ? info->offset + size - 1 : 0;
+   }
+
+   vb->need_bo = (info->buf != NULL);
+
+   return true;
+}
+
+static uint32_t
+get_index_format_size(enum gen_index_format format)
+{
+   switch (format) {
+   case GEN6_INDEX_BYTE:   return 1;
+   case GEN6_INDEX_WORD:   return 2;
+   case GEN6_INDEX_DWORD:  return 4;
+   default:
+      assert(!"unknown index format");
+      return 1;
+   }
+}
+
+static bool
+index_buffer_validate_gen6(const struct ilo_dev *dev,
+                           const struct ilo_state_index_buffer_info *info)
+{
+   const uint32_t format_size = get_index_format_size(info->format);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 79:
+    *
+    *     "This field (Buffer Starting Address) contains the size-aligned (as
+    *      specified by Index Format) Graphics Address of the first element of
+    *      interest within the index buffer."
+    */
+   assert(info->offset % format_size == 0);
+
+   if (info->buf)
+      assert(info->offset < info->buf->bo_size && info->size);
+
+   return true;
+}
+
+static uint32_t
+index_buffer_get_gen6_size(const struct ilo_dev *dev,
+                           const struct ilo_state_index_buffer_info *info)
+{
+   uint32_t size;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!info->buf)
+      return 0;
+
+   size = (info->offset + info->size <= info->buf->bo_size) ? info->size :
+      info->buf->bo_size - info->offset;
+
+   if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+      const uint32_t format_size = get_index_format_size(info->format);
+      size -= (size % format_size);
+   }
+
+   return size;
+}
+
+static bool
+index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(struct ilo_state_index_buffer *ib,
+                                           const struct ilo_dev *dev,
+                                           const struct ilo_state_index_buffer_info *info)
+{
+   const uint32_t size = index_buffer_get_gen6_size(dev, info);
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (!index_buffer_validate_gen6(dev, info))
+      return false;
+
+   STATIC_ASSERT(ARRAY_SIZE(ib->ib) >= 3);
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      ib->ib[0] = info->format << GEN8_IB_DW1_FORMAT__SHIFT;
+      ib->ib[1] = info->offset;
+      ib->ib[2] = size;
+   } else {
+      ib->ib[0] = info->format << GEN6_IB_DW0_FORMAT__SHIFT;
+      ib->ib[1] = info->offset;
+      /* address of the last valid byte, or 0 */
+      ib->ib[2] = (size) ? info->offset + size - 1 : 0;
+   }
+
+   ib->need_bo = (info->buf != NULL);
+
+   return true;
+}
+
+bool
+ilo_state_vf_valid_element_format(const struct ilo_dev *dev,
+                                  enum gen_surface_format format)
+{
+   /*
+    * This table is based on:
+    *
+    *  - the Sandy Bridge PRM, volume 4 part 1, page 88-97
+    *  - the Ivy Bridge PRM, volume 2 part 1, page 97-99
+    *  - the Haswell PRM, volume 7, page 467-470
+    */
+   static const int vf_element_formats[] = {
+      [GEN6_FORMAT_R32G32B32A32_FLOAT]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_UINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_UNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R64G64_FLOAT]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SSCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_USCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32A32_SFIXED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_R32G32B32_FLOAT]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SINT]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_UINT]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_UNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SSCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_USCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32B32_SFIXED]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_R16G16B16A16_UNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_SNORM]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_SINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_UINT]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_FLOAT]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_FLOAT]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_UINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_UNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R64_FLOAT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_SSCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16A16_USCALED]     = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SSCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_USCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32G32_SFIXED]            = ILO_GEN(7.5),
+      [GEN6_FORMAT_B8G8R8A8_UNORM]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10A2_UNORM]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10A2_UINT]         = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10_SNORM_A2_UNORM] = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_UNORM]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_SNORM]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_SINT]            = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_UINT]            = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_UNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_SNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_SINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_UINT]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_FLOAT]             = ILO_GEN(  1),
+      [GEN6_FORMAT_B10G10R10A2_UNORM]        = ILO_GEN(7.5),
+      [GEN6_FORMAT_R11G11B10_FLOAT]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_SINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_UINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_FLOAT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_UNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_SNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R10G10B10X2_USCALED]      = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_SSCALED]         = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8A8_USCALED]         = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_SSCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16_USCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_SSCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R32_USCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_UNORM]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_SNORM]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_SINT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_UINT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_UNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_SNORM]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_SINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_UINT]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_FLOAT]                = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_SSCALED]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8_USCALED]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_SSCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R16_USCALED]              = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_UNORM]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_SNORM]                 = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_SINT]                  = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_UINT]                  = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_SSCALED]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8_USCALED]               = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_UNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_SNORM]             = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_SSCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R8G8B8_USCALED]           = ILO_GEN(  1),
+      [GEN6_FORMAT_R64G64B64A64_FLOAT]       = ILO_GEN(  1),
+      [GEN6_FORMAT_R64G64B64_FLOAT]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_FLOAT]          = ILO_GEN(  6),
+      [GEN6_FORMAT_R16G16B16_UNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_SNORM]          = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_SSCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_USCALED]        = ILO_GEN(  1),
+      [GEN6_FORMAT_R16G16B16_UINT]           = ILO_GEN(7.5),
+      [GEN6_FORMAT_R16G16B16_SINT]           = ILO_GEN(7.5),
+      [GEN6_FORMAT_R32_SFIXED]               = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_SNORM]        = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_USCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_SSCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_R10G10B10A2_SINT]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_SNORM]        = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_USCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_SSCALED]      = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_UINT]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_B10G10R10A2_SINT]         = ILO_GEN(7.5),
+      [GEN6_FORMAT_R8G8B8_UINT]              = ILO_GEN(7.5),
+      [GEN6_FORMAT_R8G8B8_SINT]              = ILO_GEN(7.5),
+   };
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   return (format < ARRAY_SIZE(vf_element_formats) &&
+           vf_element_formats[format] &&
+           ilo_dev_gen(dev) >= vf_element_formats[format]);
+}
+
+bool
+ilo_state_vf_init(struct ilo_state_vf *vf,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vf_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(vf, sizeof(*vf)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   assert(ilo_state_vf_data_size(dev, info->element_count) <=
+         info->data_size);
+   vf->user_ve = (uint32_t (*)[2]) info->data;
+   vf->user_instancing =
+      (uint32_t (*)[2]) (vf->user_ve + info->element_count);
+
+   ret &= vf_set_gen6_3DSTATE_VERTEX_ELEMENTS(vf, dev, info);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ret &= vf_set_gen8_3DSTATE_VF_INSTANCING(vf, dev, info);
+   else
+      ret &= vf_set_gen6_vertex_buffer_state(vf, dev, info);
+
+   ret &= ilo_state_vf_set_params(vf, dev, &info->params);
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_vf_init_for_rectlist(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               void *data, size_t data_size,
+                               const struct ilo_state_vf_element_info *elements,
+                               uint8_t element_count)
+{
+   struct ilo_state_vf_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   info.data = data;
+   info.data_size = data_size;
+
+   info.elements = elements;
+   info.element_count = element_count;
+
+   /*
+    * For VUE header,
+    *
+    *   DW0: Reserved: MBZ
+    *   DW1: Render Target Array Index
+    *   DW2: Viewport Index
+    *   DW3: Point Width
+    */
+   info.params.prepend_zeros = true;
+
+   return ilo_state_vf_init(vf, dev, &info);
+}
+
+bool
+ilo_state_vf_set_params(struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_vf_params_info *params)
+{
+   bool ret = true;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   ret &= vf_params_set_gen6_internal_ve(vf, dev, params, vf->user_ve_count);
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      ret &= vf_params_set_gen8_3DSTATE_VF_SGVS(vf, dev, params);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 94:
+    *
+    *     "Edge flags are supported for the following primitive topology types
+    *      only, otherwise EdgeFlagEnable must not be ENABLED.
+    *
+    *      - 3DPRIM_TRILIST*
+    *      - 3DPRIM_TRISTRIP*
+    *      - 3DPRIM_TRIFAN*
+    *      - 3DPRIM_POLYGON"
+    *
+    *     "[DevSNB]: Edge Flags are not supported for QUADLIST primitives.
+    *      Software may elect to convert QUADLIST primitives to some set of
+    *      corresponding edge-flag-supported primitive types (e.g., POLYGONs)
+    *      prior to submission to the 3D vf."
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 86:
+    *
+    *     "Edge flags are supported for all primitive topology types."
+    *
+    * Both PRMs are confusing...
+    */
+   if (params->last_element_edge_flag) {
+      assert(vf->edge_flag_supported);
+      if (ilo_dev_gen(dev) == ILO_GEN(6))
+         assert(params->cv_topology != GEN6_3DPRIM_QUADLIST);
+   }
+
+   if (vf->edge_flag_supported) {
+      assert(vf->user_ve_count);
+      memcpy(vf->user_ve[vf->user_ve_count - 1],
+            vf->last_user_ve[params->last_element_edge_flag],
+            sizeof(vf->user_ve[vf->user_ve_count - 1]));
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      ret &= vf_params_set_gen75_3DSTATE_VF(vf, dev, params);
+   else
+      ret &= vf_params_set_gen6_3dstate_index_buffer(vf, dev, params);
+
+   assert(ret);
+
+   return ret;
+}
+
+void
+ilo_state_vf_full_delta(const struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_vf_delta *delta)
+{
+   delta->dirty = ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VF_SGVS |
+                      ILO_STATE_VF_3DSTATE_VF_INSTANCING;
+   } else {
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VF;
+   else
+      delta->dirty |= ILO_STATE_VF_3DSTATE_INDEX_BUFFER;
+}
+
+void
+ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_vf *old,
+                       struct ilo_state_vf_delta *delta)
+{
+   /* no shallow copying */
+   assert(vf->user_ve != old->user_ve &&
+          vf->user_instancing != old->user_instancing);
+
+   delta->dirty = 0;
+
+   if (vf->internal_ve_count != old->internal_ve_count ||
+       vf->user_ve_count != old->user_ve_count ||
+       memcmp(vf->internal_ve, old->internal_ve,
+          sizeof(vf->internal_ve[0]) * vf->internal_ve_count) ||
+       memcmp(vf->user_ve, old->user_ve,
+          sizeof(vf->user_ve[0]) * vf->user_ve_count))
+      delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS;
+
+   if (vf->user_ve_count != old->user_ve_count ||
+       memcmp(vf->user_instancing, old->user_instancing,
+          sizeof(vf->user_instancing[0]) * vf->user_ve_count)) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(8))
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VF_INSTANCING;
+      else
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      if (vf->sgvs[0] != old->sgvs[0])
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VF_SGVS;
+   }
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5)) {
+      if (memcmp(vf->cut, old->cut, sizeof(vf->cut)))
+         delta->dirty |= ILO_STATE_VF_3DSTATE_VF;
+   } else {
+      if (vf->cut[0] != old->cut[0])
+         delta->dirty |= ILO_STATE_VF_3DSTATE_INDEX_BUFFER;
+   }
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_vertex_buffer_info *info)
+{
+   bool ret = true;
+
+   ret &= vertex_buffer_set_gen8_vertex_buffer_state(vb, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
+
+/**
+ * No need to initialize first.
+ */
+bool
+ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_index_buffer_info *info)
+{
+   bool ret = true;
+
+   ret &= index_buffer_set_gen8_3DSTATE_INDEX_BUFFER(ib, dev, info);
+
+   assert(ret);
+
+   return ret;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_vf.h b/src/gallium/drivers/ilo/core/ilo_state_vf.h
new file mode 100644
index 0000000..f15c63a
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_vf.h
@@ -0,0 +1,228 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_VF_H
+#define ILO_STATE_VF_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 93:
+ *
+ *     "Up to 34 (DevSNB+) vertex elements are supported."
+ *
+ *     "Up to 33 VBs are supported"
+ *
+ * Reserve two VEs and one VB for internal use.
+ */
+#define ILO_STATE_VF_MAX_ELEMENT_COUNT (34 - 2)
+#define ILO_STATE_VF_MAX_BUFFER_COUNT (33 - 1)
+
+enum ilo_state_vf_dirty_bits {
+   ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS            = (1 << 0),
+   ILO_STATE_VF_3DSTATE_VF_SGVS                    = (1 << 1),
+   ILO_STATE_VF_3DSTATE_VF_INSTANCING              = (1 << 2),
+   ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS             = (1 << 3),
+   ILO_STATE_VF_3DSTATE_VF                         = (1 << 4),
+   ILO_STATE_VF_3DSTATE_INDEX_BUFFER               = (1 << 5),
+};
+
+/**
+ * Fetch a 128-bit vertex attribute.
+ */
+struct ilo_state_vf_element_info {
+   uint8_t buffer;
+   uint16_t vertex_offset;
+   enum gen_surface_format format;
+
+   uint8_t format_size;
+   uint8_t component_count;
+   bool is_integer;
+
+   /* must be the same for those share the same buffer before Gen8 */
+   bool instancing_enable;
+   uint32_t instancing_step_rate;
+};
+
+/**
+ * VF parameters.
+ */
+struct ilo_state_vf_params_info {
+   enum gen_3dprim_type cv_topology;
+
+   /* prepend an attribute of zeros */
+   bool prepend_zeros;
+
+   /* prepend an attribute of VertexID and/or InstanceID */
+   bool prepend_vertexid;
+   bool prepend_instanceid;
+
+   bool last_element_edge_flag;
+
+   enum gen_index_format cv_index_format;
+   bool cut_index_enable;
+   uint32_t cut_index;
+};
+
+/**
+ * Vertex fetch.
+ */
+struct ilo_state_vf_info {
+   void *data;
+   size_t data_size;
+
+   const struct ilo_state_vf_element_info *elements;
+   uint8_t element_count;
+
+   struct ilo_state_vf_params_info params;
+};
+
+struct ilo_state_vf {
+   uint32_t (*user_ve)[2];
+   uint32_t (*user_instancing)[2];
+   int8_t vb_to_first_elem[ILO_STATE_VF_MAX_BUFFER_COUNT];
+   uint8_t user_ve_count;
+
+   bool edge_flag_supported;
+   uint32_t last_user_ve[2][2];
+
+   /* two VEs are reserved for internal use */
+   uint32_t internal_ve[2][2];
+   uint8_t internal_ve_count;
+
+   uint32_t sgvs[1];
+
+   uint32_t cut[2];
+};
+
+struct ilo_state_vf_delta {
+   uint32_t dirty;
+};
+
+struct ilo_buffer;
+
+struct ilo_state_vertex_buffer_info {
+   const struct ilo_buffer *buf;
+   uint32_t offset;
+   uint32_t size;
+
+   uint16_t stride;
+
+   /* doubles must be at 64-bit aligned addresses */
+   bool cv_has_double;
+   uint8_t cv_double_vertex_offset_mod_8;
+};
+
+struct ilo_state_vertex_buffer {
+   uint32_t vb[3];
+
+   bool need_bo;
+
+   /* managed by users */
+   struct intel_bo *bo;
+};
+
+struct ilo_state_index_buffer_info {
+   const struct ilo_buffer *buf;
+   uint32_t offset;
+   uint32_t size;
+
+   enum gen_index_format format;
+};
+
+struct ilo_state_index_buffer {
+   uint32_t ib[3];
+
+   bool need_bo;
+
+   /* managed by users */
+   struct intel_bo *bo;
+};
+
+static inline size_t
+ilo_state_vf_data_size(const struct ilo_dev *dev, uint8_t element_count)
+{
+   const struct ilo_state_vf *vf = NULL;
+   return (sizeof(vf->user_ve[0]) +
+           sizeof(vf->user_instancing[0])) * element_count;
+}
+
+bool
+ilo_state_vf_valid_element_format(const struct ilo_dev *dev,
+                                  enum gen_surface_format format);
+
+bool
+ilo_state_vf_init(struct ilo_state_vf *vf,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_vf_info *info);
+
+bool
+ilo_state_vf_init_for_rectlist(struct ilo_state_vf *vf,
+                               const struct ilo_dev *dev,
+                               void *data, size_t data_size,
+                               const struct ilo_state_vf_element_info *elements,
+                               uint8_t element_count);
+
+bool
+ilo_state_vf_set_params(struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_vf_params_info *params);
+
+/**
+ * Return the number of attributes in the VUE.
+ */
+static inline uint8_t
+ilo_state_vf_get_attr_count(const struct ilo_state_vf *vf)
+{
+   return vf->internal_ve_count + vf->user_ve_count;
+}
+
+void
+ilo_state_vf_full_delta(const struct ilo_state_vf *vf,
+                        const struct ilo_dev *dev,
+                        struct ilo_state_vf_delta *delta);
+
+void
+ilo_state_vf_get_delta(const struct ilo_state_vf *vf,
+                       const struct ilo_dev *dev,
+                       const struct ilo_state_vf *old,
+                       struct ilo_state_vf_delta *delta);
+
+bool
+ilo_state_vertex_buffer_set_info(struct ilo_state_vertex_buffer *vb,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_vertex_buffer_info *info);
+
+bool
+ilo_state_index_buffer_set_info(struct ilo_state_index_buffer *ib,
+                                const struct ilo_dev *dev,
+                                const struct ilo_state_index_buffer_info *info);
+
+#endif /* ILO_STATE_VF_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_viewport.c b/src/gallium/drivers/ilo/core/ilo_state_viewport.c
new file mode 100644
index 0000000..aae5733
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_viewport.c
@@ -0,0 +1,378 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "ilo_debug.h"
+#include "ilo_state_viewport.h"
+
+static void
+viewport_matrix_get_gen6_guardband(const struct ilo_dev *dev,
+                                   const struct ilo_state_viewport_matrix_info *mat,
+                                   float *min_gbx, float *max_gbx,
+                                   float *min_gby, float *max_gby)
+{
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 234:
+    *
+    *     "Per-Device Guardband Extents
+    *
+    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-16K,16K-1]
+    *       - Maximum Post-Clamp Delta (X or Y): 16K"
+    *
+    *     "In addition, in order to be correctly rendered, objects must have a
+    *      screenspace bounding box not exceeding 8K in the X or Y direction.
+    *      This additional restriction must also be comprehended by software,
+    *      i.e., enforced by use of clipping."
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 248:
+    *
+    *     "Per-Device Guardband Extents
+    *
+    *       - Supported X,Y ScreenSpace "Guardband" Extent: [-32K,32K-1]
+    *       - Maximum Post-Clamp Delta (X or Y): N/A"
+    *
+    *     "In addition, in order to be correctly rendered, objects must have a
+    *      screenspace bounding box not exceeding 8K in the X or Y direction.
+    *      This additional restriction must also be comprehended by software,
+    *      i.e., enforced by use of clipping."
+    *
+    * Combined, the bounding box of any object can not exceed 8K in both
+    * width and height.
+    *
+    * Below we set the guardband as a squre of length 8K, centered at where
+    * the viewport is.  This makes sure all objects passing the GB test are
+    * valid to the renderer, and those failing the XY clipping have a
+    * better chance of passing the GB test.
+    */
+   const int max_extent = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 32768 : 16384;
+   const int half_len = 8192 / 2;
+   int center_x = (int) mat->translate[0];
+   int center_y = (int) mat->translate[1];
+   float scale_x, scale_y;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /* make sure the guardband is within the valid range */
+   if (center_x - half_len < -max_extent)
+      center_x = -max_extent + half_len;
+   else if (center_x + half_len > max_extent - 1)
+      center_x = max_extent - half_len;
+
+   if (center_y - half_len < -max_extent)
+      center_y = -max_extent + half_len;
+   else if (center_y + half_len > max_extent - 1)
+      center_y = max_extent - half_len;
+
+   scale_x = fabsf(mat->scale[0]);
+   scale_y = fabsf(mat->scale[1]);
+   /*
+    * From the Haswell PRM, volume 2d, page 292-293:
+    *
+    *     "Note: Minimum allowed value for this field (X/Y Min Clip Guardband)
+    *      is -16384."
+    *
+    *     "Note: Maximum allowed value for this field (X/Y Max Clip Guardband)
+    *      is 16383."
+    *
+    * Avoid small scales.
+    */
+   if (scale_x < 1.0f)
+      scale_x = 1.0f;
+   if (scale_y < 1.0f)
+      scale_y = 1.0f;
+
+   /* in NDC space */
+   *min_gbx = ((float) (center_x - half_len) - mat->translate[0]) / scale_x;
+   *max_gbx = ((float) (center_x + half_len) - mat->translate[0]) / scale_x;
+   *min_gby = ((float) (center_y - half_len) - mat->translate[1]) / scale_y;
+   *max_gby = ((float) (center_y + half_len) - mat->translate[1]) / scale_y;
+}
+
+static void
+viewport_matrix_get_extent(const struct ilo_state_viewport_matrix_info *mat,
+                           int axis, float *min, float *max)
+{
+   const float scale_abs = fabsf(mat->scale[axis]);
+
+   *min = -1.0f * scale_abs + mat->translate[axis];
+   *max =  1.0f * scale_abs + mat->translate[axis];
+}
+
+static bool
+viewport_matrix_set_gen7_SF_CLIP_VIEWPORT(struct ilo_state_viewport *vp,
+                                          const struct ilo_dev *dev,
+                                          const struct ilo_state_viewport_matrix_info *matrices,
+                                          uint8_t count)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < count; i++) {
+      const struct ilo_state_viewport_matrix_info *mat = &matrices[i];
+      float min_gbx, max_gbx, min_gby, max_gby;
+      uint32_t dw[16];
+
+      viewport_matrix_get_gen6_guardband(dev, mat,
+            &min_gbx, &max_gbx, &min_gby, &max_gby);
+
+      dw[0] = fui(mat->scale[0]);
+      dw[1] = fui(mat->scale[1]);
+      dw[2] = fui(mat->scale[2]);
+      dw[3] = fui(mat->translate[0]);
+      dw[4] = fui(mat->translate[1]);
+      dw[5] = fui(mat->translate[2]);
+      dw[6] = 0;
+      dw[7] = 0;
+
+      dw[8] = fui(min_gbx);
+      dw[9] = fui(max_gbx);
+      dw[10] = fui(min_gby);
+      dw[11] = fui(max_gby);
+
+      if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+         float min_x, max_x, min_y, max_y;
+
+         viewport_matrix_get_extent(mat, 0, &min_x, &max_x);
+         viewport_matrix_get_extent(mat, 1, &min_y, &max_y);
+
+         dw[12] = fui(min_x);
+         dw[13] = fui(max_x - 1.0f);
+         dw[14] = fui(min_y);
+         dw[15] = fui(max_y - 1.0f);
+      } else {
+         dw[12] = 0;
+         dw[13] = 0;
+         dw[14] = 0;
+         dw[15] = 0;
+      }
+
+      STATIC_ASSERT(ARRAY_SIZE(vp->sf_clip[i]) >= 16);
+      memcpy(vp->sf_clip[i], dw, sizeof(dw));
+   }
+
+   return true;
+}
+
+static bool
+viewport_matrix_set_gen6_CC_VIEWPORT(struct ilo_state_viewport *vp,
+                                     const struct ilo_dev *dev,
+                                     const struct ilo_state_viewport_matrix_info *matrices,
+                                     uint8_t count)
+{
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < count; i++) {
+      const struct ilo_state_viewport_matrix_info *mat = &matrices[i];
+      float min_z, max_z;
+
+      viewport_matrix_get_extent(mat, 2, &min_z, &max_z);
+
+      STATIC_ASSERT(ARRAY_SIZE(vp->cc[i]) >= 2);
+      vp->cc[i][0] = fui(min_z);
+      vp->cc[i][1] = fui(max_z);
+   }
+
+   return true;
+}
+
+static bool
+viewport_scissor_set_gen6_SCISSOR_RECT(struct ilo_state_viewport *vp,
+                                       const struct ilo_dev *dev,
+                                       const struct ilo_state_viewport_scissor_info *scissors,
+                                       uint8_t count)
+{
+   const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
+   uint8_t i;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   for (i = 0; i < count; i++) {
+      const struct ilo_state_viewport_scissor_info *scissor = &scissors[i];
+      uint16_t min_x, min_y, max_x, max_y;
+      uint32_t dw0, dw1;
+
+      min_x = (scissor->min_x < max_size) ? scissor->min_x : max_size - 1;
+      min_y = (scissor->min_y < max_size) ? scissor->min_y : max_size - 1;
+      max_x = (scissor->max_x < max_size) ? scissor->max_x : max_size - 1;
+      max_y = (scissor->max_y < max_size) ? scissor->max_y : max_size - 1;
+
+      dw0 = min_y << GEN6_SCISSOR_DW0_MIN_Y__SHIFT |
+            min_x << GEN6_SCISSOR_DW0_MIN_X__SHIFT;
+      dw1 = max_y << GEN6_SCISSOR_DW1_MAX_Y__SHIFT |
+            max_x << GEN6_SCISSOR_DW1_MAX_X__SHIFT;
+
+      STATIC_ASSERT(ARRAY_SIZE(vp->scissor[i]) >= 2);
+      vp->scissor[i][0] = dw0;
+      vp->scissor[i][1] = dw1;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_viewport_init(struct ilo_state_viewport *vp,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_viewport_info *info)
+{
+   const size_t elem_size = ilo_state_viewport_data_size(dev, 1);
+
+   assert(ilo_is_zeroed(vp, sizeof(*vp)));
+   assert(ilo_is_zeroed(info->data, info->data_size));
+
+   vp->data = info->data;
+
+   if (info->data_size / elem_size < ILO_STATE_VIEWPORT_MAX_COUNT)
+      vp->array_size = info->data_size / elem_size;
+   else
+      vp->array_size = ILO_STATE_VIEWPORT_MAX_COUNT;
+
+   return ilo_state_viewport_set_params(vp, dev, &info->params, false);
+}
+
+bool
+ilo_state_viewport_init_data_only(struct ilo_state_viewport *vp,
+                                  const struct ilo_dev *dev,
+                                  void *data, size_t data_size)
+{
+   struct ilo_state_viewport_info info;
+
+   memset(&info, 0, sizeof(info));
+   info.data = data;
+   info.data_size = data_size;
+
+   return ilo_state_viewport_init(vp, dev, &info);
+}
+
+bool
+ilo_state_viewport_init_for_rectlist(struct ilo_state_viewport *vp,
+                                     const struct ilo_dev *dev,
+                                     void *data, size_t data_size)
+{
+   struct ilo_state_viewport_info info;
+   struct ilo_state_viewport_matrix_info mat;
+   struct ilo_state_viewport_scissor_info sci;
+
+   memset(&info, 0, sizeof(info));
+   memset(&mat, 0, sizeof(mat));
+   memset(&sci, 0, sizeof(sci));
+
+   info.data = data;
+   info.data_size = data_size;
+   info.params.matrices = &mat;
+   info.params.scissors = &sci;
+   info.params.count = 1;
+
+   mat.scale[0] = 1.0f;
+   mat.scale[1] = 1.0f;
+   mat.scale[2] = 1.0f;
+
+   return ilo_state_viewport_init(vp, dev, &info);
+}
+
+static void
+viewport_set_count(struct ilo_state_viewport *vp,
+                   const struct ilo_dev *dev,
+                   uint8_t count)
+{
+   assert(count <= vp->array_size);
+
+   vp->count = count;
+   vp->sf_clip = (uint32_t (*)[16]) vp->data;
+   vp->cc =      (uint32_t (*)[ 2]) (vp->sf_clip + count);
+   vp->scissor = (uint32_t (*)[ 2]) (vp->cc + count);
+}
+
+bool
+ilo_state_viewport_set_params(struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_viewport_params_info *params,
+                              bool scissors_only)
+{
+   bool ret = true;
+
+   if (scissors_only) {
+      assert(vp->count == params->count);
+
+      ret &= viewport_scissor_set_gen6_SCISSOR_RECT(vp, dev,
+            params->scissors, params->count);
+   } else {
+      viewport_set_count(vp, dev, params->count);
+
+      ret &= viewport_matrix_set_gen7_SF_CLIP_VIEWPORT(vp, dev,
+            params->matrices, params->count);
+      ret &= viewport_matrix_set_gen6_CC_VIEWPORT(vp, dev,
+            params->matrices, params->count);
+      ret &= viewport_scissor_set_gen6_SCISSOR_RECT(vp, dev,
+            params->scissors, params->count);
+   }
+
+   assert(ret);
+
+   return ret;
+}
+
+void
+ilo_state_viewport_full_delta(const struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              struct ilo_state_viewport_delta *delta)
+{
+   delta->dirty = ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT |
+                  ILO_STATE_VIEWPORT_CC_VIEWPORT |
+                  ILO_STATE_VIEWPORT_SCISSOR_RECT;
+}
+
+void
+ilo_state_viewport_get_delta(const struct ilo_state_viewport *vp,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_viewport *old,
+                             struct ilo_state_viewport_delta *delta)
+{
+   const size_t sf_clip_size = sizeof(vp->sf_clip[0]) * vp->count;
+   const size_t cc_size = sizeof(vp->cc[0]) * vp->count;
+   const size_t scissor_size = sizeof(vp->scissor[0]) * vp->count;
+
+   /* no shallow copying */
+   assert(vp->data != old->data);
+
+   if (vp->count != old->count) {
+      ilo_state_viewport_full_delta(vp, dev, delta);
+      return;
+   }
+
+   delta->dirty = 0;
+
+   if (memcmp(vp->sf_clip, old->sf_clip, sf_clip_size))
+      delta->dirty |= ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT;
+
+   if (memcmp(vp->cc, old->cc, cc_size))
+      delta->dirty |= ILO_STATE_VIEWPORT_CC_VIEWPORT;
+
+   if (memcmp(vp->scissor, old->scissor, scissor_size))
+      delta->dirty |= ILO_STATE_VIEWPORT_SCISSOR_RECT;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_viewport.h b/src/gallium/drivers/ilo/core/ilo_state_viewport.h
new file mode 100644
index 0000000..b42ad65
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_viewport.h
@@ -0,0 +1,132 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_VIEWPORT_H
+#define ILO_STATE_VIEWPORT_H
+
+#include "genhw/genhw.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+/*
+ * From the Sandy Bridge PRM, volume 2 part 1, page 38:
+ *
+ *     "... 16 sets of viewport (VP) state parameters in the Clip unit's
+ *      VertexClipTest function and in the SF unit's ViewportMapping and
+ *      Scissor functions."
+ */
+#define ILO_STATE_VIEWPORT_MAX_COUNT 16
+
+enum ilo_state_viewport_dirty_bits {
+   ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT             = (1 << 0),
+   ILO_STATE_VIEWPORT_CC_VIEWPORT                  = (1 << 1),
+   ILO_STATE_VIEWPORT_SCISSOR_RECT                 = (1 << 2),
+};
+
+struct ilo_state_viewport_matrix_info {
+   float scale[3];
+   float translate[3];
+};
+
+struct ilo_state_viewport_scissor_info {
+   /* all inclusive */
+   uint16_t min_x;
+   uint16_t min_y;
+   uint16_t max_x;
+   uint16_t max_y;
+};
+
+struct ilo_state_viewport_params_info {
+   const struct ilo_state_viewport_matrix_info *matrices;
+   const struct ilo_state_viewport_scissor_info *scissors;
+   uint8_t count;
+};
+
+struct ilo_state_viewport_info {
+   void *data;
+   size_t data_size;
+
+   struct ilo_state_viewport_params_info params;
+};
+
+struct ilo_state_viewport {
+   void *data;
+   uint8_t array_size;
+
+   uint8_t count;
+   uint32_t (*sf_clip)[16];
+   uint32_t (*cc)[2];
+   uint32_t (*scissor)[2];
+};
+
+struct ilo_state_viewport_delta {
+   uint32_t dirty;
+};
+
+static inline size_t
+ilo_state_viewport_data_size(const struct ilo_dev *dev, uint8_t array_size)
+{
+   const struct ilo_state_viewport *vp = NULL;
+   return (sizeof(vp->sf_clip[0]) +
+           sizeof(vp->cc[0]) +
+           sizeof(vp->scissor[0])) * array_size;
+}
+
+bool
+ilo_state_viewport_init(struct ilo_state_viewport *vp,
+                        const struct ilo_dev *dev,
+                        const struct ilo_state_viewport_info *info);
+
+bool
+ilo_state_viewport_init_data_only(struct ilo_state_viewport *vp,
+                                  const struct ilo_dev *dev,
+                                  void *data, size_t data_size);
+
+bool
+ilo_state_viewport_init_for_rectlist(struct ilo_state_viewport *vp,
+                                     const struct ilo_dev *dev,
+                                     void *data, size_t data_size);
+
+bool
+ilo_state_viewport_set_params(struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              const struct ilo_state_viewport_params_info *params,
+                              bool scissors_only);
+
+void
+ilo_state_viewport_full_delta(const struct ilo_state_viewport *vp,
+                              const struct ilo_dev *dev,
+                              struct ilo_state_viewport_delta *delta);
+
+void
+ilo_state_viewport_get_delta(const struct ilo_state_viewport *vp,
+                             const struct ilo_dev *dev,
+                             const struct ilo_state_viewport *old,
+                             struct ilo_state_viewport_delta *delta);
+
+#endif /* ILO_STATE_VIEWPORT_H */
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.c b/src/gallium/drivers/ilo/core/ilo_state_zs.c
new file mode 100644
index 0000000..901fedb
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.c
@@ -0,0 +1,727 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "intel_winsys.h"
+
+#include "ilo_debug.h"
+#include "ilo_image.h"
+#include "ilo_state_zs.h"
+
+static bool
+zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                      const struct ilo_dev *dev)
+{
+   const enum gen_depth_format format = GEN6_ZFORMAT_D32_FLOAT;
+   uint32_t dw1;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      dw1 = GEN6_SURFTYPE_NULL << GEN7_DEPTH_DW1_TYPE__SHIFT |
+            format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
+   } else {
+      dw1 = GEN6_SURFTYPE_NULL << GEN6_DEPTH_DW1_TYPE__SHIFT |
+            GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
+            format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5);
+   zs->depth[0] = dw1;
+   zs->depth[1] = 0;
+   zs->depth[2] = 0;
+   zs->depth[3] = 0;
+   zs->depth[4] = 0;
+
+   zs->depth_format = format;
+
+   return true;
+}
+
+static enum gen_surface_type
+get_gen6_surface_type(const struct ilo_dev *dev, const struct ilo_image *img)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (img->target) {
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return GEN6_SURFTYPE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return GEN6_SURFTYPE_2D;
+   case PIPE_TEXTURE_3D:
+      return GEN6_SURFTYPE_3D;
+   default:
+      assert(!"unknown texture target");
+      return GEN6_SURFTYPE_NULL;
+   }
+}
+
+static enum gen_depth_format
+get_gen6_depth_format(const struct ilo_dev *dev, const struct ilo_image *img)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7)) {
+      switch (img->format) {
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   } else {
+      switch (img->format) {
+      case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+         return GEN6_ZFORMAT_D32_FLOAT_S8X24_UINT;
+      case PIPE_FORMAT_Z32_FLOAT:
+         return GEN6_ZFORMAT_D32_FLOAT;
+      case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+         return GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+      case PIPE_FORMAT_Z24X8_UNORM:
+         return GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+      case PIPE_FORMAT_Z16_UNORM:
+         return GEN6_ZFORMAT_D16_UNORM;
+      default:
+         assert(!"unknown depth format");
+         return GEN6_ZFORMAT_D32_FLOAT;
+      }
+   }
+}
+
+static bool
+zs_validate_gen6(const struct ilo_dev *dev,
+                 const struct ilo_state_zs_info *info)
+{
+   const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 315:
+    *
+    *      The stencil buffer has a format of S8_UINT, and shares Surface
+    *      Type, Height, Width, and Depth, Minimum Array Element, Render
+    *      Target View Extent, Depth Coordinate Offset X/Y, LOD, and Depth
+    *      Buffer Object Control State fields of the depth buffer.
+    */
+   if (info->z_img == info->s_img) {
+      assert(info->z_img->target == info->s_img->target &&
+             info->z_img->width0 == info->s_img->width0 &&
+             info->z_img->height0 == info->s_img->height0 &&
+             info->z_img->depth0 == info->s_img->depth0);
+   }
+
+   assert(info->level < img->level_count);
+   assert(img->bo_stride);
+
+   if (info->hiz_enable) {
+      assert(info->z_img &&
+             ilo_image_can_enable_aux(info->z_img, info->level));
+   }
+
+   if (info->is_cube_map) {
+      assert(get_gen6_surface_type(dev, img) == GEN6_SURFTYPE_2D);
+
+      /*
+       * From the Sandy Bridge PRM, volume 2 part 1, page 323:
+       *
+       *     "For cube maps, Width must be set equal to Height."
+       */
+      assert(img->width0 == img->height0);
+   }
+
+   if (info->z_img)
+      assert(info->z_img->tiling == GEN6_TILING_Y);
+   if (info->s_img)
+      assert(info->s_img->tiling == GEN8_TILING_W);
+
+   return true;
+}
+
+static void
+get_gen6_max_extent(const struct ilo_dev *dev,
+                    const struct ilo_image *img,
+                    uint16_t *max_w, uint16_t *max_h)
+{
+   const uint16_t max_size = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 16384 : 8192;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   switch (get_gen6_surface_type(dev, img)) {
+   case GEN6_SURFTYPE_1D:
+      *max_w = max_size;
+      *max_h = 1;
+      break;
+   case GEN6_SURFTYPE_2D:
+      *max_w = max_size;
+      *max_h = max_size;
+      break;
+   case GEN6_SURFTYPE_3D:
+      *max_w = 2048;
+      *max_h = 2048;
+      break;
+   default:
+      assert(!"invalid surface type");
+      *max_w = 1;
+      *max_h = 1;
+      break;
+   }
+}
+
+static void
+get_gen6_hiz_alignments(const struct ilo_dev *dev,
+                        const struct ilo_image *img,
+                        uint16_t *align_w, uint16_t *align_h)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 313:
+    *
+    *     "A rectangle primitive representing the clear area is delivered. The
+    *      primitive must adhere to the following restrictions on size:
+    *
+    *      - If Number of Multisamples is NUMSAMPLES_1, the rectangle must be
+    *        aligned to an 8x4 pixel block relative to the upper left corner
+    *        of the depth buffer, and contain an integer number of these pixel
+    *        blocks, and all 8x4 pixels must be lit.
+    *      - If Number of Multisamples is NUMSAMPLES_4, the rectangle must be
+    *        aligned to a 4x2 pixel block (8x4 sample block) relative to the
+    *        upper left corner of the depth buffer, and contain an integer
+    *        number of these pixel blocks, and all samples of the 4x2 pixels
+    *        must be lit
+    *      - If Number of Multisamples is NUMSAMPLES_8, the rectangle must be
+    *        aligned to a 2x2 pixel block (8x4 sample block) relative to the
+    *        upper left corner of the depth buffer, and contain an integer
+    *        number of these pixel blocks, and all samples of the 2x2 pixels
+    *        must be list."
+    *
+    * Experiments on Gen7.5 show that HiZ resolve also requires the rectangle
+    * to be aligned to 8x4 sample blocks.  But to be on the safe side, we
+    * always require a level to be aligned when HiZ is enabled.
+    */
+   switch (img->sample_count) {
+   case 1:
+      *align_w = 8;
+      *align_h = 4;
+      break;
+   case 2:
+      *align_w = 4;
+      *align_h = 4;
+      break;
+   case 4:
+      *align_w = 4;
+      *align_h = 2;
+      break;
+   case 8:
+      *align_w = 2;
+      *align_h = 2;
+      break;
+   case 16:
+      *align_w = 2;
+      *align_h = 1;
+      break;
+   default:
+      assert(!"unknown sample count");
+      *align_w = 1;
+      *align_h = 1;
+      break;
+   }
+}
+
+static bool
+zs_get_gen6_depth_extent(const struct ilo_dev *dev,
+                         const struct ilo_state_zs_info *info,
+                         uint16_t *width, uint16_t *height)
+{
+   const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img;
+   uint16_t w, h, max_w, max_h;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   w = img->width0;
+   h = img->height0;
+
+   if (info->hiz_enable) {
+      uint16_t align_w, align_h;
+
+      get_gen6_hiz_alignments(dev, info->z_img, &align_w, &align_h);
+
+      /*
+       * We want to force 8x4 alignment, but we can do so only for level 0 and
+       * only when it is padded.  ilo_image should know all these.
+       */
+      if (info->level)
+         assert(w % align_w == 0 && h % align_h == 0);
+
+      w = align(w, align_w);
+      h = align(h, align_h);
+   }
+
+   get_gen6_max_extent(dev, img, &max_w, &max_h);
+   assert(w && h && w <= max_w && h <= max_h);
+
+   *width = w - 1;
+   *height = h - 1;
+
+   return true;
+}
+
+static bool
+zs_get_gen6_depth_slices(const struct ilo_dev *dev,
+                         const struct ilo_state_zs_info *info,
+                         uint16_t *depth, uint16_t *min_array_elem,
+                         uint16_t *rt_view_extent)
+{
+   const struct ilo_image *img = (info->z_img) ? info->z_img : info->s_img;
+   uint16_t max_slice, d;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 325:
+    *
+    *     "This field (Depth) specifies the total number of levels for a
+    *      volume texture or the number of array elements allowed to be
+    *      accessed starting at the Minimum Array Element for arrayed
+    *      surfaces. If the volume texture is MIP-mapped, this field specifies
+    *      the depth of the base MIP level."
+    */
+   switch (get_gen6_surface_type(dev, img)) {
+   case GEN6_SURFTYPE_1D:
+   case GEN6_SURFTYPE_2D:
+      max_slice = (ilo_dev_gen(dev) >= ILO_GEN(7)) ? 2048 : 512;
+
+      assert(img->array_size <= max_slice);
+      max_slice = img->array_size;
+
+      d = info->slice_count;
+      if (info->is_cube_map) {
+         /*
+          * Minumum Array Element and Depth must be 0; Render Target View
+          * Extent is ignored.
+          */
+         if (info->slice_base || d != 6) {
+            ilo_warn("no cube array dpeth buffer\n");
+            return false;
+         }
+
+         d /= 6;
+      }
+      break;
+   case GEN6_SURFTYPE_3D:
+      max_slice = 2048;
+
+      assert(img->depth0 <= max_slice);
+      max_slice = u_minify(img->depth0, info->level);
+
+      d = img->depth0;
+      break;
+   default:
+      assert(!"invalid surface type");
+      return false;
+      break;
+   }
+
+   if (!info->slice_count ||
+       info->slice_base + info->slice_count > max_slice) {
+      ilo_warn("invalid slice range\n");
+      return false;
+   }
+
+   assert(d);
+   *depth = d - 1;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 325:
+    *
+    *     "For 1D and 2D Surfaces:
+    *      This field (Minimum Array Element) indicates the minimum array
+    *      element that can be accessed as part of this surface. The delivered
+    *      array index is added to this field before being used to address the
+    *      surface.
+    *
+    *      For 3D Surfaces:
+    *      This field indicates the minimum `R' coordinate on the LOD
+    *      currently being rendered to.  This field is added to the delivered
+    *      array index before it is used to address the surface.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    */
+   *min_array_elem = info->slice_base;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 326:
+    *
+    *     "For 3D Surfaces:
+    *      This field (Render Target View Extent) indicates the extent of the
+    *      accessible `R' coordinates minus 1 on the LOD currently being
+    *      rendered to.
+    *
+    *      For 1D and 2D Surfaces:
+    *      This field must be set to the same value as the Depth field.
+    *
+    *      For Other Surfaces:
+    *      This field is ignored."
+    */
+   *rt_view_extent = info->slice_count - 1;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_zs_info *info)
+{
+   uint16_t width, height, depth, array_base, view_extent;
+   enum gen_surface_type type;
+   enum gen_depth_format format;
+   uint32_t dw1, dw2, dw3, dw4;
+
+   ILO_DEV_ASSERT(dev, 6, 6);
+
+   if (!zs_validate_gen6(dev, info) ||
+       !zs_get_gen6_depth_extent(dev, info, &width, &height) ||
+       !zs_get_gen6_depth_slices(dev, info, &depth, &array_base,
+                                 &view_extent))
+      return false;
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
+                          get_gen6_surface_type(dev, info->s_img);
+
+   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
+      GEN6_ZFORMAT_D32_FLOAT;
+
+   /*
+    * From the Ironlake PRM, volume 2 part 1, page 330:
+    *
+    *     "If this field (Separate Stencil Buffer Enable) is disabled, the
+    *      Surface Format of the depth buffer cannot be D24_UNORM_X8_UINT."
+    *
+    * From the Sandy Bridge PRM, volume 2 part 1, page 321:
+    *
+    *     "[DevSNB]: This field (Separate Stencil Buffer Enable) must be set
+    *      to the same value (enabled or disabled) as Hierarchical Depth
+    *      Buffer Enable."
+    */
+   if (!info->hiz_enable && format == GEN6_ZFORMAT_D24_UNORM_X8_UINT)
+      format = GEN6_ZFORMAT_D24_UNORM_S8_UINT;
+
+   /* info->z_readonly and info->s_readonly are ignored on Gen6 */
+   dw1 = type << GEN6_DEPTH_DW1_TYPE__SHIFT |
+         GEN6_TILING_Y << GEN6_DEPTH_DW1_TILING__SHIFT |
+         format << GEN6_DEPTH_DW1_FORMAT__SHIFT;
+
+   if (info->z_img)
+      dw1 |= (info->z_img->bo_stride - 1) << GEN6_DEPTH_DW1_PITCH__SHIFT;
+
+   if (info->hiz_enable || !info->z_img) {
+      dw1 |= GEN6_DEPTH_DW1_HIZ_ENABLE |
+             GEN6_DEPTH_DW1_SEPARATE_STENCIL;
+   }
+
+   dw2 = 0;
+   dw3 = height << GEN6_DEPTH_DW3_HEIGHT__SHIFT |
+         width << GEN6_DEPTH_DW3_WIDTH__SHIFT |
+         info->level << GEN6_DEPTH_DW3_LOD__SHIFT |
+         GEN6_DEPTH_DW3_MIPLAYOUT_BELOW;
+   dw4 = depth << GEN6_DEPTH_DW4_DEPTH__SHIFT |
+         array_base << GEN6_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT |
+         view_extent << GEN6_DEPTH_DW4_RT_VIEW_EXTENT__SHIFT;
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5);
+   zs->depth[0] = dw1;
+   zs->depth[1] = dw2;
+   zs->depth[2] = dw3;
+   zs->depth[3] = dw4;
+   zs->depth[4] = 0;
+
+   zs->depth_format = format;
+
+   return true;
+}
+
+static bool
+zs_set_gen7_3DSTATE_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                 const struct ilo_dev *dev,
+                                 const struct ilo_state_zs_info *info)
+{
+   enum gen_surface_type type;
+   enum gen_depth_format format;
+   uint16_t width, height, depth;
+   uint16_t array_base, view_extent;
+   uint32_t dw1, dw2, dw3, dw4, dw6;
+
+   ILO_DEV_ASSERT(dev, 7, 8);
+
+   if (!zs_validate_gen6(dev, info) ||
+       !zs_get_gen6_depth_extent(dev, info, &width, &height) ||
+       !zs_get_gen6_depth_slices(dev, info, &depth, &array_base,
+                                 &view_extent))
+      return false;
+
+   type = (info->is_cube_map) ? GEN6_SURFTYPE_CUBE :
+          (info->z_img) ? get_gen6_surface_type(dev, info->z_img) :
+                          get_gen6_surface_type(dev, info->s_img);
+
+   format = (info->z_img) ? get_gen6_depth_format(dev, info->z_img) :
+      GEN6_ZFORMAT_D32_FLOAT;
+
+   dw1 = type << GEN7_DEPTH_DW1_TYPE__SHIFT |
+         format << GEN7_DEPTH_DW1_FORMAT__SHIFT;
+
+   if (info->z_img) {
+      if (!info->z_readonly)
+         dw1 |= GEN7_DEPTH_DW1_DEPTH_WRITE_ENABLE;
+      if (info->hiz_enable)
+         dw1 |= GEN7_DEPTH_DW1_HIZ_ENABLE;
+
+      dw1 |= (info->z_img->bo_stride - 1) << GEN7_DEPTH_DW1_PITCH__SHIFT;
+   }
+
+   if (info->s_img && !info->s_readonly)
+      dw1 |= GEN7_DEPTH_DW1_STENCIL_WRITE_ENABLE;
+
+   dw2 = 0;
+   dw3 = height << GEN7_DEPTH_DW3_HEIGHT__SHIFT |
+         width << GEN7_DEPTH_DW3_WIDTH__SHIFT |
+         info->level << GEN7_DEPTH_DW3_LOD__SHIFT;
+   dw4 = depth << GEN7_DEPTH_DW4_DEPTH__SHIFT |
+         array_base << GEN7_DEPTH_DW4_MIN_ARRAY_ELEMENT__SHIFT;
+   dw6 = view_extent << GEN7_DEPTH_DW6_RT_VIEW_EXTENT__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8) && info->z_img) {
+      assert(info->z_img->walk_layer_height % 4 == 0);
+      /* note that DW is off-by-one for Gen8+ */
+      dw6 |= (info->z_img->walk_layer_height / 4) <<
+         GEN8_DEPTH_DW7_QPITCH__SHIFT;
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->depth) >= 5);
+   zs->depth[0] = dw1;
+   zs->depth[1] = dw2;
+   zs->depth[2] = dw3;
+   zs->depth[3] = dw4;
+   zs->depth[4] = dw6;
+
+   zs->depth_format = format;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(struct ilo_state_zs *zs,
+                                        const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->stencil) >= 3);
+   zs->stencil[0] = 0;
+   zs->stencil[1] = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      zs->stencil[2] = 0;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_3DSTATE_STENCIL_BUFFER(struct ilo_state_zs *zs,
+                                   const struct ilo_dev *dev,
+                                   const struct ilo_state_zs_info *info)
+{
+   const struct ilo_image *img = info->s_img;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(img->bo_stride);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 329:
+    *
+    *     "The pitch must be set to 2x the value computed based on width, as
+    *      the stencil buffer is stored with two rows interleaved."
+    *
+    * For Gen7+, we still dobule the stride because we did not double the
+    * slice widths when initializing ilo_image.
+    */
+   dw1 = (img->bo_stride * 2 - 1) << GEN6_STENCIL_DW1_PITCH__SHIFT;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(7.5))
+      dw1 |= GEN75_STENCIL_DW1_STENCIL_BUFFER_ENABLE;
+
+   dw2 = 0;
+   /* offset to the level as Gen6 does not support mipmapped stencil */
+   if (ilo_dev_gen(dev) == ILO_GEN(6)) {
+      unsigned x, y;
+
+      ilo_image_get_slice_pos(img, info->level, 0, &x, &y);
+      ilo_image_pos_to_mem(img, x, y, &x, &y);
+      dw2 |= ilo_image_mem_to_raw(img, x, y);
+   }
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->stencil) >= 3);
+   zs->stencil[0] = dw1;
+   zs->stencil[1] = dw2;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      uint32_t dw4;
+
+      assert(img->walk_layer_height % 4 == 0);
+      dw4 = (img->walk_layer_height / 4) << GEN8_STENCIL_DW4_QPITCH__SHIFT;
+
+      zs->stencil[2] = dw4;
+   }
+
+   return true;
+}
+
+static bool
+zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                           const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->hiz) >= 3);
+   zs->hiz[0] = 0;
+   zs->hiz[1] = 0;
+   if (ilo_dev_gen(dev) >= ILO_GEN(8))
+      zs->hiz[2] = 0;
+
+   return true;
+}
+
+static bool
+zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(struct ilo_state_zs *zs,
+                                      const struct ilo_dev *dev,
+                                      const struct ilo_state_zs_info *info)
+{
+   const struct ilo_image *img = info->z_img;
+   uint32_t dw1, dw2;
+
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   assert(img->aux.bo_stride);
+
+   dw1 = (img->aux.bo_stride - 1) << GEN6_HIZ_DW1_PITCH__SHIFT;
+
+   dw2 = 0;
+   /* offset to the level as Gen6 does not support mipmapped HiZ */
+   if (ilo_dev_gen(dev) == ILO_GEN(6))
+      dw2 |= img->aux.walk_lod_offsets[info->level];
+
+   STATIC_ASSERT(ARRAY_SIZE(zs->hiz) >= 3);
+   zs->hiz[0] = dw1;
+   zs->hiz[1] = dw2;
+
+   if (ilo_dev_gen(dev) >= ILO_GEN(8)) {
+      uint32_t dw4;
+
+      assert(img->aux.walk_layer_height % 4 == 0);
+      dw4 = (img->aux.walk_layer_height / 4) << GEN8_HIZ_DW4_QPITCH__SHIFT;
+
+      zs->hiz[2] = dw4;
+   }
+
+   return true;
+}
+
+bool
+ilo_state_zs_init(struct ilo_state_zs *zs, const struct ilo_dev *dev,
+                  const struct ilo_state_zs_info *info)
+{
+   bool ret = true;
+
+   assert(ilo_is_zeroed(zs, sizeof(*zs)));
+
+   if (info->z_img || info->s_img) {
+      if (ilo_dev_gen(dev) >= ILO_GEN(7))
+         ret &= zs_set_gen7_3DSTATE_DEPTH_BUFFER(zs, dev, info);
+      else
+         ret &= zs_set_gen6_3DSTATE_DEPTH_BUFFER(zs, dev, info);
+   } else {
+      ret &= zs_set_gen6_null_3DSTATE_DEPTH_BUFFER(zs, dev);
+   }
+
+   if (info->s_img)
+      ret &= zs_set_gen6_3DSTATE_STENCIL_BUFFER(zs, dev, info);
+   else
+      ret &= zs_set_gen6_null_3DSTATE_STENCIL_BUFFER(zs, dev);
+
+   if (info->z_img && info->hiz_enable)
+      ret &= zs_set_gen6_3DSTATE_HIER_DEPTH_BUFFER(zs, dev, info);
+   else
+      ret &= zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+
+   zs->z_readonly = info->z_readonly;
+   zs->s_readonly = info->s_readonly;
+
+   assert(ret);
+
+   return ret;
+}
+
+bool
+ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
+                           const struct ilo_dev *dev)
+{
+   struct ilo_state_zs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   return ilo_state_zs_init(zs, dev, &info);
+}
+
+bool
+ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
+                         const struct ilo_dev *dev)
+{
+   ILO_DEV_ASSERT(dev, 6, 8);
+
+   /*
+    * Separate stencil must be disabled simultaneously on Gen6.  We can make
+    * it work when there is no stencil buffer, but it is probably not worth
+    * it.
+    */
+   assert(ilo_dev_gen(dev) >= ILO_GEN(7));
+
+   zs->depth[0] &= ~GEN7_DEPTH_DW1_HIZ_ENABLE;
+   zs_set_gen6_null_3DSTATE_HIER_DEPTH_BUFFER(zs, dev);
+
+   return true;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_state_zs.h b/src/gallium/drivers/ilo/core/ilo_state_zs.h
new file mode 100644
index 0000000..98212da
--- /dev/null
+++ b/src/gallium/drivers/ilo/core/ilo_state_zs.h
@@ -0,0 +1,93 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2015 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#ifndef ILO_STATE_ZS_H
+#define ILO_STATE_ZS_H
+
+#include "genhw/genhw.h"
+#include "intel_winsys.h"
+
+#include "ilo_core.h"
+#include "ilo_dev.h"
+
+struct ilo_image;
+
+struct ilo_state_zs_info {
+   /* both are optional */
+   const struct ilo_image *z_img;
+   const struct ilo_image *s_img;
+
+   /* ignored prior to Gen7 */
+   bool z_readonly;
+   bool s_readonly;
+
+   bool hiz_enable;
+   bool is_cube_map;
+
+   uint8_t level;
+   uint16_t slice_base;
+   uint16_t slice_count;
+};
+
+struct ilo_state_zs {
+   uint32_t depth[5];
+   uint32_t stencil[3];
+   uint32_t hiz[3];
+
+   /* TODO move this to ilo_image */
+   enum gen_depth_format depth_format;
+
+   bool z_readonly;
+   bool s_readonly;
+
+   /* managed by users */
+   struct intel_bo *depth_bo;
+   struct intel_bo *stencil_bo;
+   struct intel_bo *hiz_bo;
+};
+
+bool
+ilo_state_zs_init(struct ilo_state_zs *zs,
+                  const struct ilo_dev *dev,
+                  const struct ilo_state_zs_info *info);
+
+bool
+ilo_state_zs_init_for_null(struct ilo_state_zs *zs,
+                           const struct ilo_dev *dev);
+
+bool
+ilo_state_zs_disable_hiz(struct ilo_state_zs *zs,
+                         const struct ilo_dev *dev);
+
+static inline enum gen_depth_format
+ilo_state_zs_get_depth_format(const struct ilo_state_zs *zs,
+                              const struct ilo_dev *dev)
+{
+   return zs->depth_format;
+}
+
+#endif /* ILO_STATE_ZS_H */
diff --git a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
index 24d726a..5a0bb4f 100644
--- a/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_mi.xml.h
@@ -97,6 +97,9 @@ enum gen_mi_alu_operand {
 #define GEN6_MI_LENGTH__MASK					0x0000003f
 #define GEN6_MI_LENGTH__SHIFT					0
 #define GEN6_MI_NOOP__SIZE					1
+#define GEN6_MI_NOOP_DW0_WRITE_NOPID				(0x1 << 22)
+#define GEN6_MI_NOOP_DW0_VALUE__MASK				0x003fffff
+#define GEN6_MI_NOOP_DW0_VALUE__SHIFT				0
 
 #define GEN75_MI_SET_PREDICATE__SIZE				1
 #define GEN75_MI_SET_PREDICATE_DW0_PREDICATE__MASK		0x00000003
diff --git a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
index 2bdd72b..c51e4f7 100644
--- a/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_regs.xml.h
@@ -35,6 +35,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define GEN6_REG_MASK__MASK					0xffff0000
 #define GEN6_REG_MASK__SHIFT					16
 #define GEN6_REG__SIZE						0x400000
+#define GEN6_REG_NOPID						0x2094
+
 #define GEN7_REG_HS_INVOCATION_COUNT				0x2300
 
 #define GEN7_REG_DS_INVOCATION_COUNT				0x2308
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
index d25542e..52173fe 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_3d.xml.h
@@ -32,7 +32,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 */
 
 
-enum gen_prim_type {
+enum gen_3dprim_type {
     GEN6_3DPRIM_POINTLIST				      = 0x1,
     GEN6_3DPRIM_LINELIST				      = 0x2,
     GEN6_3DPRIM_LINESTRIP				      = 0x3,
@@ -105,6 +105,12 @@ enum gen_state_alignment {
     GEN8_ALIGNMENT_SURFACE_STATE			      = 0x40,
 };
 
+enum gen_index_format {
+    GEN6_INDEX_BYTE					      = 0x0,
+    GEN6_INDEX_WORD					      = 0x1,
+    GEN6_INDEX_DWORD					      = 0x2,
+};
+
 enum gen_vf_component {
     GEN6_VFCOMP_NOSTORE					      = 0x0,
     GEN6_VFCOMP_STORE_SRC				      = 0x1,
@@ -123,6 +129,87 @@ enum gen_depth_format {
     GEN6_ZFORMAT_D16_UNORM				      = 0x5,
 };
 
+enum gen_reorder_mode {
+    GEN7_REORDER_LEADING				      = 0x0,
+    GEN7_REORDER_TRAILING				      = 0x1,
+};
+
+enum gen_clip_mode {
+    GEN6_CLIPMODE_NORMAL				      = 0x0,
+    GEN6_CLIPMODE_REJECT_ALL				      = 0x3,
+    GEN6_CLIPMODE_ACCEPT_ALL				      = 0x4,
+};
+
+enum gen_front_winding {
+    GEN6_FRONTWINDING_CW				      = 0x0,
+    GEN6_FRONTWINDING_CCW				      = 0x1,
+};
+
+enum gen_fill_mode {
+    GEN6_FILLMODE_SOLID					      = 0x0,
+    GEN6_FILLMODE_WIREFRAME				      = 0x1,
+    GEN6_FILLMODE_POINT					      = 0x2,
+};
+
+enum gen_cull_mode {
+    GEN6_CULLMODE_BOTH					      = 0x0,
+    GEN6_CULLMODE_NONE					      = 0x1,
+    GEN6_CULLMODE_FRONT					      = 0x2,
+    GEN6_CULLMODE_BACK					      = 0x3,
+};
+
+enum gen_pixel_location {
+    GEN6_PIXLOC_CENTER					      = 0x0,
+    GEN6_PIXLOC_UL_CORNER				      = 0x1,
+};
+
+enum gen_sample_count {
+    GEN6_NUMSAMPLES_1					      = 0x0,
+    GEN8_NUMSAMPLES_2					      = 0x1,
+    GEN6_NUMSAMPLES_4					      = 0x2,
+    GEN7_NUMSAMPLES_8					      = 0x3,
+    GEN8_NUMSAMPLES_16					      = 0x4,
+};
+
+enum gen_inputattr_select {
+    GEN6_INPUTATTR_NORMAL				      = 0x0,
+    GEN6_INPUTATTR_FACING				      = 0x1,
+    GEN6_INPUTATTR_W					      = 0x2,
+    GEN6_INPUTATTR_FACING_W				      = 0x3,
+};
+
+enum gen_zw_interp {
+    GEN6_ZW_INTERP_PIXEL				      = 0x0,
+    GEN6_ZW_INTERP_CENTROID				      = 0x2,
+    GEN6_ZW_INTERP_SAMPLE				      = 0x3,
+};
+
+enum gen_position_offset {
+    GEN6_POSOFFSET_NONE					      = 0x0,
+    GEN6_POSOFFSET_CENTROID				      = 0x2,
+    GEN6_POSOFFSET_SAMPLE				      = 0x3,
+};
+
+enum gen_edsc_mode {
+    GEN7_EDSC_NORMAL					      = 0x0,
+    GEN7_EDSC_PSEXEC					      = 0x1,
+    GEN7_EDSC_PREPS					      = 0x2,
+};
+
+enum gen_pscdepth_mode {
+    GEN7_PSCDEPTH_OFF					      = 0x0,
+    GEN7_PSCDEPTH_ON					      = 0x1,
+    GEN7_PSCDEPTH_ON_GE					      = 0x2,
+    GEN7_PSCDEPTH_ON_LE					      = 0x3,
+};
+
+enum gen_msrast_mode {
+    GEN6_MSRASTMODE_OFF_PIXEL				      = 0x0,
+    GEN6_MSRASTMODE_OFF_PATTERN				      = 0x1,
+    GEN6_MSRASTMODE_ON_PIXEL				      = 0x2,
+    GEN6_MSRASTMODE_ON_PATTERN				      = 0x3,
+};
+
 #define GEN6_INTERP_NONPERSPECTIVE_SAMPLE			(0x1 << 5)
 #define GEN6_INTERP_NONPERSPECTIVE_CENTROID			(0x1 << 4)
 #define GEN6_INTERP_NONPERSPECTIVE_PIXEL			(0x1 << 3)
@@ -285,9 +372,6 @@ enum gen_depth_format {
 #define GEN6_IB_DW0_CUT_INDEX_ENABLE				(0x1 << 10)
 #define GEN6_IB_DW0_FORMAT__MASK				0x00000300
 #define GEN6_IB_DW0_FORMAT__SHIFT				8
-#define GEN6_IB_DW0_FORMAT_BYTE					(0x0 << 8)
-#define GEN6_IB_DW0_FORMAT_WORD					(0x1 << 8)
-#define GEN6_IB_DW0_FORMAT_DWORD				(0x2 << 8)
 
 
 
@@ -295,9 +379,6 @@ enum gen_depth_format {
 
 #define GEN8_IB_DW1_FORMAT__MASK				0x00000300
 #define GEN8_IB_DW1_FORMAT__SHIFT				8
-#define GEN8_IB_DW1_FORMAT_BYTE					(0x0 << 8)
-#define GEN8_IB_DW1_FORMAT_WORD					(0x1 << 8)
-#define GEN8_IB_DW1_FORMAT_DWORD				(0x2 << 8)
 #define GEN8_IB_DW1_MOCS__MASK					0x0000007f
 #define GEN8_IB_DW1_MOCS__SHIFT					0
 
@@ -313,8 +394,8 @@ enum gen_depth_format {
 
 
 #define GEN8_INSTANCING_DW1_ENABLE				(0x1 << 8)
-#define GEN8_INSTANCING_DW1_VB_INDEX__MASK			0x0000003f
-#define GEN8_INSTANCING_DW1_VB_INDEX__SHIFT			0
+#define GEN8_INSTANCING_DW1_VE_INDEX__MASK			0x0000003f
+#define GEN8_INSTANCING_DW1_VE_INDEX__SHIFT			0
 
 
 #define GEN8_3DSTATE_VF_SGVS__SIZE				2
@@ -614,7 +695,7 @@ enum gen_depth_format {
 #define GEN6_GS_DW5_SO_STATISTICS				(0x1 << 9)
 #define GEN6_GS_DW5_RENDER_ENABLE				(0x1 << 8)
 
-#define GEN6_GS_DW6_REORDER_ENABLE				(0x1 << 30)
+#define GEN6_GS_DW6_REORDER_LEADING_ENABLE			(0x1 << 30)
 #define GEN6_GS_DW6_DISCARD_ADJACENCY				(0x1 << 29)
 #define GEN6_GS_DW6_SVBI_PAYLOAD_ENABLE				(0x1 << 28)
 #define GEN6_GS_DW6_SVBI_POST_INC_ENABLE			(0x1 << 27)
@@ -666,11 +747,9 @@ enum gen_depth_format {
 #define GEN7_GS_DW5_INVOCATION_INCR__SHIFT			5
 #define GEN7_GS_DW5_INCLUDE_PRIMITIVE_ID			(0x1 << 4)
 #define GEN7_GS_DW5_HINT					(0x1 << 3)
-#define GEN7_GS_DW5_REORDER_ENABLE				(0x1 << 2)
-#define GEN75_GS_DW5_REORDER__MASK				0x00000004
-#define GEN75_GS_DW5_REORDER__SHIFT				2
-#define GEN75_GS_DW5_REORDER_LEADING				(0x0 << 2)
-#define GEN75_GS_DW5_REORDER_TRAILING				(0x1 << 2)
+#define GEN7_GS_DW5_REORDER_LEADING_ENABLE			(0x1 << 2)
+#define GEN75_GS_DW5_REORDER_MODE__MASK				0x00000004
+#define GEN75_GS_DW5_REORDER_MODE__SHIFT			2
 #define GEN7_GS_DW5_DISCARD_ADJACENCY				(0x1 << 1)
 #define GEN7_GS_DW5_GS_ENABLE					(0x1 << 0)
 
@@ -727,10 +806,8 @@ enum gen_depth_format {
 #define GEN8_GS_DW7_INVOCATION_INCR__SHIFT			5
 #define GEN8_GS_DW7_INCLUDE_PRIMITIVE_ID			(0x1 << 4)
 #define GEN8_GS_DW7_HINT					(0x1 << 3)
-#define GEN8_GS_DW7_REORDER__MASK				0x00000004
-#define GEN8_GS_DW7_REORDER__SHIFT				2
-#define GEN8_GS_DW7_REORDER_LEADING				(0x0 << 2)
-#define GEN8_GS_DW7_REORDER_TRAILING				(0x1 << 2)
+#define GEN8_GS_DW7_REORDER_MODE__MASK				0x00000004
+#define GEN8_GS_DW7_REORDER_MODE__SHIFT				2
 #define GEN8_GS_DW7_DISCARD_ADJACENCY				(0x1 << 1)
 #define GEN8_GS_DW7_GS_ENABLE					(0x1 << 0)
 
@@ -758,10 +835,8 @@ enum gen_depth_format {
 #define GEN7_SO_DW1_RENDER_DISABLE				(0x1 << 30)
 #define GEN7_SO_DW1_RENDER_STREAM_SELECT__MASK			0x18000000
 #define GEN7_SO_DW1_RENDER_STREAM_SELECT__SHIFT			27
-#define GEN7_SO_DW1_REORDER__MASK				0x04000000
-#define GEN7_SO_DW1_REORDER__SHIFT				26
-#define GEN7_SO_DW1_REORDER_LEADING				(0x0 << 26)
-#define GEN7_SO_DW1_REORDER_TRAILING				(0x1 << 26)
+#define GEN7_SO_DW1_REORDER_MODE__MASK				0x04000000
+#define GEN7_SO_DW1_REORDER_MODE__SHIFT				26
 #define GEN7_SO_DW1_STATISTICS					(0x1 << 25)
 #define GEN7_SO_DW1_BUFFER_ENABLES__MASK			0x00000f00
 #define GEN7_SO_DW1_BUFFER_ENABLES__SHIFT			8
@@ -862,21 +937,15 @@ enum gen_depth_format {
 #define GEN6_3DSTATE_CLIP__SIZE					4
 
 
-#define GEN7_CLIP_DW1_FRONTWINDING__MASK			0x00100000
-#define GEN7_CLIP_DW1_FRONTWINDING__SHIFT			20
-#define GEN7_CLIP_DW1_FRONTWINDING_CW				(0x0 << 20)
-#define GEN7_CLIP_DW1_FRONTWINDING_CCW				(0x1 << 20)
+#define GEN7_CLIP_DW1_FRONT_WINDING__MASK			0x00100000
+#define GEN7_CLIP_DW1_FRONT_WINDING__SHIFT			20
 #define GEN7_CLIP_DW1_SUBPIXEL__MASK				0x00080000
 #define GEN7_CLIP_DW1_SUBPIXEL__SHIFT				19
 #define GEN7_CLIP_DW1_SUBPIXEL_8BITS				(0x0 << 19)
 #define GEN7_CLIP_DW1_SUBPIXEL_4BITS				(0x1 << 19)
 #define GEN7_CLIP_DW1_EARLY_CULL_ENABLE				(0x1 << 18)
-#define GEN7_CLIP_DW1_CULLMODE__MASK				0x00030000
-#define GEN7_CLIP_DW1_CULLMODE__SHIFT				16
-#define GEN7_CLIP_DW1_CULLMODE_BOTH				(0x0 << 16)
-#define GEN7_CLIP_DW1_CULLMODE_NONE				(0x1 << 16)
-#define GEN7_CLIP_DW1_CULLMODE_FRONT				(0x2 << 16)
-#define GEN7_CLIP_DW1_CULLMODE_BACK				(0x3 << 16)
+#define GEN7_CLIP_DW1_CULL_MODE__MASK				0x00030000
+#define GEN7_CLIP_DW1_CULL_MODE__SHIFT				16
 #define GEN6_CLIP_DW1_STATISTICS				(0x1 << 10)
 #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__MASK			0x000000ff
 #define GEN6_CLIP_DW1_UCP_CULL_ENABLES__SHIFT			0
@@ -891,11 +960,8 @@ enum gen_depth_format {
 #define GEN6_CLIP_DW2_GB_TEST_ENABLE				(0x1 << 26)
 #define GEN6_CLIP_DW2_UCP_CLIP_ENABLES__MASK			0x00ff0000
 #define GEN6_CLIP_DW2_UCP_CLIP_ENABLES__SHIFT			16
-#define GEN6_CLIP_DW2_CLIPMODE__MASK				0x0000e000
-#define GEN6_CLIP_DW2_CLIPMODE__SHIFT				13
-#define GEN6_CLIP_DW2_CLIPMODE_NORMAL				(0x0 << 13)
-#define GEN6_CLIP_DW2_CLIPMODE_REJECT_ALL			(0x3 << 13)
-#define GEN6_CLIP_DW2_CLIPMODE_ACCEPT_ALL			(0x4 << 13)
+#define GEN6_CLIP_DW2_CLIP_MODE__MASK				0x0000e000
+#define GEN6_CLIP_DW2_CLIP_MODE__SHIFT				13
 #define GEN6_CLIP_DW2_PERSPECTIVE_DIVIDE_DISABLE		(0x1 << 9)
 #define GEN6_CLIP_DW2_NONPERSPECTIVE_BARYCENTRIC_ENABLE		(0x1 << 8)
 #define GEN6_CLIP_DW2_TRI_PROVOKE__MASK				0x00000030
@@ -911,7 +977,7 @@ enum gen_depth_format {
 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__MASK			0x0001ffc0
 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__SHIFT			6
 #define GEN6_CLIP_DW3_MAX_POINT_WIDTH__RADIX			3
-#define GEN6_CLIP_DW3_RTAINDEX_FORCED_ZERO			(0x1 << 5)
+#define GEN6_CLIP_DW3_FORCE_RTAINDEX_ZERO			(0x1 << 5)
 #define GEN6_CLIP_DW3_MAX_VPINDEX__MASK				0x0000000f
 #define GEN6_CLIP_DW3_MAX_VPINDEX__SHIFT			0
 
@@ -927,29 +993,17 @@ enum gen_depth_format {
 #define GEN7_SF_DW1_DEPTH_OFFSET_SOLID				(0x1 << 9)
 #define GEN7_SF_DW1_DEPTH_OFFSET_WIREFRAME			(0x1 << 8)
 #define GEN7_SF_DW1_DEPTH_OFFSET_POINT				(0x1 << 7)
-#define GEN7_SF_DW1_FRONTFACE__MASK				0x00000060
-#define GEN7_SF_DW1_FRONTFACE__SHIFT				5
-#define GEN7_SF_DW1_FRONTFACE_SOLID				(0x0 << 5)
-#define GEN7_SF_DW1_FRONTFACE_WIREFRAME				(0x1 << 5)
-#define GEN7_SF_DW1_FRONTFACE_POINT				(0x2 << 5)
-#define GEN7_SF_DW1_BACKFACE__MASK				0x00000018
-#define GEN7_SF_DW1_BACKFACE__SHIFT				3
-#define GEN7_SF_DW1_BACKFACE_SOLID				(0x0 << 3)
-#define GEN7_SF_DW1_BACKFACE_WIREFRAME				(0x1 << 3)
-#define GEN7_SF_DW1_BACKFACE_POINT				(0x2 << 3)
-#define GEN7_SF_DW1_VIEWPORT_ENABLE				(0x1 << 1)
-#define GEN7_SF_DW1_FRONTWINDING__MASK				0x00000001
-#define GEN7_SF_DW1_FRONTWINDING__SHIFT				0
-#define GEN7_SF_DW1_FRONTWINDING_CW				0x0
-#define GEN7_SF_DW1_FRONTWINDING_CCW				0x1
+#define GEN7_SF_DW1_FILL_MODE_FRONT__MASK			0x00000060
+#define GEN7_SF_DW1_FILL_MODE_FRONT__SHIFT			5
+#define GEN7_SF_DW1_FILL_MODE_BACK__MASK			0x00000018
+#define GEN7_SF_DW1_FILL_MODE_BACK__SHIFT			3
+#define GEN7_SF_DW1_VIEWPORT_TRANSFORM				(0x1 << 1)
+#define GEN7_SF_DW1_FRONT_WINDING__MASK				0x00000001
+#define GEN7_SF_DW1_FRONT_WINDING__SHIFT			0
 
 #define GEN7_SF_DW2_AA_LINE_ENABLE				(0x1 << 31)
-#define GEN7_SF_DW2_CULLMODE__MASK				0x60000000
-#define GEN7_SF_DW2_CULLMODE__SHIFT				29
-#define GEN7_SF_DW2_CULLMODE_BOTH				(0x0 << 29)
-#define GEN7_SF_DW2_CULLMODE_NONE				(0x1 << 29)
-#define GEN7_SF_DW2_CULLMODE_FRONT				(0x2 << 29)
-#define GEN7_SF_DW2_CULLMODE_BACK				(0x3 << 29)
+#define GEN7_SF_DW2_CULL_MODE__MASK				0x60000000
+#define GEN7_SF_DW2_CULL_MODE__SHIFT				29
 #define GEN7_SF_DW2_LINE_WIDTH__MASK				0x0ffc0000
 #define GEN7_SF_DW2_LINE_WIDTH__SHIFT				18
 #define GEN7_SF_DW2_LINE_WIDTH__RADIX				7
@@ -963,10 +1017,6 @@ enum gen_depth_format {
 #define GEN7_SF_DW2_SCISSOR_ENABLE				(0x1 << 11)
 #define GEN7_SF_DW2_MSRASTMODE__MASK				0x00000300
 #define GEN7_SF_DW2_MSRASTMODE__SHIFT				8
-#define GEN7_SF_DW2_MSRASTMODE_OFF_PIXEL			(0x0 << 8)
-#define GEN7_SF_DW2_MSRASTMODE_OFF_PATTERN			(0x1 << 8)
-#define GEN7_SF_DW2_MSRASTMODE_ON_PIXEL				(0x2 << 8)
-#define GEN7_SF_DW2_MSRASTMODE_ON_PATTERN			(0x3 << 8)
 
 #define GEN7_SF_DW3_LINE_LAST_PIXEL_ENABLE			(0x1 << 31)
 #define GEN7_SF_DW3_TRI_PROVOKE__MASK				0x60000000
@@ -1021,14 +1071,10 @@ enum gen_depth_format {
 #define GEN8_SBE_SWIZ_CONST_0001_FLOAT				(0x1 << 9)
 #define GEN8_SBE_SWIZ_CONST_1111_FLOAT				(0x2 << 9)
 #define GEN8_SBE_SWIZ_CONST_PRIM_ID				(0x3 << 9)
-#define GEN8_SBE_SWIZ_INPUTATTR__MASK				0x000000c0
-#define GEN8_SBE_SWIZ_INPUTATTR__SHIFT				6
-#define GEN8_SBE_SWIZ_INPUTATTR_NORMAL				(0x0 << 6)
-#define GEN8_SBE_SWIZ_INPUTATTR_FACING				(0x1 << 6)
-#define GEN8_SBE_SWIZ_INPUTATTR_W				(0x2 << 6)
-#define GEN8_SBE_SWIZ_INPUTATTR_FACING_W			(0x3 << 6)
-#define GEN8_SBE_SWIZ_URB_ENTRY_OFFSET__MASK			0x0000001f
-#define GEN8_SBE_SWIZ_URB_ENTRY_OFFSET__SHIFT			0
+#define GEN8_SBE_SWIZ_SWIZZLE_SELECT__MASK			0x000000c0
+#define GEN8_SBE_SWIZ_SWIZZLE_SELECT__SHIFT			6
+#define GEN8_SBE_SWIZ_SRC_ATTR__MASK				0x0000001f
+#define GEN8_SBE_SWIZ_SRC_ATTR__SHIFT				0
 
 #define GEN6_3DSTATE_SF__SIZE					20
 
@@ -1080,31 +1126,19 @@ enum gen_depth_format {
 
 
 #define GEN9_RASTER_DW1_Z_TEST_FAR_ENABLE			(0x1 << 26)
-#define GEN8_RASTER_DW1_FRONTWINDING__MASK			0x00200000
-#define GEN8_RASTER_DW1_FRONTWINDING__SHIFT			21
-#define GEN8_RASTER_DW1_FRONTWINDING_CW				(0x0 << 21)
-#define GEN8_RASTER_DW1_FRONTWINDING_CCW			(0x1 << 21)
-#define GEN8_RASTER_DW1_CULLMODE__MASK				0x00030000
-#define GEN8_RASTER_DW1_CULLMODE__SHIFT				16
-#define GEN8_RASTER_DW1_CULLMODE_BOTH				(0x0 << 16)
-#define GEN8_RASTER_DW1_CULLMODE_NONE				(0x1 << 16)
-#define GEN8_RASTER_DW1_CULLMODE_FRONT				(0x2 << 16)
-#define GEN8_RASTER_DW1_CULLMODE_BACK				(0x3 << 16)
+#define GEN8_RASTER_DW1_FRONT_WINDING__MASK			0x00200000
+#define GEN8_RASTER_DW1_FRONT_WINDING__SHIFT			21
+#define GEN8_RASTER_DW1_CULL_MODE__MASK				0x00030000
+#define GEN8_RASTER_DW1_CULL_MODE__SHIFT			16
 #define GEN8_RASTER_DW1_SMOOTH_POINT_ENABLE			(0x1 << 13)
 #define GEN8_RASTER_DW1_API_MULTISAMPLE_ENABLE			(0x1 << 12)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_SOLID			(0x1 << 9)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_WIREFRAME			(0x1 << 8)
 #define GEN8_RASTER_DW1_DEPTH_OFFSET_POINT			(0x1 << 7)
-#define GEN8_RASTER_DW1_FRONTFACE__MASK				0x00000060
-#define GEN8_RASTER_DW1_FRONTFACE__SHIFT			5
-#define GEN8_RASTER_DW1_FRONTFACE_SOLID				(0x0 << 5)
-#define GEN8_RASTER_DW1_FRONTFACE_WIREFRAME			(0x1 << 5)
-#define GEN8_RASTER_DW1_FRONTFACE_POINT				(0x2 << 5)
-#define GEN8_RASTER_DW1_BACKFACE__MASK				0x00000018
-#define GEN8_RASTER_DW1_BACKFACE__SHIFT				3
-#define GEN8_RASTER_DW1_BACKFACE_SOLID				(0x0 << 3)
-#define GEN8_RASTER_DW1_BACKFACE_WIREFRAME			(0x1 << 3)
-#define GEN8_RASTER_DW1_BACKFACE_POINT				(0x2 << 3)
+#define GEN8_RASTER_DW1_FILL_MODE_FRONT__MASK			0x00000060
+#define GEN8_RASTER_DW1_FILL_MODE_FRONT__SHIFT			5
+#define GEN8_RASTER_DW1_FILL_MODE_BACK__MASK			0x00000018
+#define GEN8_RASTER_DW1_FILL_MODE_BACK__SHIFT			3
 #define GEN8_RASTER_DW1_AA_LINE_ENABLE				(0x1 << 2)
 #define GEN8_RASTER_DW1_SCISSOR_ENABLE				(0x1 << 1)
 #define GEN8_RASTER_DW1_Z_TEST_ENABLE				(0x1 << 0)
@@ -1164,14 +1198,8 @@ enum gen_depth_format {
 #define GEN6_WM_DW6_SF_ATTR_COUNT__SHIFT			20
 #define GEN6_WM_DW6_PS_POSOFFSET__MASK				0x000c0000
 #define GEN6_WM_DW6_PS_POSOFFSET__SHIFT				18
-#define GEN6_WM_DW6_PS_POSOFFSET_NONE				(0x0 << 18)
-#define GEN6_WM_DW6_PS_POSOFFSET_CENTROID			(0x2 << 18)
-#define GEN6_WM_DW6_PS_POSOFFSET_SAMPLE				(0x3 << 18)
 #define GEN6_WM_DW6_ZW_INTERP__MASK				0x00030000
 #define GEN6_WM_DW6_ZW_INTERP__SHIFT				16
-#define GEN6_WM_DW6_ZW_INTERP_PIXEL				(0x0 << 16)
-#define GEN6_WM_DW6_ZW_INTERP_CENTROID				(0x2 << 16)
-#define GEN6_WM_DW6_ZW_INTERP_SAMPLE				(0x3 << 16)
 #define GEN6_WM_DW6_BARYCENTRIC_INTERP__MASK			0x0000fc00
 #define GEN6_WM_DW6_BARYCENTRIC_INTERP__SHIFT			10
 #define GEN6_WM_DW6_POINT_RASTRULE__MASK			0x00000200
@@ -1180,10 +1208,6 @@ enum gen_depth_format {
 #define GEN6_WM_DW6_POINT_RASTRULE_UPPER_RIGHT			(0x1 << 9)
 #define GEN6_WM_DW6_MSRASTMODE__MASK				0x00000006
 #define GEN6_WM_DW6_MSRASTMODE__SHIFT				1
-#define GEN6_WM_DW6_MSRASTMODE_OFF_PIXEL			(0x0 << 1)
-#define GEN6_WM_DW6_MSRASTMODE_OFF_PATTERN			(0x1 << 1)
-#define GEN6_WM_DW6_MSRASTMODE_ON_PIXEL				(0x2 << 1)
-#define GEN6_WM_DW6_MSRASTMODE_ON_PATTERN			(0x3 << 1)
 #define GEN6_WM_DW6_MSDISPMODE__MASK				0x00000001
 #define GEN6_WM_DW6_MSDISPMODE__SHIFT				0
 #define GEN6_WM_DW6_MSDISPMODE_PERSAMPLE			0x0
@@ -1207,22 +1231,12 @@ enum gen_depth_format {
 #define GEN7_WM_DW1_PS_KILL_PIXEL				(0x1 << 25)
 #define GEN7_WM_DW1_PSCDEPTH__MASK				0x01800000
 #define GEN7_WM_DW1_PSCDEPTH__SHIFT				23
-#define GEN7_WM_DW1_PSCDEPTH_OFF				(0x0 << 23)
-#define GEN7_WM_DW1_PSCDEPTH_ON					(0x1 << 23)
-#define GEN7_WM_DW1_PSCDEPTH_ON_GE				(0x2 << 23)
-#define GEN7_WM_DW1_PSCDEPTH_ON_LE				(0x3 << 23)
 #define GEN7_WM_DW1_EDSC__MASK					0x00600000
 #define GEN7_WM_DW1_EDSC__SHIFT					21
-#define GEN7_WM_DW1_EDSC_NORMAL					(0x0 << 21)
-#define GEN7_WM_DW1_EDSC_PSEXEC					(0x1 << 21)
-#define GEN7_WM_DW1_EDSC_PREPS					(0x2 << 21)
 #define GEN7_WM_DW1_PS_USE_DEPTH				(0x1 << 20)
 #define GEN7_WM_DW1_PS_USE_W					(0x1 << 19)
 #define GEN7_WM_DW1_ZW_INTERP__MASK				0x00060000
 #define GEN7_WM_DW1_ZW_INTERP__SHIFT				17
-#define GEN7_WM_DW1_ZW_INTERP_PIXEL				(0x0 << 17)
-#define GEN7_WM_DW1_ZW_INTERP_CENTROID				(0x2 << 17)
-#define GEN7_WM_DW1_ZW_INTERP_SAMPLE				(0x3 << 17)
 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__MASK			0x0001f800
 #define GEN7_WM_DW1_BARYCENTRIC_INTERP__SHIFT			11
 #define GEN7_WM_DW1_PS_USE_COVERAGE_MASK			(0x1 << 10)
@@ -1247,10 +1261,6 @@ enum gen_depth_format {
 #define GEN7_WM_DW1_POINT_RASTRULE_UPPER_RIGHT			(0x1 << 2)
 #define GEN7_WM_DW1_MSRASTMODE__MASK				0x00000003
 #define GEN7_WM_DW1_MSRASTMODE__SHIFT				0
-#define GEN7_WM_DW1_MSRASTMODE_OFF_PIXEL			0x0
-#define GEN7_WM_DW1_MSRASTMODE_OFF_PATTERN			0x1
-#define GEN7_WM_DW1_MSRASTMODE_ON_PIXEL				0x2
-#define GEN7_WM_DW1_MSRASTMODE_ON_PATTERN			0x3
 
 #define GEN7_WM_DW2_MSDISPMODE__MASK				0x80000000
 #define GEN7_WM_DW2_MSDISPMODE__SHIFT				31
@@ -1265,12 +1275,12 @@ enum gen_depth_format {
 #define GEN8_3DSTATE_WM_DEPTH_STENCIL__SIZE			4
 
 
-#define GEN8_ZS_DW1_STENCIL0_FAIL_OP__MASK			0xe0000000
-#define GEN8_ZS_DW1_STENCIL0_FAIL_OP__SHIFT			29
-#define GEN8_ZS_DW1_STENCIL0_ZFAIL_OP__MASK			0x1c000000
-#define GEN8_ZS_DW1_STENCIL0_ZFAIL_OP__SHIFT			26
-#define GEN8_ZS_DW1_STENCIL0_ZPASS_OP__MASK			0x03800000
-#define GEN8_ZS_DW1_STENCIL0_ZPASS_OP__SHIFT			23
+#define GEN8_ZS_DW1_STENCIL_FAIL_OP__MASK			0xe0000000
+#define GEN8_ZS_DW1_STENCIL_FAIL_OP__SHIFT			29
+#define GEN8_ZS_DW1_STENCIL_ZFAIL_OP__MASK			0x1c000000
+#define GEN8_ZS_DW1_STENCIL_ZFAIL_OP__SHIFT			26
+#define GEN8_ZS_DW1_STENCIL_ZPASS_OP__MASK			0x03800000
+#define GEN8_ZS_DW1_STENCIL_ZPASS_OP__SHIFT			23
 #define GEN8_ZS_DW1_STENCIL1_FUNC__MASK				0x00700000
 #define GEN8_ZS_DW1_STENCIL1_FUNC__SHIFT			20
 #define GEN8_ZS_DW1_STENCIL1_FAIL_OP__MASK			0x000e0000
@@ -1279,8 +1289,8 @@ enum gen_depth_format {
 #define GEN8_ZS_DW1_STENCIL1_ZFAIL_OP__SHIFT			14
 #define GEN8_ZS_DW1_STENCIL1_ZPASS_OP__MASK			0x00003800
 #define GEN8_ZS_DW1_STENCIL1_ZPASS_OP__SHIFT			11
-#define GEN8_ZS_DW1_STENCIL0_FUNC__MASK				0x00000700
-#define GEN8_ZS_DW1_STENCIL0_FUNC__SHIFT			8
+#define GEN8_ZS_DW1_STENCIL_FUNC__MASK				0x00000700
+#define GEN8_ZS_DW1_STENCIL_FUNC__SHIFT				8
 #define GEN8_ZS_DW1_DEPTH_FUNC__MASK				0x000000e0
 #define GEN8_ZS_DW1_DEPTH_FUNC__SHIFT				5
 #define GEN8_ZS_DW1_STENCIL1_ENABLE				(0x1 << 4)
@@ -1289,17 +1299,17 @@ enum gen_depth_format {
 #define GEN8_ZS_DW1_DEPTH_TEST_ENABLE				(0x1 << 1)
 #define GEN8_ZS_DW1_DEPTH_WRITE_ENABLE				(0x1 << 0)
 
-#define GEN8_ZS_DW2_STENCIL0_VALUEMASK__MASK			0xff000000
-#define GEN8_ZS_DW2_STENCIL0_VALUEMASK__SHIFT			24
-#define GEN8_ZS_DW2_STENCIL0_WRITEMASK__MASK			0x00ff0000
-#define GEN8_ZS_DW2_STENCIL0_WRITEMASK__SHIFT			16
-#define GEN8_ZS_DW2_STENCIL1_VALUEMASK__MASK			0x0000ff00
-#define GEN8_ZS_DW2_STENCIL1_VALUEMASK__SHIFT			8
-#define GEN8_ZS_DW2_STENCIL1_WRITEMASK__MASK			0x000000ff
-#define GEN8_ZS_DW2_STENCIL1_WRITEMASK__SHIFT			0
-
-#define GEN9_ZS_DW3_STENCIL0_REF__MASK				0x0000ff00
-#define GEN9_ZS_DW3_STENCIL0_REF__SHIFT				8
+#define GEN8_ZS_DW2_STENCIL_TEST_MASK__MASK			0xff000000
+#define GEN8_ZS_DW2_STENCIL_TEST_MASK__SHIFT			24
+#define GEN8_ZS_DW2_STENCIL_WRITE_MASK__MASK			0x00ff0000
+#define GEN8_ZS_DW2_STENCIL_WRITE_MASK__SHIFT			16
+#define GEN8_ZS_DW2_STENCIL1_TEST_MASK__MASK			0x0000ff00
+#define GEN8_ZS_DW2_STENCIL1_TEST_MASK__SHIFT			8
+#define GEN8_ZS_DW2_STENCIL1_WRITE_MASK__MASK			0x000000ff
+#define GEN8_ZS_DW2_STENCIL1_WRITE_MASK__SHIFT			0
+
+#define GEN9_ZS_DW3_STENCIL_REF__MASK				0x0000ff00
+#define GEN9_ZS_DW3_STENCIL_REF__SHIFT				8
 #define GEN9_ZS_DW3_STENCIL1_REF__MASK				0x000000ff
 #define GEN9_ZS_DW3_STENCIL1_REF__SHIFT				0
 
@@ -1314,13 +1324,8 @@ enum gen_depth_format {
 #define GEN8_WM_HZ_DW1_FULL_SURFACE_DEPTH_CLEAR			(0x1 << 25)
 #define GEN8_WM_HZ_DW1_STENCIL_CLEAR_VALUE__MASK		0x00ff0000
 #define GEN8_WM_HZ_DW1_STENCIL_CLEAR_VALUE__SHIFT		16
-#define GEN8_WM_HZ_DW1_NUMSAMPLES__MASK				0x0000e000
-#define GEN8_WM_HZ_DW1_NUMSAMPLES__SHIFT			13
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_1				(0x0 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_2				(0x1 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_4				(0x2 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_8				(0x3 << 13)
-#define GEN8_WM_HZ_DW1_NUMSAMPLES_16				(0x4 << 13)
+#define GEN8_WM_HZ_DW1_NUM_SAMPLES__MASK			0x0000e000
+#define GEN8_WM_HZ_DW1_NUM_SAMPLES__SHIFT			13
 
 #define GEN8_WM_HZ_DW2_RECT_MIN_Y__MASK				0xffff0000
 #define GEN8_WM_HZ_DW2_RECT_MIN_Y__SHIFT			16
@@ -1359,9 +1364,6 @@ enum gen_depth_format {
 #define GEN75_PS_DW4_ACCESS_UAV					(0x1 << 5)
 #define GEN7_PS_DW4_POSOFFSET__MASK				0x00000018
 #define GEN7_PS_DW4_POSOFFSET__SHIFT				3
-#define GEN7_PS_DW4_POSOFFSET_NONE				(0x0 << 3)
-#define GEN7_PS_DW4_POSOFFSET_CENTROID				(0x2 << 3)
-#define GEN7_PS_DW4_POSOFFSET_SAMPLE				(0x3 << 3)
 #define GEN7_PS_DW4_DISPATCH_MODE__MASK				0x00000007
 #define GEN7_PS_DW4_DISPATCH_MODE__SHIFT			0
 
@@ -1397,9 +1399,6 @@ enum gen_depth_format {
 #define GEN8_PS_DW6_RT_RESOLVE					(0x1 << 6)
 #define GEN8_PS_DW6_POSOFFSET__MASK				0x00000018
 #define GEN8_PS_DW6_POSOFFSET__SHIFT				3
-#define GEN8_PS_DW6_POSOFFSET_NONE				(0x0 << 3)
-#define GEN8_PS_DW6_POSOFFSET_CENTROID				(0x2 << 3)
-#define GEN8_PS_DW6_POSOFFSET_SAMPLE				(0x3 << 3)
 #define GEN8_PS_DW6_DISPATCH_MODE__MASK				0x00000007
 #define GEN8_PS_DW6_DISPATCH_MODE__SHIFT			0
 
@@ -1423,16 +1422,12 @@ enum gen_depth_format {
 #define GEN8_3DSTATE_PS_EXTRA__SIZE				2
 
 
-#define GEN8_PSX_DW1_DISPATCH_ENABLE				(0x1 << 31)
+#define GEN8_PSX_DW1_VALID					(0x1 << 31)
 #define GEN8_PSX_DW1_UAV_ONLY					(0x1 << 30)
 #define GEN8_PSX_DW1_COMPUTE_OMASK				(0x1 << 29)
 #define GEN8_PSX_DW1_KILL_PIXEL					(0x1 << 28)
 #define GEN8_PSX_DW1_PSCDEPTH__MASK				0x0c000000
 #define GEN8_PSX_DW1_PSCDEPTH__SHIFT				26
-#define GEN8_PSX_DW1_PSCDEPTH_OFF				(0x0 << 26)
-#define GEN8_PSX_DW1_PSCDEPTH_ON				(0x1 << 26)
-#define GEN8_PSX_DW1_PSCDEPTH_ON_GE				(0x2 << 26)
-#define GEN8_PSX_DW1_PSCDEPTH_ON_LE				(0x3 << 26)
 #define GEN8_PSX_DW1_FORCE_COMPUTE_DEPTH			(0x1 << 25)
 #define GEN8_PSX_DW1_USE_DEPTH					(0x1 << 24)
 #define GEN8_PSX_DW1_USE_W					(0x1 << 23)
@@ -1696,17 +1691,10 @@ enum gen_depth_format {
 
 
 #define GEN75_MULTISAMPLE_DW1_DX9_MULTISAMPLE_ENABLE		(0x1 << 5)
-#define GEN6_MULTISAMPLE_DW1_PIXLOC__MASK			0x00000010
-#define GEN6_MULTISAMPLE_DW1_PIXLOC__SHIFT			4
-#define GEN6_MULTISAMPLE_DW1_PIXLOC_CENTER			(0x0 << 4)
-#define GEN6_MULTISAMPLE_DW1_PIXLOC_UL_CORNER			(0x1 << 4)
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES__MASK			0x0000000e
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES__SHIFT			1
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES_1			(0x0 << 1)
-#define GEN8_MULTISAMPLE_DW1_NUMSAMPLES_2			(0x1 << 1)
-#define GEN6_MULTISAMPLE_DW1_NUMSAMPLES_4			(0x2 << 1)
-#define GEN7_MULTISAMPLE_DW1_NUMSAMPLES_8			(0x3 << 1)
-#define GEN8_MULTISAMPLE_DW1_NUMSAMPLES_16			(0x4 << 1)
+#define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__MASK		0x00000010
+#define GEN6_MULTISAMPLE_DW1_PIXEL_LOCATION__SHIFT		4
+#define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__MASK			0x0000000e
+#define GEN6_MULTISAMPLE_DW1_NUM_SAMPLES__SHIFT			1
 
 
 
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
index 6d815be..b65b704a 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_dynamic.xml.h
@@ -84,7 +84,7 @@ enum gen_blend_function {
     GEN6_BLENDFUNCTION_MAX				      = 0x4,
 };
 
-enum gen_logicop_function {
+enum gen_logic_op {
     GEN6_LOGICOP_CLEAR					      = 0x0,
     GEN6_LOGICOP_NOR					      = 0x1,
     GEN6_LOGICOP_AND_INVERTED				      = 0x2,
@@ -103,20 +103,31 @@ enum gen_logicop_function {
     GEN6_LOGICOP_SET					      = 0xf,
 };
 
-enum gen_sampler_mip_filter {
+enum gen_mip_filter {
     GEN6_MIPFILTER_NONE					      = 0x0,
     GEN6_MIPFILTER_NEAREST				      = 0x1,
     GEN6_MIPFILTER_LINEAR				      = 0x3,
 };
 
-enum gen_sampler_map_filter {
+enum gen_map_filter {
     GEN6_MAPFILTER_NEAREST				      = 0x0,
     GEN6_MAPFILTER_LINEAR				      = 0x1,
     GEN6_MAPFILTER_ANISOTROPIC				      = 0x2,
     GEN6_MAPFILTER_MONO					      = 0x6,
 };
 
-enum gen_sampler_aniso_ratio {
+enum gen_prefilter_op {
+    GEN6_PREFILTEROP_ALWAYS				      = 0x0,
+    GEN6_PREFILTEROP_NEVER				      = 0x1,
+    GEN6_PREFILTEROP_LESS				      = 0x2,
+    GEN6_PREFILTEROP_EQUAL				      = 0x3,
+    GEN6_PREFILTEROP_LEQUAL				      = 0x4,
+    GEN6_PREFILTEROP_GREATER				      = 0x5,
+    GEN6_PREFILTEROP_NOTEQUAL				      = 0x6,
+    GEN6_PREFILTEROP_GEQUAL				      = 0x7,
+};
+
+enum gen_aniso_ratio {
     GEN6_ANISORATIO_2					      = 0x0,
     GEN6_ANISORATIO_4					      = 0x1,
     GEN6_ANISORATIO_6					      = 0x2,
@@ -127,7 +138,7 @@ enum gen_sampler_aniso_ratio {
     GEN6_ANISORATIO_16					      = 0x7,
 };
 
-enum gen_sampler_texcoord_mode {
+enum gen_texcoord_mode {
     GEN6_TEXCOORDMODE_WRAP				      = 0x0,
     GEN6_TEXCOORDMODE_MIRROR				      = 0x1,
     GEN6_TEXCOORDMODE_CLAMP				      = 0x2,
@@ -137,15 +148,15 @@ enum gen_sampler_texcoord_mode {
     GEN8_TEXCOORDMODE_HALF_BORDER			      = 0x6,
 };
 
-enum gen_sampler_key_filter {
+enum gen_key_filter {
     GEN6_KEYFILTER_KILL_ON_ANY_MATCH			      = 0x0,
     GEN6_KEYFILTER_REPLACE_BLACK			      = 0x1,
 };
 
 #define GEN6_COLOR_CALC_STATE__SIZE				6
 
-#define GEN6_CC_DW0_STENCIL0_REF__MASK				0xff000000
-#define GEN6_CC_DW0_STENCIL0_REF__SHIFT				24
+#define GEN6_CC_DW0_STENCIL_REF__MASK				0xff000000
+#define GEN6_CC_DW0_STENCIL_REF__SHIFT				24
 #define GEN6_CC_DW0_STENCIL1_REF__MASK				0x00ff0000
 #define GEN6_CC_DW0_STENCIL1_REF__SHIFT				16
 #define GEN6_CC_DW0_ROUND_DISABLE_DISABLE			(0x1 << 15)
@@ -162,14 +173,14 @@ enum gen_sampler_key_filter {
 #define GEN6_DEPTH_STENCIL_STATE__SIZE				3
 
 #define GEN6_ZS_DW0_STENCIL_TEST_ENABLE				(0x1 << 31)
-#define GEN6_ZS_DW0_STENCIL0_FUNC__MASK				0x70000000
-#define GEN6_ZS_DW0_STENCIL0_FUNC__SHIFT			28
-#define GEN6_ZS_DW0_STENCIL0_FAIL_OP__MASK			0x0e000000
-#define GEN6_ZS_DW0_STENCIL0_FAIL_OP__SHIFT			25
-#define GEN6_ZS_DW0_STENCIL0_ZFAIL_OP__MASK			0x01c00000
-#define GEN6_ZS_DW0_STENCIL0_ZFAIL_OP__SHIFT			22
-#define GEN6_ZS_DW0_STENCIL0_ZPASS_OP__MASK			0x00380000
-#define GEN6_ZS_DW0_STENCIL0_ZPASS_OP__SHIFT			19
+#define GEN6_ZS_DW0_STENCIL_FUNC__MASK				0x70000000
+#define GEN6_ZS_DW0_STENCIL_FUNC__SHIFT				28
+#define GEN6_ZS_DW0_STENCIL_FAIL_OP__MASK			0x0e000000
+#define GEN6_ZS_DW0_STENCIL_FAIL_OP__SHIFT			25
+#define GEN6_ZS_DW0_STENCIL_ZFAIL_OP__MASK			0x01c00000
+#define GEN6_ZS_DW0_STENCIL_ZFAIL_OP__SHIFT			22
+#define GEN6_ZS_DW0_STENCIL_ZPASS_OP__MASK			0x00380000
+#define GEN6_ZS_DW0_STENCIL_ZPASS_OP__SHIFT			19
 #define GEN6_ZS_DW0_STENCIL_WRITE_ENABLE			(0x1 << 18)
 #define GEN6_ZS_DW0_STENCIL1_ENABLE				(0x1 << 15)
 #define GEN6_ZS_DW0_STENCIL1_FUNC__MASK				0x00007000
@@ -181,14 +192,14 @@ enum gen_sampler_key_filter {
 #define GEN6_ZS_DW0_STENCIL1_ZPASS_OP__MASK			0x00000038
 #define GEN6_ZS_DW0_STENCIL1_ZPASS_OP__SHIFT			3
 
-#define GEN6_ZS_DW1_STENCIL0_VALUEMASK__MASK			0xff000000
-#define GEN6_ZS_DW1_STENCIL0_VALUEMASK__SHIFT			24
-#define GEN6_ZS_DW1_STENCIL0_WRITEMASK__MASK			0x00ff0000
-#define GEN6_ZS_DW1_STENCIL0_WRITEMASK__SHIFT			16
-#define GEN6_ZS_DW1_STENCIL1_VALUEMASK__MASK			0x0000ff00
-#define GEN6_ZS_DW1_STENCIL1_VALUEMASK__SHIFT			8
-#define GEN6_ZS_DW1_STENCIL1_WRITEMASK__MASK			0x000000ff
-#define GEN6_ZS_DW1_STENCIL1_WRITEMASK__SHIFT			0
+#define GEN6_ZS_DW1_STENCIL_TEST_MASK__MASK			0xff000000
+#define GEN6_ZS_DW1_STENCIL_TEST_MASK__SHIFT			24
+#define GEN6_ZS_DW1_STENCIL_WRITE_MASK__MASK			0x00ff0000
+#define GEN6_ZS_DW1_STENCIL_WRITE_MASK__SHIFT			16
+#define GEN6_ZS_DW1_STENCIL1_TEST_MASK__MASK			0x0000ff00
+#define GEN6_ZS_DW1_STENCIL1_TEST_MASK__SHIFT			8
+#define GEN6_ZS_DW1_STENCIL1_WRITE_MASK__MASK			0x000000ff
+#define GEN6_ZS_DW1_STENCIL1_WRITE_MASK__SHIFT			0
 
 #define GEN6_ZS_DW2_DEPTH_TEST_ENABLE				(0x1 << 31)
 #define GEN6_ZS_DW2_DEPTH_FUNC__MASK				0x38000000
@@ -216,10 +227,12 @@ enum gen_sampler_key_filter {
 #define GEN6_RT_DW1_ALPHA_TO_COVERAGE				(0x1 << 31)
 #define GEN6_RT_DW1_ALPHA_TO_ONE				(0x1 << 30)
 #define GEN6_RT_DW1_ALPHA_TO_COVERAGE_DITHER			(0x1 << 29)
-#define GEN6_RT_DW1_WRITE_DISABLE_A				(0x1 << 27)
-#define GEN6_RT_DW1_WRITE_DISABLE_R				(0x1 << 26)
-#define GEN6_RT_DW1_WRITE_DISABLE_G				(0x1 << 25)
-#define GEN6_RT_DW1_WRITE_DISABLE_B				(0x1 << 24)
+#define GEN6_RT_DW1_WRITE_DISABLES__MASK			0x0f000000
+#define GEN6_RT_DW1_WRITE_DISABLES__SHIFT			24
+#define GEN6_RT_DW1_WRITE_DISABLES_A				(0x1 << 27)
+#define GEN6_RT_DW1_WRITE_DISABLES_R				(0x1 << 26)
+#define GEN6_RT_DW1_WRITE_DISABLES_G				(0x1 << 25)
+#define GEN6_RT_DW1_WRITE_DISABLES_B				(0x1 << 24)
 #define GEN6_RT_DW1_LOGICOP_ENABLE				(0x1 << 22)
 #define GEN6_RT_DW1_LOGICOP_FUNC__MASK				0x003c0000
 #define GEN6_RT_DW1_LOGICOP_FUNC__SHIFT				18
@@ -267,10 +280,12 @@ enum gen_sampler_key_filter {
 #define GEN8_RT_DW0_DST_ALPHA_FACTOR__SHIFT			8
 #define GEN8_RT_DW0_ALPHA_FUNC__MASK				0x000000e0
 #define GEN8_RT_DW0_ALPHA_FUNC__SHIFT				5
-#define GEN8_RT_DW0_WRITE_DISABLE_A				(0x1 << 3)
-#define GEN8_RT_DW0_WRITE_DISABLE_R				(0x1 << 2)
-#define GEN8_RT_DW0_WRITE_DISABLE_G				(0x1 << 1)
-#define GEN8_RT_DW0_WRITE_DISABLE_B				(0x1 << 0)
+#define GEN8_RT_DW0_WRITE_DISABLES__MASK			0x0000000f
+#define GEN8_RT_DW0_WRITE_DISABLES__SHIFT			0
+#define GEN8_RT_DW0_WRITE_DISABLES_A				(0x1 << 3)
+#define GEN8_RT_DW0_WRITE_DISABLES_R				(0x1 << 2)
+#define GEN8_RT_DW0_WRITE_DISABLES_G				(0x1 << 1)
+#define GEN8_RT_DW0_WRITE_DISABLES_B				(0x1 << 0)
 
 #define GEN8_RT_DW1_LOGICOP_ENABLE				(0x1 << 31)
 #define GEN8_RT_DW1_LOGICOP_FUNC__MASK				0x78000000
@@ -419,6 +434,7 @@ enum gen_sampler_key_filter {
 #define GEN8_SAMPLER_DW0_LOD_PRECLAMP_ENABLE__SHIFT		27
 #define GEN6_SAMPLER_DW0_BASE_LOD__MASK				0x07c00000
 #define GEN6_SAMPLER_DW0_BASE_LOD__SHIFT			22
+#define GEN6_SAMPLER_DW0_BASE_LOD__RADIX			1
 #define GEN6_SAMPLER_DW0_MIP_FILTER__MASK			0x00300000
 #define GEN6_SAMPLER_DW0_MIP_FILTER__SHIFT			20
 #define GEN6_SAMPLER_DW0_MAG_FILTER__MASK			0x000e0000
diff --git a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
index 7c2349f..b5d09f6 100644
--- a/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
+++ b/src/gallium/drivers/ilo/genhw/gen_render_surface.xml.h
@@ -299,7 +299,10 @@ enum gen_surface_scs {
 #define GEN6_SURFACE_DW0_MIPLAYOUT__SHIFT			10
 #define GEN6_SURFACE_DW0_MIPLAYOUT_BELOW			(0x0 << 10)
 #define GEN6_SURFACE_DW0_MIPLAYOUT_RIGHT			(0x1 << 10)
-#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE			(0x1 << 9)
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE__MASK		0x00000200
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE__SHIFT		9
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_REPLICATE		(0x0 << 9)
+#define GEN6_SURFACE_DW0_CUBE_MAP_CORNER_MODE_AVERAGE		(0x1 << 9)
 #define GEN6_SURFACE_DW0_RENDER_CACHE_RW			(0x1 << 8)
 #define GEN6_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__MASK	0x000000c0
 #define GEN6_SURFACE_DW0_MEDIA_BOUNDARY_PIXEL_MODE__SHIFT	6
@@ -485,6 +488,8 @@ enum gen_surface_scs {
 #define GEN7_SURFACE_DW7_CC_B__SHIFT				29
 #define GEN7_SURFACE_DW7_CC_A__MASK				0x10000000
 #define GEN7_SURFACE_DW7_CC_A__SHIFT				28
+#define GEN75_SURFACE_DW7_SCS__MASK				0x0fff0000
+#define GEN75_SURFACE_DW7_SCS__SHIFT				16
 #define GEN75_SURFACE_DW7_SCS_R__MASK				0x0e000000
 #define GEN75_SURFACE_DW7_SCS_R__SHIFT				25
 #define GEN75_SURFACE_DW7_SCS_G__MASK				0x01c00000
diff --git a/src/gallium/drivers/ilo/genhw/genhw.h b/src/gallium/drivers/ilo/genhw/genhw.h
index 9e05bf5..3a777a1 100644
--- a/src/gallium/drivers/ilo/genhw/genhw.h
+++ b/src/gallium/drivers/ilo/genhw/genhw.h
@@ -1,6 +1,4 @@
 /*
- * Mesa 3-D graphics library
- *
  * Copyright (C) 2014 LunarG, Inc.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,8 +23,9 @@
 #ifndef GENHW_H
 #define GENHW_H
 
-#include "pipe/p_compiler.h"
-#include "util/u_debug.h"
+#include <stdbool.h>
+#include <stdint.h>
+#include <assert.h>
 
 #include "gen_regs.xml.h"
 #include "gen_mi.xml.h"
diff --git a/src/gallium/drivers/ilo/ilo_blitter.h b/src/gallium/drivers/ilo/ilo_blitter.h
index 4284f41..4eba848 100644
--- a/src/gallium/drivers/ilo/ilo_blitter.h
+++ b/src/gallium/drivers/ilo/ilo_blitter.h
@@ -39,12 +39,6 @@ enum ilo_blitter_uses {
    ILO_BLITTER_USE_FB_STENCIL    = 1 << 4,
 };
 
-enum ilo_blitter_rectlist_op {
-   ILO_BLITTER_RECTLIST_CLEAR_ZS,
-   ILO_BLITTER_RECTLIST_RESOLVE_Z,
-   ILO_BLITTER_RECTLIST_RESOLVE_HIZ,
-};
-
 struct blitter_context;
 struct pipe_resource;
 struct pipe_surface;
@@ -57,30 +51,42 @@ struct ilo_blitter {
    /*
     * A minimal context with the goal to send RECTLISTs down the pipeline.
     */
-   enum ilo_blitter_rectlist_op op;
+   enum ilo_state_raster_earlyz_op earlyz_op;
+   bool earlyz_stencil_clear;
    uint32_t uses;
 
    bool initialized;
 
    float vertices[3][2];
-   struct ilo_ve_state ve;
-   struct pipe_draw_info draw;
+   struct gen6_3dprimitive_info draw_info;
 
-   struct ilo_viewport_cso viewport;
-   struct ilo_dsa_state dsa;
+   uint32_t vf_data[4];
+   struct ilo_state_vf vf;
 
-   struct {
-      struct pipe_stencil_ref stencil_ref;
-      ubyte alpha_ref;
-      struct pipe_blend_color blend_color;
-   } cc;
+   struct ilo_state_vs vs;
+   struct ilo_state_hs hs;
+   struct ilo_state_ds ds;
+   struct ilo_state_gs gs;
+
+   struct ilo_state_sol sol;
+
+   struct ilo_state_viewport vp;
+   uint32_t vp_data[20];
+
+   struct ilo_state_sbe sbe;
+   struct ilo_state_ps ps;
+   struct ilo_state_cc cc;
 
    uint32_t depth_clear_value;
 
+   struct ilo_state_urb urb;
+
    struct {
       struct ilo_surface_cso dst;
       unsigned width, height;
       unsigned num_samples;
+
+      struct ilo_state_raster rs;
    } fb;
 };
 
diff --git a/src/gallium/drivers/ilo/ilo_blitter_pipe.c b/src/gallium/drivers/ilo/ilo_blitter_pipe.c
index c4c02bd..0bfe782 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_pipe.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_pipe.c
@@ -63,7 +63,7 @@ ilo_blitter_pipe_begin(struct ilo_blitter *blitter,
    util_blitter_save_viewport(b, &vec->viewport.viewport0);
 
    if (scissor_enable)
-      util_blitter_save_scissor(b, &vec->scissor.scissor0);
+      util_blitter_save_scissor(b, &vec->viewport.scissor0);
 
    switch (op) {
    case ILO_BLITTER_PIPE_BLIT:
diff --git a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
index 6d8afed..13c8f50 100644
--- a/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
+++ b/src/gallium/drivers/ilo/ilo_blitter_rectlist.c
@@ -25,7 +25,6 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
-#include "core/ilo_state_3d.h"
 #include "util/u_draw.h"
 #include "util/u_pack_color.h"
 
@@ -40,45 +39,48 @@
 static bool
 ilo_blitter_set_invariants(struct ilo_blitter *blitter)
 {
-   struct pipe_vertex_element velem;
-   struct pipe_viewport_state vp;
+   struct ilo_state_vf_element_info elem;
 
    if (blitter->initialized)
       return true;
 
+   /* a rectangle has 3 vertices in a RECTLIST */
+   blitter->draw_info.topology = GEN6_3DPRIM_RECTLIST;
+   blitter->draw_info.vertex_count = 3;
+   blitter->draw_info.instance_count = 1;
+
+   memset(&elem, 0, sizeof(elem));
    /* only vertex X and Y */
-   memset(&velem, 0, sizeof(velem));
-   velem.src_format = PIPE_FORMAT_R32G32_FLOAT;
-   ilo_gpe_init_ve(blitter->ilo->dev, 1, &velem, &blitter->ve);
-
-   /* generate VUE header */
-   ilo_gpe_init_ve_nosrc(blitter->ilo->dev,
-         GEN6_VFCOMP_STORE_0, /* Reserved */
-         GEN6_VFCOMP_STORE_0, /* Render Target Array Index */
-         GEN6_VFCOMP_STORE_0, /* Viewport Index */
-         GEN6_VFCOMP_STORE_0, /* Point Width */
-         &blitter->ve.nosrc_cso);
-   blitter->ve.prepend_nosrc_cso = true;
+   elem.format = GEN6_FORMAT_R32G32_FLOAT;
+   elem.format_size = 8;
+   elem.component_count = 2;
 
-   /* a rectangle has 3 vertices in a RECTLIST */
-   util_draw_init_info(&blitter->draw);
-   blitter->draw.mode = ILO_PRIM_RECTANGLES;
-   blitter->draw.count = 3;
+   ilo_state_vf_init_for_rectlist(&blitter->vf, blitter->ilo->dev,
+         blitter->vf_data, sizeof(blitter->vf_data), &elem, 1);
+
+   ilo_state_vs_init_disabled(&blitter->vs, blitter->ilo->dev);
+   ilo_state_hs_init_disabled(&blitter->hs, blitter->ilo->dev);
+   ilo_state_ds_init_disabled(&blitter->ds, blitter->ilo->dev);
+   ilo_state_gs_init_disabled(&blitter->gs, blitter->ilo->dev);
+   ilo_state_sol_init_disabled(&blitter->sol, blitter->ilo->dev, false);
 
    /**
     * From the Haswell PRM, volume 7, page 615:
     *
     *     "The clear value must be between the min and max depth values
-    *     (inclusive) defined in the CC_VIEWPORT."
+    *      (inclusive) defined in the CC_VIEWPORT."
     *
     * Even though clipping and viewport transformation will be disabled, we
     * still need to set up the viewport states.
     */
-   memset(&vp, 0, sizeof(vp));
-   vp.scale[0] = 1.0f;
-   vp.scale[1] = 1.0f;
-   vp.scale[2] = 1.0f;
-   ilo_gpe_set_viewport_cso(blitter->ilo->dev, &vp, &blitter->viewport);
+   ilo_state_viewport_init_for_rectlist(&blitter->vp, blitter->ilo->dev,
+         blitter->vp_data, sizeof(blitter->vp_data));
+
+   ilo_state_sbe_init_for_rectlist(&blitter->sbe, blitter->ilo->dev, 0, 0);
+   ilo_state_ps_init_disabled(&blitter->ps, blitter->ilo->dev);
+
+   ilo_state_urb_init_for_rectlist(&blitter->urb, blitter->ilo->dev,
+         ilo_state_vf_get_attr_count(&blitter->vf));
 
    blitter->initialized = true;
 
@@ -86,10 +88,12 @@ ilo_blitter_set_invariants(struct ilo_blitter *blitter)
 }
 
 static void
-ilo_blitter_set_op(struct ilo_blitter *blitter,
-                   enum ilo_blitter_rectlist_op op)
+ilo_blitter_set_earlyz_op(struct ilo_blitter *blitter,
+                          enum ilo_state_raster_earlyz_op op,
+                          bool earlyz_stencil_clear)
 {
-   blitter->op = op;
+   blitter->earlyz_op = op;
+   blitter->earlyz_stencil_clear = earlyz_stencil_clear;
 }
 
 /**
@@ -117,18 +121,27 @@ ilo_blitter_set_rectlist(struct ilo_blitter *blitter,
 }
 
 static void
-ilo_blitter_set_clear_values(struct ilo_blitter *blitter,
-                             uint32_t depth, ubyte stencil)
+ilo_blitter_set_depth_clear_value(struct ilo_blitter *blitter,
+                                  uint32_t depth)
 {
    blitter->depth_clear_value = depth;
-   blitter->cc.stencil_ref.ref_value[0] = stencil;
 }
 
 static void
-ilo_blitter_set_dsa(struct ilo_blitter *blitter,
-                    const struct pipe_depth_stencil_alpha_state *state)
+ilo_blitter_set_cc(struct ilo_blitter *blitter,
+                   const struct ilo_state_cc_info *info)
+{
+   memset(&blitter->cc, 0, sizeof(blitter->cc));
+   ilo_state_cc_init(&blitter->cc, blitter->ilo->dev, info);
+}
+
+static void
+ilo_blitter_set_fb_rs(struct ilo_blitter *blitter)
 {
-   ilo_gpe_init_dsa(blitter->ilo->dev, state, &blitter->dsa);
+   memset(&blitter->fb.rs, 0, sizeof(blitter->fb.rs));
+   ilo_state_raster_init_for_rectlist(&blitter->fb.rs, blitter->ilo->dev,
+         blitter->fb.num_samples, blitter->earlyz_op,
+         blitter->earlyz_stencil_clear);
 }
 
 static void
@@ -146,6 +159,8 @@ ilo_blitter_set_fb(struct ilo_blitter *blitter,
       blitter->fb.num_samples = 1;
 
    memcpy(&blitter->fb.dst, cso, sizeof(*cso));
+
+   ilo_blitter_set_fb_rs(blitter);
 }
 
 static void
@@ -191,9 +206,9 @@ hiz_align_fb(struct ilo_blitter *blitter)
 {
    unsigned align_w, align_h;
 
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
+   switch (blitter->earlyz_op) {
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR:
+   case ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE:
       break;
    default:
       return;
@@ -328,7 +343,7 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
                               double depth, unsigned stencil)
 {
    struct ilo_texture *tex = ilo_texture(zs->texture);
-   struct pipe_depth_stencil_alpha_state dsa_state;
+   struct ilo_state_cc_info info;
    uint32_t uses, clear_value;
 
    if (!ilo_image_can_enable_aux(&tex->image, zs->u.tex.level))
@@ -368,17 +383,20 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
     *      - [DevSNB] errata: For stencil buffer only clear, the previous
     *        depth clear value must be delivered during the clear."
     */
-   memset(&dsa_state, 0, sizeof(dsa_state));
+   memset(&info, 0, sizeof(info));
 
-   if (clear_flags & PIPE_CLEAR_DEPTH)
-      dsa_state.depth.writemask = true;
+   if (clear_flags & PIPE_CLEAR_DEPTH) {
+      info.depth.cv_has_buffer = true;
+      info.depth.write_enable = true;
+   }
 
    if (clear_flags & PIPE_CLEAR_STENCIL) {
-      dsa_state.stencil[0].enabled = true;
-      dsa_state.stencil[0].func = PIPE_FUNC_ALWAYS;
-      dsa_state.stencil[0].fail_op = PIPE_STENCIL_OP_KEEP;
-      dsa_state.stencil[0].zpass_op = PIPE_STENCIL_OP_REPLACE;
-      dsa_state.stencil[0].zfail_op = PIPE_STENCIL_OP_KEEP;
+      info.stencil.cv_has_buffer = true;
+      info.stencil.test_enable = true;
+      info.stencil.front.test_func = GEN6_COMPAREFUNCTION_ALWAYS;
+      info.stencil.front.fail_op = GEN6_STENCILOP_KEEP;
+      info.stencil.front.zfail_op = GEN6_STENCILOP_KEEP;
+      info.stencil.front.zpass_op = GEN6_STENCILOP_REPLACE;
 
       /*
        * From the Ivy Bridge PRM, volume 2 part 1, page 277:
@@ -389,18 +407,21 @@ ilo_blitter_rectlist_clear_zs(struct ilo_blitter *blitter,
        *      - DEPTH_STENCIL_STATE::Stencil Test Mask must be 0xFF
        *      - DEPTH_STENCIL_STATE::Back Face Stencil Write Mask must be 0xFF
        *      - DEPTH_STENCIL_STATE::Back Face Stencil Test Mask must be 0xFF"
+       *
+       * Back frace masks will be copied from front face masks.
        */
-      dsa_state.stencil[0].valuemask = 0xff;
-      dsa_state.stencil[0].writemask = 0xff;
-      dsa_state.stencil[1].valuemask = 0xff;
-      dsa_state.stencil[1].writemask = 0xff;
+      info.params.stencil_front.test_ref = (uint8_t) stencil;
+      info.params.stencil_front.test_mask = 0xff;
+      info.params.stencil_front.write_mask = 0xff;
    }
 
    ilo_blitter_set_invariants(blitter);
-   ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_CLEAR_ZS);
+   ilo_blitter_set_earlyz_op(blitter,
+         ILO_STATE_RASTER_EARLYZ_DEPTH_CLEAR,
+         clear_flags & PIPE_CLEAR_STENCIL);
 
-   ilo_blitter_set_dsa(blitter, &dsa_state);
-   ilo_blitter_set_clear_values(blitter, clear_value, (ubyte) stencil);
+   ilo_blitter_set_cc(blitter, &info);
+   ilo_blitter_set_depth_clear_value(blitter, clear_value);
    ilo_blitter_set_fb_from_surface(blitter, zs);
 
    uses = ILO_BLITTER_USE_DSA;
@@ -421,7 +442,7 @@ ilo_blitter_rectlist_resolve_z(struct ilo_blitter *blitter,
                                unsigned level, unsigned slice)
 {
    struct ilo_texture *tex = ilo_texture(res);
-   struct pipe_depth_stencil_alpha_state dsa_state;
+   struct ilo_state_cc_info info;
    const struct ilo_texture_slice *s =
       ilo_texture_get_slice(tex, level, slice);
 
@@ -435,16 +456,18 @@ ilo_blitter_rectlist_resolve_z(struct ilo_blitter *blitter,
     *      to NEVER. Depth Buffer Write Enable must be enabled. Stencil Test
     *      Enable and Stencil Buffer Write Enable must be disabled."
     */
-   memset(&dsa_state, 0, sizeof(dsa_state));
-   dsa_state.depth.writemask = true;
-   dsa_state.depth.enabled = true;
-   dsa_state.depth.func = PIPE_FUNC_NEVER;
+   memset(&info, 0, sizeof(info));
+   info.depth.cv_has_buffer = true;
+   info.depth.test_enable = true;
+   info.depth.write_enable = true;
+   info.depth.test_func = GEN6_COMPAREFUNCTION_NEVER;
 
    ilo_blitter_set_invariants(blitter);
-   ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_RESOLVE_Z);
+   ilo_blitter_set_earlyz_op(blitter,
+         ILO_STATE_RASTER_EARLYZ_DEPTH_RESOLVE, false);
 
-   ilo_blitter_set_dsa(blitter, &dsa_state);
-   ilo_blitter_set_clear_values(blitter, s->clear_value, 0);
+   ilo_blitter_set_cc(blitter, &info);
+   ilo_blitter_set_depth_clear_value(blitter, s->clear_value);
    ilo_blitter_set_fb_from_resource(blitter, res, res->format, level, slice);
    ilo_blitter_set_uses(blitter,
          ILO_BLITTER_USE_DSA | ILO_BLITTER_USE_FB_DEPTH);
@@ -458,7 +481,7 @@ ilo_blitter_rectlist_resolve_hiz(struct ilo_blitter *blitter,
                                  unsigned level, unsigned slice)
 {
    struct ilo_texture *tex = ilo_texture(res);
-   struct pipe_depth_stencil_alpha_state dsa_state;
+   struct ilo_state_cc_info info;
 
    if (!ilo_image_can_enable_aux(&tex->image, level))
       return;
@@ -470,13 +493,15 @@ ilo_blitter_rectlist_resolve_hiz(struct ilo_blitter *blitter,
     *      disabled. Depth Buffer Write Enable must be enabled. Stencil Test
     *      Enable and Stencil Buffer Write Enable must be disabled."
     */
-   memset(&dsa_state, 0, sizeof(dsa_state));
-   dsa_state.depth.writemask = true;
+   memset(&info, 0, sizeof(info));
+   info.depth.cv_has_buffer = true;
+   info.depth.write_enable = true;
 
    ilo_blitter_set_invariants(blitter);
-   ilo_blitter_set_op(blitter, ILO_BLITTER_RECTLIST_RESOLVE_HIZ);
+   ilo_blitter_set_earlyz_op(blitter,
+         ILO_STATE_RASTER_EARLYZ_HIZ_RESOLVE, false);
 
-   ilo_blitter_set_dsa(blitter, &dsa_state);
+   ilo_blitter_set_cc(blitter, &info);
    ilo_blitter_set_fb_from_resource(blitter, res, res->format, level, slice);
    ilo_blitter_set_uses(blitter,
          ILO_BLITTER_USE_DSA | ILO_BLITTER_USE_FB_DEPTH);
diff --git a/src/gallium/drivers/ilo/ilo_draw.c b/src/gallium/drivers/ilo/ilo_draw.c
index fc91fd3..e8e1a4c 100644
--- a/src/gallium/drivers/ilo/ilo_draw.c
+++ b/src/gallium/drivers/ilo/ilo_draw.c
@@ -452,12 +452,12 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
    } u;
 
    /* we will draw with IB mapped */
-   if (ib->buffer) {
-      u.ptr = intel_bo_map(ilo_buffer(ib->buffer)->bo, false);
+   if (ib->state.buffer) {
+      u.ptr = intel_bo_map(ilo_buffer(ib->state.buffer)->bo, false);
       if (u.ptr)
-         u.u8 += ib->offset;
+         u.u8 += ib->state.offset;
    } else {
-      u.ptr = ib->user_buffer;
+      u.ptr = ib->state.user_buffer;
    }
 
    if (!u.ptr)
@@ -483,7 +483,7 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
       (pipe)->draw_vbo(pipe, &subinfo);                  \
 } while (0)
 
-   switch (ib->index_size) {
+   switch (ib->state.index_size) {
    case 1:
       DRAW_VBO_WITH_SW_RESTART(&ilo->base, info, u.u8);
       break;
@@ -500,8 +500,8 @@ draw_vbo_with_sw_restart(struct ilo_context *ilo,
 
 #undef DRAW_VBO_WITH_SW_RESTART
 
-   if (ib->buffer)
-      intel_bo_unmap(ilo_buffer(ib->buffer)->bo);
+   if (ib->state.buffer)
+      intel_bo_unmap(ilo_buffer(ib->state.buffer)->bo);
 }
 
 static bool
@@ -511,9 +511,9 @@ draw_vbo_need_sw_restart(const struct ilo_context *ilo,
    /* the restart index is fixed prior to GEN7.5 */
    if (ilo_dev_gen(ilo->dev) < ILO_GEN(7.5)) {
       const unsigned cut_index =
-         (ilo->state_vector.ib.index_size == 1) ? 0xff :
-         (ilo->state_vector.ib.index_size == 2) ? 0xffff :
-         (ilo->state_vector.ib.index_size == 4) ? 0xffffffff : 0;
+         (ilo->state_vector.ib.state.index_size == 1) ? 0xff :
+         (ilo->state_vector.ib.state.index_size == 2) ? 0xffff :
+         (ilo->state_vector.ib.state.index_size == 4) ? 0xffffffff : 0;
 
       if (info->restart_index < cut_index)
          return true;
diff --git a/src/gallium/drivers/ilo/ilo_format.c b/src/gallium/drivers/ilo/ilo_format.c
new file mode 100644
index 0000000..ca7e6b5
--- /dev/null
+++ b/src/gallium/drivers/ilo/ilo_format.c
@@ -0,0 +1,356 @@
+/*
+ * Mesa 3-D graphics library
+ *
+ * Copyright (C) 2012-2013 LunarG, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Chia-I Wu <olv@lunarg.com>
+ */
+
+#include "genhw/genhw.h"
+#include "core/ilo_state_surface.h"
+#include "core/ilo_state_vf.h"
+#include "ilo_format.h"
+
+bool
+ilo_format_support_vb(const struct ilo_dev *dev,
+                      enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_VERTEX_BUFFER);
+
+   return (idx >= 0 && ilo_state_vf_valid_element_format(dev, idx));
+}
+
+bool
+ilo_format_support_sol(const struct ilo_dev *dev,
+                       enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_STREAM_OUTPUT);
+
+   return (idx >= 0 && ilo_state_surface_valid_format(dev,
+            ILO_STATE_SURFACE_ACCESS_DP_SVB, idx));
+}
+
+bool
+ilo_format_support_sampler(const struct ilo_dev *dev,
+                           enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_SAMPLER_VIEW);
+
+   return (idx >= 0 && ilo_state_surface_valid_format(dev,
+            ILO_STATE_SURFACE_ACCESS_SAMPLER, idx));
+}
+
+bool
+ilo_format_support_rt(const struct ilo_dev *dev,
+                      enum pipe_format format)
+{
+   const int idx = ilo_format_translate(dev, format, PIPE_BIND_RENDER_TARGET);
+
+   return (idx >= 0 && ilo_state_surface_valid_format(dev,
+            ILO_STATE_SURFACE_ACCESS_DP_RENDER, idx));
+}
+
+bool
+ilo_format_support_zs(const struct ilo_dev *dev,
+                      enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_Z16_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z32_FLOAT:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return true;
+   case PIPE_FORMAT_S8_UINT:
+      /* TODO separate stencil */
+   default:
+      return false;
+   }
+}
+
+/**
+ * Translate a color (non-depth/stencil) pipe format to the matching hardware
+ * format.  Return -1 on errors.
+ */
+int
+ilo_format_translate_color(const struct ilo_dev *dev,
+                           enum pipe_format format)
+{
+   static const int format_mapping[PIPE_FORMAT_COUNT] = {
+      [PIPE_FORMAT_NONE]                  = 0,
+      [PIPE_FORMAT_B8G8R8A8_UNORM]        = GEN6_FORMAT_B8G8R8A8_UNORM,
+      [PIPE_FORMAT_B8G8R8X8_UNORM]        = GEN6_FORMAT_B8G8R8X8_UNORM,
+      [PIPE_FORMAT_A8R8G8B8_UNORM]        = 0,
+      [PIPE_FORMAT_X8R8G8B8_UNORM]        = 0,
+      [PIPE_FORMAT_B5G5R5A1_UNORM]        = GEN6_FORMAT_B5G5R5A1_UNORM,
+      [PIPE_FORMAT_B4G4R4A4_UNORM]        = GEN6_FORMAT_B4G4R4A4_UNORM,
+      [PIPE_FORMAT_B5G6R5_UNORM]          = GEN6_FORMAT_B5G6R5_UNORM,
+      [PIPE_FORMAT_R10G10B10A2_UNORM]     = GEN6_FORMAT_R10G10B10A2_UNORM,
+      [PIPE_FORMAT_L8_UNORM]              = GEN6_FORMAT_L8_UNORM,
+      [PIPE_FORMAT_A8_UNORM]              = GEN6_FORMAT_A8_UNORM,
+      [PIPE_FORMAT_I8_UNORM]              = GEN6_FORMAT_I8_UNORM,
+      [PIPE_FORMAT_L8A8_UNORM]            = GEN6_FORMAT_L8A8_UNORM,
+      [PIPE_FORMAT_L16_UNORM]             = GEN6_FORMAT_L16_UNORM,
+      [PIPE_FORMAT_UYVY]                  = GEN6_FORMAT_YCRCB_SWAPUVY,
+      [PIPE_FORMAT_YUYV]                  = GEN6_FORMAT_YCRCB_NORMAL,
+      [PIPE_FORMAT_Z16_UNORM]             = 0,
+      [PIPE_FORMAT_Z32_UNORM]             = 0,
+      [PIPE_FORMAT_Z32_FLOAT]             = 0,
+      [PIPE_FORMAT_Z24_UNORM_S8_UINT]     = 0,
+      [PIPE_FORMAT_S8_UINT_Z24_UNORM]     = 0,
+      [PIPE_FORMAT_Z24X8_UNORM]           = 0,
+      [PIPE_FORMAT_X8Z24_UNORM]           = 0,
+      [PIPE_FORMAT_S8_UINT]               = 0,
+      [PIPE_FORMAT_R64_FLOAT]             = GEN6_FORMAT_R64_FLOAT,
+      [PIPE_FORMAT_R64G64_FLOAT]          = GEN6_FORMAT_R64G64_FLOAT,
+      [PIPE_FORMAT_R64G64B64_FLOAT]       = GEN6_FORMAT_R64G64B64_FLOAT,
+      [PIPE_FORMAT_R64G64B64A64_FLOAT]    = GEN6_FORMAT_R64G64B64A64_FLOAT,
+      [PIPE_FORMAT_R32_FLOAT]             = GEN6_FORMAT_R32_FLOAT,
+      [PIPE_FORMAT_R32G32_FLOAT]          = GEN6_FORMAT_R32G32_FLOAT,
+      [PIPE_FORMAT_R32G32B32_FLOAT]       = GEN6_FORMAT_R32G32B32_FLOAT,
+      [PIPE_FORMAT_R32G32B32A32_FLOAT]    = GEN6_FORMAT_R32G32B32A32_FLOAT,
+      [PIPE_FORMAT_R32_UNORM]             = GEN6_FORMAT_R32_UNORM,
+      [PIPE_FORMAT_R32G32_UNORM]          = GEN6_FORMAT_R32G32_UNORM,
+      [PIPE_FORMAT_R32G32B32_UNORM]       = GEN6_FORMAT_R32G32B32_UNORM,
+      [PIPE_FORMAT_R32G32B32A32_UNORM]    = GEN6_FORMAT_R32G32B32A32_UNORM,
+      [PIPE_FORMAT_R32_USCALED]           = GEN6_FORMAT_R32_USCALED,
+      [PIPE_FORMAT_R32G32_USCALED]        = GEN6_FORMAT_R32G32_USCALED,
+      [PIPE_FORMAT_R32G32B32_USCALED]     = GEN6_FORMAT_R32G32B32_USCALED,
+      [PIPE_FORMAT_R32G32B32A32_USCALED]  = GEN6_FORMAT_R32G32B32A32_USCALED,
+      [PIPE_FORMAT_R32_SNORM]             = GEN6_FORMAT_R32_SNORM,
+      [PIPE_FORMAT_R32G32_SNORM]          = GEN6_FORMAT_R32G32_SNORM,
+      [PIPE_FORMAT_R32G32B32_SNORM]       = GEN6_FORMAT_R32G32B32_SNORM,
+      [PIPE_FORMAT_R32G32B32A32_SNORM]    = GEN6_FORMAT_R32G32B32A32_SNORM,
+      [PIPE_FORMAT_R32_SSCALED]           = GEN6_FORMAT_R32_SSCALED,
+      [PIPE_FORMAT_R32G32_SSCALED]        = GEN6_FORMAT_R32G32_SSCALED,
+      [PIPE_FORMAT_R32G32B32_SSCALED]     = GEN6_FORMAT_R32G32B32_SSCALED,
+      [PIPE_FORMAT_R32G32B32A32_SSCALED]  = GEN6_FORMAT_R32G32B32A32_SSCALED,
+      [PIPE_FORMAT_R16_UNORM]             = GEN6_FORMAT_R16_UNORM,
+      [PIPE_FORMAT_R16G16_UNORM]          = GEN6_FORMAT_R16G16_UNORM,
+      [PIPE_FORMAT_R16G16B16_UNORM]       = GEN6_FORMAT_R16G16B16_UNORM,
+      [PIPE_FORMAT_R16G16B16A16_UNORM]    = GEN6_FORMAT_R16G16B16A16_UNORM,
+      [PIPE_FORMAT_R16_USCALED]           = GEN6_FORMAT_R16_USCALED,
+      [PIPE_FORMAT_R16G16_USCALED]        = GEN6_FORMAT_R16G16_USCALED,
+      [PIPE_FORMAT_R16G16B16_USCALED]     = GEN6_FORMAT_R16G16B16_USCALED,
+      [PIPE_FORMAT_R16G16B16A16_USCALED]  = GEN6_FORMAT_R16G16B16A16_USCALED,
+      [PIPE_FORMAT_R16_SNORM]             = GEN6_FORMAT_R16_SNORM,
+      [PIPE_FORMAT_R16G16_SNORM]          = GEN6_FORMAT_R16G16_SNORM,
+      [PIPE_FORMAT_R16G16B16_SNORM]       = GEN6_FORMAT_R16G16B16_SNORM,
+      [PIPE_FORMAT_R16G16B16A16_SNORM]    = GEN6_FORMAT_R16G16B16A16_SNORM,
+      [PIPE_FORMAT_R16_SSCALED]           = GEN6_FORMAT_R16_SSCALED,
+      [PIPE_FORMAT_R16G16_SSCALED]        = GEN6_FORMAT_R16G16_SSCALED,
+      [PIPE_FORMAT_R16G16B16_SSCALED]     = GEN6_FORMAT_R16G16B16_SSCALED,
+      [PIPE_FORMAT_R16G16B16A16_SSCALED]  = GEN6_FORMAT_R16G16B16A16_SSCALED,
+      [PIPE_FORMAT_R8_UNORM]              = GEN6_FORMAT_R8_UNORM,
+      [PIPE_FORMAT_R8G8_UNORM]            = GEN6_FORMAT_R8G8_UNORM,
+      [PIPE_FORMAT_R8G8B8_UNORM]          = GEN6_FORMAT_R8G8B8_UNORM,
+      [PIPE_FORMAT_R8G8B8A8_UNORM]        = GEN6_FORMAT_R8G8B8A8_UNORM,
+      [PIPE_FORMAT_X8B8G8R8_UNORM]        = 0,
+      [PIPE_FORMAT_R8_USCALED]            = GEN6_FORMAT_R8_USCALED,
+      [PIPE_FORMAT_R8G8_USCALED]          = GEN6_FORMAT_R8G8_USCALED,
+      [PIPE_FORMAT_R8G8B8_USCALED]        = GEN6_FORMAT_R8G8B8_USCALED,
+      [PIPE_FORMAT_R8G8B8A8_USCALED]      = GEN6_FORMAT_R8G8B8A8_USCALED,
+      [PIPE_FORMAT_R8_SNORM]              = GEN6_FORMAT_R8_SNORM,
+      [PIPE_FORMAT_R8G8_SNORM]            = GEN6_FORMAT_R8G8_SNORM,
+      [PIPE_FORMAT_R8G8B8_SNORM]          = GEN6_FORMAT_R8G8B8_SNORM,
+      [PIPE_FORMAT_R8G8B8A8_SNORM]        = GEN6_FORMAT_R8G8B8A8_SNORM,
+      [PIPE_FORMAT_R8_SSCALED]            = GEN6_FORMAT_R8_SSCALED,
+      [PIPE_FORMAT_R8G8_SSCALED]          = GEN6_FORMAT_R8G8_SSCALED,
+      [PIPE_FORMAT_R8G8B8_SSCALED]        = GEN6_FORMAT_R8G8B8_SSCALED,
+      [PIPE_FORMAT_R8G8B8A8_SSCALED]      = GEN6_FORMAT_R8G8B8A8_SSCALED,
+      [PIPE_FORMAT_R32_FIXED]             = GEN6_FORMAT_R32_SFIXED,
+      [PIPE_FORMAT_R32G32_FIXED]          = GEN6_FORMAT_R32G32_SFIXED,
+      [PIPE_FORMAT_R32G32B32_FIXED]       = GEN6_FORMAT_R32G32B32_SFIXED,
+      [PIPE_FORMAT_R32G32B32A32_FIXED]    = GEN6_FORMAT_R32G32B32A32_SFIXED,
+      [PIPE_FORMAT_R16_FLOAT]             = GEN6_FORMAT_R16_FLOAT,
+      [PIPE_FORMAT_R16G16_FLOAT]          = GEN6_FORMAT_R16G16_FLOAT,
+      [PIPE_FORMAT_R16G16B16_FLOAT]       = GEN6_FORMAT_R16G16B16_FLOAT,
+      [PIPE_FORMAT_R16G16B16A16_FLOAT]    = GEN6_FORMAT_R16G16B16A16_FLOAT,
+      [PIPE_FORMAT_L8_SRGB]               = GEN6_FORMAT_L8_UNORM_SRGB,
+      [PIPE_FORMAT_L8A8_SRGB]             = GEN6_FORMAT_L8A8_UNORM_SRGB,
+      [PIPE_FORMAT_R8G8B8_SRGB]           = GEN6_FORMAT_R8G8B8_UNORM_SRGB,
+      [PIPE_FORMAT_A8B8G8R8_SRGB]         = 0,
+      [PIPE_FORMAT_X8B8G8R8_SRGB]         = 0,
+      [PIPE_FORMAT_B8G8R8A8_SRGB]         = GEN6_FORMAT_B8G8R8A8_UNORM_SRGB,
+      [PIPE_FORMAT_B8G8R8X8_SRGB]         = GEN6_FORMAT_B8G8R8X8_UNORM_SRGB,
+      [PIPE_FORMAT_A8R8G8B8_SRGB]         = 0,
+      [PIPE_FORMAT_X8R8G8B8_SRGB]         = 0,
+      [PIPE_FORMAT_R8G8B8A8_SRGB]         = GEN6_FORMAT_R8G8B8A8_UNORM_SRGB,
+      [PIPE_FORMAT_DXT1_RGB]              = GEN6_FORMAT_DXT1_RGB,
+      [PIPE_FORMAT_DXT1_RGBA]             = GEN6_FORMAT_BC1_UNORM,
+      [PIPE_FORMAT_DXT3_RGBA]             = GEN6_FORMAT_BC2_UNORM,
+      [PIPE_FORMAT_DXT5_RGBA]             = GEN6_FORMAT_BC3_UNORM,
+      [PIPE_FORMAT_DXT1_SRGB]             = GEN6_FORMAT_DXT1_RGB_SRGB,
+      [PIPE_FORMAT_DXT1_SRGBA]            = GEN6_FORMAT_BC1_UNORM_SRGB,
+      [PIPE_FORMAT_DXT3_SRGBA]            = GEN6_FORMAT_BC2_UNORM_SRGB,
+      [PIPE_FORMAT_DXT5_SRGBA]            = GEN6_FORMAT_BC3_UNORM_SRGB,
+      [PIPE_FORMAT_RGTC1_UNORM]           = GEN6_FORMAT_BC4_UNORM,
+      [PIPE_FORMAT_RGTC1_SNORM]           = GEN6_FORMAT_BC4_SNORM,
+      [PIPE_FORMAT_RGTC2_UNORM]           = GEN6_FORMAT_BC5_UNORM,
+      [PIPE_FORMAT_RGTC2_SNORM]           = GEN6_FORMAT_BC5_SNORM,
+      [PIPE_FORMAT_R8G8_B8G8_UNORM]       = 0,
+      [PIPE_FORMAT_G8R8_G8B8_UNORM]       = 0,
+      [PIPE_FORMAT_R8SG8SB8UX8U_NORM]     = 0,
+      [PIPE_FORMAT_R5SG5SB6U_NORM]        = 0,
+      [PIPE_FORMAT_A8B8G8R8_UNORM]        = 0,
+      [PIPE_FORMAT_B5G5R5X1_UNORM]        = GEN6_FORMAT_B5G5R5X1_UNORM,
+      [PIPE_FORMAT_R10G10B10A2_USCALED]   = GEN6_FORMAT_R10G10B10A2_USCALED,
+      [PIPE_FORMAT_R11G11B10_FLOAT]       = GEN6_FORMAT_R11G11B10_FLOAT,
+      [PIPE_FORMAT_R9G9B9E5_FLOAT]        = GEN6_FORMAT_R9G9B9E5_SHAREDEXP,
+      [PIPE_FORMAT_Z32_FLOAT_S8X24_UINT]  = 0,
+      [PIPE_FORMAT_R1_UNORM]              = GEN6_FORMAT_R1_UNORM,
+      [PIPE_FORMAT_R10G10B10X2_USCALED]   = GEN6_FORMAT_R10G10B10X2_USCALED,
+      [PIPE_FORMAT_R10G10B10X2_SNORM]     = 0,
+      [PIPE_FORMAT_L4A4_UNORM]            = 0,
+      [PIPE_FORMAT_B10G10R10A2_UNORM]     = GEN6_FORMAT_B10G10R10A2_UNORM,
+      [PIPE_FORMAT_R10SG10SB10SA2U_NORM]  = 0,
+      [PIPE_FORMAT_R8G8Bx_SNORM]          = 0,
+      [PIPE_FORMAT_R8G8B8X8_UNORM]        = GEN6_FORMAT_R8G8B8X8_UNORM,
+      [PIPE_FORMAT_B4G4R4X4_UNORM]        = 0,
+      [PIPE_FORMAT_X24S8_UINT]            = 0,
+      [PIPE_FORMAT_S8X24_UINT]            = 0,
+      [PIPE_FORMAT_X32_S8X24_UINT]        = 0,
+      [PIPE_FORMAT_B2G3R3_UNORM]          = 0,
+      [PIPE_FORMAT_L16A16_UNORM]          = GEN6_FORMAT_L16A16_UNORM,
+      [PIPE_FORMAT_A16_UNORM]             = GEN6_FORMAT_A16_UNORM,
+      [PIPE_FORMAT_I16_UNORM]             = GEN6_FORMAT_I16_UNORM,
+      [PIPE_FORMAT_LATC1_UNORM]           = 0,
+      [PIPE_FORMAT_LATC1_SNORM]           = 0,
+      [PIPE_FORMAT_LATC2_UNORM]           = 0,
+      [PIPE_FORMAT_LATC2_SNORM]           = 0,
+      [PIPE_FORMAT_A8_SNORM]              = 0,
+      [PIPE_FORMAT_L8_SNORM]              = 0,
+      [PIPE_FORMAT_L8A8_SNORM]            = 0,
+      [PIPE_FORMAT_I8_SNORM]              = 0,
+      [PIPE_FORMAT_A16_SNORM]             = 0,
+      [PIPE_FORMAT_L16_SNORM]             = 0,
+      [PIPE_FORMAT_L16A16_SNORM]          = 0,
+      [PIPE_FORMAT_I16_SNORM]             = 0,
+      [PIPE_FORMAT_A16_FLOAT]             = GEN6_FORMAT_A16_FLOAT,
+      [PIPE_FORMAT_L16_FLOAT]             = GEN6_FORMAT_L16_FLOAT,
+      [PIPE_FORMAT_L16A16_FLOAT]          = GEN6_FORMAT_L16A16_FLOAT,
+      [PIPE_FORMAT_I16_FLOAT]             = GEN6_FORMAT_I16_FLOAT,
+      [PIPE_FORMAT_A32_FLOAT]             = GEN6_FORMAT_A32_FLOAT,
+      [PIPE_FORMAT_L32_FLOAT]             = GEN6_FORMAT_L32_FLOAT,
+      [PIPE_FORMAT_L32A32_FLOAT]          = GEN6_FORMAT_L32A32_FLOAT,
+      [PIPE_FORMAT_I32_FLOAT]             = GEN6_FORMAT_I32_FLOAT,
+      [PIPE_FORMAT_YV12]                  = 0,
+      [PIPE_FORMAT_YV16]                  = 0,
+      [PIPE_FORMAT_IYUV]                  = 0,
+      [PIPE_FORMAT_NV12]                  = 0,
+      [PIPE_FORMAT_NV21]                  = 0,
+      [PIPE_FORMAT_A4R4_UNORM]            = 0,
+      [PIPE_FORMAT_R4A4_UNORM]            = 0,
+      [PIPE_FORMAT_R8A8_UNORM]            = 0,
+      [PIPE_FORMAT_A8R8_UNORM]            = 0,
+      [PIPE_FORMAT_R10G10B10A2_SSCALED]   = GEN6_FORMAT_R10G10B10A2_SSCALED,
+      [PIPE_FORMAT_R10G10B10A2_SNORM]     = GEN6_FORMAT_R10G10B10A2_SNORM,
+      [PIPE_FORMAT_B10G10R10A2_USCALED]   = GEN6_FORMAT_B10G10R10A2_USCALED,
+      [PIPE_FORMAT_B10G10R10A2_SSCALED]   = GEN6_FORMAT_B10G10R10A2_SSCALED,
+      [PIPE_FORMAT_B10G10R10A2_SNORM]     = GEN6_FORMAT_B10G10R10A2_SNORM,
+      [PIPE_FORMAT_R8_UINT]               = GEN6_FORMAT_R8_UINT,
+      [PIPE_FORMAT_R8G8_UINT]             = GEN6_FORMAT_R8G8_UINT,
+      [PIPE_FORMAT_R8G8B8_UINT]           = GEN6_FORMAT_R8G8B8_UINT,
+      [PIPE_FORMAT_R8G8B8A8_UINT]         = GEN6_FORMAT_R8G8B8A8_UINT,
+      [PIPE_FORMAT_R8_SINT]               = GEN6_FORMAT_R8_SINT,
+      [PIPE_FORMAT_R8G8_SINT]             = GEN6_FORMAT_R8G8_SINT,
+      [PIPE_FORMAT_R8G8B8_SINT]           = GEN6_FORMAT_R8G8B8_SINT,
+      [PIPE_FORMAT_R8G8B8A8_SINT]         = GEN6_FORMAT_R8G8B8A8_SINT,
+      [PIPE_FORMAT_R16_UINT]              = GEN6_FORMAT_R16_UINT,
+      [PIPE_FORMAT_R16G16_UINT]           = GEN6_FORMAT_R16G16_UINT,
+      [PIPE_FORMAT_R16G16B16_UINT]        = GEN6_FORMAT_R16G16B16_UINT,
+      [PIPE_FORMAT_R16G16B16A16_UINT]     = GEN6_FORMAT_R16G16B16A16_UINT,
+      [PIPE_FORMAT_R16_SINT]              = GEN6_FORMAT_R16_SINT,
+      [PIPE_FORMAT_R16G16_SINT]           = GEN6_FORMAT_R16G16_SINT,
+      [PIPE_FORMAT_R16G16B16_SINT]        = GEN6_FORMAT_R16G16B16_SINT,
+      [PIPE_FORMAT_R16G16B16A16_SINT]     = GEN6_FORMAT_R16G16B16A16_SINT,
+      [PIPE_FORMAT_R32_UINT]              = GEN6_FORMAT_R32_UINT,
+      [PIPE_FORMAT_R32G32_UINT]           = GEN6_FORMAT_R32G32_UINT,
+      [PIPE_FORMAT_R32G32B32_UINT]        = GEN6_FORMAT_R32G32B32_UINT,
+      [PIPE_FORMAT_R32G32B32A32_UINT]     = GEN6_FORMAT_R32G32B32A32_UINT,
+      [PIPE_FORMAT_R32_SINT]              = GEN6_FORMAT_R32_SINT,
+      [PIPE_FORMAT_R32G32_SINT]           = GEN6_FORMAT_R32G32_SINT,
+      [PIPE_FORMAT_R32G32B32_SINT]        = GEN6_FORMAT_R32G32B32_SINT,
+      [PIPE_FORMAT_R32G32B32A32_SINT]     = GEN6_FORMAT_R32G32B32A32_SINT,
+      [PIPE_FORMAT_A8_UINT]               = 0,
+      [PIPE_FORMAT_I8_UINT]               = GEN6_FORMAT_I8_UINT,
+      [PIPE_FORMAT_L8_UINT]               = GEN6_FORMAT_L8_UINT,
+      [PIPE_FORMAT_L8A8_UINT]             = GEN6_FORMAT_L8A8_UINT,
+      [PIPE_FORMAT_A8_SINT]               = 0,
+      [PIPE_FORMAT_I8_SINT]               = GEN6_FORMAT_I8_SINT,
+      [PIPE_FORMAT_L8_SINT]               = GEN6_FORMAT_L8_SINT,
+      [PIPE_FORMAT_L8A8_SINT]             = GEN6_FORMAT_L8A8_SINT,
+      [PIPE_FORMAT_A16_UINT]              = 0,
+      [PIPE_FORMAT_I16_UINT]              = 0,
+      [PIPE_FORMAT_L16_UINT]              = 0,
+      [PIPE_FORMAT_L16A16_UINT]           = 0,
+      [PIPE_FORMAT_A16_SINT]              = 0,
+      [PIPE_FORMAT_I16_SINT]              = 0,
+      [PIPE_FORMAT_L16_SINT]              = 0,
+      [PIPE_FORMAT_L16A16_SINT]           = 0,
+      [PIPE_FORMAT_A32_UINT]              = 0,
+      [PIPE_FORMAT_I32_UINT]              = 0,
+      [PIPE_FORMAT_L32_UINT]              = 0,
+      [PIPE_FORMAT_L32A32_UINT]           = 0,
+      [PIPE_FORMAT_A32_SINT]              = 0,
+      [PIPE_FORMAT_I32_SINT]              = 0,
+      [PIPE_FORMAT_L32_SINT]              = 0,
+      [PIPE_FORMAT_L32A32_SINT]           = 0,
+      [PIPE_FORMAT_B10G10R10A2_UINT]      = GEN6_FORMAT_B10G10R10A2_UINT,
+      [PIPE_FORMAT_ETC1_RGB8]             = GEN6_FORMAT_ETC1_RGB8,
+      [PIPE_FORMAT_R8G8_R8B8_UNORM]       = 0,
+      [PIPE_FORMAT_G8R8_B8R8_UNORM]       = 0,
+      [PIPE_FORMAT_R8G8B8X8_SNORM]        = 0,
+      [PIPE_FORMAT_R8G8B8X8_SRGB]         = 0,
+      [PIPE_FORMAT_R8G8B8X8_UINT]         = 0,
+      [PIPE_FORMAT_R8G8B8X8_SINT]         = 0,
+      [PIPE_FORMAT_B10G10R10X2_UNORM]     = GEN6_FORMAT_B10G10R10X2_UNORM,
+      [PIPE_FORMAT_R16G16B16X16_UNORM]    = GEN6_FORMAT_R16G16B16X16_UNORM,
+      [PIPE_FORMAT_R16G16B16X16_SNORM]    = 0,
+      [PIPE_FORMAT_R16G16B16X16_FLOAT]    = GEN6_FORMAT_R16G16B16X16_FLOAT,
+      [PIPE_FORMAT_R16G16B16X16_UINT]     = 0,
+      [PIPE_FORMAT_R16G16B16X16_SINT]     = 0,
+      [PIPE_FORMAT_R32G32B32X32_FLOAT]    = GEN6_FORMAT_R32G32B32X32_FLOAT,
+      [PIPE_FORMAT_R32G32B32X32_UINT]     = 0,
+      [PIPE_FORMAT_R32G32B32X32_SINT]     = 0,
+      [PIPE_FORMAT_R8A8_SNORM]            = 0,
+      [PIPE_FORMAT_R16A16_UNORM]          = 0,
+      [PIPE_FORMAT_R16A16_SNORM]          = 0,
+      [PIPE_FORMAT_R16A16_FLOAT]          = 0,
+      [PIPE_FORMAT_R32A32_FLOAT]          = 0,
+      [PIPE_FORMAT_R8A8_UINT]             = 0,
+      [PIPE_FORMAT_R8A8_SINT]             = 0,
+      [PIPE_FORMAT_R16A16_UINT]           = 0,
+      [PIPE_FORMAT_R16A16_SINT]           = 0,
+      [PIPE_FORMAT_R32A32_UINT]           = 0,
+      [PIPE_FORMAT_R32A32_SINT]           = 0,
+      [PIPE_FORMAT_R10G10B10A2_UINT]      = GEN6_FORMAT_R10G10B10A2_UINT,
+      [PIPE_FORMAT_B5G6R5_SRGB]           = GEN6_FORMAT_B5G6R5_UNORM_SRGB,
+   };
+   int sfmt = format_mapping[format];
+
+   /* GEN6_FORMAT_R32G32B32A32_FLOAT happens to be 0 */
+   if (!sfmt && format != PIPE_FORMAT_R32G32B32A32_FLOAT)
+      sfmt = -1;
+
+   return sfmt;
+}
diff --git a/src/gallium/drivers/ilo/core/ilo_format.h b/src/gallium/drivers/ilo/ilo_format.h
index 6b73ea1..4e955c0 100644
--- a/src/gallium/drivers/ilo/core/ilo_format.h
+++ b/src/gallium/drivers/ilo/ilo_format.h
@@ -29,8 +29,8 @@
 #define ILO_FORMAT_H
 
 #include "genhw/genhw.h"
-#include "ilo_core.h"
-#include "ilo_dev.h"
+
+#include "ilo_common.h"
 
 bool
 ilo_format_support_vb(const struct ilo_dev *dev,
diff --git a/src/gallium/drivers/ilo/ilo_render.c b/src/gallium/drivers/ilo/ilo_render.c
index f5be336..21f75de 100644
--- a/src/gallium/drivers/ilo/ilo_render.c
+++ b/src/gallium/drivers/ilo/ilo_render.c
@@ -35,76 +35,10 @@
 #include "ilo_query.h"
 #include "ilo_render_gen.h"
 
-/* in S1.3 */
-struct sample_position {
-   int8_t x, y;
-};
-
-static const struct sample_position ilo_sample_pattern_1x[1] = {
-   {  0,  0 },
-};
-
-static const struct sample_position ilo_sample_pattern_2x[2] = {
-   { -4, -4 },
-   {  4,  4 },
-};
-
-static const struct sample_position ilo_sample_pattern_4x[4] = {
-   { -2, -6 },
-   {  6, -2 },
-   { -6,  2 },
-   {  2,  6 },
-};
-
-/* \see brw_multisample_positions_8x */
-static const struct sample_position ilo_sample_pattern_8x[8] = {
-   { -1,  1 },
-   {  1,  5 },
-   {  3, -5 },
-   {  5,  3 },
-   { -7, -1 },
-   { -3, -7 },
-   {  7, -3 },
-   { -5,  7 },
-};
-
-static const struct sample_position ilo_sample_pattern_16x[16] = {
-   {  0,  2 },
-   {  3,  0 },
-   { -3, -2 },
-   { -2, -4 },
-   {  4,  3 },
-   {  5,  1 },
-   {  6, -1 },
-   {  2, -6 },
-   { -4,  5 },
-   { -5, -5 },
-   { -1, -7 },
-   {  7, -3 },
-   { -7,  4 },
-   {  1, -8 },
-   { -6,  6 },
-   { -8,  7 },
-};
-
-static uint8_t
-pack_sample_position(const struct sample_position *pos)
-{
-   return (pos->x + 8) << 4 | (pos->y + 8);
-}
-
-static void
-get_sample_position(const struct sample_position *pos, float *x, float *y)
-{
-   *x = (float) (pos->x + 8) / 16.0f;
-   *y = (float) (pos->y + 8) / 16.0f;
-}
-
 struct ilo_render *
 ilo_render_create(struct ilo_builder *builder)
 {
    struct ilo_render *render;
-   int i;
 
    render = CALLOC_STRUCT(ilo_render);
    if (!render)
@@ -121,29 +55,8 @@ ilo_render_create(struct ilo_builder *builder)
       return NULL;
    }
 
-   /* pack into dwords */
-   render->sample_pattern_1x = pack_sample_position(ilo_sample_pattern_1x);
-   render->sample_pattern_2x =
-      pack_sample_position(&ilo_sample_pattern_2x[1]) << 8 |
-      pack_sample_position(&ilo_sample_pattern_2x[0]);
-   for (i = 0; i < 4; i++) {
-      render->sample_pattern_4x |=
-         pack_sample_position(&ilo_sample_pattern_4x[i]) << (8 * i);
-
-      render->sample_pattern_8x[0] |=
-         pack_sample_position(&ilo_sample_pattern_8x[i]) << (8 * i);
-      render->sample_pattern_8x[1] |=
-         pack_sample_position(&ilo_sample_pattern_8x[i + 4]) << (8 * i);
-
-      render->sample_pattern_16x[0] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i]) << (8 * i);
-      render->sample_pattern_16x[1] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i + 4]) << (8 * i);
-      render->sample_pattern_16x[2] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i + 8]) << (8 * i);
-      render->sample_pattern_16x[3] |=
-         pack_sample_position(&ilo_sample_pattern_16x[i + 12]) << (8 * i);
-   }
+   ilo_state_sample_pattern_init_default(&render->sample_pattern,
+         render->dev);
 
    ilo_render_invalidate_hw(render);
    ilo_render_invalidate_builder(render);
@@ -164,38 +77,13 @@ ilo_render_get_sample_position(const struct ilo_render *render,
                                unsigned sample_index,
                                float *x, float *y)
 {
-   const struct sample_position *pattern;
+   uint8_t off_x, off_y;
 
-   switch (sample_count) {
-   case 1:
-      assert(sample_index < Elements(ilo_sample_pattern_1x));
-      pattern = ilo_sample_pattern_1x;
-      break;
-   case 2:
-      assert(sample_index < Elements(ilo_sample_pattern_2x));
-      pattern = ilo_sample_pattern_2x;
-      break;
-   case 4:
-      assert(sample_index < Elements(ilo_sample_pattern_4x));
-      pattern = ilo_sample_pattern_4x;
-      break;
-   case 8:
-      assert(sample_index < Elements(ilo_sample_pattern_8x));
-      pattern = ilo_sample_pattern_8x;
-      break;
-   case 16:
-      assert(sample_index < Elements(ilo_sample_pattern_16x));
-      pattern = ilo_sample_pattern_16x;
-      break;
-   default:
-      assert(!"unknown sample count");
-      *x = 0.5f;
-      *y = 0.5f;
-      return;
-      break;
-   }
+   ilo_state_sample_pattern_get_offset(&render->sample_pattern, render->dev,
+         sample_count, sample_index, &off_x, &off_y);
 
-   get_sample_position(&pattern[sample_index], x, y);
+   *x = (float) off_x / 16.0f;
+   *y = (float) off_y / 16.0f;
 }
 
 void
@@ -446,12 +334,44 @@ draw_session_prepare(struct ilo_render *render,
       render->instruction_bo_changed = true;
 
       session->prim_changed = true;
-      session->primitive_restart_changed = true;
+
+      ilo_state_urb_full_delta(&vec->urb, render->dev, &session->urb_delta);
+      ilo_state_vf_full_delta(&vec->ve->vf, render->dev, &session->vf_delta);
+
+      ilo_state_raster_full_delta(&vec->rasterizer->rs, render->dev,
+            &session->rs_delta);
+
+      ilo_state_viewport_full_delta(&vec->viewport.vp, render->dev,
+            &session->vp_delta);
+
+      ilo_state_cc_full_delta(&vec->blend->cc, render->dev,
+            &session->cc_delta);
    } else {
       session->prim_changed =
          (render->state.reduced_prim != session->reduced_prim);
-      session->primitive_restart_changed =
-         (render->state.primitive_restart != vec->draw->primitive_restart);
+
+      ilo_state_urb_get_delta(&vec->urb, render->dev,
+            &render->state.urb, &session->urb_delta);
+
+      if (vec->dirty & ILO_DIRTY_VE) {
+         ilo_state_vf_full_delta(&vec->ve->vf, render->dev,
+               &session->vf_delta);
+      }
+
+      if (vec->dirty & ILO_DIRTY_RASTERIZER) {
+         ilo_state_raster_get_delta(&vec->rasterizer->rs, render->dev,
+               &render->state.rs, &session->rs_delta);
+      }
+
+      if (vec->dirty & ILO_DIRTY_VIEWPORT) {
+         ilo_state_viewport_full_delta(&vec->viewport.vp, render->dev,
+               &session->vp_delta);
+      }
+
+      if (vec->dirty & ILO_DIRTY_BLEND) {
+         ilo_state_cc_get_delta(&vec->blend->cc, render->dev,
+               &render->state.cc, &session->cc_delta);
+      }
    }
 }
 
@@ -467,7 +387,10 @@ draw_session_end(struct ilo_render *render,
    render->instruction_bo_changed = false;
 
    render->state.reduced_prim = session->reduced_prim;
-   render->state.primitive_restart = vec->draw->primitive_restart;
+
+   render->state.urb = vec->urb;
+   render->state.rs = vec->rasterizer->rs;
+   render->state.cc = vec->blend->cc;
 }
 
 void
diff --git a/src/gallium/drivers/ilo/ilo_render.h b/src/gallium/drivers/ilo/ilo_render.h
index a85b280..098af73 100644
--- a/src/gallium/drivers/ilo/ilo_render.h
+++ b/src/gallium/drivers/ilo/ilo_render.h
@@ -43,9 +43,6 @@ ilo_render_create(struct ilo_builder *builder);
 void
 ilo_render_destroy(struct ilo_render *render);
 
-/**
- * Estimate the size of an action.
- */
 void
 ilo_render_get_sample_position(const struct ilo_render *render,
                                unsigned sample_count,
diff --git a/src/gallium/drivers/ilo/ilo_render_dynamic.c b/src/gallium/drivers/ilo/ilo_render_dynamic.c
index ef92b12..3b4c802 100644
--- a/src/gallium/drivers/ilo/ilo_render_dynamic.c
+++ b/src/gallium/drivers/ilo/ilo_render_dynamic.c
@@ -30,6 +30,7 @@
 
 #include "ilo_common.h"
 #include "ilo_blitter.h"
+#include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
 
@@ -42,16 +43,14 @@ gen6_emit_draw_dynamic_viewports(struct ilo_render *r,
 {
    ILO_DEV_ASSERT(r->dev, 6, 6);
 
-   /* SF_VIEWPORT, CLIP_VIEWPORT, and CC_VIEWPORT */
-   if (DIRTY(VIEWPORT)) {
+   /* CLIP_VIEWPORT, SF_VIEWPORT, and CC_VIEWPORT */
+   if ((session->vp_delta.dirty & (ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT |
+                                   ILO_STATE_VIEWPORT_CC_VIEWPORT)) ||
+       r->state_bo_changed) {
       r->state.CLIP_VIEWPORT = gen6_CLIP_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
-
-      r->state.SF_VIEWPORT = gen6_SF_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
-
-      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
+            &vec->viewport.vp);
+      r->state.SF_VIEWPORT = gen6_SF_VIEWPORT(r->builder, &vec->viewport.vp);
+      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, &vec->viewport.vp);
 
       session->viewport_changed = true;
    }
@@ -65,12 +64,12 @@ gen7_emit_draw_dynamic_viewports(struct ilo_render *r,
    ILO_DEV_ASSERT(r->dev, 7, 8);
 
    /* SF_CLIP_VIEWPORT and CC_VIEWPORT */
-   if (DIRTY(VIEWPORT)) {
+   if ((session->vp_delta.dirty & (ILO_STATE_VIEWPORT_SF_CLIP_VIEWPORT |
+                                   ILO_STATE_VIEWPORT_CC_VIEWPORT)) ||
+       r->state_bo_changed) {
       r->state.SF_CLIP_VIEWPORT = gen7_SF_CLIP_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
-
-      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder,
-            vec->viewport.cso, vec->viewport.count);
+            &vec->viewport.vp);
+      r->state.CC_VIEWPORT = gen6_CC_VIEWPORT(r->builder, &vec->viewport.vp);
 
       session->viewport_changed = true;
    }
@@ -84,10 +83,10 @@ gen6_emit_draw_dynamic_scissors(struct ilo_render *r,
    ILO_DEV_ASSERT(r->dev, 6, 8);
 
    /* SCISSOR_RECT */
-   if (DIRTY(SCISSOR) || DIRTY(VIEWPORT)) {
-      /* there should be as many scissors as there are viewports */
+   if ((session->vp_delta.dirty & ILO_STATE_VIEWPORT_SCISSOR_RECT) ||
+       r->state_bo_changed) {
       r->state.SCISSOR_RECT = gen6_SCISSOR_RECT(r->builder,
-            &vec->scissor, vec->viewport.count);
+            &vec->viewport.vp);
 
       session->scissor_changed = true;
    }
@@ -101,32 +100,30 @@ gen6_emit_draw_dynamic_cc(struct ilo_render *r,
    ILO_DEV_ASSERT(r->dev, 6, 8);
 
    /* BLEND_STATE */
-   if (DIRTY(BLEND) || DIRTY(FB) || DIRTY(DSA)) {
-      if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) {
-         r->state.BLEND_STATE = gen8_BLEND_STATE(r->builder,
-               vec->blend, &vec->fb, vec->dsa);
-      } else {
-         r->state.BLEND_STATE = gen6_BLEND_STATE(r->builder,
-               vec->blend, &vec->fb, vec->dsa);
-      }
+   if ((session->cc_delta.dirty & ILO_STATE_CC_BLEND_STATE) ||
+        r->state_bo_changed) {
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         r->state.BLEND_STATE = gen8_BLEND_STATE(r->builder, &vec->blend->cc);
+      else
+         r->state.BLEND_STATE = gen6_BLEND_STATE(r->builder, &vec->blend->cc);
 
       session->blend_changed = true;
    }
 
    /* COLOR_CALC_STATE */
-   if (DIRTY(DSA) || DIRTY(STENCIL_REF) || DIRTY(BLEND_COLOR)) {
+   if ((session->cc_delta.dirty & ILO_STATE_CC_COLOR_CALC_STATE) ||
+       r->state_bo_changed) {
       r->state.COLOR_CALC_STATE =
-         gen6_COLOR_CALC_STATE(r->builder, &vec->stencil_ref,
-               vec->dsa->alpha_ref, &vec->blend_color);
-
+         gen6_COLOR_CALC_STATE(r->builder, &vec->blend->cc);
       session->cc_changed = true;
    }
 
    /* DEPTH_STENCIL_STATE */
-   if (ilo_dev_gen(r->dev) < ILO_GEN(8) && DIRTY(DSA)) {
+   if (ilo_dev_gen(r->dev) < ILO_GEN(8) &&
+       ((session->cc_delta.dirty & ILO_STATE_CC_DEPTH_STENCIL_STATE) ||
+        r->state_bo_changed)) {
       r->state.DEPTH_STENCIL_STATE =
-         gen6_DEPTH_STENCIL_STATE(r->builder, vec->dsa);
-
+         gen6_DEPTH_STENCIL_STATE(r->builder, &vec->blend->cc);
       session->dsa_changed = true;
    }
 }
@@ -137,12 +134,11 @@ gen6_emit_draw_dynamic_samplers(struct ilo_render *r,
                                 int shader_type,
                                 struct ilo_render_draw_session *session)
 {
-   const struct ilo_sampler_cso * const *samplers =
-      vec->sampler[shader_type].cso;
-   const struct pipe_sampler_view * const *views =
-      (const struct pipe_sampler_view **) vec->view[shader_type].states;
+   const struct ilo_view_cso * const *views =
+      (const struct ilo_view_cso **) vec->view[shader_type].states;
+   struct ilo_state_sampler samplers[ILO_MAX_SAMPLERS];
    uint32_t *sampler_state, *border_color_state;
-   int sampler_count;
+   int sampler_count, i;
    bool emit_border_color = false;
    bool skip = false;
 
@@ -194,16 +190,28 @@ gen6_emit_draw_dynamic_samplers(struct ilo_render *r,
           sampler_count <= Elements(vec->sampler[shader_type].cso));
 
    if (emit_border_color) {
-      int i;
-
       for (i = 0; i < sampler_count; i++) {
-         border_color_state[i] = (samplers[i]) ?
-            gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, samplers[i]) : 0;
+         const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+         border_color_state[i] = (cso) ?
+            gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, &cso->border) : 0;
+      }
+   }
+
+   for (i = 0; i < sampler_count; i++) {
+      const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+      if (cso && views[i]) {
+         samplers[i] = cso->sampler;
+         ilo_state_sampler_set_surface(&samplers[i],
+               r->dev, &views[i]->surface);
+      } else {
+         samplers[i] = vec->disabled_sampler;
       }
    }
 
-   *sampler_state = gen6_SAMPLER_STATE(r->builder,
-         samplers, views, border_color_state, sampler_count);
+   *sampler_state = gen6_SAMPLER_STATE(r->builder, samplers,
+         border_color_state, sampler_count);
 }
 
 static void
@@ -234,13 +242,13 @@ gen6_emit_draw_dynamic_pcb(struct ilo_render *r,
             const struct ilo_cbuf_state *cbuf =
                &vec->cbuf[PIPE_SHADER_VERTEX];
 
-            if (cbuf0_size <= cbuf->cso[0].user_buffer_size) {
+            if (cbuf0_size <= cbuf->cso[0].info.size) {
                memcpy(pcb, cbuf->cso[0].user_buffer, cbuf0_size);
             } else {
                memcpy(pcb, cbuf->cso[0].user_buffer,
-                     cbuf->cso[0].user_buffer_size);
-               memset(pcb + cbuf->cso[0].user_buffer_size, 0,
-                     cbuf0_size - cbuf->cso[0].user_buffer_size);
+                     cbuf->cso[0].info.size);
+               memset(pcb + cbuf->cso[0].info.size, 0,
+                     cbuf0_size - cbuf->cso[0].info.size);
             }
 
             pcb += cbuf0_size;
@@ -271,13 +279,13 @@ gen6_emit_draw_dynamic_pcb(struct ilo_render *r,
             gen6_push_constant_buffer(r->builder, cbuf0_size, &pcb);
          r->state.wm.PUSH_CONSTANT_BUFFER_size = cbuf0_size;
 
-         if (cbuf0_size <= cbuf->cso[0].user_buffer_size) {
+         if (cbuf0_size <= cbuf->cso[0].info.size) {
             memcpy(pcb, cbuf->cso[0].user_buffer, cbuf0_size);
          } else {
             memcpy(pcb, cbuf->cso[0].user_buffer,
-                  cbuf->cso[0].user_buffer_size);
-            memset(pcb + cbuf->cso[0].user_buffer_size, 0,
-                  cbuf0_size - cbuf->cso[0].user_buffer_size);
+                  cbuf->cso[0].info.size);
+            memset(pcb + cbuf->cso[0].info.size, 0,
+                  cbuf0_size - cbuf->cso[0].info.size);
          }
 
          session->pcb_fs_changed = true;
@@ -441,18 +449,17 @@ ilo_render_emit_rectlist_dynamic_states(struct ilo_render *render,
 
    if (blitter->uses & ILO_BLITTER_USE_DSA) {
       render->state.DEPTH_STENCIL_STATE =
-         gen6_DEPTH_STENCIL_STATE(render->builder, &blitter->dsa);
+         gen6_DEPTH_STENCIL_STATE(render->builder, &blitter->cc);
    }
 
    if (blitter->uses & ILO_BLITTER_USE_CC) {
       render->state.COLOR_CALC_STATE =
-         gen6_COLOR_CALC_STATE(render->builder, &blitter->cc.stencil_ref,
-               blitter->cc.alpha_ref, &blitter->cc.blend_color);
+         gen6_COLOR_CALC_STATE(render->builder, &blitter->cc);
    }
 
    if (blitter->uses & ILO_BLITTER_USE_VIEWPORT) {
       render->state.CC_VIEWPORT =
-         gen6_CC_VIEWPORT(render->builder, &blitter->viewport, 1);
+         gen6_CC_VIEWPORT(render->builder, &blitter->vp);
    }
 
    assert(ilo_builder_dynamic_used(render->builder) <= dynamic_used +
@@ -466,10 +473,9 @@ gen6_emit_launch_grid_dynamic_samplers(struct ilo_render *r,
 {
    const unsigned shader_type = PIPE_SHADER_COMPUTE;
    const struct ilo_shader_state *cs = vec->cs;
-   const struct ilo_sampler_cso * const *samplers =
-      vec->sampler[shader_type].cso;
-   const struct pipe_sampler_view * const *views =
-      (const struct pipe_sampler_view **) vec->view[shader_type].states;
+   const struct ilo_view_cso * const *views =
+      (const struct ilo_view_cso **) vec->view[shader_type].states;
+   struct ilo_state_sampler samplers[ILO_MAX_SAMPLERS];
    int sampler_count, i;
 
    ILO_DEV_ASSERT(r->dev, 7, 7.5);
@@ -480,11 +486,25 @@ gen6_emit_launch_grid_dynamic_samplers(struct ilo_render *r,
           sampler_count <= Elements(vec->sampler[shader_type].cso));
 
    for (i = 0; i < sampler_count; i++) {
-      r->state.cs.SAMPLER_BORDER_COLOR_STATE[i] = (samplers[i]) ?
-         gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, samplers[i]) : 0;
+      const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+      r->state.cs.SAMPLER_BORDER_COLOR_STATE[i] = (cso) ?
+         gen6_SAMPLER_BORDER_COLOR_STATE(r->builder, &cso->border) : 0;
    }
 
-   r->state.cs.SAMPLER_STATE = gen6_SAMPLER_STATE(r->builder, samplers, views,
+   for (i = 0; i < sampler_count; i++) {
+      const struct ilo_sampler_cso *cso = vec->sampler[shader_type].cso[i];
+
+      if (cso && views[i]) {
+         samplers[i] = cso->sampler;
+         ilo_state_sampler_set_surface(&samplers[i],
+               r->dev, &views[i]->surface);
+      } else {
+         samplers[i] = vec->disabled_sampler;
+      }
+   }
+
+   r->state.cs.SAMPLER_STATE = gen6_SAMPLER_STATE(r->builder, samplers,
          r->state.cs.SAMPLER_BORDER_COLOR_STATE, sampler_count);
 }
 
@@ -503,20 +523,39 @@ gen6_emit_launch_grid_dynamic_idrt(struct ilo_render *r,
                                    struct ilo_render_launch_grid_session *session)
 {
    const struct ilo_shader_state *cs = vec->cs;
-   struct gen6_idrt_data data;
+   struct ilo_state_compute_interface_info interface;
+   struct ilo_state_compute_info info;
+   uint32_t kernel_offset;
 
    ILO_DEV_ASSERT(r->dev, 7, 7.5);
 
-   memset(&data, 0, sizeof(data));
+   memset(&interface, 0, sizeof(interface));
+
+   interface.sampler_count =
+      ilo_shader_get_kernel_param(cs, ILO_KERNEL_SAMPLER_COUNT);
+   interface.surface_count =
+      ilo_shader_get_kernel_param(cs, ILO_KERNEL_SURFACE_TOTAL_COUNT);
+   interface.thread_group_size = session->thread_group_size;
+   interface.slm_size =
+      ilo_shader_get_kernel_param(cs, ILO_KERNEL_CS_LOCAL_SIZE);
+   interface.curbe_read_length = r->state.cs.PUSH_CONSTANT_BUFFER_size;
+
+   memset(&info, 0, sizeof(info));
+   info.data = session->compute_data;
+   info.data_size = sizeof(session->compute_data);
+   info.interfaces = &interface;
+   info.interface_count = 1;
+   info.cv_urb_alloc_size = r->dev->urb_size;
+   info.curbe_alloc_size = r->state.cs.PUSH_CONSTANT_BUFFER_size;
+
+   ilo_state_compute_init(&session->compute, r->dev, &info);
 
-   data.cs = cs;
-   data.sampler_offset = r->state.cs.SAMPLER_STATE;
-   data.binding_table_offset = r->state.cs.BINDING_TABLE_STATE;
+   kernel_offset = ilo_shader_get_kernel_offset(cs);
 
-   data.curbe_size = r->state.cs.PUSH_CONSTANT_BUFFER_size;
-   data.thread_group_size = session->thread_group_size;
+   session->idrt = gen6_INTERFACE_DESCRIPTOR_DATA(r->builder,
+         &session->compute, &kernel_offset,
+         &r->state.cs.SAMPLER_STATE, &r->state.cs.BINDING_TABLE_STATE);
 
-   session->idrt = gen6_INTERFACE_DESCRIPTOR_DATA(r->builder, &data, 1);
    session->idrt_size = 32;
 }
 
diff --git a/src/gallium/drivers/ilo/ilo_render_gen.h b/src/gallium/drivers/ilo/ilo_render_gen.h
index acfe8be..6b13375 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen.h
+++ b/src/gallium/drivers/ilo/ilo_render_gen.h
@@ -31,6 +31,7 @@
 #include "core/ilo_builder.h"
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_render.h"
+#include "core/ilo_state_raster.h"
 
 #include "ilo_common.h"
 #include "ilo_state.h"
@@ -50,11 +51,7 @@ struct ilo_render {
 
    struct intel_bo *workaround_bo;
 
-   uint32_t sample_pattern_1x;
-   uint32_t sample_pattern_2x;
-   uint32_t sample_pattern_4x;
-   uint32_t sample_pattern_8x[2];
-   uint32_t sample_pattern_16x[4];
+   struct ilo_state_sample_pattern sample_pattern;
 
    bool hw_ctx_changed;
 
@@ -85,10 +82,13 @@ struct ilo_render {
        */
       uint32_t deferred_pipe_control_dw1;
 
-      bool primitive_restart;
       int reduced_prim;
       int so_max_vertices;
 
+      struct ilo_state_urb urb;
+      struct ilo_state_raster rs;
+      struct ilo_state_cc cc;
+
       uint32_t SF_VIEWPORT;
       uint32_t CLIP_VIEWPORT;
       uint32_t SF_CLIP_VIEWPORT; /* GEN7+ */
@@ -142,7 +142,12 @@ struct ilo_render_draw_session {
    int reduced_prim;
 
    bool prim_changed;
-   bool primitive_restart_changed;
+
+   struct ilo_state_urb_delta urb_delta;
+   struct ilo_state_vf_delta vf_delta;
+   struct ilo_state_raster_delta rs_delta;
+   struct ilo_state_viewport_delta vp_delta;
+   struct ilo_state_cc_delta cc_delta;
 
    /* dynamic states */
    bool viewport_changed;
@@ -180,6 +185,9 @@ struct ilo_render_launch_grid_session {
 
    uint32_t idrt;
    int idrt_size;
+
+   uint32_t compute_data[6];
+   struct ilo_state_compute compute;
 };
 
 int
@@ -381,8 +389,7 @@ ilo_render_pipe_control(struct ilo_render *r, uint32_t dw1)
  */
 static inline void
 ilo_render_3dprimitive(struct ilo_render *r,
-                       const struct pipe_draw_info *info,
-                       const struct ilo_ib_state *ib)
+                       const struct gen6_3dprimitive_info *info)
 {
    ILO_DEV_ASSERT(r->dev, 6, 8);
 
@@ -391,9 +398,9 @@ ilo_render_3dprimitive(struct ilo_render *r,
 
    /* 3DPRIMITIVE */
    if (ilo_dev_gen(r->dev) >= ILO_GEN(7))
-      gen7_3DPRIMITIVE(r->builder, info, ib);
+      gen7_3DPRIMITIVE(r->builder, info);
    else
-      gen6_3DPRIMITIVE(r->builder, info, ib);
+      gen6_3DPRIMITIVE(r->builder, info);
 
    r->state.current_pipe_control_dw1 = 0;
    assert(!r->state.deferred_pipe_control_dw1);
diff --git a/src/gallium/drivers/ilo/ilo_render_gen6.c b/src/gallium/drivers/ilo/ilo_render_gen6.c
index 47f711e..c1f759f 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen6.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen6.c
@@ -29,11 +29,11 @@
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_mi.h"
 #include "core/ilo_builder_render.h"
-#include "util/u_dual_blend.h"
 #include "util/u_prim.h"
 
 #include "ilo_blitter.h"
 #include "ilo_query.h"
+#include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
@@ -330,64 +330,19 @@ gen6_draw_common_urb(struct ilo_render *r,
                      const struct ilo_state_vector *vec,
                      struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_URB */
-   if (DIRTY(VE) || DIRTY(VS) || DIRTY(GS)) {
-      const bool gs_active = (vec->gs || (vec->vs &&
-               ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)));
-      int vs_entry_size, gs_entry_size;
-      int vs_total_size, gs_total_size;
-
-      vs_entry_size = (vec->vs) ?
-         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT) : 0;
+   const bool gs_active = (vec->gs || (vec->vs &&
+            ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)));
 
-      /*
-       * As indicated by 2e712e41db0c0676e9f30fc73172c0e8de8d84d4, VF and VS
-       * share VUE handles.  The VUE allocation size must be large enough to
-       * store either VF outputs (number of VERTEX_ELEMENTs) and VS outputs.
-       *
-       * I am not sure if the PRM explicitly states that VF and VS share VUE
-       * handles.  But here is a citation that implies so:
-       *
-       * From the Sandy Bridge PRM, volume 2 part 1, page 44:
-       *
-       *     "Once a FF stage that spawn threads has sufficient input to
-       *      initiate a thread, it must guarantee that it is safe to request
-       *      the thread initiation. For all these FF stages, this check is
-       *      based on :
-       *
-       *      - The availability of output URB entries:
-       *        - VS: As the input URB entries are overwritten with the
-       *          VS-generated output data, output URB availability isn't a
-       *          factor."
-       */
-      if (vs_entry_size < vec->ve->count + vec->ve->prepend_nosrc_cso)
-         vs_entry_size = vec->ve->count + vec->ve->prepend_nosrc_cso;
-
-      gs_entry_size = (vec->gs) ?
-         ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_OUTPUT_COUNT) :
-         (gs_active) ? vs_entry_size : 0;
-
-      /* in bytes */
-      vs_entry_size *= sizeof(float) * 4;
-      gs_entry_size *= sizeof(float) * 4;
-      vs_total_size = r->dev->urb_size;
-
-      if (gs_active) {
-         vs_total_size /= 2;
-         gs_total_size = vs_total_size;
-      }
-      else {
-         gs_total_size = 0;
-      }
-
-      gen6_3DSTATE_URB(r->builder, vs_total_size, gs_total_size,
-            vs_entry_size, gs_entry_size);
+   /* 3DSTATE_URB */
+   if (session->urb_delta.dirty & (ILO_STATE_URB_3DSTATE_URB_VS |
+                                   ILO_STATE_URB_3DSTATE_URB_GS)) {
+      gen6_3DSTATE_URB(r->builder, &vec->urb);
 
       if (r->state.gs.active && !gs_active)
          gen6_wa_post_3dstate_urb_no_gs(r);
-
-      r->state.gs.active = gs_active;
    }
+
+   r->state.gs.active = gs_active;
 }
 
 static void
@@ -459,33 +414,30 @@ gen6_draw_vf(struct ilo_render *r,
 {
    if (ilo_dev_gen(r->dev) >= ILO_GEN(7.5)) {
       /* 3DSTATE_INDEX_BUFFER */
-      if (DIRTY(IB) || r->batch_bo_changed) {
-         gen6_3DSTATE_INDEX_BUFFER(r->builder,
-               &vec->ib, false);
-      }
+      if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) ||
+          DIRTY(IB) || r->batch_bo_changed)
+         gen6_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib);
 
       /* 3DSTATE_VF */
-      if (session->primitive_restart_changed) {
-         gen75_3DSTATE_VF(r->builder, vec->draw->primitive_restart,
-               vec->draw->restart_index);
-      }
-   }
-   else {
+      if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF)
+         gen75_3DSTATE_VF(r->builder, &vec->ve->vf);
+   } else {
       /* 3DSTATE_INDEX_BUFFER */
-      if (DIRTY(IB) || session->primitive_restart_changed ||
-          r->batch_bo_changed) {
-         gen6_3DSTATE_INDEX_BUFFER(r->builder,
-               &vec->ib, vec->draw->primitive_restart);
-      }
+      if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) ||
+          DIRTY(IB) || r->batch_bo_changed)
+         gen6_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib);
    }
 
    /* 3DSTATE_VERTEX_BUFFERS */
-   if (DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed)
-      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, vec->ve, &vec->vb);
+   if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS) ||
+       DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) {
+      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, &vec->ve->vf,
+            vec->vb.vb, vec->ve->vb_count);
+   }
 
    /* 3DSTATE_VERTEX_ELEMENTS */
-   if (DIRTY(VE))
-      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, vec->ve);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS)
+      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &vec->ve->vf);
 }
 
 void
@@ -516,10 +468,17 @@ gen6_draw_vs(struct ilo_render *r,
 
    /* 3DSTATE_VS */
    if (DIRTY(VS) || r->instruction_bo_changed) {
+      const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs);
+      const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs);
+
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_3dstate_vs_toggle(r);
 
-      gen6_3DSTATE_VS(r->builder, vec->vs);
+      if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
+          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO))
+         gen6_3DSTATE_VS(r->builder, &cso->vs_sol.vs, kernel_offset);
+      else
+         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
    }
 }
 
@@ -535,14 +494,39 @@ gen6_draw_gs(struct ilo_render *r,
    /* 3DSTATE_GS */
    if (DIRTY(GS) || DIRTY(VS) ||
        session->prim_changed || r->instruction_bo_changed) {
+      const union ilo_shader_cso *cso;
+      uint32_t kernel_offset;
+
       if (vec->gs) {
-         gen6_3DSTATE_GS(r->builder, vec->gs);
-      } else if (vec->vs &&
+         cso = ilo_shader_get_kernel_cso(vec->gs);
+         kernel_offset = ilo_shader_get_kernel_offset(vec->gs);
+
+         gen6_3DSTATE_GS(r->builder, &cso->gs, kernel_offset);
+      } else if (ilo_dev_gen(r->dev) == ILO_GEN(6) &&
             ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_GEN6_SO)) {
-         const int verts_per_prim = u_vertices_per_prim(session->reduced_prim);
-         gen6_so_3DSTATE_GS(r->builder, vec->vs, verts_per_prim);
+         const int verts_per_prim =
+            u_vertices_per_prim(session->reduced_prim);
+         enum ilo_kernel_param param;
+
+         switch (verts_per_prim) {
+         case 1:
+            param = ILO_KERNEL_VS_GEN6_SO_POINT_OFFSET;
+            break;
+         case 2:
+            param = ILO_KERNEL_VS_GEN6_SO_LINE_OFFSET;
+            break;
+         default:
+            param = ILO_KERNEL_VS_GEN6_SO_TRI_OFFSET;
+            break;
+         }
+
+         cso = ilo_shader_get_kernel_cso(vec->vs);
+         kernel_offset = ilo_shader_get_kernel_offset(vec->vs) +
+            ilo_shader_get_kernel_param(vec->vs, param);
+
+         gen6_3DSTATE_GS(r->builder, &cso->vs_sol.sol, kernel_offset);
       } else {
-         gen6_disable_3DSTATE_GS(r->builder);
+         gen6_3DSTATE_GS(r->builder, &vec->disabled_gs, 0);
       }
    }
 }
@@ -633,30 +617,8 @@ gen6_draw_clip(struct ilo_render *r,
                struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_CLIP */
-   if (DIRTY(RASTERIZER) || DIRTY(FS) || DIRTY(VIEWPORT) || DIRTY(FB)) {
-      bool enable_guardband = true;
-      unsigned i;
-
-      /*
-       * Gen8+ has viewport extent test.  Guard band test can be enabled on
-       * prior Gens only when the viewport is larger than the framebuffer,
-       * unless we emulate viewport extent test on them.
-       */
-      if (ilo_dev_gen(r->dev) < ILO_GEN(8)) {
-         for (i = 0; i < vec->viewport.count; i++) {
-            const struct ilo_viewport_cso *vp = &vec->viewport.cso[i];
-
-            if (vp->min_x > 0.0f || vp->max_x < vec->fb.state.width ||
-                vp->min_y > 0.0f || vp->max_y < vec->fb.state.height) {
-               enable_guardband = false;
-               break;
-            }
-         }
-      }
-
-      gen6_3DSTATE_CLIP(r->builder, vec->rasterizer,
-            vec->fs, enable_guardband, 1);
-   }
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_CLIP)
+      gen6_3DSTATE_CLIP(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -665,9 +627,9 @@ gen6_draw_sf(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_SF */
-   if (DIRTY(RASTERIZER) || DIRTY(FS) || DIRTY(FB)) {
-      gen6_3DSTATE_SF(r->builder, vec->rasterizer, vec->fs,
-            vec->fb.num_samples);
+   if ((session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF) || DIRTY(FS)) {
+      const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs);
+      gen6_3DSTATE_SF(r->builder, &vec->rasterizer->rs, sbe);
    }
 }
 
@@ -700,17 +662,17 @@ gen6_draw_wm(struct ilo_render *r,
    }
 
    /* 3DSTATE_WM */
-   if (DIRTY(FS) || DIRTY(BLEND) || DIRTY(DSA) ||
-       DIRTY(RASTERIZER) || r->instruction_bo_changed) {
-      const bool dual_blend = vec->blend->dual_blend;
-      const bool cc_may_kill = (vec->dsa->dw_blend_alpha ||
-                                vec->blend->alpha_to_coverage);
+   if (DIRTY(FS) ||
+       (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM) ||
+       r->instruction_bo_changed) {
+      const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs);
+      const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs);
 
       if (ilo_dev_gen(r->dev) == ILO_GEN(6) && r->hw_ctx_changed)
          gen6_wa_pre_3dstate_wm_max_threads(r);
 
-      gen6_3DSTATE_WM(r->builder, vec->fs,
-            vec->rasterizer, dual_blend, cc_may_kill);
+      gen6_3DSTATE_WM(r->builder, &vec->rasterizer->rs,
+            &cso->ps, kernel_offset);
    }
 }
 
@@ -719,25 +681,23 @@ gen6_draw_wm_multisample(struct ilo_render *r,
                          const struct ilo_state_vector *vec,
                          struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */
-   if (DIRTY(SAMPLE_MASK) || DIRTY(FB)) {
-      const uint32_t *pattern;
-
-      pattern = (vec->fb.num_samples > 1) ?
-         &r->sample_pattern_4x : &r->sample_pattern_1x;
+   /* 3DSTATE_MULTISAMPLE */
+   if (DIRTY(FB) || (session->rs_delta.dirty &
+            ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)) {
+      const uint8_t sample_count = (vec->fb.num_samples > 1) ? 4 : 1;
 
       if (ilo_dev_gen(r->dev) == ILO_GEN(6)) {
          gen6_wa_pre_non_pipelined(r);
          gen6_wa_pre_3dstate_multisample(r);
       }
 
-      gen6_3DSTATE_MULTISAMPLE(r->builder,
-            vec->fb.num_samples, pattern,
-            vec->rasterizer->state.half_pixel_center);
-
-      gen6_3DSTATE_SAMPLE_MASK(r->builder,
-            (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1);
+      gen6_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs,
+            &r->sample_pattern, sample_count);
    }
+
+   /* 3DSTATE_SAMPLE_MASK */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK)
+      gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -747,7 +707,7 @@ gen6_draw_wm_depth(struct ilo_render *r,
 {
    /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */
    if (DIRTY(FB) || r->batch_bo_changed) {
-      const struct ilo_zs_surface *zs;
+      const struct ilo_state_zs *zs;
       uint32_t clear_params;
 
       if (vec->fb.state.zsbuf) {
@@ -772,7 +732,7 @@ gen6_draw_wm_depth(struct ilo_render *r,
          gen6_wa_pre_depth(r);
       }
 
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false);
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs);
       gen6_3DSTATE_CLEAR_PARAMS(r->builder, clear_params);
@@ -790,10 +750,8 @@ gen6_draw_wm_raster(struct ilo_render *r,
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_non_pipelined(r);
 
-      gen6_3DSTATE_POLY_STIPPLE_PATTERN(r->builder,
-            &vec->poly_stipple);
-
-      gen6_3DSTATE_POLY_STIPPLE_OFFSET(r->builder, 0, 0);
+      gen6_3DSTATE_POLY_STIPPLE_PATTERN(r->builder, &vec->poly_stipple);
+      gen6_3DSTATE_POLY_STIPPLE_OFFSET(r->builder, &vec->poly_stipple);
    }
 
    /* 3DSTATE_LINE_STIPPLE */
@@ -801,17 +759,16 @@ gen6_draw_wm_raster(struct ilo_render *r,
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_non_pipelined(r);
 
-      gen6_3DSTATE_LINE_STIPPLE(r->builder,
-            vec->rasterizer->state.line_stipple_pattern,
-            vec->rasterizer->state.line_stipple_factor + 1);
+      gen6_3DSTATE_LINE_STIPPLE(r->builder, &vec->line_stipple);
    }
 
    /* 3DSTATE_AA_LINE_PARAMETERS */
-   if (DIRTY(RASTERIZER) && vec->rasterizer->state.line_smooth) {
+   if (session->rs_delta.dirty &
+         ILO_STATE_RASTER_3DSTATE_AA_LINE_PARAMETERS) {
       if (ilo_dev_gen(r->dev) == ILO_GEN(6))
          gen6_wa_pre_non_pipelined(r);
 
-      gen6_3DSTATE_AA_LINE_PARAMETERS(r->builder);
+      gen6_3DSTATE_AA_LINE_PARAMETERS(r->builder, &vec->rasterizer->rs);
    }
 }
 
@@ -849,7 +806,7 @@ ilo_render_emit_draw_commands_gen6(struct ilo_render *render,
    gen6_draw_sf_rect(render, vec, session);
    gen6_draw_vf(render, vec, session);
 
-   ilo_render_3dprimitive(render, vec->draw, &vec->ib);
+   ilo_render_3dprimitive(render, &vec->draw_info);
 }
 
 static void
@@ -860,40 +817,23 @@ gen6_rectlist_vs_to_sf(struct ilo_render *r,
    gen6_wa_post_3dstate_constant_vs(r);
 
    gen6_wa_pre_3dstate_vs_toggle(r);
-   gen6_disable_3DSTATE_VS(r->builder);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
 
    gen6_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen6_disable_3DSTATE_GS(r->builder);
+   gen6_3DSTATE_GS(r->builder, &blitter->gs, 0);
 
-   gen6_disable_3DSTATE_CLIP(r->builder);
-   gen6_3DSTATE_SF(r->builder, NULL, NULL, blitter->fb.num_samples);
+   gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs);
+   gen6_3DSTATE_SF(r->builder, &blitter->fb.rs, &blitter->sbe);
 }
 
 static void
 gen6_rectlist_wm(struct ilo_render *r,
                  const struct ilo_blitter *blitter)
 {
-   uint32_t hiz_op;
-
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-      hiz_op = GEN6_WM_DW4_DEPTH_CLEAR;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
-      hiz_op = GEN6_WM_DW4_DEPTH_RESOLVE;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_HIZ:
-      hiz_op = GEN6_WM_DW4_HIZ_RESOLVE;
-      break;
-   default:
-      hiz_op = 0;
-      break;
-   }
-
    gen6_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);
 
    gen6_wa_pre_3dstate_wm_max_threads(r);
-   gen6_hiz_3DSTATE_WM(r->builder, hiz_op);
+   gen6_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps, 0);
 }
 
 static void
@@ -903,10 +843,8 @@ gen6_rectlist_wm_depth(struct ilo_render *r,
    gen6_wa_pre_depth(r);
 
    if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH |
-                        ILO_BLITTER_USE_FB_STENCIL)) {
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder,
-            &blitter->fb.dst.u.zs, true);
-   }
+                        ILO_BLITTER_USE_FB_STENCIL))
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs);
 
    if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) {
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder,
@@ -926,16 +864,12 @@ static void
 gen6_rectlist_wm_multisample(struct ilo_render *r,
                              const struct ilo_blitter *blitter)
 {
-   const uint32_t *pattern = (blitter->fb.num_samples > 1) ?
-      &r->sample_pattern_4x : &r->sample_pattern_1x;
+   const uint8_t sample_count = (blitter->fb.num_samples > 1) ? 4 : 1;
 
    gen6_wa_pre_3dstate_multisample(r);
 
-   gen6_3DSTATE_MULTISAMPLE(r->builder, blitter->fb.num_samples,
-         pattern, true);
-
-   gen6_3DSTATE_SAMPLE_MASK(r->builder,
-         (1 << blitter->fb.num_samples) - 1);
+   gen6_3DSTATE_MULTISAMPLE(r->builder, &blitter->fb.rs, &r->sample_pattern, sample_count);
+   gen6_3DSTATE_SAMPLE_MASK(r->builder, &blitter->fb.rs);
 }
 
 int
@@ -964,11 +898,9 @@ ilo_render_emit_rectlist_commands_gen6(struct ilo_render *r,
          session->vb_start, session->vb_end,
          sizeof(blitter->vertices[0]));
 
-   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->ve);
+   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->vf);
 
-   gen6_3DSTATE_URB(r->builder, r->dev->urb_size, 0,
-         (blitter->ve.count + blitter->ve.prepend_nosrc_cso) * 4 * sizeof(float),
-         0);
+   gen6_3DSTATE_URB(r->builder, &blitter->urb);
 
    if (r->state.gs.active) {
       gen6_wa_post_3dstate_urb_no_gs(r);
@@ -994,7 +926,7 @@ ilo_render_emit_rectlist_commands_gen6(struct ilo_render *r,
    gen6_3DSTATE_DRAWING_RECTANGLE(r->builder, 0, 0,
          blitter->fb.width, blitter->fb.height);
 
-   ilo_render_3dprimitive(r, &blitter->draw, NULL);
+   ilo_render_3dprimitive(r, &blitter->draw_info);
 }
 
 int
diff --git a/src/gallium/drivers/ilo/ilo_render_gen7.c b/src/gallium/drivers/ilo/ilo_render_gen7.c
index 07fe7c8..6623a8b 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen7.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen7.c
@@ -28,9 +28,9 @@
 #include "genhw/genhw.h"
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_render.h"
-#include "util/u_dual_blend.h"
 
 #include "ilo_blitter.h"
+#include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
@@ -201,40 +201,17 @@ gen7_draw_common_urb(struct ilo_render *r,
                      struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_URB_{VS,GS,HS,DS} */
-   if (DIRTY(VE) || DIRTY(VS)) {
-      /* the first 16KB are reserved for VS and PS PCBs */
-      const int offset =
-         (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-          (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-          32768 : 16384;
-      int vs_entry_size, vs_total_size;
-
-      vs_entry_size = (vec->vs) ?
-         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT) : 0;
-
-      /*
-       * From the Ivy Bridge PRM, volume 2 part 1, page 35:
-       *
-       *     "Programming Restriction: As the VS URB entry serves as both the
-       *      per-vertex input and output of the VS shader, the VS URB
-       *      Allocation Size must be sized to the maximum of the vertex input
-       *      and output structures."
-       */
-      if (vs_entry_size < vec->ve->count + vec->ve->prepend_nosrc_cso)
-         vs_entry_size = vec->ve->count + vec->ve->prepend_nosrc_cso;
-
-      vs_entry_size *= sizeof(float) * 4;
-      vs_total_size = r->dev->urb_size - offset;
-
+   if (session->urb_delta.dirty & (ILO_STATE_URB_3DSTATE_URB_VS |
+                                   ILO_STATE_URB_3DSTATE_URB_HS |
+                                   ILO_STATE_URB_3DSTATE_URB_DS |
+                                   ILO_STATE_URB_3DSTATE_URB_GS)) {
       if (ilo_dev_gen(r->dev) == ILO_GEN(7))
          gen7_wa_pre_vs(r);
 
-      gen7_3DSTATE_URB_VS(r->builder,
-            offset, vs_total_size, vs_entry_size);
-
-      gen7_3DSTATE_URB_GS(r->builder, offset, 0, 0);
-      gen7_3DSTATE_URB_HS(r->builder, offset, 0, 0);
-      gen7_3DSTATE_URB_DS(r->builder, offset, 0, 0);
+      gen7_3DSTATE_URB_VS(r->builder, &vec->urb);
+      gen7_3DSTATE_URB_GS(r->builder, &vec->urb);
+      gen7_3DSTATE_URB_HS(r->builder, &vec->urb);
+      gen7_3DSTATE_URB_DS(r->builder, &vec->urb);
    }
 }
 
@@ -244,22 +221,15 @@ gen7_draw_common_pcb_alloc(struct ilo_render *r,
                            struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_PUSH_CONSTANT_ALLOC_{VS,PS} */
-   if (r->hw_ctx_changed) {
-      /*
-       * Push constant buffers are only allowed to take up at most the first
-       * 16KB of the URB.  Split the space evenly for VS and FS.
-       */
-      const int max_size =
-         (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-          (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-          32768 : 16384;
-      const int size = max_size / 2;
-      int offset = 0;
-
-      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, offset, size);
-      offset += size;
-
-      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, offset, size);
+   if (session->urb_delta.dirty &
+         (ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_VS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_HS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_DS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_GS |
+          ILO_STATE_URB_3DSTATE_PUSH_CONSTANT_ALLOC_PS)) {
+      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, &vec->urb);
+      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_GS(r->builder, &vec->urb);
+      gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, &vec->urb);
 
       if (ilo_dev_gen(r->dev) == ILO_GEN(7))
          gen7_wa_post_3dstate_push_constant_alloc_ps(r);
@@ -344,14 +314,14 @@ gen7_draw_vs(struct ilo_render *r,
    }
 
    /* 3DSTATE_VS */
-   if (ilo_dev_gen(r->dev) >= ILO_GEN(8)) {
-      if (emit_3dstate_vs || DIRTY(RASTERIZER)) {
-         gen8_3DSTATE_VS(r->builder, vec->vs,
-               vec->rasterizer->state.clip_plane_enable);
-      }
-   } else {
-      if (emit_3dstate_vs)
-         gen6_3DSTATE_VS(r->builder, vec->vs);
+   if (emit_3dstate_vs) {
+      const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->vs);
+      const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->vs);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
+      else
+         gen6_3DSTATE_VS(r->builder, &cso->vs, kernel_offset);
    }
 }
 
@@ -362,8 +332,15 @@ gen7_draw_hs(struct ilo_render *r,
 {
    /* 3DSTATE_CONSTANT_HS and 3DSTATE_HS */
    if (r->hw_ctx_changed) {
+      const struct ilo_state_hs *hs = &vec->disabled_hs;
+      const uint32_t kernel_offset = 0;
+
       gen7_3DSTATE_CONSTANT_HS(r->builder, 0, 0, 0);
-      gen7_disable_3DSTATE_HS(r->builder);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_HS(r->builder, hs, kernel_offset);
+      else
+         gen7_3DSTATE_HS(r->builder, hs, kernel_offset);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_HS */
@@ -377,8 +354,10 @@ gen7_draw_te(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_TE */
-   if (r->hw_ctx_changed)
-      gen7_3DSTATE_TE(r->builder);
+   if (r->hw_ctx_changed) {
+      const struct ilo_state_ds *ds = &vec->disabled_ds;
+      gen7_3DSTATE_TE(r->builder, ds);
+   }
 }
 
 void
@@ -388,8 +367,15 @@ gen7_draw_ds(struct ilo_render *r,
 {
    /* 3DSTATE_CONSTANT_DS and 3DSTATE_DS */
    if (r->hw_ctx_changed) {
+      const struct ilo_state_ds *ds = &vec->disabled_ds;
+      const uint32_t kernel_offset = 0;
+
       gen7_3DSTATE_CONSTANT_DS(r->builder, 0, 0, 0);
-      gen7_disable_3DSTATE_DS(r->builder);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_DS(r->builder, ds, kernel_offset);
+      else
+         gen7_3DSTATE_DS(r->builder, ds, kernel_offset);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_DS */
@@ -405,8 +391,15 @@ gen7_draw_gs(struct ilo_render *r,
 {
    /* 3DSTATE_CONSTANT_GS and 3DSTATE_GS */
    if (r->hw_ctx_changed) {
+      const struct ilo_state_gs *gs = &vec->disabled_gs;
+      const uint32_t kernel_offset = 0;
+
       gen7_3DSTATE_CONSTANT_GS(r->builder, 0, 0, 0);
-      gen7_disable_3DSTATE_GS(r->builder);
+
+      if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+         gen8_3DSTATE_GS(r->builder, gs, kernel_offset);
+      else
+         gen7_3DSTATE_GS(r->builder, gs, kernel_offset);
    }
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_GS */
@@ -421,7 +414,7 @@ gen7_draw_sol(struct ilo_render *r,
               const struct ilo_state_vector *vec,
               struct ilo_render_draw_session *session)
 {
-   const struct pipe_stream_output_info *so_info;
+   const struct ilo_state_sol *sol;
    const struct ilo_shader_state *shader;
    bool dirty_sh = false;
 
@@ -434,41 +427,54 @@ gen7_draw_sol(struct ilo_render *r,
       dirty_sh = DIRTY(VS);
    }
 
-   so_info = ilo_shader_get_kernel_so_info(shader);
+   sol = ilo_shader_get_kernel_sol(shader);
 
    /* 3DSTATE_SO_BUFFER */
    if ((DIRTY(SO) || dirty_sh || r->batch_bo_changed) &&
        vec->so.enabled) {
       int i;
 
-      for (i = 0; i < vec->so.count; i++) {
-         const int stride = so_info->stride[i] * 4; /* in bytes */
-
-         gen7_3DSTATE_SO_BUFFER(r->builder, i, stride, vec->so.states[i]);
+      for (i = 0; i < ILO_STATE_SOL_MAX_BUFFER_COUNT; i++) {
+         const struct pipe_stream_output_target *target =
+            (i < vec->so.count && vec->so.states[i]) ?
+            vec->so.states[i] : NULL;
+         const struct ilo_state_sol_buffer *sb = (target) ?
+            &((const struct ilo_stream_output_target *) target)->sb :
+            &vec->so.dummy_sb;
+
+         if (ilo_dev_gen(r->dev) >= ILO_GEN(8))
+            gen8_3DSTATE_SO_BUFFER(r->builder, sol, sb, i);
+         else
+            gen7_3DSTATE_SO_BUFFER(r->builder, sol, sb, i);
       }
-
-      for (; i < 4; i++)
-         gen7_disable_3DSTATE_SO_BUFFER(r->builder, i);
    }
 
    /* 3DSTATE_SO_DECL_LIST */
    if (dirty_sh && vec->so.enabled)
-      gen7_3DSTATE_SO_DECL_LIST(r->builder, so_info);
-
-   /* 3DSTATE_STREAMOUT */
-   if (DIRTY(SO) || DIRTY(RASTERIZER) || dirty_sh) {
-      const int output_count = ilo_shader_get_kernel_param(shader,
-            ILO_KERNEL_OUTPUT_COUNT);
-      int buf_strides[4] = { 0, 0, 0, 0 };
-      int i;
+      gen7_3DSTATE_SO_DECL_LIST(r->builder, sol);
 
-      for (i = 0; i < vec->so.count; i++)
-         buf_strides[i] = so_info->stride[i] * 4;
+   /*
+    * From the Ivy Bridge PRM, volume 2 part 1, page 196-197:
+    *
+    *     "Anytime the SOL unit MMIO registers or non-pipeline state are
+    *      written, the SOL unit needs to receive a pipeline state update with
+    *      SOL unit dirty state for information programmed in MMIO/NP to get
+    *      loaded into the SOL unit.
+    *
+    *      The SOL unit incorrectly double buffers MMIO/NP registers and only
+    *      moves them into the design for usage when control topology is
+    *      received with the SOL unit dirty state.
+    *
+    *      If the state does not change, need to resend the same state.
+    *
+    *      Because of corruption, software must flush the whole fixed function
+    *      pipeline when 3DSTATE_STREAMOUT changes state."
+    *
+    * The first and fourth paragraphs are gone on Gen7.5+.
+    */
 
-      gen7_3DSTATE_STREAMOUT(r->builder, 0,
-            vec->rasterizer->state.rasterizer_discard,
-            output_count, buf_strides);
-   }
+   /* 3DSTATE_STREAMOUT */
+   gen7_3DSTATE_STREAMOUT(r->builder, sol);
 }
 
 static void
@@ -477,22 +483,17 @@ gen7_draw_sf(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_SBE */
-   if (DIRTY(RASTERIZER) || DIRTY(FS)) {
-      gen7_3DSTATE_SBE(r->builder, vec->fs, (vec->rasterizer) ?
-            vec->rasterizer->state.sprite_coord_mode : 0);
+   if (DIRTY(FS)) {
+      const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs);
+      gen7_3DSTATE_SBE(r->builder, sbe);
    }
 
    /* 3DSTATE_SF */
-   if (DIRTY(RASTERIZER) || DIRTY(FB)) {
-      struct pipe_surface *zs = vec->fb.state.zsbuf;
-
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF) {
       if (ilo_dev_gen(r->dev) == ILO_GEN(7))
          gen7_wa_pre_3dstate_sf_depth_bias(r);
 
-      gen7_3DSTATE_SF(r->builder,
-            (vec->rasterizer) ? &vec->rasterizer->sf : NULL,
-            (zs) ? zs->format : PIPE_FORMAT_NONE,
-            vec->fb.num_samples);
+      gen7_3DSTATE_SF(r->builder, &vec->rasterizer->rs);
    }
 }
 
@@ -501,13 +502,12 @@ gen7_draw_wm(struct ilo_render *r,
              const struct ilo_state_vector *vec,
              struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_WM */
-   if (DIRTY(FS) || DIRTY(BLEND) || DIRTY(DSA) || DIRTY(RASTERIZER)) {
-      const bool cc_may_kill = (vec->dsa->dw_blend_alpha ||
-                                vec->blend->alpha_to_coverage);
+   const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs);
+   const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs);
 
-      gen7_3DSTATE_WM(r->builder, vec->fs, vec->rasterizer, cc_may_kill);
-   }
+   /* 3DSTATE_WM */
+   if (DIRTY(FS) || (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM))
+      gen7_3DSTATE_WM(r->builder, &vec->rasterizer->rs, &cso->ps);
 
    /* 3DSTATE_BINDING_TABLE_POINTERS_PS */
    if (session->binding_table_fs_changed) {
@@ -530,13 +530,11 @@ gen7_draw_wm(struct ilo_render *r,
    }
 
    /* 3DSTATE_PS */
-   if (DIRTY(FS) || DIRTY(BLEND) || r->instruction_bo_changed) {
-      const bool dual_blend = vec->blend->dual_blend;
-
+   if (DIRTY(FS) || r->instruction_bo_changed) {
       if (r->hw_ctx_changed)
          gen7_wa_pre_3dstate_ps_max_threads(r);
 
-      gen7_3DSTATE_PS(r->builder, vec->fs, dual_blend);
+      gen7_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
    }
 
    /* 3DSTATE_SCISSOR_STATE_POINTERS */
@@ -569,7 +567,7 @@ gen7_draw_wm(struct ilo_render *r,
 
    /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */
    if (DIRTY(FB) || r->batch_bo_changed) {
-      const struct ilo_zs_surface *zs;
+      const struct ilo_state_zs *zs;
       uint32_t clear_params;
 
       if (vec->fb.state.zsbuf) {
@@ -588,7 +586,7 @@ gen7_draw_wm(struct ilo_render *r,
          clear_params = 0;
       }
 
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false);
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs);
       gen7_3DSTATE_CLEAR_PARAMS(r->builder, clear_params);
@@ -600,24 +598,21 @@ gen7_draw_wm_multisample(struct ilo_render *r,
                          const struct ilo_state_vector *vec,
                          struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */
-   if (DIRTY(SAMPLE_MASK) || DIRTY(FB)) {
-      const uint32_t *pattern;
+   /* 3DSTATE_MULTISAMPLE */
+   if (DIRTY(FB) || (session->rs_delta.dirty &
+            ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)) {
+      const uint8_t sample_count = (vec->fb.num_samples > 4) ? 8 :
+                                   (vec->fb.num_samples > 1) ? 4 : 1;
 
       gen7_wa_pre_3dstate_multisample(r);
 
-      pattern = (vec->fb.num_samples > 4) ? r->sample_pattern_8x :
-                (vec->fb.num_samples > 1) ? &r->sample_pattern_4x :
-                &r->sample_pattern_1x;
-
-      gen6_3DSTATE_MULTISAMPLE(r->builder,
-            vec->fb.num_samples, pattern,
-            vec->rasterizer->state.half_pixel_center);
-
-      gen7_3DSTATE_SAMPLE_MASK(r->builder,
-            (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1,
-            vec->fb.num_samples);
+      gen6_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs,
+            &r->sample_pattern, sample_count);
    }
+
+   /* 3DSTATE_SAMPLE_MASK */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK)
+      gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs);
 }
 
 void
@@ -654,28 +649,15 @@ ilo_render_emit_draw_commands_gen7(struct ilo_render *render,
    gen6_draw_sf_rect(render, vec, session);
    gen6_draw_vf(render, vec, session);
 
-   ilo_render_3dprimitive(render, vec->draw, &vec->ib);
+   ilo_render_3dprimitive(render, &vec->draw_info);
 }
 
 static void
 gen7_rectlist_pcb_alloc(struct ilo_render *r,
                         const struct ilo_blitter *blitter)
 {
-   /*
-    * Push constant buffers are only allowed to take up at most the first
-    * 16KB of the URB.  Split the space evenly for VS and FS.
-    */
-   const int max_size =
-      (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-       (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-       32768 : 16384;
-   const int size = max_size / 2;
-   int offset = 0;
-
-   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, offset, size);
-   offset += size;
-
-   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, offset, size);
+   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_VS(r->builder, &blitter->urb);
+   gen7_3DSTATE_PUSH_CONSTANT_ALLOC_PS(r->builder, &blitter->urb);
 
    if (ilo_dev_gen(r->dev) == ILO_GEN(7))
       gen7_wa_post_3dstate_push_constant_alloc_ps(r);
@@ -685,19 +667,10 @@ static void
 gen7_rectlist_urb(struct ilo_render *r,
                   const struct ilo_blitter *blitter)
 {
-   /* the first 16KB are reserved for VS and PS PCBs */
-   const int offset =
-      (ilo_dev_gen(r->dev) >= ILO_GEN(8)) ||
-       (ilo_dev_gen(r->dev) == ILO_GEN(7.5) && r->dev->gt == 3) ?
-       32768 : 16384;
-
-   gen7_3DSTATE_URB_VS(r->builder, offset, r->dev->urb_size - offset,
-         (blitter->ve.count + blitter->ve.prepend_nosrc_cso) *
-         4 * sizeof(float));
-
-   gen7_3DSTATE_URB_GS(r->builder, offset, 0, 0);
-   gen7_3DSTATE_URB_HS(r->builder, offset, 0, 0);
-   gen7_3DSTATE_URB_DS(r->builder, offset, 0, 0);
+   gen7_3DSTATE_URB_VS(r->builder, &blitter->urb);
+   gen7_3DSTATE_URB_GS(r->builder, &blitter->urb);
+   gen7_3DSTATE_URB_HS(r->builder, &blitter->urb);
+   gen7_3DSTATE_URB_DS(r->builder, &blitter->urb);
 }
 
 static void
@@ -705,58 +678,40 @@ gen7_rectlist_vs_to_sf(struct ilo_render *r,
                        const struct ilo_blitter *blitter)
 {
    gen7_3DSTATE_CONSTANT_VS(r->builder, NULL, NULL, 0);
-   gen6_disable_3DSTATE_VS(r->builder);
+   gen6_3DSTATE_VS(r->builder, &blitter->vs, 0);
 
    gen7_3DSTATE_CONSTANT_HS(r->builder, NULL, NULL, 0);
-   gen7_disable_3DSTATE_HS(r->builder);
+   gen7_3DSTATE_HS(r->builder, &blitter->hs, 0);
 
-   gen7_3DSTATE_TE(r->builder);
+   gen7_3DSTATE_TE(r->builder, &blitter->ds);
 
    gen7_3DSTATE_CONSTANT_DS(r->builder, NULL, NULL, 0);
-   gen7_disable_3DSTATE_DS(r->builder);
+   gen7_3DSTATE_DS(r->builder, &blitter->ds, 0);
 
    gen7_3DSTATE_CONSTANT_GS(r->builder, NULL, NULL, 0);
-   gen7_disable_3DSTATE_GS(r->builder);
+   gen7_3DSTATE_GS(r->builder, &blitter->gs, 0);
 
-   gen7_3DSTATE_STREAMOUT(r->builder, 0, false, 0x0, 0);
+   gen7_3DSTATE_STREAMOUT(r->builder, &blitter->sol);
 
-   gen6_disable_3DSTATE_CLIP(r->builder);
+   gen6_3DSTATE_CLIP(r->builder, &blitter->fb.rs);
 
    if (ilo_dev_gen(r->dev) == ILO_GEN(7))
       gen7_wa_pre_3dstate_sf_depth_bias(r);
 
-   gen7_3DSTATE_SF(r->builder, NULL, blitter->fb.dst.base.format,
-         blitter->fb.num_samples);
-   gen7_3DSTATE_SBE(r->builder, NULL, 0);
+   gen7_3DSTATE_SF(r->builder, &blitter->fb.rs);
+   gen7_3DSTATE_SBE(r->builder, &blitter->sbe);
 }
 
 static void
 gen7_rectlist_wm(struct ilo_render *r,
                  const struct ilo_blitter *blitter)
 {
-   uint32_t hiz_op;
-
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-      hiz_op = GEN7_WM_DW1_DEPTH_CLEAR;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
-      hiz_op = GEN7_WM_DW1_DEPTH_RESOLVE;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_HIZ:
-      hiz_op = GEN7_WM_DW1_HIZ_RESOLVE;
-      break;
-   default:
-      hiz_op = 0;
-      break;
-   }
-
-   gen7_hiz_3DSTATE_WM(r->builder, hiz_op);
+   gen7_3DSTATE_WM(r->builder, &blitter->fb.rs, &blitter->ps);
 
    gen7_3DSTATE_CONSTANT_PS(r->builder, NULL, NULL, 0);
 
    gen7_wa_pre_3dstate_ps_max_threads(r);
-   gen7_disable_3DSTATE_PS(r->builder);
+   gen7_3DSTATE_PS(r->builder, &blitter->ps, 0);
 }
 
 static void
@@ -766,10 +721,8 @@ gen7_rectlist_wm_depth(struct ilo_render *r,
    gen7_wa_pre_depth(r);
 
    if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH |
-                        ILO_BLITTER_USE_FB_STENCIL)) {
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder,
-            &blitter->fb.dst.u.zs, true);
-   }
+                        ILO_BLITTER_USE_FB_STENCIL))
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs);
 
    if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) {
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder,
@@ -789,18 +742,15 @@ static void
 gen7_rectlist_wm_multisample(struct ilo_render *r,
                              const struct ilo_blitter *blitter)
 {
-   const uint32_t *pattern =
-      (blitter->fb.num_samples > 4) ? r->sample_pattern_8x :
-      (blitter->fb.num_samples > 1) ? &r->sample_pattern_4x :
-      &r->sample_pattern_1x;
+   const uint8_t sample_count = (blitter->fb.num_samples > 4) ? 8 :
+                                (blitter->fb.num_samples > 1) ? 4 : 1;
 
    gen7_wa_pre_3dstate_multisample(r);
 
-   gen6_3DSTATE_MULTISAMPLE(r->builder, blitter->fb.num_samples,
-         pattern, true);
+   gen6_3DSTATE_MULTISAMPLE(r->builder, &blitter->fb.rs,
+         &r->sample_pattern, sample_count);
 
-   gen7_3DSTATE_SAMPLE_MASK(r->builder,
-         (1 << blitter->fb.num_samples) - 1, blitter->fb.num_samples);
+   gen6_3DSTATE_SAMPLE_MASK(r->builder, &blitter->fb.rs);
 }
 
 void
@@ -818,7 +768,7 @@ ilo_render_emit_rectlist_commands_gen7(struct ilo_render *r,
          session->vb_start, session->vb_end,
          sizeof(blitter->vertices[0]));
 
-   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->ve);
+   gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &blitter->vf);
 
    gen7_rectlist_pcb_alloc(r, blitter);
 
@@ -854,7 +804,7 @@ ilo_render_emit_rectlist_commands_gen7(struct ilo_render *r,
    if (ilo_dev_gen(r->dev) == ILO_GEN(7))
       gen7_wa_post_ps_and_later(r);
 
-   ilo_render_3dprimitive(r, &blitter->draw, NULL);
+   ilo_render_3dprimitive(r, &blitter->draw_info);
 }
 
 int
diff --git a/src/gallium/drivers/ilo/ilo_render_gen8.c b/src/gallium/drivers/ilo/ilo_render_gen8.c
index 715b936..65494b4 100644
--- a/src/gallium/drivers/ilo/ilo_render_gen8.c
+++ b/src/gallium/drivers/ilo/ilo_render_gen8.c
@@ -28,9 +28,9 @@
 #include "genhw/genhw.h"
 #include "core/ilo_builder_3d.h"
 #include "core/ilo_builder_render.h"
-#include "util/u_dual_blend.h"
 
 #include "ilo_blitter.h"
+#include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
@@ -66,26 +66,20 @@ gen8_draw_sf(struct ilo_render *r,
              struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_RASTER */
-   if (DIRTY(RASTERIZER)) {
-      gen8_3DSTATE_RASTER(r->builder, (vec->rasterizer) ?
-            &vec->rasterizer->sf : NULL);
-   }
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_RASTER)
+      gen8_3DSTATE_RASTER(r->builder, &vec->rasterizer->rs);
 
-   /* 3DSTATE_SBE */
-   if (DIRTY(RASTERIZER) || DIRTY(FS)) {
-      gen8_3DSTATE_SBE(r->builder, vec->fs, (vec->rasterizer) ?
-            vec->rasterizer->state.sprite_coord_mode : 0);
-   }
+   /* 3DSTATE_SBE and 3DSTATE_SBE_SWIZ */
+   if (DIRTY(FS)) {
+      const struct ilo_state_sbe *sbe = ilo_shader_get_kernel_sbe(vec->fs);
 
-   /* 3DSTATE_SBE_SWIZ */
-   if (DIRTY(FS))
-      gen8_3DSTATE_SBE_SWIZ(r->builder, vec->fs);
+      gen8_3DSTATE_SBE(r->builder, sbe);
+      gen8_3DSTATE_SBE_SWIZ(r->builder, sbe);
+   }
 
    /* 3DSTATE_SF */
-   if (DIRTY(RASTERIZER)) {
-      gen8_3DSTATE_SF(r->builder, (vec->rasterizer) ?
-            &vec->rasterizer->sf : NULL);
-   }
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SF)
+      gen7_3DSTATE_SF(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -93,12 +87,15 @@ gen8_draw_wm(struct ilo_render *r,
              const struct ilo_state_vector *vec,
              struct ilo_render_draw_session *session)
 {
+   const union ilo_shader_cso *cso = ilo_shader_get_kernel_cso(vec->fs);
+   const uint32_t kernel_offset = ilo_shader_get_kernel_offset(vec->fs);
+
    /* 3DSTATE_WM */
-   if (DIRTY(FS) || DIRTY(RASTERIZER))
-      gen8_3DSTATE_WM(r->builder, vec->fs, vec->rasterizer);
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_WM)
+      gen8_3DSTATE_WM(r->builder, &vec->rasterizer->rs);
 
-   if (DIRTY(DSA))
-      gen8_3DSTATE_WM_DEPTH_STENCIL(r->builder, vec->dsa);
+   if (session->cc_delta.dirty & ILO_STATE_CC_3DSTATE_WM_DEPTH_STENCIL)
+      gen8_3DSTATE_WM_DEPTH_STENCIL(r->builder, &vec->blend->cc);
 
    /* 3DSTATE_WM_HZ_OP and 3DSTATE_WM_CHROMAKEY */
    if (r->hw_ctx_changed) {
@@ -128,18 +125,15 @@ gen8_draw_wm(struct ilo_render *r,
 
    /* 3DSTATE_PS */
    if (DIRTY(FS) || r->instruction_bo_changed)
-      gen8_3DSTATE_PS(r->builder, vec->fs);
+      gen8_3DSTATE_PS(r->builder, &cso->ps, kernel_offset);
 
    /* 3DSTATE_PS_EXTRA */
-   if (DIRTY(FS) || DIRTY(DSA) || DIRTY(BLEND)) {
-      const bool cc_may_kill = (vec->dsa->dw_blend_alpha ||
-                                vec->blend->alpha_to_coverage);
-      gen8_3DSTATE_PS_EXTRA(r->builder, vec->fs, cc_may_kill, false);
-   }
+   if (DIRTY(FS))
+      gen8_3DSTATE_PS_EXTRA(r->builder, &cso->ps);
 
    /* 3DSTATE_PS_BLEND */
-   if (DIRTY(BLEND) || DIRTY(FB) || DIRTY(DSA))
-      gen8_3DSTATE_PS_BLEND(r->builder, vec->blend, &vec->fb, vec->dsa);
+   if (session->cc_delta.dirty & ILO_STATE_CC_3DSTATE_PS_BLEND)
+      gen8_3DSTATE_PS_BLEND(r->builder, &vec->blend->cc);
 
    /* 3DSTATE_SCISSOR_STATE_POINTERS */
    if (session->scissor_changed) {
@@ -149,7 +143,7 @@ gen8_draw_wm(struct ilo_render *r,
 
    /* 3DSTATE_DEPTH_BUFFER and 3DSTATE_CLEAR_PARAMS */
    if (DIRTY(FB) || r->batch_bo_changed) {
-      const struct ilo_zs_surface *zs;
+      const struct ilo_state_zs *zs;
       uint32_t clear_params;
 
       if (vec->fb.state.zsbuf) {
@@ -170,7 +164,7 @@ gen8_draw_wm(struct ilo_render *r,
 
       gen8_wa_pre_depth(r);
 
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs, false);
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder, zs);
       gen6_3DSTATE_STENCIL_BUFFER(r->builder, zs);
       gen7_3DSTATE_CLEAR_PARAMS(r->builder, clear_params);
@@ -183,14 +177,8 @@ gen8_draw_wm_sample_pattern(struct ilo_render *r,
                             struct ilo_render_draw_session *session)
 {
    /* 3DSTATE_SAMPLE_PATTERN */
-   if (r->hw_ctx_changed) {
-      gen8_3DSTATE_SAMPLE_PATTERN(r->builder,
-            &r->sample_pattern_1x,
-            &r->sample_pattern_2x,
-            &r->sample_pattern_4x,
-            r->sample_pattern_8x,
-            r->sample_pattern_16x);
-   }
+   if (r->hw_ctx_changed)
+      gen8_3DSTATE_SAMPLE_PATTERN(r->builder, &r->sample_pattern);
 }
 
 static void
@@ -198,15 +186,13 @@ gen8_draw_wm_multisample(struct ilo_render *r,
                          const struct ilo_state_vector *vec,
                          struct ilo_render_draw_session *session)
 {
-   /* 3DSTATE_MULTISAMPLE and 3DSTATE_SAMPLE_MASK */
-   if (DIRTY(SAMPLE_MASK) || DIRTY(FB) || DIRTY(RASTERIZER)) {
-      gen8_3DSTATE_MULTISAMPLE(r->builder, vec->fb.num_samples,
-            vec->rasterizer->state.half_pixel_center);
-
-      gen7_3DSTATE_SAMPLE_MASK(r->builder,
-            (vec->fb.num_samples > 1) ? vec->sample_mask : 0x1,
-            vec->fb.num_samples);
-   }
+   /* 3DSTATE_MULTISAMPLE */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_MULTISAMPLE)
+      gen8_3DSTATE_MULTISAMPLE(r->builder, &vec->rasterizer->rs);
+
+   /* 3DSTATE_SAMPLE_MASK */
+   if (session->rs_delta.dirty & ILO_STATE_RASTER_3DSTATE_SAMPLE_MASK)
+      gen6_3DSTATE_SAMPLE_MASK(r->builder, &vec->rasterizer->rs);
 }
 
 static void
@@ -214,36 +200,38 @@ gen8_draw_vf(struct ilo_render *r,
              const struct ilo_state_vector *vec,
              struct ilo_render_draw_session *session)
 {
-   int i;
-
    /* 3DSTATE_INDEX_BUFFER */
-   if (DIRTY(IB) || r->batch_bo_changed)
-      gen8_3DSTATE_INDEX_BUFFER(r->builder, &vec->ib);
+   if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_INDEX_BUFFER) ||
+       DIRTY(IB) || r->batch_bo_changed)
+      gen8_3DSTATE_INDEX_BUFFER(r->builder, &vec->ve->vf, &vec->ib.ib);
 
    /* 3DSTATE_VF */
-   if (session->primitive_restart_changed) {
-      gen75_3DSTATE_VF(r->builder, vec->draw->primitive_restart,
-            vec->draw->restart_index);
-   }
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF)
+      gen75_3DSTATE_VF(r->builder, &vec->ve->vf);
 
    /* 3DSTATE_VERTEX_BUFFERS */
-   if (DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed)
-      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, vec->ve, &vec->vb);
+   if ((session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_BUFFERS) ||
+       DIRTY(VB) || DIRTY(VE) || r->batch_bo_changed) {
+      gen6_3DSTATE_VERTEX_BUFFERS(r->builder, &vec->ve->vf,
+            vec->vb.vb, vec->ve->vb_count);
+   }
 
    /* 3DSTATE_VERTEX_ELEMENTS */
-   if (DIRTY(VE))
-      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, vec->ve);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VERTEX_ELEMENTS)
+      gen6_3DSTATE_VERTEX_ELEMENTS(r->builder, &vec->ve->vf);
+
+   gen8_3DSTATE_VF_TOPOLOGY(r->builder, vec->draw_info.topology);
 
-   gen8_3DSTATE_VF_TOPOLOGY(r->builder, vec->draw->mode);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF_INSTANCING) {
+      const uint8_t attr_count = ilo_state_vf_get_attr_count(&vec->ve->vf);
+      uint8_t i;
 
-   for (i = 0; i < vec->ve->vb_count; i++) {
-      gen8_3DSTATE_VF_INSTANCING(r->builder, i,
-            vec->ve->instance_divisors[i]);
+      for (i = 0; i < attr_count; i++)
+         gen8_3DSTATE_VF_INSTANCING(r->builder, &vec->ve->vf, i);
    }
 
-   gen8_3DSTATE_VF_SGVS(r->builder,
-         false, 0, 0,
-         false, 0, 0);
+   if (session->vf_delta.dirty & ILO_STATE_VF_3DSTATE_VF_SGVS)
+      gen8_3DSTATE_VF_SGVS(r->builder, &vec->ve->vf);
 }
 
 void
@@ -281,7 +269,7 @@ ilo_render_emit_draw_commands_gen8(struct ilo_render *render,
    gen6_draw_sf_rect(render, vec, session);
    gen8_draw_vf(render, vec, session);
 
-   ilo_render_3dprimitive(render, vec->draw, &vec->ib);
+   ilo_render_3dprimitive(render, &vec->draw_info);
 }
 
 int
@@ -365,17 +353,13 @@ ilo_render_emit_rectlist_commands_gen8(struct ilo_render *r,
                                        const struct ilo_blitter *blitter,
                                        const struct ilo_render_rectlist_session *session)
 {
-   uint32_t op;
-
    ILO_DEV_ASSERT(r->dev, 8, 8);
 
    gen8_wa_pre_depth(r);
 
    if (blitter->uses & (ILO_BLITTER_USE_FB_DEPTH |
-                        ILO_BLITTER_USE_FB_STENCIL)) {
-      gen6_3DSTATE_DEPTH_BUFFER(r->builder,
-            &blitter->fb.dst.u.zs, true);
-   }
+                        ILO_BLITTER_USE_FB_STENCIL))
+      gen6_3DSTATE_DEPTH_BUFFER(r->builder, &blitter->fb.dst.u.zs);
 
    if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH) {
       gen6_3DSTATE_HIER_DEPTH_BUFFER(r->builder,
@@ -393,27 +377,8 @@ ilo_render_emit_rectlist_commands_gen8(struct ilo_render *r,
    gen6_3DSTATE_DRAWING_RECTANGLE(r->builder, 0, 0,
          blitter->fb.width, blitter->fb.height);
 
-   switch (blitter->op) {
-   case ILO_BLITTER_RECTLIST_CLEAR_ZS:
-      op = 0;
-      if (blitter->uses & ILO_BLITTER_USE_FB_DEPTH)
-         op |= GEN8_WM_HZ_DW1_DEPTH_CLEAR;
-      if (blitter->uses & ILO_BLITTER_USE_FB_STENCIL)
-         op |= GEN8_WM_HZ_DW1_STENCIL_CLEAR;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_Z:
-      op = GEN8_WM_HZ_DW1_DEPTH_RESOLVE;
-      break;
-   case ILO_BLITTER_RECTLIST_RESOLVE_HIZ:
-      op = GEN8_WM_HZ_DW1_HIZ_RESOLVE;
-      break;
-   default:
-      op = 0;
-      break;
-   }
-
-   gen8_3DSTATE_WM_HZ_OP(r->builder, op, blitter->fb.width,
-         blitter->fb.height, blitter->fb.num_samples);
+   gen8_3DSTATE_WM_HZ_OP(r->builder, &blitter->fb.rs,
+         blitter->fb.width, blitter->fb.height);
 
    ilo_render_pipe_control(r, GEN6_PIPE_CONTROL_WRITE_IMM);
 
diff --git a/src/gallium/drivers/ilo/ilo_render_media.c b/src/gallium/drivers/ilo/ilo_render_media.c
index 387920a..a0de002 100644
--- a/src/gallium/drivers/ilo/ilo_render_media.c
+++ b/src/gallium/drivers/ilo/ilo_render_media.c
@@ -30,6 +30,7 @@
 #include "core/ilo_builder_mi.h"
 #include "core/ilo_builder_render.h"
 
+#include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
 
@@ -206,7 +207,7 @@ ilo_render_emit_launch_grid_commands(struct ilo_render *render,
 
    gen6_state_base_address(render->builder, true);
 
-   gen6_MEDIA_VFE_STATE(render->builder, pcb_size, use_slm);
+   gen6_MEDIA_VFE_STATE(render->builder, &session->compute);
 
    if (pcb_size)
       gen6_MEDIA_CURBE_LOAD(render->builder, pcb, pcb_size);
diff --git a/src/gallium/drivers/ilo/ilo_render_surface.c b/src/gallium/drivers/ilo/ilo_render_surface.c
index b345dfb..ad05356 100644
--- a/src/gallium/drivers/ilo/ilo_render_surface.c
+++ b/src/gallium/drivers/ilo/ilo_render_surface.c
@@ -29,11 +29,65 @@
 
 #include "ilo_common.h"
 #include "ilo_blitter.h"
+#include "ilo_resource.h"
+#include "ilo_shader.h"
 #include "ilo_state.h"
 #include "ilo_render_gen.h"
 
 #define DIRTY(state) (session->pipe_dirty & ILO_DIRTY_ ## state)
 
+static inline uint32_t
+gen6_so_SURFACE_STATE(struct ilo_builder *builder,
+                      const struct pipe_stream_output_target *so,
+                      const struct pipe_stream_output_info *so_info,
+                      int so_index)
+{
+   struct ilo_buffer *buf = ilo_buffer(so->buffer);
+   struct ilo_state_surface_buffer_info info;
+   struct ilo_state_surface surf;
+
+   ILO_DEV_ASSERT(builder->dev, 6, 6);
+
+   memset(&info, 0, sizeof(info));
+   info.buf = buf;
+   info.access = ILO_STATE_SURFACE_ACCESS_DP_SVB;
+
+   switch (so_info->output[so_index].num_components) {
+   case 1:
+      info.format = GEN6_FORMAT_R32_FLOAT;
+      info.format_size = 4;
+      break;
+   case 2:
+      info.format = GEN6_FORMAT_R32G32_FLOAT;
+      info.format_size = 8;
+      break;
+   case 3:
+      info.format = GEN6_FORMAT_R32G32B32_FLOAT;
+      info.format_size = 12;
+      break;
+   case 4:
+      info.format = GEN6_FORMAT_R32G32B32A32_FLOAT;
+      info.format_size = 16;
+      break;
+   default:
+      assert(!"unexpected SO components length");
+      info.format = GEN6_FORMAT_R32_FLOAT;
+      info.format_size = 4;
+      break;
+   }
+
+   info.struct_size =
+      so_info->stride[so_info->output[so_index].output_buffer] * 4;
+   info.offset = so->buffer_offset + so_info->output[so_index].dst_offset * 4;
+   info.size = so->buffer_size - so_info->output[so_index].dst_offset * 4;
+
+   memset(&surf, 0, sizeof(surf));
+   ilo_state_surface_init_for_buffer(&surf, builder->dev, &info);
+   surf.bo = info.buf->bo;
+
+   return gen6_SURFACE_STATE(builder, &surf);
+}
+
 static void
 gen6_emit_draw_surface_rt(struct ilo_render *r,
                           const struct ilo_state_vector *vec,
@@ -64,11 +118,9 @@ gen6_emit_draw_surface_rt(struct ilo_render *r,
             (const struct ilo_surface_cso *) fb->state.cbufs[i];
 
          assert(surface->is_rt);
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &surface->u.rt, true);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &surface->u.rt);
       } else {
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &fb->null_rt, true);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &fb->null_rt);
       }
    }
 }
@@ -173,8 +225,7 @@ gen6_emit_draw_surface_view(struct ilo_render *r,
          const struct ilo_view_cso *cso =
             (const struct ilo_view_cso *) view->states[i];
 
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &cso->surface, false);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface);
       } else {
          surface_state[i] = 0;
       }
@@ -228,12 +279,10 @@ gen6_emit_draw_surface_const(struct ilo_render *r,
    for (i = 0; i < count; i++) {
       const struct ilo_cbuf_cso *cso = &cbuf->cso[i];
 
-      if (cso->resource) {
-         surface_state[i] = gen6_SURFACE_STATE(r->builder,
-               &cso->surface, false);
-      } else {
+      if (cso->resource)
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface);
+      else
          surface_state[i] = 0;
-      }
    }
 }
 
@@ -406,8 +455,7 @@ gen6_emit_launch_grid_surface_view(struct ilo_render *r,
          const struct ilo_view_cso *cso =
             (const struct ilo_view_cso *) view->states[i];
 
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &cso->surface, false);
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &cso->surface);
       } else {
          surface_state[i] = 0;
       }
@@ -421,7 +469,8 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r,
 {
    const struct ilo_shader_state *cs = vec->cs;
    uint32_t *surface_state = r->state.cs.SURFACE_STATE;
-   struct ilo_view_surface view;
+   struct ilo_state_surface_buffer_info info;
+   struct ilo_state_surface surf;
    int base, count;
 
    ILO_DEV_ASSERT(r->dev, 7, 7.5);
@@ -432,15 +481,22 @@ gen6_emit_launch_grid_surface_const(struct ilo_render *r,
    if (!count)
       return;
 
-   ilo_gpe_init_view_surface_for_buffer(r->dev,
-         ilo_buffer(session->input->buffer),
-         session->input->buffer_offset,
-         session->input->buffer_size,
-         1, PIPE_FORMAT_NONE,
-         false, false, &view);
+   memset(&info, 0, sizeof(info));
+   info.buf = ilo_buffer(session->input->buffer);
+   info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
+   info.format = GEN6_FORMAT_RAW;
+   info.format_size = 1;
+   info.struct_size = 1;
+   info.readonly = true;
+   info.offset = session->input->buffer_offset;
+   info.size = session->input->buffer_size;
+
+   memset(&surf, 0, sizeof(surf));
+   ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
+   surf.bo = info.buf->bo;
 
    assert(count == 1 && session->input->buffer);
-   surface_state[base] = gen6_SURFACE_STATE(r->builder, &view, false);
+   surface_state[base] = gen6_SURFACE_STATE(r->builder, &surf);
 }
 
 static void
@@ -483,14 +539,24 @@ gen6_emit_launch_grid_surface_global(struct ilo_render *r,
    for (i = 0; i < count; i++) {
       if (i < vec->global_binding.count && bindings[i].resource) {
          const struct ilo_buffer *buf = ilo_buffer(bindings[i].resource);
-         struct ilo_view_surface view;
+         struct ilo_state_surface_buffer_info info;
+         struct ilo_state_surface surf;
 
          assert(bindings[i].resource->target == PIPE_BUFFER);
 
-         ilo_gpe_init_view_surface_for_buffer(r->dev, buf, 0, buf->bo_size,
-               1, PIPE_FORMAT_NONE, true, true, &view);
-         surface_state[i] =
-            gen6_SURFACE_STATE(r->builder, &view, true);
+         memset(&info, 0, sizeof(info));
+         info.buf = buf;
+         info.access = ILO_STATE_SURFACE_ACCESS_DP_UNTYPED;
+         info.format = GEN6_FORMAT_RAW;
+         info.format_size = 1;
+         info.struct_size = 1;
+         info.size = buf->bo_size;
+
+         memset(&surf, 0, sizeof(surf));
+         ilo_state_surface_init_for_buffer(&surf, r->dev, &info);
+         surf.bo = info.buf->bo;
+
+         surface_state[i] = gen6_SURFACE_STATE(r->builder, &surf);
       } else {
          surface_state[i] = 0;
       }
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index ad48522..be9fd10 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -178,8 +178,8 @@ tex_create_bo(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   ilo_image_set_bo(&tex->image, bo);
-   intel_bo_unref(bo);
+   intel_bo_unref(tex->image.bo);
+   tex->image.bo = bo;
 
    return true;
 }
@@ -223,7 +223,7 @@ tex_create_hiz(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   ilo_image_set_aux_bo(&tex->image, bo);
+   tex->image.aux.bo = bo;
 
    if (tex->imported) {
       unsigned lv;
@@ -256,7 +256,7 @@ tex_create_mcs(struct ilo_texture *tex)
    if (!bo)
       return false;
 
-   ilo_image_set_aux_bo(&tex->image, bo);
+   tex->image.aux.bo = bo;
 
    return true;
 }
@@ -267,7 +267,8 @@ tex_destroy(struct ilo_texture *tex)
    if (tex->separate_s8)
       tex_destroy(tex->separate_s8);
 
-   ilo_image_cleanup(&tex->image);
+   intel_bo_unref(tex->image.bo);
+   intel_bo_unref(tex->image.aux.bo);
 
    tex_free_slices(tex);
    FREE(tex);
@@ -287,15 +288,13 @@ tex_alloc_bos(struct ilo_texture *tex)
 
    switch (tex->image.aux.type) {
    case ILO_IMAGE_AUX_HIZ:
-      if (!tex_create_hiz(tex)) {
-         /* Separate Stencil Buffer requires HiZ to be enabled */
-         if (ilo_dev_gen(&is->dev) == ILO_GEN(6) &&
-             tex->image.separate_stencil)
-            return false;
-      }
+      if (!tex_create_hiz(tex) &&
+          !ilo_image_disable_aux(&tex->image, &is->dev))
+         return false;
       break;
    case ILO_IMAGE_AUX_MCS:
-      if (!tex_create_mcs(tex))
+      if (!tex_create_mcs(tex) &&
+          !ilo_image_disable_aux(&tex->image, &is->dev))
          return false;
       break;
    default:
@@ -328,8 +327,7 @@ tex_import_handle(struct ilo_texture *tex,
       return false;
    }
 
-   ilo_image_set_bo(&tex->image, bo);
-   intel_bo_unref(bo);
+   tex->image.bo = bo;
 
    tex->imported = true;
 
@@ -427,8 +425,8 @@ buf_create_bo(struct ilo_buffer_resource *buf)
    if (!bo)
       return false;
 
-   ilo_buffer_set_bo(&buf->buffer, bo);
-   intel_bo_unref(bo);
+   intel_bo_unref(buf->buffer.bo);
+   buf->buffer.bo = bo;
 
    return true;
 }
@@ -436,7 +434,7 @@ buf_create_bo(struct ilo_buffer_resource *buf)
 static void
 buf_destroy(struct ilo_buffer_resource *buf)
 {
-   ilo_buffer_cleanup(&buf->buffer);
+   intel_bo_unref(buf->buffer.bo);
    FREE(buf);
 }
 
@@ -445,6 +443,7 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
 {
    const struct ilo_screen *is = ilo_screen(screen);
    struct ilo_buffer_resource *buf;
+   unsigned size;
 
    buf = CALLOC_STRUCT(ilo_buffer_resource);
    if (!buf)
@@ -454,8 +453,25 @@ buf_create(struct pipe_screen *screen, const struct pipe_resource *templ)
    buf->base.screen = screen;
    pipe_reference_init(&buf->base.reference, 1);
 
-   ilo_buffer_init(&buf->buffer, &is->dev,
-         templ->width0, templ->bind, templ->flags);
+   size = templ->width0;
+
+   /*
+    * As noted in ilo_format_translate(), we treat some 3-component formats as
+    * 4-component formats to work around hardware limitations.  Imagine the
+    * case where the vertex buffer holds a single PIPE_FORMAT_R16G16B16_FLOAT
+    * vertex, and buf->bo_size is 6.  The hardware would fail to fetch it at
+    * boundary check because the vertex buffer is expected to hold a
+    * PIPE_FORMAT_R16G16B16A16_FLOAT vertex and that takes at least 8 bytes.
+    *
+    * For the workaround to work, we should add 2 to the bo size.  But that
+    * would waste a page when the bo size is already page aligned.  Let's
+    * round it to page size for now and revisit this when needed.
+    */
+   if ((templ->bind & PIPE_BIND_VERTEX_BUFFER) &&
+       ilo_dev_gen(&is->dev) < ILO_GEN(7.5))
+      size = align(size, 4096);
+
+   ilo_buffer_init(&buf->buffer, &is->dev, size, templ->bind, templ->flags);
 
    if (buf->buffer.bo_size < templ->width0 ||
        buf->buffer.bo_size > ilo_max_resource_size ||
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index 918af08..9410555 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -31,11 +31,10 @@
 #include "vl/vl_decoder.h"
 #include "vl/vl_video_buffer.h"
 #include "genhw/genhw.h" /* for GEN6_REG_TIMESTAMP */
-#include "core/ilo_fence.h"
-#include "core/ilo_format.h"
 #include "core/intel_winsys.h"
 
 #include "ilo_context.h"
+#include "ilo_format.h"
 #include "ilo_resource.h"
 #include "ilo_transfer.h" /* for ILO_TRANSFER_MAP_BUFFER_ALIGNMENT */
 #include "ilo_public.h"
@@ -43,8 +42,7 @@
 
 struct pipe_fence_handle {
    struct pipe_reference reference;
-
-   struct ilo_fence fence;
+   struct intel_bo *seqno_bo;
 };
 
 static float
@@ -347,7 +345,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_INDEP_BLEND_FUNC:
       return true;
    case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
-      return (ilo_dev_gen(&is->dev) >= ILO_GEN(7)) ? 2048 : 512;
+      return (ilo_dev_gen(&is->dev) >= ILO_GEN(7.5)) ? 2048 : 512;
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
@@ -458,6 +456,7 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SAMPLER_VIEW_TARGET:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -641,7 +640,7 @@ ilo_screen_fence_reference(struct pipe_screen *screen,
 
    STATIC_ASSERT(&((struct pipe_fence_handle *) NULL)->reference == NULL);
    if (pipe_reference(&old->reference, &fence->reference)) {
-      ilo_fence_cleanup(&old->fence);
+      intel_bo_unref(old->seqno_bo);
       FREE(old);
    }
 }
@@ -654,10 +653,14 @@ ilo_screen_fence_finish(struct pipe_screen *screen,
    const int64_t wait_timeout = (timeout > INT64_MAX) ? -1 : timeout;
    bool signaled;
 
-   signaled = ilo_fence_wait(&fence->fence, wait_timeout);
+   signaled = (!fence->seqno_bo ||
+         intel_bo_wait(fence->seqno_bo, wait_timeout) == 0);
+
    /* XXX not thread safe */
-   if (signaled)
-      ilo_fence_set_seq_bo(&fence->fence, NULL);
+   if (signaled && fence->seqno_bo) {
+      intel_bo_unref(fence->seqno_bo);
+      fence->seqno_bo = NULL;
+   }
 
    return signaled;
 }
@@ -676,7 +679,6 @@ ilo_screen_fence_signalled(struct pipe_screen *screen,
 struct pipe_fence_handle *
 ilo_screen_fence_create(struct pipe_screen *screen, struct intel_bo *bo)
 {
-   struct ilo_screen *is = ilo_screen(screen);
    struct pipe_fence_handle *fence;
 
    fence = CALLOC_STRUCT(pipe_fence_handle);
@@ -685,8 +687,7 @@ ilo_screen_fence_create(struct pipe_screen *screen, struct intel_bo *bo)
 
    pipe_reference_init(&fence->reference, 1);
 
-   ilo_fence_init(&fence->fence, &is->dev);
-   ilo_fence_set_seq_bo(&fence->fence, bo);
+   fence->seqno_bo = intel_bo_ref(bo);
 
    return fence;
 }
@@ -696,7 +697,7 @@ ilo_screen_destroy(struct pipe_screen *screen)
 {
    struct ilo_screen *is = ilo_screen(screen);
 
-   ilo_dev_cleanup(&is->dev);
+   intel_winsys_destroy(is->dev.winsys);
 
    FREE(is);
 }
diff --git a/src/gallium/drivers/ilo/ilo_shader.c b/src/gallium/drivers/ilo/ilo_shader.c
index 799db2c..5f2b010 100644
--- a/src/gallium/drivers/ilo/ilo_shader.c
+++ b/src/gallium/drivers/ilo/ilo_shader.c
@@ -27,7 +27,6 @@
 
 #include "genhw/genhw.h" /* for SBE setup */
 #include "core/ilo_builder.h"
-#include "core/ilo_state_3d.h"
 #include "core/intel_winsys.h"
 #include "shader/ilo_shader_internal.h"
 #include "tgsi/tgsi_parse.h"
@@ -557,39 +556,255 @@ ilo_shader_state_search_variant(struct ilo_shader_state *state,
 }
 
 static void
-copy_so_info(struct ilo_shader *sh,
-             const struct pipe_stream_output_info *so_info)
+init_shader_urb(const struct ilo_shader *kernel,
+                const struct ilo_shader_state *state,
+                struct ilo_state_shader_urb_info *urb)
 {
-   unsigned i, attr;
+   urb->cv_input_attr_count = kernel->in.count;
+   urb->read_base = 0;
+   urb->read_count = kernel->in.count;
 
-   if (!so_info->num_outputs)
+   urb->output_attr_count = kernel->out.count;
+   urb->user_cull_enables = 0x0;
+   urb->user_clip_enables = 0x0;
+}
+
+static void
+init_shader_kernel(const struct ilo_shader *kernel,
+                   const struct ilo_shader_state *state,
+                   struct ilo_state_shader_kernel_info *kern)
+{
+   kern->offset = 0;
+   kern->grf_start = kernel->in.start_grf;
+   kern->pcb_attr_count =
+      (kernel->pcb.cbuf0_size + kernel->pcb.clip_state_size + 15) / 16;
+   kern->scratch_size = 0;
+}
+
+static void
+init_shader_resource(const struct ilo_shader *kernel,
+                     const struct ilo_shader_state *state,
+                     struct ilo_state_shader_resource_info *resource)
+{
+   resource->sampler_count = state->info.num_samplers;
+   resource->surface_count = 0;
+   resource->has_uav = false;
+}
+
+static void
+init_vs(struct ilo_shader *kernel,
+        const struct ilo_shader_state *state)
+{
+   struct ilo_state_vs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   init_shader_urb(kernel, state, &info.urb);
+   init_shader_kernel(kernel, state, &info.kernel);
+   init_shader_resource(kernel, state, &info.resource);
+   info.dispatch_enable = true;
+   info.stats_enable = true;
+
+   if (ilo_dev_gen(state->info.dev) == ILO_GEN(6) && kernel->stream_output) {
+      struct ilo_state_gs_info gs_info;
+
+      memset(&gs_info, 0, sizeof(gs_info));
+
+      gs_info.urb.cv_input_attr_count = kernel->out.count;
+      gs_info.urb.read_count = kernel->out.count;
+      gs_info.kernel.grf_start = kernel->gs_start_grf;
+      gs_info.sol.sol_enable = true;
+      gs_info.sol.stats_enable = true;
+      gs_info.sol.render_disable = kernel->variant.u.vs.rasterizer_discard;
+      gs_info.sol.svbi_post_inc = kernel->svbi_post_inc;
+      gs_info.sol.tristrip_reorder = GEN7_REORDER_LEADING;
+      gs_info.dispatch_enable = true;
+      gs_info.stats_enable = true;
+
+      ilo_state_vs_init(&kernel->cso.vs_sol.vs, state->info.dev, &info);
+      ilo_state_gs_init(&kernel->cso.vs_sol.sol, state->info.dev, &gs_info);
+   } else {
+      ilo_state_vs_init(&kernel->cso.vs, state->info.dev, &info);
+   }
+}
+
+static void
+init_gs(struct ilo_shader *kernel,
+        const struct ilo_shader_state *state)
+{
+   const struct pipe_stream_output_info *so_info = &state->info.stream_output;
+   struct ilo_state_gs_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   init_shader_urb(kernel, state, &info.urb);
+   init_shader_kernel(kernel, state, &info.kernel);
+   init_shader_resource(kernel, state, &info.resource);
+   info.dispatch_enable = true;
+   info.stats_enable = true;
+
+   if (so_info->num_outputs > 0) {
+      info.sol.sol_enable = true;
+      info.sol.stats_enable = true;
+      info.sol.render_disable = kernel->variant.u.gs.rasterizer_discard;
+      info.sol.tristrip_reorder = GEN7_REORDER_LEADING;
+   }
+
+   ilo_state_gs_init(&kernel->cso.gs, state->info.dev, &info);
+}
+
+static void
+init_ps(struct ilo_shader *kernel,
+        const struct ilo_shader_state *state)
+{
+   struct ilo_state_ps_info info;
+
+   memset(&info, 0, sizeof(info));
+
+   init_shader_kernel(kernel, state, &info.kernel_8);
+   init_shader_resource(kernel, state, &info.resource);
+
+   info.io.has_rt_write = true;
+   info.io.posoffset = GEN6_POSOFFSET_NONE;
+   info.io.attr_count = kernel->in.count;
+   info.io.use_z = kernel->in.has_pos;
+   info.io.use_w = kernel->in.has_pos;
+   info.io.use_coverage_mask = false;
+   info.io.pscdepth = (kernel->out.has_pos) ?
+      GEN7_PSCDEPTH_ON : GEN7_PSCDEPTH_OFF;
+   info.io.write_pixel_mask = kernel->has_kill;
+   info.io.write_omask = false;
+
+   info.params.sample_mask = 0x1;
+   info.params.earlyz_control_psexec = false;
+   info.params.alpha_may_kill = false;
+   info.params.dual_source_blending = false;
+   info.params.has_writeable_rt = true;
+
+   info.valid_kernels = GEN6_PS_DISPATCH_8;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 284:
+    *
+    *     "(MSDISPMODE_PERSAMPLE) This is the high-quality multisample mode
+    *      where (over and above PERPIXEL mode) the PS is run for each covered
+    *      sample. This mode is also used for "normal" non-multisample
+    *      rendering (aka 1X), given Number of Multisamples is programmed to
+    *      NUMSAMPLES_1."
+    */
+   info.per_sample_dispatch = true;
+
+   info.rt_clear_enable = false;
+   info.rt_resolve_enable = false;
+   info.cv_per_sample_interp = false;
+   info.cv_has_earlyz_op = false;
+   info.sample_count_one = true;
+   info.cv_has_depth_buffer = true;
+
+   ilo_state_ps_init(&kernel->cso.ps, state->info.dev, &info);
+
+   /* remember current parameters */
+   kernel->ps_params = info.params;
+}
+
+static void
+init_sol(struct ilo_shader *kernel,
+         const struct ilo_dev *dev,
+         const struct pipe_stream_output_info *so_info,
+         bool rasterizer_discard)
+{
+   struct ilo_state_sol_decl_info decls[4][PIPE_MAX_SO_OUTPUTS];
+   unsigned buf_offsets[PIPE_MAX_SO_BUFFERS];
+   struct ilo_state_sol_info info;
+   unsigned i;
+
+   if (!so_info->num_outputs) {
+      ilo_state_sol_init_disabled(&kernel->sol, dev, rasterizer_discard);
       return;
+   }
+
+   memset(&info, 0, sizeof(info));
+   info.data = kernel->sol_data;
+   info.data_size = sizeof(kernel->sol_data);
+   info.sol_enable = true;
+   info.stats_enable = true;
+   info.tristrip_reorder = GEN7_REORDER_TRAILING;
+   info.render_disable = rasterizer_discard;
+   info.render_stream = 0;
+
+   for (i = 0; i < 4; i++) {
+      info.buffer_strides[i] = so_info->stride[i] * 4;
 
-   sh->so_info = *so_info;
+      info.streams[i].cv_vue_attr_count = kernel->out.count;
+      info.streams[i].decls = decls[i];
+   }
 
+   memset(decls, 0, sizeof(decls));
+   memset(buf_offsets, 0, sizeof(buf_offsets));
    for (i = 0; i < so_info->num_outputs; i++) {
+      const unsigned stream = so_info->output[i].stream;
+      const unsigned buffer = so_info->output[i].output_buffer;
+      struct ilo_state_sol_decl_info *decl;
+      unsigned attr;
+
       /* figure out which attribute is sourced */
-      for (attr = 0; attr < sh->out.count; attr++) {
-         const int reg_idx = sh->out.register_indices[attr];
+      for (attr = 0; attr < kernel->out.count; attr++) {
+         const int reg_idx = kernel->out.register_indices[attr];
          if (reg_idx == so_info->output[i].register_index)
             break;
       }
-
-      if (attr < sh->out.count) {
-         sh->so_info.output[i].register_index = attr;
-      }
-      else {
+      if (attr >= kernel->out.count) {
          assert(!"stream output an undefined register");
-         sh->so_info.output[i].register_index = 0;
+         attr = 0;
       }
 
+      if (info.streams[stream].vue_read_count < attr + 1)
+         info.streams[stream].vue_read_count = attr + 1;
+
+      /* pad with holes first */
+      while (buf_offsets[buffer] < so_info->output[i].dst_offset) {
+         int num_dwords;
+
+         num_dwords = so_info->output[i].dst_offset - buf_offsets[buffer];
+         if (num_dwords > 4)
+            num_dwords = 4;
+
+         assert(info.streams[stream].decl_count < ARRAY_SIZE(decls[stream]));
+         decl = &decls[stream][info.streams[stream].decl_count];
+
+         decl->attr = 0;
+         decl->is_hole = true;
+         decl->component_base = 0;
+         decl->component_count = num_dwords;
+         decl->buffer = buffer;
+
+         info.streams[stream].decl_count++;
+         buf_offsets[buffer] += num_dwords;
+      }
+      assert(buf_offsets[buffer] == so_info->output[i].dst_offset);
+
+      assert(info.streams[stream].decl_count < ARRAY_SIZE(decls[stream]));
+      decl = &decls[stream][info.streams[stream].decl_count];
+
+      decl->attr = attr;
+      decl->is_hole = false;
       /* PSIZE is at W channel */
-      if (sh->out.semantic_names[attr] == TGSI_SEMANTIC_PSIZE) {
+      if (kernel->out.semantic_names[attr] == TGSI_SEMANTIC_PSIZE) {
          assert(so_info->output[i].start_component == 0);
          assert(so_info->output[i].num_components == 1);
-         sh->so_info.output[i].start_component = 3;
+         decl->component_base = 3;
+         decl->component_count = 1;
+      } else {
+         decl->component_base = so_info->output[i].start_component;
+         decl->component_count = so_info->output[i].num_components;
       }
+      decl->buffer = buffer;
+
+      info.streams[stream].decl_count++;
+      buf_offsets[buffer] += so_info->output[i].num_components;
    }
+
+   ilo_state_sol_init(&kernel->sol, dev, &info);
 }
 
 /**
@@ -599,17 +814,20 @@ static struct ilo_shader *
 ilo_shader_state_add_variant(struct ilo_shader_state *state,
                              const struct ilo_shader_variant *variant)
 {
+   bool rasterizer_discard = false;
    struct ilo_shader *sh;
 
    switch (state->info.type) {
    case PIPE_SHADER_VERTEX:
       sh = ilo_shader_compile_vs(state, variant);
+      rasterizer_discard = variant->u.vs.rasterizer_discard;
       break;
    case PIPE_SHADER_FRAGMENT:
       sh = ilo_shader_compile_fs(state, variant);
       break;
    case PIPE_SHADER_GEOMETRY:
       sh = ilo_shader_compile_gs(state, variant);
+      rasterizer_discard = variant->u.gs.rasterizer_discard;
       break;
    case PIPE_SHADER_COMPUTE:
       sh = ilo_shader_compile_cs(state, variant);
@@ -625,7 +843,8 @@ ilo_shader_state_add_variant(struct ilo_shader_state *state,
 
    sh->variant = *variant;
 
-   copy_so_info(sh, &state->info.stream_output);
+   init_sol(sh, state->info.dev, &state->info.stream_output,
+         rasterizer_discard);
 
    ilo_shader_state_add_shader(state, sh);
 
@@ -665,13 +884,13 @@ ilo_shader_state_use_variant(struct ilo_shader_state *state,
    if (construct_cso) {
       switch (state->info.type) {
       case PIPE_SHADER_VERTEX:
-         ilo_gpe_init_vs_cso(state->info.dev, state, &sh->cso);
+         init_vs(sh, state);
          break;
       case PIPE_SHADER_GEOMETRY:
-         ilo_gpe_init_gs_cso(state->info.dev, state, &sh->cso);
+         init_gs(sh, state);
          break;
       case PIPE_SHADER_FRAGMENT:
-         ilo_gpe_init_fs_cso(state->info.dev, state, &sh->cso);
+         init_ps(sh, state);
          break;
       default:
          break;
@@ -789,16 +1008,33 @@ ilo_shader_select_kernel(struct ilo_shader_state *shader,
                          const struct ilo_state_vector *vec,
                          uint32_t dirty)
 {
-   const struct ilo_shader * const cur = shader->shader;
    struct ilo_shader_variant variant;
+   bool changed = false;
 
-   if (!(shader->info.non_orthogonal_states & dirty))
-      return false;
+   if (shader->info.non_orthogonal_states & dirty) {
+      const struct ilo_shader * const old = shader->shader;
+
+      ilo_shader_variant_init(&variant, &shader->info, vec);
+      ilo_shader_state_use_variant(shader, &variant);
+      changed = (shader->shader != old);
+   }
 
-   ilo_shader_variant_init(&variant, &shader->info, vec);
-   ilo_shader_state_use_variant(shader, &variant);
+   if (shader->info.type == PIPE_SHADER_FRAGMENT) {
+      struct ilo_shader *kernel = shader->shader;
 
-   return (shader->shader != cur);
+      if (kernel->ps_params.sample_mask != vec->sample_mask ||
+          kernel->ps_params.alpha_may_kill != vec->blend->alpha_may_kill) {
+         kernel->ps_params.sample_mask = vec->sample_mask;
+         kernel->ps_params.alpha_may_kill = vec->blend->alpha_may_kill;
+
+         ilo_state_ps_set_params(&kernel->cso.ps, shader->info.dev,
+               &kernel->ps_params);
+
+         changed = true;
+      }
+   }
+
+   return changed;
 }
 
 static int
@@ -829,82 +1065,104 @@ route_attr(const int *semantics, const int *indices, int len,
  * \return true if a different routing is selected
  */
 bool
-ilo_shader_select_kernel_routing(struct ilo_shader_state *shader,
-                                 const struct ilo_shader_state *source,
-                                 const struct ilo_rasterizer_state *rasterizer)
+ilo_shader_select_kernel_sbe(struct ilo_shader_state *shader,
+                             const struct ilo_shader_state *source,
+                             const struct ilo_rasterizer_state *rasterizer)
 {
-   const uint32_t sprite_coord_enable = rasterizer->state.sprite_coord_enable;
+   const bool is_point = true;
    const bool light_twoside = rasterizer->state.light_twoside;
+   const uint32_t sprite_coord_enable = rasterizer->state.sprite_coord_enable;
+   const int sprite_coord_mode = rasterizer->state.sprite_coord_mode;
    struct ilo_shader *kernel = shader->shader;
    struct ilo_kernel_routing *routing = &kernel->routing;
+   struct ilo_state_sbe_swizzle_info swizzles[ILO_STATE_SBE_MAX_SWIZZLE_COUNT];
+   struct ilo_state_sbe_info info;
    const int *src_semantics, *src_indices;
-   int src_len, max_src_slot;
+   int src_skip, src_len, src_slot;
    int dst_len, dst_slot;
 
-   /* we are constructing 3DSTATE_SBE here */
-   ILO_DEV_ASSERT(shader->info.dev, 6, 8);
-
    assert(kernel);
 
    if (source) {
       assert(source->shader);
+
       src_semantics = source->shader->out.semantic_names;
       src_indices = source->shader->out.semantic_indices;
       src_len = source->shader->out.count;
-   }
-   else {
+      src_skip = 0;
+
+      assert(src_len >= 2 &&
+             src_semantics[0] == TGSI_SEMANTIC_PSIZE &&
+             src_semantics[1] == TGSI_SEMANTIC_POSITION);
+
+      /*
+       * skip PSIZE and POSITION (how about the optional CLIPDISTs?), unless
+       * they are all the source shader has and FS needs to read some
+       * attributes.
+       */
+      if (src_len > 2 || !kernel->in.count) {
+         src_semantics += 2;
+         src_indices += 2;
+         src_len -= 2;
+         src_skip = 2;
+      }
+   } else {
       src_semantics = kernel->in.semantic_names;
       src_indices = kernel->in.semantic_indices;
       src_len = kernel->in.count;
+      src_skip = 0;
    }
 
    /* no change */
-   if (kernel->routing_initialized &&
-       routing->source_skip + routing->source_len <= src_len &&
-       kernel->routing_sprite_coord_enable == sprite_coord_enable &&
-       !memcmp(kernel->routing_src_semantics,
-          &src_semantics[routing->source_skip],
-          sizeof(kernel->routing_src_semantics[0]) * routing->source_len) &&
-       !memcmp(kernel->routing_src_indices,
-          &src_indices[routing->source_skip],
-          sizeof(kernel->routing_src_indices[0]) * routing->source_len))
+   if (routing->initialized &&
+       routing->is_point == is_point &&
+       routing->light_twoside == light_twoside &&
+       routing->sprite_coord_enable == sprite_coord_enable &&
+       routing->sprite_coord_mode == sprite_coord_mode &&
+       routing->src_len <= src_len &&
+       !memcmp(routing->src_semantics, src_semantics,
+          sizeof(src_semantics[0]) * routing->src_len) &&
+       !memcmp(routing->src_indices, src_indices,
+          sizeof(src_indices[0]) * routing->src_len))
       return false;
 
-   if (source) {
-      /* skip PSIZE and POSITION (how about the optional CLIPDISTs?) */
-      assert(src_semantics[0] == TGSI_SEMANTIC_PSIZE);
-      assert(src_semantics[1] == TGSI_SEMANTIC_POSITION);
-      routing->source_skip = 2;
-
-      routing->source_len = src_len - routing->source_skip;
-      src_semantics += routing->source_skip;
-      src_indices += routing->source_skip;
-   }
-   else {
-      routing->source_skip = 0;
-      routing->source_len = src_len;
-   }
-
-   routing->const_interp_enable = kernel->in.const_interp_enable;
-   routing->point_sprite_enable = 0;
-   routing->swizzle_enable = false;
-
-   assert(kernel->in.count <= Elements(routing->swizzles));
-   dst_len = MIN2(kernel->in.count, Elements(routing->swizzles));
-   max_src_slot = -1;
+   routing->is_point = is_point;
+   routing->light_twoside = light_twoside;
+   routing->sprite_coord_enable = sprite_coord_enable;
+   routing->sprite_coord_mode = sprite_coord_mode;
+
+   assert(kernel->in.count <= Elements(swizzles));
+   dst_len = MIN2(kernel->in.count, Elements(swizzles));
+
+   memset(&swizzles, 0, sizeof(swizzles));
+   memset(&info, 0, sizeof(info));
+
+   info.attr_count = dst_len;
+   info.cv_vue_attr_count = src_skip + src_len;
+   info.vue_read_base = src_skip;
+   info.vue_read_count = 0;
+   info.has_min_read_count = true;
+   info.swizzle_enable = false;
+   info.swizzle_16_31 = false;
+   info.swizzle_count = 0;
+   info.swizzles = swizzles;
+   info.const_interp_enables = kernel->in.const_interp_enable;
+   info.point_sprite_enables = 0x0;
+   info.point_sprite_origin_lower_left =
+      (sprite_coord_mode == PIPE_SPRITE_COORD_LOWER_LEFT);
+   info.cv_is_point = is_point;
 
    for (dst_slot = 0; dst_slot < dst_len; dst_slot++) {
       const int semantic = kernel->in.semantic_names[dst_slot];
       const int index = kernel->in.semantic_indices[dst_slot];
-      int src_slot;
 
       if (semantic == TGSI_SEMANTIC_GENERIC &&
           (sprite_coord_enable & (1 << index)))
-         routing->point_sprite_enable |= 1 << dst_slot;
+         info.point_sprite_enables |= 1 << dst_slot;
 
       if (source) {
-         src_slot = route_attr(src_semantics, src_indices,
-               routing->source_len, semantic, index);
+         src_slot = route_attr(src_semantics, src_indices, src_len,
+               semantic, index);
 
          /*
           * The source shader stage does not output this attribute.  The value
@@ -918,58 +1176,47 @@ ilo_shader_select_kernel_routing(struct ilo_shader_state *shader,
           */
          if (src_slot < 0)
             src_slot = 0;
-      }
-      else {
+      } else {
          src_slot = dst_slot;
       }
 
-      routing->swizzles[dst_slot] = src_slot;
-
       /* use the following slot for two-sided lighting */
       if (semantic == TGSI_SEMANTIC_COLOR && light_twoside &&
-          src_slot + 1 < routing->source_len &&
+          src_slot + 1 < src_len &&
           src_semantics[src_slot + 1] == TGSI_SEMANTIC_BCOLOR &&
           src_indices[src_slot + 1] == index) {
-         routing->swizzles[dst_slot] |= GEN8_SBE_SWIZ_INPUTATTR_FACING;
+         swizzles[dst_slot].attr_select = GEN6_INPUTATTR_FACING;
+         swizzles[dst_slot].attr = src_slot;
+         info.swizzle_enable = true;
          src_slot++;
+      } else {
+         swizzles[dst_slot].attr_select = GEN6_INPUTATTR_NORMAL;
+         swizzles[dst_slot].attr = src_slot;
+         if (src_slot != dst_slot)
+            info.swizzle_enable = true;
       }
 
-      if (routing->swizzles[dst_slot] != dst_slot)
-         routing->swizzle_enable = true;
+      swizzles[dst_slot].force_zeros = false;
 
-      if (max_src_slot < src_slot)
-         max_src_slot = src_slot;
+      if (info.vue_read_count < src_slot + 1)
+         info.vue_read_count = src_slot + 1;
    }
 
-   memset(&routing->swizzles[dst_slot], 0, sizeof(routing->swizzles) -
-         sizeof(routing->swizzles[0]) * dst_slot);
+   if (info.swizzle_enable)
+      info.swizzle_count = dst_len;
 
-   /*
-    * From the Sandy Bridge PRM, volume 2 part 1, page 248:
-    *
-    *     "It is UNDEFINED to set this field (Vertex URB Entry Read Length) to
-    *      0 indicating no Vertex URB data to be read.
-    *
-    *      This field should be set to the minimum length required to read the
-    *      maximum source attribute. The maximum source attribute is indicated
-    *      by the maximum value of the enabled Attribute # Source Attribute if
-    *      Attribute Swizzle Enable is set, Number of Output Attributes-1 if
-    *      enable is not set.
-    *
-    *        read_length = ceiling((max_source_attr+1)/2)
-    *
-    *      [errata] Corruption/Hang possible if length programmed larger than
-    *      recommended"
-    */
-   routing->source_len = max_src_slot + 1;
+   if (routing->initialized)
+      ilo_state_sbe_set_info(&routing->sbe, shader->info.dev, &info);
+   else
+      ilo_state_sbe_init(&routing->sbe, shader->info.dev, &info);
+
+   routing->src_len = info.vue_read_count;
+   memcpy(routing->src_semantics, src_semantics,
+         sizeof(src_semantics[0]) * routing->src_len);
+   memcpy(routing->src_indices, src_indices,
+         sizeof(src_indices[0]) * routing->src_len);
 
-   /* remember the states of the source */
-   kernel->routing_initialized = true;
-   kernel->routing_sprite_coord_enable = sprite_coord_enable;
-   memcpy(kernel->routing_src_semantics, src_semantics,
-         sizeof(kernel->routing_src_semantics[0]) * routing->source_len);
-   memcpy(kernel->routing_src_indices, src_indices,
-         sizeof(kernel->routing_src_indices[0]) * routing->source_len);
+   routing->initialized = true;
 
    return true;
 }
@@ -1147,7 +1394,7 @@ ilo_shader_get_kernel_param(const struct ilo_shader_state *shader,
 /**
  * Return the CSO of the selected kernel.
  */
-const struct ilo_shader_cso *
+const union ilo_shader_cso *
 ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader)
 {
    const struct ilo_shader *kernel = shader->shader;
@@ -1163,22 +1410,28 @@ ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader)
 const struct pipe_stream_output_info *
 ilo_shader_get_kernel_so_info(const struct ilo_shader_state *shader)
 {
+   return &shader->info.stream_output;
+}
+
+const struct ilo_state_sol *
+ilo_shader_get_kernel_sol(const struct ilo_shader_state *shader)
+{
    const struct ilo_shader *kernel = shader->shader;
 
    assert(kernel);
 
-   return &kernel->so_info;
+   return &kernel->sol;
 }
 
 /**
  * Return the routing info of the selected kernel.
  */
-const struct ilo_kernel_routing *
-ilo_shader_get_kernel_routing(const struct ilo_shader_state *shader)
+const struct ilo_state_sbe *
+ilo_shader_get_kernel_sbe(const struct ilo_shader_state *shader)
 {
    const struct ilo_shader *kernel = shader->shader;
 
    assert(kernel);
 
-   return &kernel->routing;
+   return &kernel->routing.sbe;
 }
diff --git a/src/gallium/drivers/ilo/ilo_shader.h b/src/gallium/drivers/ilo/ilo_shader.h
index 8a35900..d9f02a4 100644
--- a/src/gallium/drivers/ilo/ilo_shader.h
+++ b/src/gallium/drivers/ilo/ilo_shader.h
@@ -28,6 +28,8 @@
 #ifndef ILO_SHADER_H
 #define ILO_SHADER_H
 
+#include "core/ilo_state_shader.h"
+
 #include "ilo_common.h"
 
 enum ilo_kernel_param {
@@ -81,23 +83,28 @@ enum ilo_kernel_param {
    ILO_KERNEL_PARAM_COUNT,
 };
 
-struct ilo_kernel_routing {
-   uint32_t const_interp_enable;
-   uint32_t point_sprite_enable;
-   unsigned source_skip, source_len;
-
-   bool swizzle_enable;
-   uint16_t swizzles[16];
-};
-
 struct intel_bo;
 struct ilo_builder;
 struct ilo_rasterizer_state;
 struct ilo_shader_cache;
 struct ilo_shader_state;
-struct ilo_shader_cso;
+struct ilo_state_sbe;
+struct ilo_state_sol;
 struct ilo_state_vector;
 
+union ilo_shader_cso {
+   struct ilo_state_vs vs;
+   struct ilo_state_hs hs;
+   struct ilo_state_ds ds;
+   struct ilo_state_gs gs;
+   struct ilo_state_ps ps;
+
+   struct {
+      struct ilo_state_vs vs;
+      struct ilo_state_gs sol;
+   } vs_sol;
+};
+
 struct ilo_shader_cache *
 ilo_shader_cache_create(void);
 
@@ -151,9 +158,9 @@ ilo_shader_select_kernel(struct ilo_shader_state *shader,
                          uint32_t dirty);
 
 bool
-ilo_shader_select_kernel_routing(struct ilo_shader_state *shader,
-                                 const struct ilo_shader_state *source,
-                                 const struct ilo_rasterizer_state *rasterizer);
+ilo_shader_select_kernel_sbe(struct ilo_shader_state *shader,
+                             const struct ilo_shader_state *source,
+                             const struct ilo_rasterizer_state *rasterizer);
 
 uint32_t
 ilo_shader_get_kernel_offset(const struct ilo_shader_state *shader);
@@ -162,13 +169,16 @@ int
 ilo_shader_get_kernel_param(const struct ilo_shader_state *shader,
                             enum ilo_kernel_param param);
 
-const struct ilo_shader_cso *
+const union ilo_shader_cso *
 ilo_shader_get_kernel_cso(const struct ilo_shader_state *shader);
 
 const struct pipe_stream_output_info *
 ilo_shader_get_kernel_so_info(const struct ilo_shader_state *shader);
 
-const struct ilo_kernel_routing *
-ilo_shader_get_kernel_routing(const struct ilo_shader_state *shader);
+const struct ilo_state_sol *
+ilo_shader_get_kernel_sol(const struct ilo_shader_state *shader);
+
+const struct ilo_state_sbe *
+ilo_shader_get_kernel_sbe(const struct ilo_shader_state *shader);
 
 #endif /* ILO_SHADER_H */
diff --git a/src/gallium/drivers/ilo/ilo_state.c b/src/gallium/drivers/ilo/ilo_state.c
index b1bd49a..63534f3 100644
--- a/src/gallium/drivers/ilo/ilo_state.c
+++ b/src/gallium/drivers/ilo/ilo_state.c
@@ -25,16 +25,288 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
-#include "core/ilo_state_3d.h"
+#include "util/u_dual_blend.h"
 #include "util/u_dynarray.h"
+#include "util/u_framebuffer.h"
 #include "util/u_helpers.h"
+#include "util/u_resource.h"
 #include "util/u_upload_mgr.h"
 
 #include "ilo_context.h"
+#include "ilo_format.h"
 #include "ilo_resource.h"
 #include "ilo_shader.h"
 #include "ilo_state.h"
 
+/**
+ * Translate a pipe primitive type to the matching hardware primitive type.
+ */
+static enum gen_3dprim_type
+ilo_translate_draw_mode(unsigned mode)
+{
+   static const enum gen_3dprim_type prim_mapping[PIPE_PRIM_MAX] = {
+      [PIPE_PRIM_POINTS]                     = GEN6_3DPRIM_POINTLIST,
+      [PIPE_PRIM_LINES]                      = GEN6_3DPRIM_LINELIST,
+      [PIPE_PRIM_LINE_LOOP]                  = GEN6_3DPRIM_LINELOOP,
+      [PIPE_PRIM_LINE_STRIP]                 = GEN6_3DPRIM_LINESTRIP,
+      [PIPE_PRIM_TRIANGLES]                  = GEN6_3DPRIM_TRILIST,
+      [PIPE_PRIM_TRIANGLE_STRIP]             = GEN6_3DPRIM_TRISTRIP,
+      [PIPE_PRIM_TRIANGLE_FAN]               = GEN6_3DPRIM_TRIFAN,
+      [PIPE_PRIM_QUADS]                      = GEN6_3DPRIM_QUADLIST,
+      [PIPE_PRIM_QUAD_STRIP]                 = GEN6_3DPRIM_QUADSTRIP,
+      [PIPE_PRIM_POLYGON]                    = GEN6_3DPRIM_POLYGON,
+      [PIPE_PRIM_LINES_ADJACENCY]            = GEN6_3DPRIM_LINELIST_ADJ,
+      [PIPE_PRIM_LINE_STRIP_ADJACENCY]       = GEN6_3DPRIM_LINESTRIP_ADJ,
+      [PIPE_PRIM_TRIANGLES_ADJACENCY]        = GEN6_3DPRIM_TRILIST_ADJ,
+      [PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]   = GEN6_3DPRIM_TRISTRIP_ADJ,
+   };
+
+   assert(prim_mapping[mode]);
+
+   return prim_mapping[mode];
+}
+
+static enum gen_index_format
+ilo_translate_index_size(unsigned index_size)
+{
+   switch (index_size) {
+   case 1:                             return GEN6_INDEX_BYTE;
+   case 2:                             return GEN6_INDEX_WORD;
+   case 4:                             return GEN6_INDEX_DWORD;
+   default:
+      assert(!"unknown index size");
+      return GEN6_INDEX_BYTE;
+   }
+}
+
+static enum gen_mip_filter
+ilo_translate_mip_filter(unsigned filter)
+{
+   switch (filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:    return GEN6_MIPFILTER_NEAREST;
+   case PIPE_TEX_MIPFILTER_LINEAR:     return GEN6_MIPFILTER_LINEAR;
+   case PIPE_TEX_MIPFILTER_NONE:       return GEN6_MIPFILTER_NONE;
+   default:
+      assert(!"unknown mipfilter");
+      return GEN6_MIPFILTER_NONE;
+   }
+}
+
+static int
+ilo_translate_img_filter(unsigned filter)
+{
+   switch (filter) {
+   case PIPE_TEX_FILTER_NEAREST:       return GEN6_MAPFILTER_NEAREST;
+   case PIPE_TEX_FILTER_LINEAR:        return GEN6_MAPFILTER_LINEAR;
+   default:
+      assert(!"unknown sampler filter");
+      return GEN6_MAPFILTER_NEAREST;
+   }
+}
+
+static enum gen_texcoord_mode
+ilo_translate_address_wrap(unsigned wrap)
+{
+   switch (wrap) {
+   case PIPE_TEX_WRAP_CLAMP:           return GEN8_TEXCOORDMODE_HALF_BORDER;
+   case PIPE_TEX_WRAP_REPEAT:          return GEN6_TEXCOORDMODE_WRAP;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:   return GEN6_TEXCOORDMODE_CLAMP;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER: return GEN6_TEXCOORDMODE_CLAMP_BORDER;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:   return GEN6_TEXCOORDMODE_MIRROR;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+   default:
+      assert(!"unknown sampler wrap mode");
+      return GEN6_TEXCOORDMODE_WRAP;
+   }
+}
+
+static enum gen_aniso_ratio
+ilo_translate_max_anisotropy(unsigned max_anisotropy)
+{
+   switch (max_anisotropy) {
+   case 0: case 1: case 2:             return GEN6_ANISORATIO_2;
+   case 3: case 4:                     return GEN6_ANISORATIO_4;
+   case 5: case 6:                     return GEN6_ANISORATIO_6;
+   case 7: case 8:                     return GEN6_ANISORATIO_8;
+   case 9: case 10:                    return GEN6_ANISORATIO_10;
+   case 11: case 12:                   return GEN6_ANISORATIO_12;
+   case 13: case 14:                   return GEN6_ANISORATIO_14;
+   default:                            return GEN6_ANISORATIO_16;
+   }
+}
+
+static enum gen_prefilter_op
+ilo_translate_shadow_func(unsigned func)
+{
+   /*
+    * For PIPE_FUNC_x, the reference value is on the left-hand side of the
+    * comparison, and 1.0 is returned when the comparison is true.
+    *
+    * For GEN6_PREFILTEROP_x, the reference value is on the right-hand side of
+    * the comparison, and 0.0 is returned when the comparison is true.
+    */
+   switch (func) {
+   case PIPE_FUNC_NEVER:               return GEN6_PREFILTEROP_ALWAYS;
+   case PIPE_FUNC_LESS:                return GEN6_PREFILTEROP_LEQUAL;
+   case PIPE_FUNC_EQUAL:               return GEN6_PREFILTEROP_NOTEQUAL;
+   case PIPE_FUNC_LEQUAL:              return GEN6_PREFILTEROP_LESS;
+   case PIPE_FUNC_GREATER:             return GEN6_PREFILTEROP_GEQUAL;
+   case PIPE_FUNC_NOTEQUAL:            return GEN6_PREFILTEROP_EQUAL;
+   case PIPE_FUNC_GEQUAL:              return GEN6_PREFILTEROP_GREATER;
+   case PIPE_FUNC_ALWAYS:              return GEN6_PREFILTEROP_NEVER;
+   default:
+      assert(!"unknown shadow compare function");
+      return GEN6_PREFILTEROP_NEVER;
+   }
+}
+
+static enum gen_front_winding
+ilo_translate_front_ccw(unsigned front_ccw)
+{
+   return (front_ccw) ? GEN6_FRONTWINDING_CCW : GEN6_FRONTWINDING_CW;
+}
+
+static enum gen_cull_mode
+ilo_translate_cull_face(unsigned cull_face)
+{
+   switch (cull_face) {
+   case PIPE_FACE_NONE:                return GEN6_CULLMODE_NONE;
+   case PIPE_FACE_FRONT:               return GEN6_CULLMODE_FRONT;
+   case PIPE_FACE_BACK:                return GEN6_CULLMODE_BACK;
+   case PIPE_FACE_FRONT_AND_BACK:      return GEN6_CULLMODE_BOTH;
+   default:
+      assert(!"unknown face culling");
+      return GEN6_CULLMODE_NONE;
+   }
+}
+
+static enum gen_fill_mode
+ilo_translate_poly_mode(unsigned poly_mode)
+{
+   switch (poly_mode) {
+   case PIPE_POLYGON_MODE_FILL:        return GEN6_FILLMODE_SOLID;
+   case PIPE_POLYGON_MODE_LINE:        return GEN6_FILLMODE_WIREFRAME;
+   case PIPE_POLYGON_MODE_POINT:       return GEN6_FILLMODE_POINT;
+   default:
+      assert(!"unknown polygon mode");
+      return GEN6_FILLMODE_SOLID;
+   }
+}
+
+static enum gen_pixel_location
+ilo_translate_half_pixel_center(bool half_pixel_center)
+{
+   return (half_pixel_center) ? GEN6_PIXLOC_CENTER : GEN6_PIXLOC_UL_CORNER;
+}
+
+static enum gen_compare_function
+ilo_translate_compare_func(unsigned func)
+{
+   switch (func) {
+   case PIPE_FUNC_NEVER:               return GEN6_COMPAREFUNCTION_NEVER;
+   case PIPE_FUNC_LESS:                return GEN6_COMPAREFUNCTION_LESS;
+   case PIPE_FUNC_EQUAL:               return GEN6_COMPAREFUNCTION_EQUAL;
+   case PIPE_FUNC_LEQUAL:              return GEN6_COMPAREFUNCTION_LEQUAL;
+   case PIPE_FUNC_GREATER:             return GEN6_COMPAREFUNCTION_GREATER;
+   case PIPE_FUNC_NOTEQUAL:            return GEN6_COMPAREFUNCTION_NOTEQUAL;
+   case PIPE_FUNC_GEQUAL:              return GEN6_COMPAREFUNCTION_GEQUAL;
+   case PIPE_FUNC_ALWAYS:              return GEN6_COMPAREFUNCTION_ALWAYS;
+   default:
+      assert(!"unknown compare function");
+      return GEN6_COMPAREFUNCTION_NEVER;
+   }
+}
+
+static enum gen_stencil_op
+ilo_translate_stencil_op(unsigned stencil_op)
+{
+   switch (stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:          return GEN6_STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:          return GEN6_STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:       return GEN6_STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:          return GEN6_STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:          return GEN6_STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:     return GEN6_STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:     return GEN6_STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:        return GEN6_STENCILOP_INVERT;
+   default:
+      assert(!"unknown stencil op");
+      return GEN6_STENCILOP_KEEP;
+   }
+}
+
+static enum gen_logic_op
+ilo_translate_logicop(unsigned logicop)
+{
+   switch (logicop) {
+   case PIPE_LOGICOP_CLEAR:            return GEN6_LOGICOP_CLEAR;
+   case PIPE_LOGICOP_NOR:              return GEN6_LOGICOP_NOR;
+   case PIPE_LOGICOP_AND_INVERTED:     return GEN6_LOGICOP_AND_INVERTED;
+   case PIPE_LOGICOP_COPY_INVERTED:    return GEN6_LOGICOP_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_REVERSE:      return GEN6_LOGICOP_AND_REVERSE;
+   case PIPE_LOGICOP_INVERT:           return GEN6_LOGICOP_INVERT;
+   case PIPE_LOGICOP_XOR:              return GEN6_LOGICOP_XOR;
+   case PIPE_LOGICOP_NAND:             return GEN6_LOGICOP_NAND;
+   case PIPE_LOGICOP_AND:              return GEN6_LOGICOP_AND;
+   case PIPE_LOGICOP_EQUIV:            return GEN6_LOGICOP_EQUIV;
+   case PIPE_LOGICOP_NOOP:             return GEN6_LOGICOP_NOOP;
+   case PIPE_LOGICOP_OR_INVERTED:      return GEN6_LOGICOP_OR_INVERTED;
+   case PIPE_LOGICOP_COPY:             return GEN6_LOGICOP_COPY;
+   case PIPE_LOGICOP_OR_REVERSE:       return GEN6_LOGICOP_OR_REVERSE;
+   case PIPE_LOGICOP_OR:               return GEN6_LOGICOP_OR;
+   case PIPE_LOGICOP_SET:              return GEN6_LOGICOP_SET;
+   default:
+      assert(!"unknown logicop function");
+      return GEN6_LOGICOP_CLEAR;
+   }
+}
+
+static int
+ilo_translate_blend_func(unsigned blend)
+{
+   switch (blend) {
+   case PIPE_BLEND_ADD:                return GEN6_BLENDFUNCTION_ADD;
+   case PIPE_BLEND_SUBTRACT:           return GEN6_BLENDFUNCTION_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:   return GEN6_BLENDFUNCTION_REVERSE_SUBTRACT;
+   case PIPE_BLEND_MIN:                return GEN6_BLENDFUNCTION_MIN;
+   case PIPE_BLEND_MAX:                return GEN6_BLENDFUNCTION_MAX;
+   default:
+      assert(!"unknown blend function");
+      return GEN6_BLENDFUNCTION_ADD;
+   }
+}
+
+static int
+ilo_translate_blend_factor(unsigned factor)
+{
+   switch (factor) {
+   case PIPE_BLENDFACTOR_ONE:                return GEN6_BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:          return GEN6_BLENDFACTOR_SRC_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:          return GEN6_BLENDFACTOR_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:          return GEN6_BLENDFACTOR_DST_ALPHA;
+   case PIPE_BLENDFACTOR_DST_COLOR:          return GEN6_BLENDFACTOR_DST_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE: return GEN6_BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:        return GEN6_BLENDFACTOR_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:        return GEN6_BLENDFACTOR_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:         return GEN6_BLENDFACTOR_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:         return GEN6_BLENDFACTOR_SRC1_ALPHA;
+   case PIPE_BLENDFACTOR_ZERO:               return GEN6_BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:      return GEN6_BLENDFACTOR_INV_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:      return GEN6_BLENDFACTOR_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:      return GEN6_BLENDFACTOR_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:      return GEN6_BLENDFACTOR_INV_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:    return GEN6_BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:    return GEN6_BLENDFACTOR_INV_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:     return GEN6_BLENDFACTOR_INV_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:     return GEN6_BLENDFACTOR_INV_SRC1_ALPHA;
+   default:
+      assert(!"unknown blend factor");
+      return GEN6_BLENDFACTOR_ONE;
+   }
+}
+
 static void
 finalize_shader_states(struct ilo_state_vector *vec)
 {
@@ -78,7 +350,7 @@ finalize_shader_states(struct ilo_state_vector *vec)
       /* need to setup SBE for FS */
       if (type == PIPE_SHADER_FRAGMENT && vec->dirty &
             (state | ILO_DIRTY_GS | ILO_DIRTY_VS | ILO_DIRTY_RASTERIZER)) {
-         if (ilo_shader_select_kernel_routing(shader,
+         if (ilo_shader_select_kernel_sbe(shader,
                (vec->gs) ? vec->gs : vec->vs, vec->rasterizer))
             vec->dirty |= state;
       }
@@ -97,7 +369,6 @@ finalize_cbuf_state(struct ilo_context *ilo,
       ~ilo_shader_get_kernel_param(sh, ILO_KERNEL_SKIP_CBUF0_UPLOAD);
 
    while (upload_mask) {
-      const enum pipe_format elem_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
       unsigned offset, i;
 
       i = u_bit_scan(&upload_mask);
@@ -105,14 +376,16 @@ finalize_cbuf_state(struct ilo_context *ilo,
       if (cbuf->cso[i].resource)
          continue;
 
-      u_upload_data(ilo->uploader, 0, cbuf->cso[i].user_buffer_size,
+      u_upload_data(ilo->uploader, 0, cbuf->cso[i].info.size,
             cbuf->cso[i].user_buffer, &offset, &cbuf->cso[i].resource);
 
-      ilo_gpe_init_view_surface_for_buffer(ilo->dev,
-            ilo_buffer(cbuf->cso[i].resource),
-            offset, cbuf->cso[i].user_buffer_size,
-            util_format_get_blocksize(elem_format), elem_format,
-            false, false, &cbuf->cso[i].surface);
+      cbuf->cso[i].info.buf = ilo_buffer(cbuf->cso[i].resource);
+      cbuf->cso[i].info.offset = offset;
+
+      memset(&cbuf->cso[i].surface, 0, sizeof(cbuf->cso[i].surface));
+      ilo_state_surface_init_for_buffer(&cbuf->cso[i].surface,
+            ilo->dev, &cbuf->cso[i].info);
+      cbuf->cso[i].surface.bo = cbuf->cso[i].info.buf->bo;
 
       ilo->state_vector.dirty |= ILO_DIRTY_CBUF;
    }
@@ -133,114 +406,380 @@ finalize_constant_buffers(struct ilo_context *ilo)
 static void
 finalize_index_buffer(struct ilo_context *ilo)
 {
+   const struct ilo_dev *dev = ilo->dev;
    struct ilo_state_vector *vec = &ilo->state_vector;
    const bool need_upload = (vec->draw->indexed &&
-         (vec->ib.user_buffer || vec->ib.offset % vec->ib.index_size));
+         (vec->ib.state.user_buffer ||
+          vec->ib.state.offset % vec->ib.state.index_size));
    struct pipe_resource *current_hw_res = NULL;
+   struct ilo_state_index_buffer_info info;
+   int64_t vertex_start_bias = 0;
 
    if (!(vec->dirty & ILO_DIRTY_IB) && !need_upload)
       return;
 
+   /* make sure vec->ib.hw_resource changes when reallocated */
    pipe_resource_reference(&current_hw_res, vec->ib.hw_resource);
 
    if (need_upload) {
-      const unsigned offset = vec->ib.index_size * vec->draw->start;
-      const unsigned size = vec->ib.index_size * vec->draw->count;
+      const unsigned offset = vec->ib.state.index_size * vec->draw->start;
+      const unsigned size = vec->ib.state.index_size * vec->draw->count;
       unsigned hw_offset;
 
-      if (vec->ib.user_buffer) {
+      if (vec->ib.state.user_buffer) {
          u_upload_data(ilo->uploader, 0, size,
-               vec->ib.user_buffer + offset, &hw_offset, &vec->ib.hw_resource);
-      }
-      else {
-         u_upload_buffer(ilo->uploader, 0, vec->ib.offset + offset, size,
-               vec->ib.buffer, &hw_offset, &vec->ib.hw_resource);
+               vec->ib.state.user_buffer + offset,
+               &hw_offset, &vec->ib.hw_resource);
+      } else {
+         u_upload_buffer(ilo->uploader, 0,
+               vec->ib.state.offset + offset, size, vec->ib.state.buffer,
+               &hw_offset, &vec->ib.hw_resource);
       }
 
       /* the HW offset should be aligned */
-      assert(hw_offset % vec->ib.index_size == 0);
-      vec->ib.draw_start_offset = hw_offset / vec->ib.index_size;
+      assert(hw_offset % vec->ib.state.index_size == 0);
+      vertex_start_bias = hw_offset / vec->ib.state.index_size;
 
       /*
        * INDEX[vec->draw->start] in the original buffer is INDEX[0] in the HW
        * resource
        */
-      vec->ib.draw_start_offset -= vec->draw->start;
-   }
-   else {
-      pipe_resource_reference(&vec->ib.hw_resource, vec->ib.buffer);
+      vertex_start_bias -= vec->draw->start;
+   } else {
+      pipe_resource_reference(&vec->ib.hw_resource, vec->ib.state.buffer);
 
       /* note that index size may be zero when the draw is not indexed */
       if (vec->draw->indexed)
-         vec->ib.draw_start_offset = vec->ib.offset / vec->ib.index_size;
-      else
-         vec->ib.draw_start_offset = 0;
+         vertex_start_bias = vec->ib.state.offset / vec->ib.state.index_size;
    }
 
+   vec->draw_info.vertex_start += vertex_start_bias;
+
    /* treat the IB as clean if the HW states do not change */
    if (vec->ib.hw_resource == current_hw_res &&
-       vec->ib.hw_index_size == vec->ib.index_size)
+       vec->ib.hw_index_size == vec->ib.state.index_size)
       vec->dirty &= ~ILO_DIRTY_IB;
    else
-      vec->ib.hw_index_size = vec->ib.index_size;
+      vec->ib.hw_index_size = vec->ib.state.index_size;
 
    pipe_resource_reference(&current_hw_res, NULL);
+
+   memset(&info, 0, sizeof(info));
+   if (vec->ib.hw_resource) {
+      info.buf = ilo_buffer(vec->ib.hw_resource);
+      info.size = info.buf->bo_size;
+      info.format = ilo_translate_index_size(vec->ib.hw_index_size);
+
+      vec->ib.ib.bo = info.buf->bo;
+   }
+
+   ilo_state_index_buffer_set_info(&vec->ib.ib, dev, &info);
 }
 
 static void
 finalize_vertex_elements(struct ilo_context *ilo)
 {
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_ve_state *ve = vec->ve;
+   const bool last_element_edge_flag = (vec->vs &&
+         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_EDGEFLAG));
+   const bool prepend_vertexid = (vec->vs &&
+         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_VERTEXID));
+   const bool prepend_instanceid = (vec->vs &&
+         ilo_shader_get_kernel_param(vec->vs,
+            ILO_KERNEL_VS_INPUT_INSTANCEID));
+   const enum gen_index_format index_format = (vec->draw->indexed) ?
+      ilo_translate_index_size(vec->ib.state.index_size) : GEN6_INDEX_DWORD;
+
+   /* check for non-orthogonal states */
+   if (ve->vf_params.cv_topology != vec->draw_info.topology ||
+       ve->vf_params.prepend_vertexid != prepend_vertexid ||
+       ve->vf_params.prepend_instanceid != prepend_instanceid ||
+       ve->vf_params.last_element_edge_flag != last_element_edge_flag ||
+       ve->vf_params.cv_index_format != index_format ||
+       ve->vf_params.cut_index_enable != vec->draw->primitive_restart ||
+       ve->vf_params.cut_index != vec->draw->restart_index) {
+      ve->vf_params.cv_topology = vec->draw_info.topology;
+      ve->vf_params.prepend_vertexid = prepend_vertexid;
+      ve->vf_params.prepend_instanceid = prepend_instanceid;
+      ve->vf_params.last_element_edge_flag = last_element_edge_flag;
+      ve->vf_params.cv_index_format = index_format;
+      ve->vf_params.cut_index_enable = vec->draw->primitive_restart;
+      ve->vf_params.cut_index = vec->draw->restart_index;
+
+      ilo_state_vf_set_params(&ve->vf, dev, &ve->vf_params);
+
+      vec->dirty |= ILO_DIRTY_VE;
+   }
+}
+
+static void
+finalize_vertex_buffers(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
    struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_state_vertex_buffer_info info;
+   unsigned i;
 
-   if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VS)))
+   if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VB)))
       return;
 
-   vec->dirty |= ILO_DIRTY_VE;
+   memset(&info, 0, sizeof(info));
+
+   for (i = 0; i < vec->ve->vb_count; i++) {
+      const unsigned pipe_idx = vec->ve->vb_mapping[i];
+      const struct pipe_vertex_buffer *cso = &vec->vb.states[pipe_idx];
+
+      if (cso->buffer) {
+         info.buf = ilo_buffer(cso->buffer);
+         info.offset = cso->buffer_offset;
+         info.size = info.buf->bo_size;
+
+         info.stride = cso->stride;
+
+         vec->vb.vb[i].bo = info.buf->bo;
+      } else {
+         memset(&info, 0, sizeof(info));
+      }
+
+      ilo_state_vertex_buffer_set_info(&vec->vb.vb[i], dev, &info);
+   }
+}
+
+static void
+finalize_urb(struct ilo_context *ilo)
+{
+   const uint16_t attr_size = sizeof(uint32_t) * 4;
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_state_urb_info info;
+
+   if (!(vec->dirty & (ILO_DIRTY_VE | ILO_DIRTY_VS |
+                       ILO_DIRTY_GS | ILO_DIRTY_FS)))
+      return;
+
+   memset(&info, 0, sizeof(info));
+
+   info.ve_entry_size = attr_size * ilo_state_vf_get_attr_count(&vec->ve->vf);
+
+   if (vec->vs) {
+      info.vs_const_data = (bool)
+         (ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_PCB_CBUF0_SIZE) +
+          ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_PCB_UCP_SIZE));
+      info.vs_entry_size = attr_size *
+         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_OUTPUT_COUNT);
+   }
+
+   if (vec->gs) {
+      info.gs_const_data = (bool)
+         ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_PCB_CBUF0_SIZE);
 
-   vec->ve->last_cso_edgeflag = false;
-   if (vec->ve->count && vec->vs &&
-         ilo_shader_get_kernel_param(vec->vs, ILO_KERNEL_VS_INPUT_EDGEFLAG)) {
-      vec->ve->edgeflag_cso = vec->ve->cso[vec->ve->count - 1];
-      ilo_gpe_set_ve_edgeflag(ilo->dev, &vec->ve->edgeflag_cso);
-      vec->ve->last_cso_edgeflag = true;
-   }
-
-   vec->ve->prepend_nosrc_cso = false;
-   if (vec->vs &&
-       (ilo_shader_get_kernel_param(vec->vs,
-                                    ILO_KERNEL_VS_INPUT_INSTANCEID) ||
-        ilo_shader_get_kernel_param(vec->vs,
-                                    ILO_KERNEL_VS_INPUT_VERTEXID))) {
-      ilo_gpe_init_ve_nosrc(ilo->dev,
-            GEN6_VFCOMP_STORE_VID,
-            GEN6_VFCOMP_STORE_IID,
-            GEN6_VFCOMP_NOSTORE,
-            GEN6_VFCOMP_NOSTORE,
-            &vec->ve->nosrc_cso);
-      vec->ve->prepend_nosrc_cso = true;
-   } else if (!vec->vs) {
-      /* generate VUE header */
-      ilo_gpe_init_ve_nosrc(ilo->dev,
-            GEN6_VFCOMP_STORE_0, /* Reserved */
-            GEN6_VFCOMP_STORE_0, /* Render Target Array Index */
-            GEN6_VFCOMP_STORE_0, /* Viewport Index */
-            GEN6_VFCOMP_STORE_0, /* Point Width */
-            &vec->ve->nosrc_cso);
-      vec->ve->prepend_nosrc_cso = true;
-   } else if (!vec->ve->count) {
       /*
-       * From the Sandy Bridge PRM, volume 2 part 1, page 92:
+       * From the Ivy Bridge PRM, volume 2 part 1, page 189:
+       *
+       *     "All outputs of a GS thread will be stored in the single GS
+       *      thread output URB entry."
        *
-       *    "SW must ensure that at least one vertex element is defined prior
-       *     to issuing a 3DPRIMTIVE command, or operation is UNDEFINED."
+       * TODO
        */
-      ilo_gpe_init_ve_nosrc(ilo->dev,
-            GEN6_VFCOMP_STORE_0,
-            GEN6_VFCOMP_STORE_0,
-            GEN6_VFCOMP_STORE_0,
-            GEN6_VFCOMP_STORE_1_FP,
-            &vec->ve->nosrc_cso);
-      vec->ve->prepend_nosrc_cso = true;
+      info.gs_entry_size = attr_size *
+         ilo_shader_get_kernel_param(vec->gs, ILO_KERNEL_OUTPUT_COUNT);
+   }
+
+   if (vec->fs) {
+      info.ps_const_data = (bool)
+         ilo_shader_get_kernel_param(vec->fs, ILO_KERNEL_PCB_CBUF0_SIZE);
+   }
+
+   ilo_state_urb_set_info(&vec->urb, dev, &info);
+}
+
+static void
+finalize_viewport(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+
+   if (vec->dirty & ILO_DIRTY_VIEWPORT) {
+      ilo_state_viewport_set_params(&vec->viewport.vp,
+            dev, &vec->viewport.params, false);
+   } else if (vec->dirty & ILO_DIRTY_SCISSOR) {
+      ilo_state_viewport_set_params(&vec->viewport.vp,
+            dev, &vec->viewport.params, true);
+      vec->dirty |= ILO_DIRTY_VIEWPORT;
+   }
+}
+
+static bool
+can_enable_gb_test(const struct ilo_rasterizer_state *rasterizer,
+                   const struct ilo_viewport_state *viewport,
+                   const struct ilo_fb_state *fb)
+{
+   unsigned i;
+
+   /*
+    * There are several reasons that guard band test should be disabled
+    *
+    *  - GL wide points (to avoid partially visibie object)
+    *  - GL wide or AA lines (to avoid partially visibie object)
+    *  - missing 2D clipping
+    */
+   if (rasterizer->state.point_size_per_vertex ||
+       rasterizer->state.point_size > 1.0f ||
+       rasterizer->state.line_width > 1.0f ||
+       rasterizer->state.line_smooth)
+      return false;
+
+   for (i = 0; i < viewport->params.count; i++) {
+      const struct ilo_state_viewport_matrix_info *mat =
+         &viewport->matrices[i];
+      float min_x, max_x, min_y, max_y;
+
+      min_x = -1.0f * fabsf(mat->scale[0]) + mat->translate[0];
+      max_x =  1.0f * fabsf(mat->scale[0]) + mat->translate[0];
+      min_y = -1.0f * fabsf(mat->scale[1]) + mat->translate[1];
+      max_y =  1.0f * fabsf(mat->scale[1]) + mat->translate[1];
+
+      if (min_x > 0.0f || max_x < fb->state.width ||
+          min_y > 0.0f || max_y < fb->state.height)
+         return false;
+   }
+
+   return true;
+}
+
+static void
+finalize_rasterizer(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_rasterizer_state *rasterizer = vec->rasterizer;
+   struct ilo_state_raster_info *info = &vec->rasterizer->info;
+   const bool gb_test_enable =
+      can_enable_gb_test(rasterizer, &vec->viewport, &vec->fb);
+   const bool multisample =
+      (rasterizer->state.multisample && vec->fb.num_samples > 1);
+   const uint8_t barycentric_interps = ilo_shader_get_kernel_param(vec->fs,
+         ILO_KERNEL_FS_BARYCENTRIC_INTERPOLATIONS);
+
+   /* check for non-orthogonal states */
+   if (info->clip.viewport_count != vec->viewport.params.count ||
+       info->clip.gb_test_enable != gb_test_enable ||
+       info->setup.msaa_enable != multisample ||
+       info->setup.line_msaa_enable != multisample ||
+       info->tri.depth_offset_format != vec->fb.depth_offset_format ||
+       info->scan.sample_count != vec->fb.num_samples ||
+       info->scan.sample_mask != vec->sample_mask ||
+       info->scan.barycentric_interps != barycentric_interps ||
+       info->params.any_integer_rt != vec->fb.has_integer_rt ||
+       info->params.hiz_enable != vec->fb.has_hiz) {
+      info->clip.viewport_count = vec->viewport.params.count;
+      info->clip.gb_test_enable = gb_test_enable;
+      info->setup.msaa_enable = multisample;
+      info->setup.line_msaa_enable = multisample;
+      info->tri.depth_offset_format = vec->fb.depth_offset_format;
+      info->scan.sample_count = vec->fb.num_samples;
+      info->scan.sample_mask = vec->sample_mask;
+      info->scan.barycentric_interps = barycentric_interps;
+      info->params.any_integer_rt = vec->fb.has_integer_rt;
+      info->params.hiz_enable = vec->fb.has_hiz;
+
+      ilo_state_raster_set_info(&rasterizer->rs, dev, &rasterizer->info);
+
+      vec->dirty |= ILO_DIRTY_RASTERIZER;
+   }
+}
+
+static bool
+finalize_blend_rt(struct ilo_context *ilo)
+{
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   const struct ilo_fb_state *fb = &vec->fb;
+   struct ilo_blend_state *blend = vec->blend;
+   struct ilo_state_cc_blend_info *info = &vec->blend->info.blend;
+   bool changed = false;
+   unsigned i;
+
+   if (!(vec->dirty & (ILO_DIRTY_FB | ILO_DIRTY_BLEND)))
+      return false;
+
+   /* set up one for dummy RT writes */
+   if (!fb->state.nr_cbufs) {
+      if (info->rt != &blend->dummy_rt) {
+         info->rt = &blend->dummy_rt;
+         info->rt_count = 1;
+         changed = true;
+      }
+
+      return changed;
+   }
+
+   if (info->rt != blend->effective_rt ||
+       info->rt_count != fb->state.nr_cbufs) {
+      info->rt = blend->effective_rt;
+      info->rt_count = fb->state.nr_cbufs;
+      changed = true;
+   }
+
+   for (i = 0; i < fb->state.nr_cbufs; i++) {
+      const struct ilo_fb_blend_caps *caps = &fb->blend_caps[i];
+      struct ilo_state_cc_blend_rt_info *rt = &blend->effective_rt[i];
+      /* ignore logicop when not UNORM */
+      const bool logicop_enable =
+         (blend->rt[i].logicop_enable && caps->is_unorm);
+
+      if (rt->cv_is_unorm != caps->is_unorm ||
+          rt->cv_is_integer != caps->is_integer ||
+          rt->logicop_enable != logicop_enable ||
+          rt->force_dst_alpha_one != caps->force_dst_alpha_one) {
+         rt->cv_is_unorm = caps->is_unorm;
+         rt->cv_is_integer = caps->is_integer;
+         rt->logicop_enable = logicop_enable;
+         rt->force_dst_alpha_one = caps->force_dst_alpha_one;
+
+         changed = true;
+      }
+   }
+
+   return changed;
+}
+
+static void
+finalize_blend(struct ilo_context *ilo)
+{
+   const struct ilo_dev *dev = ilo->dev;
+   struct ilo_state_vector *vec = &ilo->state_vector;
+   struct ilo_blend_state *blend = vec->blend;
+   struct ilo_state_cc_info *info = &blend->info;
+   const bool sample_count_one = (vec->fb.num_samples <= 1);
+   const bool float_source0_alpha =
+      (!vec->fb.state.nr_cbufs || !vec->fb.state.cbufs[0] ||
+       !util_format_is_pure_integer(vec->fb.state.cbufs[0]->format));
+
+   /* check for non-orthogonal states */
+   if (finalize_blend_rt(ilo) ||
+       info->alpha.cv_sample_count_one != sample_count_one ||
+       info->alpha.cv_float_source0_alpha != float_source0_alpha ||
+       info->alpha.test_enable != vec->dsa->alpha_test ||
+       info->alpha.test_func != vec->dsa->alpha_func ||
+       memcmp(&info->stencil, &vec->dsa->stencil, sizeof(info->stencil)) ||
+       memcmp(&info->depth, &vec->dsa->depth, sizeof(info->depth)) ||
+       memcmp(&info->params, &vec->cc_params, sizeof(info->params))) {
+      info->alpha.cv_sample_count_one = sample_count_one;
+      info->alpha.cv_float_source0_alpha = float_source0_alpha;
+      info->alpha.test_enable = vec->dsa->alpha_test;
+      info->alpha.test_func = vec->dsa->alpha_func;
+      info->stencil = vec->dsa->stencil;
+      info->depth = vec->dsa->depth;
+      info->params = vec->cc_params;
+
+      ilo_state_cc_set_info(&blend->cc, dev, info);
+
+      blend->alpha_may_kill = (info->alpha.alpha_to_coverage ||
+                               info->alpha.test_enable);
+
+      vec->dirty |= ILO_DIRTY_BLEND;
    }
 }
 
@@ -254,10 +793,24 @@ ilo_finalize_3d_states(struct ilo_context *ilo,
 {
    ilo->state_vector.draw = draw;
 
+   ilo->state_vector.draw_info.topology = ilo_translate_draw_mode(draw->mode);
+   ilo->state_vector.draw_info.indexed = draw->indexed;
+   ilo->state_vector.draw_info.vertex_count = draw->count;
+   ilo->state_vector.draw_info.vertex_start = draw->start;
+   ilo->state_vector.draw_info.instance_count = draw->instance_count;
+   ilo->state_vector.draw_info.instance_start = draw->start_instance;
+   ilo->state_vector.draw_info.vertex_base = draw->index_bias;
+
+   finalize_blend(ilo);
    finalize_shader_states(&ilo->state_vector);
    finalize_constant_buffers(ilo);
    finalize_index_buffer(ilo);
    finalize_vertex_elements(ilo);
+   finalize_vertex_buffers(ilo);
+
+   finalize_urb(ilo);
+   finalize_rasterizer(ilo);
+   finalize_viewport(ilo);
 
    u_upload_unmap(ilo->uploader);
 }
@@ -301,12 +854,79 @@ ilo_create_blend_state(struct pipe_context *pipe,
                        const struct pipe_blend_state *state)
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
+   struct ilo_state_cc_info *info;
    struct ilo_blend_state *blend;
+   int i;
 
-   blend = MALLOC_STRUCT(ilo_blend_state);
+   blend = CALLOC_STRUCT(ilo_blend_state);
    assert(blend);
 
-   ilo_gpe_init_blend(dev, state, blend);
+   info = &blend->info;
+
+   info->alpha.cv_float_source0_alpha = true;
+   info->alpha.cv_sample_count_one = true;
+   info->alpha.alpha_to_one = state->alpha_to_one;
+   info->alpha.alpha_to_coverage = state->alpha_to_coverage;
+   info->alpha.test_enable = false;
+   info->alpha.test_func = GEN6_COMPAREFUNCTION_ALWAYS;
+
+   info->stencil.cv_has_buffer = true;
+   info->depth.cv_has_buffer= true;
+
+   info->blend.rt = blend->effective_rt;
+   info->blend.rt_count = 1;
+   info->blend.dither_enable = state->dither;
+
+   for (i = 0; i < ARRAY_SIZE(blend->rt); i++) {
+      const struct pipe_rt_blend_state *rt = &state->rt[i];
+      struct ilo_state_cc_blend_rt_info *rt_info = &blend->rt[i];
+
+      rt_info->cv_has_buffer = true;
+      rt_info->cv_is_unorm = true;
+      rt_info->cv_is_integer = false;
+
+      /* logic op takes precedence over blending */
+      if (state->logicop_enable) {
+         rt_info->logicop_enable = true;
+         rt_info->logicop_func = ilo_translate_logicop(state->logicop_func);
+      } else if (rt->blend_enable) {
+         rt_info->blend_enable = true;
+
+         rt_info->rgb_src = ilo_translate_blend_factor(rt->rgb_src_factor);
+         rt_info->rgb_dst = ilo_translate_blend_factor(rt->rgb_dst_factor);
+         rt_info->rgb_func = ilo_translate_blend_func(rt->rgb_func);
+
+         rt_info->a_src = ilo_translate_blend_factor(rt->alpha_src_factor);
+         rt_info->a_dst = ilo_translate_blend_factor(rt->alpha_dst_factor);
+         rt_info->a_func = ilo_translate_blend_func(rt->alpha_func);
+      }
+
+      if (!(rt->colormask & PIPE_MASK_A))
+         rt_info->argb_write_disables |= (1 << 3);
+      if (!(rt->colormask & PIPE_MASK_R))
+         rt_info->argb_write_disables |= (1 << 2);
+      if (!(rt->colormask & PIPE_MASK_G))
+         rt_info->argb_write_disables |= (1 << 1);
+      if (!(rt->colormask & PIPE_MASK_B))
+         rt_info->argb_write_disables |= (1 << 0);
+
+      if (!state->independent_blend_enable) {
+         for (i = 1; i < ARRAY_SIZE(blend->rt); i++)
+            blend->rt[i] = *rt_info;
+         break;
+      }
+   }
+
+   memcpy(blend->effective_rt, blend->rt, sizeof(blend->rt));
+
+   blend->dummy_rt.argb_write_disables = 0xf;
+
+   if (!ilo_state_cc_init(&blend->cc, dev, &blend->info)) {
+      FREE(blend);
+      return NULL;
+   }
+
+   blend->dual_blend = util_blend_state_is_dual(state, 0);
 
    return blend;
 }
@@ -333,11 +953,105 @@ ilo_create_sampler_state(struct pipe_context *pipe,
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_sampler_cso *sampler;
+   struct ilo_state_sampler_info info;
+   struct ilo_state_sampler_border_info border;
 
-   sampler = MALLOC_STRUCT(ilo_sampler_cso);
+   sampler = CALLOC_STRUCT(ilo_sampler_cso);
    assert(sampler);
 
-   ilo_gpe_init_sampler_cso(dev, state, sampler);
+   memset(&info, 0, sizeof(info));
+
+   info.non_normalized = !state->normalized_coords;
+   if (state->normalized_coords) {
+      info.lod_bias = state->lod_bias;
+      info.min_lod = state->min_lod;
+      info.max_lod = state->max_lod;
+
+      info.mip_filter = ilo_translate_mip_filter(state->min_mip_filter);
+   } else {
+      /* work around a bug in util_blitter */
+      info.mip_filter = GEN6_MIPFILTER_NONE;
+   }
+
+   if (state->max_anisotropy) {
+      info.min_filter = GEN6_MAPFILTER_ANISOTROPIC;
+      info.mag_filter = GEN6_MAPFILTER_ANISOTROPIC;
+   } else {
+      info.min_filter = ilo_translate_img_filter(state->min_img_filter);
+      info.mag_filter = ilo_translate_img_filter(state->mag_img_filter);
+   }
+
+   info.max_anisotropy = ilo_translate_max_anisotropy(state->max_anisotropy);
+
+   /* use LOD 0 when no mipmapping (see sampler_set_gen6_SAMPLER_STATE()) */
+   if (info.mip_filter == GEN6_MIPFILTER_NONE && info.min_lod > 0.0f) {
+      info.min_lod = 0.0f;
+      info.mag_filter = info.min_filter;
+   }
+
+   if (state->seamless_cube_map) {
+      if (state->min_img_filter == PIPE_TEX_FILTER_NEAREST ||
+          state->mag_img_filter == PIPE_TEX_FILTER_NEAREST) {
+         info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+         info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+         info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+      } else {
+         info.tcx_ctrl = GEN6_TEXCOORDMODE_CUBE;
+         info.tcy_ctrl = GEN6_TEXCOORDMODE_CUBE;
+         info.tcz_ctrl = GEN6_TEXCOORDMODE_CUBE;
+      }
+   } else {
+      info.tcx_ctrl = ilo_translate_address_wrap(state->wrap_s);
+      info.tcy_ctrl = ilo_translate_address_wrap(state->wrap_t);
+      info.tcz_ctrl = ilo_translate_address_wrap(state->wrap_r);
+
+      if (ilo_dev_gen(dev) < ILO_GEN(8)) {
+         /*
+          * For nearest filtering, PIPE_TEX_WRAP_CLAMP means
+          * PIPE_TEX_WRAP_CLAMP_TO_EDGE;  for linear filtering,
+          * PIPE_TEX_WRAP_CLAMP means PIPE_TEX_WRAP_CLAMP_TO_BORDER while
+          * additionally clamping the texture coordinates to [0.0, 1.0].
+          *
+          * PIPE_TEX_WRAP_CLAMP is not supported natively until Gen8.  The
+          * clamping has to be taken care of in the shaders.  There are two
+          * filters here, but let the minification one has a say.
+          */
+         const bool clamp_is_to_edge =
+            (state->min_img_filter == PIPE_TEX_FILTER_NEAREST);
+
+         if (clamp_is_to_edge) {
+            if (info.tcx_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER)
+               info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+            if (info.tcy_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER)
+               info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+            if (info.tcz_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER)
+               info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP;
+         } else {
+            if (info.tcx_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) {
+               info.tcx_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER;
+               sampler->saturate_s = true;
+            }
+            if (info.tcy_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) {
+               info.tcy_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER;
+               sampler->saturate_t = true;
+            }
+            if (info.tcz_ctrl == GEN8_TEXCOORDMODE_HALF_BORDER) {
+               info.tcz_ctrl = GEN6_TEXCOORDMODE_CLAMP_BORDER;
+               sampler->saturate_r = true;
+            }
+         }
+      }
+   }
+
+   if (state->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE)
+      info.shadow_func = ilo_translate_shadow_func(state->compare_func);
+
+   ilo_state_sampler_init(&sampler->sampler, dev, &info);
+
+   memset(&border, 0, sizeof(border));
+   memcpy(border.rgba.f, state->border_color.f, sizeof(border.rgba.f));
+
+   ilo_state_sampler_border_init(&sampler->border, dev, &border);
 
    return sampler;
 }
@@ -403,12 +1117,74 @@ ilo_create_rasterizer_state(struct pipe_context *pipe,
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_rasterizer_state *rast;
+   struct ilo_state_raster_info *info;
 
-   rast = MALLOC_STRUCT(ilo_rasterizer_state);
+   rast = CALLOC_STRUCT(ilo_rasterizer_state);
    assert(rast);
 
    rast->state = *state;
-   ilo_gpe_init_rasterizer(dev, state, rast);
+
+   info = &rast->info;
+
+   info->clip.clip_enable = true;
+   info->clip.stats_enable = true;
+   info->clip.viewport_count = 1;
+   info->clip.force_rtaindex_zero = true;
+   info->clip.user_clip_enables = state->clip_plane_enable;
+   info->clip.gb_test_enable = true;
+   info->clip.xy_test_enable = true;
+   info->clip.z_far_enable = state->depth_clip;
+   info->clip.z_near_enable = state->depth_clip;
+   info->clip.z_near_zero = state->clip_halfz;
+
+   info->setup.first_vertex_provoking = state->flatshade_first;
+   info->setup.viewport_transform = true;
+   info->setup.scissor_enable = state->scissor;
+   info->setup.msaa_enable = false;
+   info->setup.line_msaa_enable = false;
+   info->point.aa_enable = state->point_smooth;
+   info->point.programmable_width = state->point_size_per_vertex;
+   info->line.aa_enable = state->line_smooth;
+   info->line.stipple_enable = state->line_stipple_enable;
+   info->line.giq_enable = true;
+   info->line.giq_last_pixel = state->line_last_pixel;
+   info->tri.front_winding = ilo_translate_front_ccw(state->front_ccw);
+   info->tri.cull_mode = ilo_translate_cull_face(state->cull_face);
+   info->tri.fill_mode_front = ilo_translate_poly_mode(state->fill_front);
+   info->tri.fill_mode_back = ilo_translate_poly_mode(state->fill_back);
+   info->tri.depth_offset_format = GEN6_ZFORMAT_D24_UNORM_X8_UINT;
+   info->tri.depth_offset_solid = state->offset_tri;
+   info->tri.depth_offset_wireframe = state->offset_line;
+   info->tri.depth_offset_point = state->offset_point;
+   info->tri.poly_stipple_enable = state->poly_stipple_enable;
+
+   info->scan.stats_enable = true;
+   info->scan.sample_count = 1;
+   info->scan.pixloc =
+      ilo_translate_half_pixel_center(state->half_pixel_center);
+   info->scan.sample_mask = ~0u;
+   info->scan.zw_interp = GEN6_ZW_INTERP_PIXEL;
+   info->scan.barycentric_interps = GEN6_INTERP_PERSPECTIVE_PIXEL;
+   info->scan.earlyz_control = GEN7_EDSC_NORMAL;
+   info->scan.earlyz_op = ILO_STATE_RASTER_EARLYZ_NORMAL;
+   info->scan.earlyz_stencil_clear = false;
+
+   info->params.any_integer_rt = false;
+   info->params.hiz_enable = true;
+   info->params.point_width =
+      (state->point_size == 0.0f) ? 1.0f : state->point_size;
+   info->params.line_width =
+      (state->line_width == 0.0f) ? 1.0f : state->line_width;
+
+   info->params.depth_offset_scale = state->offset_scale;
+   /*
+    * Scale the constant term.  The minimum representable value used by the HW
+    * is not large enouch to be the minimum resolvable difference.
+    */
+   info->params.depth_offset_const = state->offset_units * 2.0f;
+   info->params.depth_offset_clamp = state->offset_clamp;
+
+   ilo_state_raster_init(&rast->rs, dev, info);
 
    return rast;
 }
@@ -416,10 +1192,20 @@ ilo_create_rasterizer_state(struct pipe_context *pipe,
 static void
 ilo_bind_rasterizer_state(struct pipe_context *pipe, void *state)
 {
+   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    vec->rasterizer = state;
 
+   if (vec->rasterizer) {
+      struct ilo_state_line_stipple_info info;
+
+      info.pattern = vec->rasterizer->state.line_stipple_pattern;
+      info.repeat_count = vec->rasterizer->state.line_stipple_factor + 1;
+
+      ilo_state_line_stipple_set_info(&vec->line_stipple, dev, &info);
+   }
+
    vec->dirty |= ILO_DIRTY_RASTERIZER;
 }
 
@@ -433,13 +1219,48 @@ static void *
 ilo_create_depth_stencil_alpha_state(struct pipe_context *pipe,
                                      const struct pipe_depth_stencil_alpha_state *state)
 {
-   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_dsa_state *dsa;
+   int i;
 
-   dsa = MALLOC_STRUCT(ilo_dsa_state);
+   dsa = CALLOC_STRUCT(ilo_dsa_state);
    assert(dsa);
 
-   ilo_gpe_init_dsa(dev, state, dsa);
+   dsa->depth.cv_has_buffer = true;
+   dsa->depth.test_enable = state->depth.enabled;
+   dsa->depth.write_enable = state->depth.writemask;
+   dsa->depth.test_func = ilo_translate_compare_func(state->depth.func);
+
+   dsa->stencil.cv_has_buffer = true;
+   for (i = 0; i < ARRAY_SIZE(state->stencil); i++) {
+      const struct pipe_stencil_state *stencil = &state->stencil[i];
+      struct ilo_state_cc_stencil_op_info *op;
+
+      if (!stencil->enabled)
+         break;
+
+      if (i == 0) {
+         dsa->stencil.test_enable = true;
+         dsa->stencil_front.test_mask = stencil->valuemask;
+         dsa->stencil_front.write_mask = stencil->writemask;
+
+         op = &dsa->stencil.front;
+      } else {
+         dsa->stencil.twosided_enable = true;
+         dsa->stencil_back.test_mask = stencil->valuemask;
+         dsa->stencil_back.write_mask = stencil->writemask;
+
+         op = &dsa->stencil.back;
+      }
+
+      op->test_func = ilo_translate_compare_func(stencil->func);
+      op->fail_op = ilo_translate_stencil_op(stencil->fail_op);
+      op->zfail_op = ilo_translate_stencil_op(stencil->zfail_op);
+      op->zpass_op = ilo_translate_stencil_op(stencil->zpass_op);
+   }
+
+   dsa->alpha_test = state->alpha.enabled;
+   dsa->alpha_ref = state->alpha.ref_value;
+   dsa->alpha_func = ilo_translate_compare_func(state->alpha.func);
 
    return dsa;
 }
@@ -450,6 +1271,17 @@ ilo_bind_depth_stencil_alpha_state(struct pipe_context *pipe, void *state)
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    vec->dsa = state;
+   if (vec->dsa) {
+      vec->cc_params.alpha_ref = vec->dsa->alpha_ref;
+      vec->cc_params.stencil_front.test_mask =
+         vec->dsa->stencil_front.test_mask;
+      vec->cc_params.stencil_front.write_mask =
+         vec->dsa->stencil_front.write_mask;
+      vec->cc_params.stencil_back.test_mask =
+         vec->dsa->stencil_back.test_mask;
+      vec->cc_params.stencil_back.write_mask =
+         vec->dsa->stencil_back.write_mask;
+   }
 
    vec->dirty |= ILO_DIRTY_DSA;
 }
@@ -575,12 +1407,60 @@ ilo_create_vertex_elements_state(struct pipe_context *pipe,
                                  const struct pipe_vertex_element *elements)
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
+   struct ilo_state_vf_element_info vf_elements[PIPE_MAX_ATTRIBS];
+   unsigned instance_divisors[PIPE_MAX_ATTRIBS];
+   struct ilo_state_vf_info vf_info;
    struct ilo_ve_state *ve;
+   unsigned i;
 
-   ve = MALLOC_STRUCT(ilo_ve_state);
+   ve = CALLOC_STRUCT(ilo_ve_state);
    assert(ve);
 
-   ilo_gpe_init_ve(dev, num_elements, elements, ve);
+   for (i = 0; i < num_elements; i++) {
+      const struct pipe_vertex_element *elem = &elements[i];
+      struct ilo_state_vf_element_info *attr = &vf_elements[i];
+      unsigned hw_idx;
+
+      /*
+       * map the pipe vb to the hardware vb, which has a fixed instance
+       * divisor
+       */
+      for (hw_idx = 0; hw_idx < ve->vb_count; hw_idx++) {
+         if (ve->vb_mapping[hw_idx] == elem->vertex_buffer_index &&
+             instance_divisors[hw_idx] == elem->instance_divisor)
+            break;
+      }
+
+      /* create one if there is no matching hardware vb */
+      if (hw_idx >= ve->vb_count) {
+         hw_idx = ve->vb_count++;
+
+         ve->vb_mapping[hw_idx] = elem->vertex_buffer_index;
+         instance_divisors[hw_idx] = elem->instance_divisor;
+      }
+
+      attr->buffer = hw_idx;
+      attr->vertex_offset = elem->src_offset;
+      attr->format = ilo_format_translate_vertex(dev, elem->src_format);
+      attr->format_size = util_format_get_blocksize(elem->src_format);
+      attr->component_count = util_format_get_nr_components(elem->src_format);
+      attr->is_integer = util_format_is_pure_integer(elem->src_format);
+
+      attr->instancing_enable = (elem->instance_divisor != 0);
+      attr->instancing_step_rate = elem->instance_divisor;
+   }
+
+   memset(&vf_info, 0, sizeof(vf_info));
+   vf_info.data = ve->vf_data;
+   vf_info.data_size = sizeof(ve->vf_data);
+   vf_info.elements = vf_elements;
+   vf_info.element_count = num_elements;
+   /* vf_info.params and ve->vf_params are both zeroed */
+
+   if (!ilo_state_vf_init(&ve->vf, dev, &vf_info)) {
+      FREE(ve);
+      return NULL;
+   }
 
    return ve;
 }
@@ -609,7 +1489,7 @@ ilo_set_blend_color(struct pipe_context *pipe,
 {
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
-   vec->blend_color = *state;
+   memcpy(vec->cc_params.blend_rgba, state->color, sizeof(state->color));
 
    vec->dirty |= ILO_DIRTY_BLEND_COLOR;
 }
@@ -626,6 +1506,9 @@ ilo_set_stencil_ref(struct pipe_context *pipe,
 
    vec->stencil_ref = *state;
 
+   vec->cc_params.stencil_front.test_ref = state->ref_value[0];
+   vec->cc_params.stencil_back.test_ref = state->ref_value[1];
+
    vec->dirty |= ILO_DIRTY_STENCIL_REF;
 }
 
@@ -675,47 +1558,47 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
 
          pipe_resource_reference(&cso->resource, buf[i].buffer);
 
+         cso->info.access = ILO_STATE_SURFACE_ACCESS_DP_DATA;
+         cso->info.format = GEN6_FORMAT_R32G32B32A32_FLOAT;
+         cso->info.format_size = 16;
+         cso->info.struct_size = 16;
+         cso->info.readonly = true;
+         cso->info.size = buf[i].buffer_size;
+
          if (buf[i].buffer) {
-            const enum pipe_format elem_format =
-               PIPE_FORMAT_R32G32B32A32_FLOAT;
+            cso->info.buf = ilo_buffer(buf[i].buffer);
+            cso->info.offset = buf[i].buffer_offset;
 
-            ilo_gpe_init_view_surface_for_buffer(dev,
-                  ilo_buffer(buf[i].buffer),
-                  buf[i].buffer_offset, buf[i].buffer_size,
-                  util_format_get_blocksize(elem_format), elem_format,
-                  false, false, &cso->surface);
+            memset(&cso->surface, 0, sizeof(cso->surface));
+            ilo_state_surface_init_for_buffer(&cso->surface, dev, &cso->info);
+            cso->surface.bo = cso->info.buf->bo;
 
             cso->user_buffer = NULL;
-            cso->user_buffer_size = 0;
 
             cbuf->enabled_mask |= 1 << (index + i);
-         }
-         else if (buf[i].user_buffer) {
-            cso->surface.bo = NULL;
-
+         } else if (buf[i].user_buffer) {
+            cso->info.buf = NULL;
             /* buffer_offset does not apply for user buffer */
             cso->user_buffer = buf[i].user_buffer;
-            cso->user_buffer_size = buf[i].buffer_size;
 
             cbuf->enabled_mask |= 1 << (index + i);
-         }
-         else {
-            cso->surface.bo = NULL;
+         } else {
+            cso->info.buf = NULL;
+            cso->info.size = 0;
             cso->user_buffer = NULL;
-            cso->user_buffer_size = 0;
 
             cbuf->enabled_mask &= ~(1 << (index + i));
          }
       }
-   }
-   else {
+   } else {
       for (i = 0; i < count; i++) {
          struct ilo_cbuf_cso *cso = &cbuf->cso[index + i];
 
          pipe_resource_reference(&cso->resource, NULL);
-         cso->surface.bo = NULL;
+
+         cso->info.buf = NULL;
+         cso->info.size = 0;
          cso->user_buffer = NULL;
-         cso->user_buffer_size = 0;
 
          cbuf->enabled_mask &= ~(1 << (index + i));
       }
@@ -725,13 +1608,116 @@ ilo_set_constant_buffer(struct pipe_context *pipe,
 }
 
 static void
+fb_set_blend_caps(const struct ilo_dev *dev,
+                  enum pipe_format format,
+                  struct ilo_fb_blend_caps *caps)
+{
+   const struct util_format_description *desc =
+      util_format_description(format);
+   const int ch = util_format_get_first_non_void_channel(format);
+
+   memset(caps, 0, sizeof(*caps));
+
+   if (format == PIPE_FORMAT_NONE || desc->is_mixed)
+      return;
+
+   caps->is_unorm = (ch >= 0 && desc->channel[ch].normalized &&
+         desc->channel[ch].type == UTIL_FORMAT_TYPE_UNSIGNED &&
+         desc->colorspace == UTIL_FORMAT_COLORSPACE_RGB);
+   caps->is_integer = util_format_is_pure_integer(format);
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 365:
+    *
+    *     "Logic Ops are only supported on *_UNORM surfaces (excluding _SRGB
+    *      variants), otherwise Logic Ops must be DISABLED."
+    *
+    * According to the classic driver, this is lifted on Gen8+.
+    */
+   caps->can_logicop = (ilo_dev_gen(dev) >= ILO_GEN(8) || caps->is_unorm);
+
+   /* no blending for pure integer formats */
+   caps->can_blend = !caps->is_integer;
+
+   /*
+    * From the Sandy Bridge PRM, volume 2 part 1, page 382:
+    *
+    *     "Alpha Test can only be enabled if Pixel Shader outputs a float
+    *      alpha value."
+    */
+   caps->can_alpha_test = !caps->is_integer;
+
+   caps->force_dst_alpha_one =
+      (ilo_format_translate_render(dev, format) !=
+       ilo_format_translate_color(dev, format));
+
+   /* sanity check */
+   if (caps->force_dst_alpha_one) {
+      enum pipe_format render_format;
+
+      switch (format) {
+      case PIPE_FORMAT_B8G8R8X8_UNORM:
+         render_format = PIPE_FORMAT_B8G8R8A8_UNORM;
+         break;
+      default:
+         render_format = PIPE_FORMAT_NONE;
+         break;
+      }
+
+      assert(ilo_format_translate_render(dev, format) ==
+             ilo_format_translate_color(dev, render_format));
+   }
+}
+
+static void
 ilo_set_framebuffer_state(struct pipe_context *pipe,
                           const struct pipe_framebuffer_state *state)
 {
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
+   struct ilo_fb_state *fb = &vec->fb;
+   const struct pipe_surface *first_surf = NULL;
+   int i;
+
+   util_copy_framebuffer_state(&fb->state, state);
+
+   fb->has_integer_rt = false;
+   for (i = 0; i < state->nr_cbufs; i++) {
+      if (state->cbufs[i]) {
+         fb_set_blend_caps(dev, state->cbufs[i]->format, &fb->blend_caps[i]);
 
-   ilo_gpe_set_fb(dev, state, &vec->fb);
+         fb->has_integer_rt |= fb->blend_caps[i].is_integer;
+
+         if (!first_surf)
+            first_surf = state->cbufs[i];
+      } else {
+         fb_set_blend_caps(dev, PIPE_FORMAT_NONE, &fb->blend_caps[i]);
+      }
+   }
+
+   if (!first_surf && state->zsbuf)
+      first_surf = state->zsbuf;
+
+   fb->num_samples = (first_surf) ? first_surf->texture->nr_samples : 1;
+   if (!fb->num_samples)
+      fb->num_samples = 1;
+
+   if (state->zsbuf) {
+      const struct ilo_surface_cso *cso =
+         (const struct ilo_surface_cso *) state->zsbuf;
+
+      fb->has_hiz = cso->u.zs.hiz_bo;
+      fb->depth_offset_format =
+         ilo_state_zs_get_depth_format(&cso->u.zs, dev);
+   } else {
+      fb->has_hiz = false;
+      fb->depth_offset_format = GEN6_ZFORMAT_D32_FLOAT;
+   }
+
+   /*
+    * The PRMs list several restrictions when the framebuffer has more than
+    * one surface.  It seems they are actually lifted on GEN6+.
+    */
 
    vec->dirty |= ILO_DIRTY_FB;
 }
@@ -740,9 +1726,15 @@ static void
 ilo_set_polygon_stipple(struct pipe_context *pipe,
                         const struct pipe_poly_stipple *state)
 {
+   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
+   struct ilo_state_poly_stipple_info info;
+   int i;
+
+   for (i = 0; i < 32; i++)
+      info.pattern[i] = state->stipple[i];
 
-   vec->poly_stipple = *state;
+   ilo_state_poly_stipple_set_info(&vec->poly_stipple, dev, &info);
 
    vec->dirty |= ILO_DIRTY_POLY_STIPPLE;
 }
@@ -753,11 +1745,26 @@ ilo_set_scissor_states(struct pipe_context *pipe,
                        unsigned num_scissors,
                        const struct pipe_scissor_state *scissors)
 {
-   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
+   unsigned i;
+
+   for (i = 0; i < num_scissors; i++) {
+      struct ilo_state_viewport_scissor_info *info =
+         &vec->viewport.scissors[start_slot + i];
 
-   ilo_gpe_set_scissor(dev, start_slot, num_scissors,
-         scissors, &vec->scissor);
+      if (scissors[i].minx < scissors[i].maxx &&
+          scissors[i].miny < scissors[i].maxy) {
+         info->min_x = scissors[i].minx;
+         info->min_y = scissors[i].miny;
+         info->max_x = scissors[i].maxx - 1;
+         info->max_y = scissors[i].maxy - 1;
+      } else {
+         info->min_x = 1;
+         info->min_y = 1;
+         info->max_x = 0;
+         info->max_y = 0;
+      }
+   }
 
    vec->dirty |= ILO_DIRTY_SCISSOR;
 }
@@ -768,28 +1775,31 @@ ilo_set_viewport_states(struct pipe_context *pipe,
                         unsigned num_viewports,
                         const struct pipe_viewport_state *viewports)
 {
-   const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    if (viewports) {
       unsigned i;
 
       for (i = 0; i < num_viewports; i++) {
-         ilo_gpe_set_viewport_cso(dev, &viewports[i],
-               &vec->viewport.cso[start_slot + i]);
+         struct ilo_state_viewport_matrix_info *info =
+            &vec->viewport.matrices[start_slot + i];
+
+         memcpy(info->scale, viewports[i].scale, sizeof(info->scale));
+         memcpy(info->translate, viewports[i].translate,
+               sizeof(info->translate));
       }
 
-      if (vec->viewport.count < start_slot + num_viewports)
-         vec->viewport.count = start_slot + num_viewports;
+      if (vec->viewport.params.count < start_slot + num_viewports)
+         vec->viewport.params.count = start_slot + num_viewports;
 
       /* need to save viewport 0 for util_blitter */
       if (!start_slot && num_viewports)
          vec->viewport.viewport0 = viewports[0];
    }
    else {
-      if (vec->viewport.count <= start_slot + num_viewports &&
-          vec->viewport.count > start_slot)
-         vec->viewport.count = start_slot;
+      if (vec->viewport.params.count <= start_slot + num_viewports &&
+          vec->viewport.params.count > start_slot)
+         vec->viewport.params.count = start_slot;
    }
 
    vec->dirty |= ILO_DIRTY_VIEWPORT;
@@ -905,16 +1915,11 @@ ilo_set_index_buffer(struct pipe_context *pipe,
    struct ilo_state_vector *vec = &ilo_context(pipe)->state_vector;
 
    if (state) {
-      pipe_resource_reference(&vec->ib.buffer, state->buffer);
-      vec->ib.user_buffer = state->user_buffer;
-      vec->ib.offset = state->offset;
-      vec->ib.index_size = state->index_size;
-   }
-   else {
-      pipe_resource_reference(&vec->ib.buffer, NULL);
-      vec->ib.user_buffer = NULL;
-      vec->ib.offset = 0;
-      vec->ib.index_size = 0;
+      pipe_resource_reference(&vec->ib.state.buffer, state->buffer);
+      vec->ib.state = *state;
+   } else {
+      pipe_resource_reference(&vec->ib.state.buffer, NULL);
+      memset(&vec->ib.state, 0, sizeof(vec->ib.state));
    }
 
    vec->dirty |= ILO_DIRTY_IB;
@@ -926,19 +1931,28 @@ ilo_create_stream_output_target(struct pipe_context *pipe,
                                 unsigned buffer_offset,
                                 unsigned buffer_size)
 {
-   struct pipe_stream_output_target *target;
+   const struct ilo_dev *dev = ilo_context(pipe)->dev;
+   struct ilo_stream_output_target *target;
+   struct ilo_state_sol_buffer_info info;
 
-   target = MALLOC_STRUCT(pipe_stream_output_target);
+   target = CALLOC_STRUCT(ilo_stream_output_target);
    assert(target);
 
-   pipe_reference_init(&target->reference, 1);
-   target->buffer = NULL;
-   pipe_resource_reference(&target->buffer, res);
-   target->context = pipe;
-   target->buffer_offset = buffer_offset;
-   target->buffer_size = buffer_size;
+   pipe_reference_init(&target->base.reference, 1);
+   pipe_resource_reference(&target->base.buffer, res);
+   target->base.context = pipe;
+   target->base.buffer_offset = buffer_offset;
+   target->base.buffer_size = buffer_size;
+
+   memset(&info, 0, sizeof(info));
+   info.buf = ilo_buffer(res);
+   info.offset = buffer_offset;
+   info.size = buffer_size;
 
-   return target;
+   ilo_state_sol_buffer_init(&target->sb, dev, &info);
+   target->sb.bo = info.buf->bo;
+
+   return &target->base;
 }
 
 static void
@@ -991,7 +2005,7 @@ ilo_create_sampler_view(struct pipe_context *pipe,
    const struct ilo_dev *dev = ilo_context(pipe)->dev;
    struct ilo_view_cso *view;
 
-   view = MALLOC_STRUCT(ilo_view_cso);
+   view = CALLOC_STRUCT(ilo_view_cso);
    assert(view);
 
    view->base = *templ;
@@ -1001,16 +2015,24 @@ ilo_create_sampler_view(struct pipe_context *pipe,
    view->base.context = pipe;
 
    if (res->target == PIPE_BUFFER) {
-      const unsigned elem_size = util_format_get_blocksize(templ->format);
-      const unsigned first_elem = templ->u.buf.first_element;
-      const unsigned num_elems = templ->u.buf.last_element - first_elem + 1;
-
-      ilo_gpe_init_view_surface_for_buffer(dev, ilo_buffer(res),
-            first_elem * elem_size, num_elems * elem_size,
-            elem_size, templ->format, false, false, &view->surface);
-   }
-   else {
+      struct ilo_state_surface_buffer_info info;
+
+      memset(&info, 0, sizeof(info));
+      info.buf = ilo_buffer(res);
+      info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
+      info.format = ilo_format_translate_color(dev, templ->format);
+      info.format_size = util_format_get_blocksize(templ->format);
+      info.struct_size = info.format_size;
+      info.readonly = true;
+      info.offset = templ->u.buf.first_element * info.struct_size;
+      info.size = (templ->u.buf.last_element -
+            templ->u.buf.first_element + 1) * info.struct_size;
+
+      ilo_state_surface_init_for_buffer(&view->surface, dev, &info);
+      view->surface.bo = info.buf->bo;
+   } else {
       struct ilo_texture *tex = ilo_texture(res);
+      struct ilo_state_surface_image_info info;
 
       /* warn about degraded performance because of a missing binding flag */
       if (tex->image.tiling == GEN6_TILING_NONE &&
@@ -1019,13 +2041,33 @@ ilo_create_sampler_view(struct pipe_context *pipe,
                   "not created for sampling\n");
       }
 
-      ilo_gpe_init_view_surface_for_image(dev, &tex->image,
-            tex->base.target, templ->format,
-            templ->u.tex.first_level,
-            templ->u.tex.last_level - templ->u.tex.first_level + 1,
-            templ->u.tex.first_layer,
-            templ->u.tex.last_layer - templ->u.tex.first_layer + 1,
-            false, &view->surface);
+      memset(&info, 0, sizeof(info));
+      info.img = &tex->image;
+
+      info.access = ILO_STATE_SURFACE_ACCESS_SAMPLER;
+
+      if (templ->format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT &&
+          tex->image.separate_stencil) {
+         info.format = ilo_format_translate_texture(dev,
+               PIPE_FORMAT_Z32_FLOAT);
+      } else {
+         info.format = ilo_format_translate_texture(dev, templ->format);
+      }
+
+      info.is_cube_map = (tex->image.target == PIPE_TEXTURE_CUBE ||
+                          tex->image.target == PIPE_TEXTURE_CUBE_ARRAY);
+      info.is_array = util_resource_is_array_texture(&tex->base);
+      info.readonly = true;
+
+      info.level_base = templ->u.tex.first_level;
+      info.level_count = templ->u.tex.last_level -
+         templ->u.tex.first_level + 1;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
+
+      ilo_state_surface_init_for_image(&view->surface, dev, &info);
+      view->surface.bo = info.img->bo;
    }
 
    return &view->base;
@@ -1048,7 +2090,7 @@ ilo_create_surface(struct pipe_context *pipe,
    struct ilo_texture *tex = ilo_texture(res);
    struct ilo_surface_cso *surf;
 
-   surf = MALLOC_STRUCT(ilo_surface_cso);
+   surf = CALLOC_STRUCT(ilo_surface_cso);
    assert(surf);
 
    surf->base = *templ;
@@ -1063,28 +2105,56 @@ ilo_create_surface(struct pipe_context *pipe,
    surf->is_rt = !util_format_is_depth_or_stencil(templ->format);
 
    if (surf->is_rt) {
+      struct ilo_state_surface_image_info info;
+
       /* relax this? */
       assert(tex->base.target != PIPE_BUFFER);
 
-      /*
-       * classic i965 sets render_cache_rw for constant buffers and sol
-       * surfaces but not render buffers.  Why?
-       */
-      ilo_gpe_init_view_surface_for_image(dev,
-            &tex->image, tex->base.target,
-            templ->format, templ->u.tex.level, 1,
-            templ->u.tex.first_layer,
-            templ->u.tex.last_layer - templ->u.tex.first_layer + 1,
-            true, &surf->u.rt);
+      memset(&info, 0, sizeof(info));
+      info.img = &tex->image;
+      info.access = ILO_STATE_SURFACE_ACCESS_DP_RENDER;
+      info.format = ilo_format_translate_render(dev, templ->format);
+      info.is_array = util_resource_is_array_texture(&tex->base);
+      info.level_base = templ->u.tex.level;
+      info.level_count = 1;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
+
+      ilo_state_surface_init_for_image(&surf->u.rt, dev, &info);
+      surf->u.rt.bo = info.img->bo;
    } else {
+      struct ilo_state_zs_info info;
+
       assert(res->target != PIPE_BUFFER);
 
-      ilo_gpe_init_zs_surface(dev, &tex->image,
-            (tex->separate_s8) ? &tex->separate_s8->image : NULL,
-            tex->base.target, templ->format,
-            templ->u.tex.level, templ->u.tex.first_layer,
-            templ->u.tex.last_layer - templ->u.tex.first_layer + 1,
-            &surf->u.zs);
+      memset(&info, 0, sizeof(info));
+
+      if (templ->format == PIPE_FORMAT_S8_UINT) {
+         info.s_img = &tex->image;
+      } else {
+         info.z_img = &tex->image;
+         info.s_img = (tex->separate_s8) ? &tex->separate_s8->image : NULL;
+
+         info.hiz_enable =
+            ilo_image_can_enable_aux(&tex->image, templ->u.tex.level);
+      }
+
+      info.level = templ->u.tex.level;
+      info.slice_base = templ->u.tex.first_layer;
+      info.slice_count = templ->u.tex.last_layer -
+         templ->u.tex.first_layer + 1;
+
+      ilo_state_zs_init(&surf->u.zs, dev, &info);
+
+      if (info.z_img) {
+         surf->u.zs.depth_bo = info.z_img->bo;
+         if (info.hiz_enable)
+            surf->u.zs.hiz_bo = info.z_img->aux.bo;
+      }
+
+      if (info.s_img)
+         surf->u.zs.stencil_bo = info.s_img->bo;
    }
 
    return &surf->base;
@@ -1294,10 +2364,30 @@ void
 ilo_state_vector_init(const struct ilo_dev *dev,
                       struct ilo_state_vector *vec)
 {
-   ilo_gpe_set_scissor_null(dev, &vec->scissor);
+   struct ilo_state_urb_info urb_info;
 
-   ilo_gpe_init_zs_surface(dev, NULL, NULL, PIPE_TEXTURE_2D,
-         PIPE_FORMAT_NONE, 0, 0, 1, &vec->fb.null_zs);
+   vec->sample_mask = ~0u;
+
+   ilo_state_viewport_init_data_only(&vec->viewport.vp, dev,
+         vec->viewport.vp_data, sizeof(vec->viewport.vp_data));
+   assert(vec->viewport.vp.array_size >= ILO_MAX_VIEWPORTS);
+
+   vec->viewport.params.matrices = vec->viewport.matrices;
+   vec->viewport.params.scissors = vec->viewport.scissors;
+
+   ilo_state_hs_init_disabled(&vec->disabled_hs, dev);
+   ilo_state_ds_init_disabled(&vec->disabled_ds, dev);
+   ilo_state_gs_init_disabled(&vec->disabled_gs, dev);
+
+   ilo_state_sol_buffer_init_disabled(&vec->so.dummy_sb, dev);
+
+   ilo_state_surface_init_for_null(&vec->fb.null_rt, dev);
+   ilo_state_zs_init_for_null(&vec->fb.null_zs, dev);
+
+   ilo_state_sampler_init_disabled(&vec->disabled_sampler, dev);
+
+   memset(&urb_info, 0, sizeof(urb_info));
+   ilo_state_urb_init(&vec->urb, dev, &urb_info);
 
    util_dynarray_init(&vec->global_binding.bindings);
 
@@ -1314,7 +2404,7 @@ ilo_state_vector_cleanup(struct ilo_state_vector *vec)
          pipe_resource_reference(&vec->vb.states[i].buffer, NULL);
    }
 
-   pipe_resource_reference(&vec->ib.buffer, NULL);
+   pipe_resource_reference(&vec->ib.state.buffer, NULL);
    pipe_resource_reference(&vec->ib.hw_resource, NULL);
 
    for (i = 0; i < vec->so.count; i++)
@@ -1377,7 +2467,7 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          }
       }
 
-      if (vec->ib.buffer == res) {
+      if (vec->ib.state.buffer == res) {
          states |= ILO_DIRTY_IB;
 
          /*
@@ -1392,6 +2482,10 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
 
       for (i = 0; i < vec->so.count; i++) {
          if (vec->so.states[i]->buffer == res) {
+            struct ilo_stream_output_target *target =
+               (struct ilo_stream_output_target *) vec->so.states[i];
+
+            target->sb.bo = ilo_buffer(res)->bo;
             states |= ILO_DIRTY_SO;
             break;
          }
@@ -1456,7 +2550,8 @@ ilo_state_vector_resource_renamed(struct ilo_state_vector *vec,
          struct ilo_surface_cso *cso =
             (struct ilo_surface_cso *) vec->fb.state.zsbuf;
 
-         cso->u.rt.bo = bo;
+         cso->u.zs.depth_bo = bo;
+
          states |= ILO_DIRTY_FB;
       }
    }
diff --git a/src/gallium/drivers/ilo/ilo_state.h b/src/gallium/drivers/ilo/ilo_state.h
index fd0a315..3e6fd8a 100644
--- a/src/gallium/drivers/ilo/ilo_state.h
+++ b/src/gallium/drivers/ilo/ilo_state.h
@@ -28,13 +28,38 @@
 #ifndef ILO_STATE_H
 #define ILO_STATE_H
 
-#include "core/ilo_state_3d.h"
+#include "core/ilo_builder_3d.h" /* for gen6_3dprimitive_info */
+#include "core/ilo_state_cc.h"
+#include "core/ilo_state_compute.h"
+#include "core/ilo_state_raster.h"
+#include "core/ilo_state_sampler.h"
+#include "core/ilo_state_sbe.h"
+#include "core/ilo_state_shader.h"
+#include "core/ilo_state_sol.h"
+#include "core/ilo_state_surface.h"
+#include "core/ilo_state_urb.h"
+#include "core/ilo_state_vf.h"
+#include "core/ilo_state_viewport.h"
+#include "core/ilo_state_zs.h"
 #include "pipe/p_state.h"
 #include "util/u_dynarray.h"
 
 #include "ilo_common.h"
 
 /**
+ * \see brw_context.h
+ */
+#define ILO_MAX_DRAW_BUFFERS    8
+#define ILO_MAX_CONST_BUFFERS   (1 + 12)
+#define ILO_MAX_SAMPLER_VIEWS   16
+#define ILO_MAX_SAMPLERS        16
+#define ILO_MAX_SO_BINDINGS     64
+#define ILO_MAX_SO_BUFFERS      4
+#define ILO_MAX_VIEWPORTS       1
+
+#define ILO_MAX_SURFACES        256
+
+/**
  * States that we track.
  *
  * XXX Do we want to count each sampler or vertex buffer as a state?  If that
@@ -120,6 +145,172 @@ enum ilo_dirty_flags {
 };
 
 struct ilo_context;
+struct ilo_shader_state;
+
+struct ilo_ve_state {
+   unsigned vb_mapping[PIPE_MAX_ATTRIBS];
+   unsigned vb_count;
+
+   /* these are not valid until the state is finalized */
+   uint32_t vf_data[PIPE_MAX_ATTRIBS][4];
+   struct ilo_state_vf_params_info vf_params;
+   struct ilo_state_vf vf;
+};
+
+struct ilo_vb_state {
+   struct pipe_vertex_buffer states[PIPE_MAX_ATTRIBS];
+   struct ilo_state_vertex_buffer vb[PIPE_MAX_ATTRIBS];
+   uint32_t enabled_mask;
+};
+
+struct ilo_ib_state {
+   struct pipe_index_buffer state;
+
+   /* these are not valid until the state is finalized */
+   struct pipe_resource *hw_resource;
+   unsigned hw_index_size;
+   struct ilo_state_index_buffer ib;
+};
+
+struct ilo_cbuf_cso {
+   struct pipe_resource *resource;
+   struct ilo_state_surface_buffer_info info;
+   struct ilo_state_surface surface;
+
+   /*
+    * this CSO is not so constant because user buffer needs to be uploaded in
+    * finalize_constant_buffers()
+    */
+   const void *user_buffer;
+};
+
+struct ilo_sampler_cso {
+   struct ilo_state_sampler sampler;
+   struct ilo_state_sampler_border border;
+   bool saturate_s;
+   bool saturate_t;
+   bool saturate_r;
+};
+
+struct ilo_sampler_state {
+   const struct ilo_sampler_cso *cso[ILO_MAX_SAMPLERS];
+};
+
+struct ilo_cbuf_state {
+   struct ilo_cbuf_cso cso[ILO_MAX_CONST_BUFFERS];
+   uint32_t enabled_mask;
+};
+
+struct ilo_resource_state {
+   struct pipe_surface *states[PIPE_MAX_SHADER_RESOURCES];
+   unsigned count;
+};
+
+struct ilo_view_cso {
+   struct pipe_sampler_view base;
+
+   struct ilo_state_surface surface;
+};
+
+struct ilo_view_state {
+   struct pipe_sampler_view *states[ILO_MAX_SAMPLER_VIEWS];
+   unsigned count;
+};
+
+struct ilo_stream_output_target {
+   struct pipe_stream_output_target base;
+
+   struct ilo_state_sol_buffer sb;
+};
+
+struct ilo_so_state {
+   struct pipe_stream_output_target *states[ILO_MAX_SO_BUFFERS];
+   unsigned count;
+   unsigned append_bitmask;
+
+   struct ilo_state_sol_buffer dummy_sb;
+
+   bool enabled;
+};
+
+struct ilo_rasterizer_state {
+   struct pipe_rasterizer_state state;
+
+   /* these are invalid until finalize_rasterizer() */
+   struct ilo_state_raster_info info;
+   struct ilo_state_raster rs;
+};
+
+struct ilo_viewport_state {
+   struct ilo_state_viewport_matrix_info matrices[ILO_MAX_VIEWPORTS];
+   struct ilo_state_viewport_scissor_info scissors[ILO_MAX_VIEWPORTS];
+   struct ilo_state_viewport_params_info params;
+
+   struct pipe_viewport_state viewport0;
+   struct pipe_scissor_state scissor0;
+
+   struct ilo_state_viewport vp;
+   uint32_t vp_data[20 * ILO_MAX_VIEWPORTS];
+};
+
+struct ilo_surface_cso {
+   struct pipe_surface base;
+
+   bool is_rt;
+   union {
+      struct ilo_state_surface rt;
+      struct ilo_state_zs zs;
+   } u;
+};
+
+struct ilo_fb_state {
+   struct pipe_framebuffer_state state;
+
+   struct ilo_state_surface null_rt;
+   struct ilo_state_zs null_zs;
+
+   struct ilo_fb_blend_caps {
+      bool is_unorm;
+      bool is_integer;
+      bool force_dst_alpha_one;
+
+      bool can_logicop;
+      bool can_blend;
+      bool can_alpha_test;
+   } blend_caps[PIPE_MAX_COLOR_BUFS];
+
+   unsigned num_samples;
+
+   bool has_integer_rt;
+   bool has_hiz;
+   enum gen_depth_format depth_offset_format;
+};
+
+struct ilo_dsa_state {
+   struct ilo_state_cc_depth_info depth;
+
+   struct ilo_state_cc_stencil_info stencil;
+   struct {
+      uint8_t test_mask;
+      uint8_t write_mask;
+   } stencil_front, stencil_back;
+
+   bool alpha_test;
+   float alpha_ref;
+   enum gen_compare_function alpha_func;
+};
+
+struct ilo_blend_state {
+   struct ilo_state_cc_blend_rt_info rt[PIPE_MAX_COLOR_BUFS];
+   struct ilo_state_cc_blend_rt_info dummy_rt;
+   bool dual_blend;
+
+   /* these are invalid until finalize_blend() */
+   struct ilo_state_cc_blend_rt_info effective_rt[PIPE_MAX_COLOR_BUFS];
+   struct ilo_state_cc_info info;
+   struct ilo_state_cc cc;
+   bool alpha_may_kill;
+};
 
 struct ilo_global_binding_cso {
    struct pipe_resource *resource;
@@ -147,6 +338,7 @@ struct ilo_global_binding {
 
 struct ilo_state_vector {
    const struct pipe_draw_info *draw;
+   struct gen6_3dprimitive_info draw_info;
 
    uint32_t dirty;
 
@@ -157,30 +349,41 @@ struct ilo_state_vector {
    struct ilo_shader_state *vs;
    struct ilo_shader_state *gs;
 
+   struct ilo_state_hs disabled_hs;
+   struct ilo_state_ds disabled_ds;
+   struct ilo_state_gs disabled_gs;
+
    struct ilo_so_state so;
 
    struct pipe_clip_state clip;
+
    struct ilo_viewport_state viewport;
-   struct ilo_scissor_state scissor;
 
-   const struct ilo_rasterizer_state *rasterizer;
-   struct pipe_poly_stipple poly_stipple;
+   struct ilo_rasterizer_state *rasterizer;
+
+   struct ilo_state_line_stipple line_stipple;
+   struct ilo_state_poly_stipple poly_stipple;
    unsigned sample_mask;
 
    struct ilo_shader_state *fs;
 
-   const struct ilo_dsa_state *dsa;
+   struct ilo_state_cc_params_info cc_params;
    struct pipe_stencil_ref stencil_ref;
-   const struct ilo_blend_state *blend;
-   struct pipe_blend_color blend_color;
+   const struct ilo_dsa_state *dsa;
+   struct ilo_blend_state *blend;
+
    struct ilo_fb_state fb;
 
+   struct ilo_state_urb urb;
+
    /* shader resources */
    struct ilo_sampler_state sampler[PIPE_SHADER_TYPES];
    struct ilo_view_state view[PIPE_SHADER_TYPES];
    struct ilo_cbuf_state cbuf[PIPE_SHADER_TYPES];
    struct ilo_resource_state resource;
 
+   struct ilo_state_sampler disabled_sampler;
+
    /* GPGPU */
    struct ilo_shader_state *cs;
    struct ilo_resource_state cs_resource;
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
index d2dc2f5..01c8667 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_internal.h
@@ -28,6 +28,9 @@
 #ifndef ILO_SHADER_INTERNAL_H
 #define ILO_SHADER_INTERNAL_H
 
+#include "core/ilo_state_sbe.h"
+#include "core/ilo_state_sol.h"
+
 #include "ilo_common.h"
 #include "ilo_state.h"
 #include "ilo_shader.h"
@@ -72,13 +75,27 @@ struct ilo_shader_variant {
    uint32_t saturate_tex_coords[3];
 };
 
+struct ilo_kernel_routing {
+   bool initialized;
+
+   bool is_point;
+   bool light_twoside;
+   uint32_t sprite_coord_enable;
+   int sprite_coord_mode;
+   int src_len;
+   int src_semantics[PIPE_MAX_SHADER_OUTPUTS];
+   int src_indices[PIPE_MAX_SHADER_OUTPUTS];
+
+   struct ilo_state_sbe sbe;
+};
+
 /**
  * A compiled shader.
  */
 struct ilo_shader {
    struct ilo_shader_variant variant;
 
-   struct ilo_shader_cso cso;
+   union ilo_shader_cso cso;
 
    struct {
       int semantic_names[PIPE_MAX_SHADER_INPUTS];
@@ -111,7 +128,9 @@ struct ilo_shader {
 
    bool stream_output;
    int svbi_post_inc;
-   struct pipe_stream_output_info so_info;
+
+   uint32_t sol_data[PIPE_MAX_SO_OUTPUTS][2];
+   struct ilo_state_sol sol;
 
    /* for VS stream output / rasterizer discard */
    int gs_offsets[3];
@@ -121,11 +140,8 @@ struct ilo_shader {
    void *kernel;
    int kernel_size;
 
-   bool routing_initialized;
-   int routing_src_semantics[PIPE_MAX_SHADER_OUTPUTS];
-   int routing_src_indices[PIPE_MAX_SHADER_OUTPUTS];
-   uint32_t routing_sprite_coord_enable;
    struct ilo_kernel_routing routing;
+   struct ilo_state_ps_params_info ps_params;
 
    /* what does the push constant buffer consist of? */
    struct {
diff --git a/src/gallium/drivers/ilo/shader/toy_tgsi.c b/src/gallium/drivers/ilo/shader/toy_tgsi.c
index 65e47bf..d38585f 100644
--- a/src/gallium/drivers/ilo/shader/toy_tgsi.c
+++ b/src/gallium/drivers/ilo/shader/toy_tgsi.c
@@ -2036,9 +2036,6 @@ parse_instruction(struct toy_tgsi *tgsi,
       if (!dst_is_scratch[i])
          continue;
 
-      if (tgsi_inst->Instruction.Saturate == TGSI_SAT_MINUS_PLUS_ONE)
-         tc_fail(tgsi->tc, "TGSI_SAT_MINUS_PLUS_ONE unhandled");
-
       tgsi->tc->templ.saturate = tgsi_inst->Instruction.Saturate;
 
       /* emit indirect store */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index b6c32ff..b25e041 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -975,10 +975,6 @@ lp_build_depth_stencil_test(struct gallivm_state *gallivm,
                                          s_bld.int_vec_type, "");
       }
 
-      /* convert scalar stencil refs into vectors */
-      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
-      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
-
       s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
                                           stencil_refs, stencil_vals,
                                           front_facing);
diff --git a/src/gallium/drivers/llvmpipe/lp_public.h b/src/gallium/drivers/llvmpipe/lp_public.h
index ec6b660..27ab1ba 100644
--- a/src/gallium/drivers/llvmpipe/lp_public.h
+++ b/src/gallium/drivers/llvmpipe/lp_public.h
@@ -1,10 +1,18 @@
 #ifndef LP_PUBLIC_H
 #define LP_PUBLIC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 struct sw_winsys;
 
 struct pipe_screen *
 llvmpipe_create_screen(struct sw_winsys *winsys);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_query.c b/src/gallium/drivers/llvmpipe/lp_query.c
index 4f8bab6..fc59367 100644
--- a/src/gallium/drivers/llvmpipe/lp_query.c
+++ b/src/gallium/drivers/llvmpipe/lp_query.c
@@ -315,7 +315,7 @@ llvmpipe_check_render_cond(struct llvmpipe_context *lp)
 
    b = pipe->get_query_result(pipe, lp->render_cond_query, wait, (void*)&result);
    if (b)
-      return (!result == lp->render_cond_cond);
+      return ((!result) == lp->render_cond_cond);
    else
       return TRUE;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index f4ba596..47f1897 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -165,7 +165,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_DEPTH_CLIP_DISABLE:
       return 1;
    case PIPE_CAP_SHADER_STENCIL_EXPORT:
-      return 0;
+      return 1;
    case PIPE_CAP_TGSI_INSTANCEID:
    case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
    case PIPE_CAP_START_INSTANCE:
@@ -258,8 +258,9 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
       return 1;
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-   case PIPE_CAP_SAMPLER_VIEW_TARGET:
       return 0;
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+      return 1;
    case PIPE_CAP_FAKE_SW_MSAA:
       return 1;
    case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
@@ -290,6 +291,7 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 96cc77c..4c8167a 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -854,9 +854,10 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                      jit_tex->img_stride[j] = lp_tex->img_stride[j];
                   }
 
-                  if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-                      res->target == PIPE_TEXTURE_2D_ARRAY ||
-                      res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                      view->target == PIPE_TEXTURE_2D_ARRAY ||
+                      view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                      /*
                       * For array textures, we don't have first_layer, instead
                       * adjust last_layer (stored as depth) plus the mip level offsets
@@ -868,7 +869,8 @@ lp_setup_set_fragment_sampler_views(struct lp_setup_context *setup,
                         jit_tex->mip_offsets[j] += view->u.tex.first_layer *
                                                    lp_tex->img_stride[j];
                      }
-                     if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                     if (view->target == PIPE_TEXTURE_CUBE ||
+                         view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                         assert(jit_tex->depth % 6 == 0);
                      }
                      assert(view->u.tex.first_layer <= view->u.tex.last_layer);
@@ -1067,10 +1069,13 @@ try_update_scene_state( struct lp_setup_context *setup )
    if (setup->dirty & LP_SETUP_NEW_CONSTANTS) {
       for (i = 0; i < Elements(setup->constants); ++i) {
          struct pipe_resource *buffer = setup->constants[i].current.buffer;
-         const unsigned current_size = setup->constants[i].current.buffer_size;
+         const unsigned current_size = MIN2(setup->constants[i].current.buffer_size,
+                                            LP_MAX_TGSI_CONST_BUFFER_SIZE);
          const ubyte *current_data = NULL;
          int num_constants;
 
+         STATIC_ASSERT(DATA_BLOCK_SIZE >= LP_MAX_TGSI_CONST_BUFFER_SIZE);
+
          if (buffer) {
             /* resource buffer */
             current_data = (ubyte *) llvmpipe_resource_data(buffer);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 35fe7b2..b5ce868 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -260,7 +260,8 @@ generate_fs_loop(struct gallivm_state *gallivm,
 {
    const struct util_format_description *zs_format_desc = NULL;
    const struct tgsi_token *tokens = shader->base.tokens;
-   LLVMTypeRef vec_type;
+   struct lp_type int_type = lp_int_type(type);
+   LLVMTypeRef vec_type, int_vec_type;
    LLVMValueRef mask_ptr, mask_val;
    LLVMValueRef consts_ptr, num_consts_ptr;
    LLVMValueRef z;
@@ -295,7 +296,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
       zs_format_desc = util_format_description(key->zsbuf_format);
       assert(zs_format_desc);
 
-      if (!shader->info.base.writes_z) {
+      if (!shader->info.base.writes_z && !shader->info.base.writes_stencil) {
          if (key->alpha.enabled ||
              key->blend.alpha_to_coverage ||
              shader->info.base.uses_kill) {
@@ -329,11 +330,14 @@ generate_fs_loop(struct gallivm_state *gallivm,
       depth_mode = 0;
    }
 
+   vec_type = lp_build_vec_type(gallivm, type);
+   int_vec_type = lp_build_vec_type(gallivm, int_type);
 
    stencil_refs[0] = lp_jit_context_stencil_ref_front_value(gallivm, context_ptr);
    stencil_refs[1] = lp_jit_context_stencil_ref_back_value(gallivm, context_ptr);
-
-   vec_type = lp_build_vec_type(gallivm, type);
+   /* convert scalar stencil refs into vectors */
+   stencil_refs[0] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[0]);
+   stencil_refs[1] = lp_build_broadcast(gallivm, int_vec_type, stencil_refs[1]);
 
    consts_ptr = lp_jit_context_constants(gallivm, context_ptr);
    num_consts_ptr = lp_jit_context_num_constants(gallivm, context_ptr);
@@ -462,7 +466,9 @@ generate_fs_loop(struct gallivm_state *gallivm,
       int pos0 = find_output_by_semantic(&shader->info.base,
                                          TGSI_SEMANTIC_POSITION,
                                          0);
-
+      int s_out = find_output_by_semantic(&shader->info.base,
+                                          TGSI_SEMANTIC_STENCIL,
+                                          0);
       if (pos0 != -1 && outputs[pos0][2]) {
          z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
 
@@ -512,6 +518,15 @@ generate_fs_loop(struct gallivm_state *gallivm,
          }
       }
 
+      if (s_out != -1 && outputs[s_out][1]) {
+         /* there's only one value, and spec says to discard additional bits */
+         LLVMValueRef s_max_mask = lp_build_const_int_vec(gallivm, int_type, 255);
+         stencil_refs[0] = LLVMBuildLoad(builder, outputs[s_out][1], "output.s");
+         stencil_refs[0] = LLVMBuildBitCast(builder, stencil_refs[0], int_vec_type, "");
+         stencil_refs[0] = LLVMBuildAnd(builder, stencil_refs[0], s_max_mask, "");
+         stencil_refs[1] = stencil_refs[0];
+      }
+
       lp_build_depth_stencil_load_swizzled(gallivm, type,
                                            zs_format_desc, key->resource_1d,
                                            depth_ptr, depth_stride,
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 21da629..b205f02 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -170,6 +170,36 @@ llvmpipe_create_sampler_view(struct pipe_context *pipe,
       view->texture = NULL;
       pipe_resource_reference(&view->texture, texture);
       view->context = pipe;
+
+#ifdef DEBUG
+     /*
+      * This is possibly too lenient, but the primary reason is just
+      * to catch state trackers which forget to initialize this, so
+      * it only catches clearly impossible view targets.
+      */
+      if (view->target != texture->target) {
+         if (view->target == PIPE_TEXTURE_1D)
+            assert(texture->target == PIPE_TEXTURE_1D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_1D_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_1D);
+         else if (view->target == PIPE_TEXTURE_2D)
+            assert(texture->target == PIPE_TEXTURE_2D_ARRAY ||
+                   texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_2D_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_2D ||
+                   texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE)
+            assert(texture->target == PIPE_TEXTURE_CUBE_ARRAY ||
+                   texture->target == PIPE_TEXTURE_2D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE_ARRAY)
+            assert(texture->target == PIPE_TEXTURE_CUBE ||
+                   texture->target == PIPE_TEXTURE_2D_ARRAY);
+         else
+            assert(0);
+      }
+#endif
    }
 
    return view;
@@ -245,15 +275,17 @@ prepare_shader_sampling(
                   row_stride[j] = lp_tex->row_stride[j];
                   img_stride[j] = lp_tex->img_stride[j];
                }
-               if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-                   res->target == PIPE_TEXTURE_2D_ARRAY ||
-                   res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                   view->target == PIPE_TEXTURE_2D_ARRAY ||
+                   view->target == PIPE_TEXTURE_CUBE ||
+                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                   num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
                   for (j = first_level; j <= last_level; j++) {
                      mip_offsets[j] += view->u.tex.first_layer *
                                        lp_tex->img_stride[j];
                   }
-                  if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                      assert(num_layers % 6 == 0);
                   }
                   assert(view->u.tex.first_layer <= view->u.tex.last_layer);
diff --git a/src/gallium/drivers/llvmpipe/lp_surface.c b/src/gallium/drivers/llvmpipe/lp_surface.c
index 08f968f..96f8ed8 100644
--- a/src/gallium/drivers/llvmpipe/lp_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_surface.c
@@ -42,13 +42,6 @@ lp_resource_copy(struct pipe_context *pipe,
                  struct pipe_resource *src, unsigned src_level,
                  const struct pipe_box *src_box)
 {
-   struct llvmpipe_resource *src_tex = llvmpipe_resource(src);
-   struct llvmpipe_resource *dst_tex = llvmpipe_resource(dst);
-   const enum pipe_format format = src_tex->base.format;
-   unsigned width = src_box->width;
-   unsigned height = src_box->height;
-   unsigned depth = src_box->depth;
-
    llvmpipe_flush_resource(pipe,
                            dst, dst_level,
                            FALSE, /* read_only */
@@ -63,58 +56,8 @@ lp_resource_copy(struct pipe_context *pipe,
                            FALSE, /* do_not_block */
                            "blit src");
 
-   /* Fallback for buffers. */
-   if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
-      util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
-                                src, src_level, src_box);
-      return;
-   }
-
-   /*
-   printf("surface copy from %u lvl %u to %u lvl %u: %u,%u,%u to %u,%u,%u %u x %u x %u\n",
-          src_tex->id, src_level, dst_tex->id, dst_level,
-          src_box->x, src_box->y, src_box->z, dstx, dsty, dstz,
-          src_box->width, src_box->height, src_box->depth);
-   */
-
-   /* make sure display target resources (which cannot have levels/layers) are mapped */
-   if (src_tex->dt)
-      (void) llvmpipe_resource_map(src, src_level, 0, LP_TEX_USAGE_READ);
-   if (dst_tex->dt)
-      /*
-       * Could set this to WRITE_ALL if complete dst is covered but it gets
-       * ignored anyway.
-       */
-      (void) llvmpipe_resource_map(dst, dst_level, 0, LP_TEX_USAGE_READ_WRITE);
-
-
-   /* copy */
-   {
-      const ubyte *src_linear_ptr
-         = llvmpipe_get_texture_image_address(src_tex, src_box->z,
-                                              src_level);
-      ubyte *dst_linear_ptr
-         = llvmpipe_get_texture_image_address(dst_tex, dstz,
-                                              dst_level);
-
-      if (dst_linear_ptr && src_linear_ptr) {
-         util_copy_box(dst_linear_ptr, format,
-                       llvmpipe_resource_stride(&dst_tex->base, dst_level),
-                       dst_tex->img_stride[dst_level],
-                       dstx, dsty, 0,
-                       width, height, depth,
-                       src_linear_ptr,
-                       llvmpipe_resource_stride(&src_tex->base, src_level),
-                       src_tex->img_stride[src_level],
-                       src_box->x, src_box->y, 0);
-      }
-   }
-
-   if (src_tex->dt)
-      llvmpipe_resource_unmap(src, 0, 0);
-   if (dst_tex->dt)
-      llvmpipe_resource_unmap(dst, 0, 0);
-
+   util_resource_copy_region(pipe, dst, dst_level, dstx, dsty, dstz,
+                             src, src_level, src_box);
 }
 
 
@@ -139,11 +82,6 @@ static void lp_blit(struct pipe_context *pipe,
       return; /* done */
    }
 
-   if (info.mask & PIPE_MASK_S) {
-      debug_printf("llvmpipe: cannot blit stencil, skipping\n");
-      info.mask &= ~PIPE_MASK_S;
-   }
-
    if (!util_blitter_is_blit_supported(lp->blitter, &info)) {
       debug_printf("llvmpipe: blit unsupported %s -> %s\n",
                    util_format_short_name(info.src.resource->format),
diff --git a/src/gallium/drivers/nouveau/Android.mk b/src/gallium/drivers/nouveau/Android.mk
index 420c8e5..daf3abd 100644
--- a/src/gallium/drivers/nouveau/Android.mk
+++ b/src/gallium/drivers/nouveau/Android.mk
@@ -39,6 +39,10 @@ LOCAL_SRC_FILES := \
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_nouveau
 LOCAL_MODULE := libmesa_pipe_nouveau
 
+ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_C_INCLUDES := external/libcxx/include
+else
 include external/stlport/libstlport.mk
+endif
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/drivers/nouveau/Makefile.am b/src/gallium/drivers/nouveau/Makefile.am
index 0aefc03..d05f0a1 100644
--- a/src/gallium/drivers/nouveau/Makefile.am
+++ b/src/gallium/drivers/nouveau/Makefile.am
@@ -48,7 +48,7 @@ nouveau_compiler_SOURCES = \
 
 nouveau_compiler_LDADD = \
 	libnouveau.la \
-	../../auxiliary/libgallium.la \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
diff --git a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
index be17871..b9c05a0 100644
--- a/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
+++ b/src/gallium/drivers/nouveau/codegen/lib/gk110.asm
@@ -11,7 +11,7 @@
 // SIZE:    22 / 14 * 8 bytes
 //
 gk110_div_u32:
-   sched 0x28282804280428
+   sched 0x28 0x04 0x28 0x04 0x28 0x28 0x28
    bfind u32 $r2 $r1
    xor b32 $r2 $r2 0x1f
    mov b32 $r3 0x1
@@ -19,7 +19,7 @@ gk110_div_u32:
    cvt u32 $r1 neg u32 $r1
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
-   sched 0x28282828282828
+   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
@@ -27,7 +27,7 @@ gk110_div_u32:
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
-   sched 0x042c2828042804
+   sched 0x04 0x28 0x04 0x28 0x28 0x2c 0x04
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mov b32 $r3 $r0
    mul high $r0 u32 $r0 u32 $r2
@@ -35,7 +35,7 @@ gk110_div_u32:
    add $r1 (mul u32 $r1 u32 $r0) $r3
    set $p0 0x1 ge u32 $r1 $r2
    $p0 sub b32 $r1 $r1 $r2
-   sched 0x20282e20042c28
+   sched 0x28 0x2c 0x04 0x20 0x2e 0x28 0x20
    $p0 add b32 $r0 $r0 0x1
    $p0 set $p0 0x1 ge u32 $r1 $r2
    $p0 sub b32 $r1 $r1 $r2
@@ -51,7 +51,7 @@ gk110_div_u32:
 gk110_div_s32:
    set $p2 0x1 lt s32 $r0 0x0
    set $p3 0x1 lt s32 $r1 0x0 xor $p2
-   sched 0x28042804282820
+   sched 0x20 0x28 0x28 0x04 0x28 0x04 0x28
    cvt s32 $r0 abs s32 $r0
    cvt s32 $r1 abs s32 $r1
    bfind u32 $r2 $r1
@@ -59,7 +59,7 @@ gk110_div_s32:
    mov b32 $r3 0x1
    shl b32 $r2 $r3 clamp $r2
    cvt u32 $r1 neg u32 $r1
-   sched 0x28282828282828
+   sched 0x28 0x28 0x28 0x28 0x28 0x28 0x28
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
@@ -67,7 +67,7 @@ gk110_div_s32:
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
-   sched 0x28280428042828
+   sched 0x28 0x28 0x04 0x28 0x04 0x28 0x28
    add $r2 (mul high u32 $r2 u32 $r3) $r2
    mul $r3 u32 $r1 u32 $r2
    add $r2 (mul high u32 $r2 u32 $r3) $r2
@@ -75,7 +75,7 @@ gk110_div_s32:
    mul high $r0 u32 $r0 u32 $r2
    cvt u32 $r2 neg u32 $r1
    add $r1 (mul u32 $r1 u32 $r0) $r3
-   sched 0x2028042c28042c
+   sched 0x2c 0x04 0x28 0x2c 0x04 0x28 0x20
    set $p0 0x1 ge u32 $r1 $r2
    $p0 sub b32 $r1 $r1 $r2
    $p0 add b32 $r0 $r0 0x1
@@ -83,7 +83,7 @@ gk110_div_s32:
    $p0 sub b32 $r1 $r1 $r2
    $p0 add b32 $r0 $r0 0x1
    $p3 cvt s32 $r0 neg s32 $r0
-   sched 0x2c200428042e04
+   sched 0x04 0x2e 0x04 0x28 0x04 0x20 0x2c
    $p2 cvt s32 $r1 neg s32 $r1
    ret
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 6bb9620..ab8bf2e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -967,8 +967,8 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i)
       code[0] = (code[0] & ~0xfc) | ((code[0] << 3) & 0xe0);
       if (i->defExists(1))
          defId(i->def(1), 2);
-   else
-      code[0] |= 0x1c;
+      else
+         code[0] |= 0x1c;
    } else {
       switch (i->sType) {
       case TYPE_F32: op2 = 0x000; op1 = 0x800; break;
@@ -990,8 +990,12 @@ CodeEmitterGK110::emitSET(const CmpInstruction *i)
       }
       FTZ_(3a);
 
-      if (i->dType == TYPE_F32)
-         code[1] |= 1 << 23;
+      if (i->dType == TYPE_F32) {
+         if (isFloatType(i->sType))
+            code[1] |= 1 << 23;
+         else
+            code[1] |= 1 << 15;
+      }
    }
    if (i->sType == TYPE_S32)
       code[1] |= 1 << 19;
@@ -1316,6 +1320,8 @@ CodeEmitterGK110::emitFlow(const Instruction *i)
    } else
    if (mask & 2) {
       int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      if (writeIssueDelays && !(f->target.bb->binPos & 0x3f))
+         pcRel += 8;
       // currently we don't want absolute branches
       assert(!f->absolute);
       code[0] |= (pcRel & 0x1ff) << 23;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index 22db368..399a6f1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -509,10 +509,13 @@ CodeEmitterGM107::emitBRA()
    emitCond5(0x00, CC_TR);
 
    if (!insn->srcExists(0) || insn->src(0).getFile() != FILE_MEMORY_CONST) {
+      int32_t pos = insn->target.bb->binPos;
+      if (writeIssueDelays && !(pos & 0x1f))
+         pos += 8;
       if (!insn->absolute)
-         emitField(0x14, 24, insn->target.bb->binPos - (codeSize + 8));
+         emitField(0x14, 24, pos - (codeSize + 8));
       else
-         emitField(0x14, 32, insn->target.bb->binPos);
+         emitField(0x14, 32, pos);
    } else {
       emitCBUF (0x24, gpr, 20, 16, 0, insn->src(0));
       emitField(0x05, 1, 1);
@@ -1827,6 +1830,7 @@ CodeEmitterGM107::emitISET()
    emitCond3(0x31, insn->setCond);
    emitField(0x30, 1, isSignedType(insn->sType));
    emitCC   (0x2f);
+   emitField(0x2c, 1, insn->dType == TYPE_F32);
    emitX    (0x2b);
    emitGPR  (0x08, insn->src(0));
    emitGPR  (0x00, insn->def(0));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index d9aed34..472e3a8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1078,8 +1078,14 @@ CodeEmitterNVC0::emitSET(const CmpInstruction *i)
    if (!isFloatType(i->sType))
       lo = 0x3;
 
-   if (isFloatType(i->dType) || isSignedIntType(i->sType))
+   if (isSignedIntType(i->sType))
       lo |= 0x20;
+   if (isFloatType(i->dType)) {
+      if (isFloatType(i->sType))
+         lo |= 0x20;
+      else
+         lo |= 0x80;
+   }
 
    switch (i->op) {
    case OP_SET_AND: hi = 0x10000000; break;
@@ -1406,6 +1412,8 @@ CodeEmitterNVC0::emitFlow(const Instruction *i)
    } else
    if (mask & 2) {
       int32_t pcRel = f->target.bb->binPos - (codeSize + 8);
+      if (writeIssueDelays && !(f->target.bb->binPos & 0x3f))
+         pcRel += 8;
       // currently we don't want absolute branches
       assert(!f->absolute);
       code[0] |= (pcRel & 0x3f) << 26;
@@ -2712,7 +2720,6 @@ private:
 
    RegScores *score; // for current BB
    std::vector<RegScores> scoreBoards;
-   int cycle;
    int prevData;
    operation prevOp;
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 254629f..ecd115f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -1316,7 +1316,7 @@ private:
    };
 
 private:
-   const struct tgsi::Source *code;
+   const tgsi::Source *code;
    const struct nv50_ir_prog_info *info;
 
    struct {
@@ -1356,18 +1356,20 @@ Converter::srcToSym(tgsi::Instruction::SrcRegister src, int c)
 {
    const int swz = src.getSwizzle(c);
 
+   /* TODO: Use Array ID when it's available for the index */
    return makeSym(src.getFile(),
                   src.is2D() ? src.getIndex(1) : 0,
-                  src.isIndirect(0) ? -1 : src.getIndex(0), swz,
+                  src.getIndex(0), swz,
                   src.getIndex(0) * 16 + swz * 4);
 }
 
 Symbol *
 Converter::dstToSym(tgsi::Instruction::DstRegister dst, int c)
 {
+   /* TODO: Use Array ID when it's available for the index */
    return makeSym(dst.getFile(),
                   dst.is2D() ? dst.getIndex(1) : 0,
-                  dst.isIndirect(0) ? -1 : dst.getIndex(0), c,
+                  dst.getIndex(0), c,
                   dst.getIndex(0) * 16 + c * 4);
 }
 
@@ -1604,19 +1606,8 @@ Converter::storeDst(int d, int c, Value *val)
 {
    const tgsi::Instruction::DstRegister dst = tgsi.getDst(d);
 
-   switch (tgsi.getSaturate()) {
-   case TGSI_SAT_NONE:
-      break;
-   case TGSI_SAT_ZERO_ONE:
+   if (tgsi.getSaturate()) {
       mkOp1(OP_SAT, dstTy, val, val);
-      break;
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      mkOp2(OP_MAX, dstTy, val, val, mkImm(-1.0f));
-      mkOp2(OP_MIN, dstTy, val, val, mkImm(+1.0f));
-      break;
-   default:
-      assert(!"invalid saturation mode");
-      break;
    }
 
    Value *ptr = NULL;
@@ -1955,13 +1946,13 @@ isResourceSpecial(const int r)
 }
 
 static inline bool
-isResourceRaw(const struct tgsi::Source *code, const int r)
+isResourceRaw(const tgsi::Source *code, const int r)
 {
    return isResourceSpecial(r) || code->resources[r].raw;
 }
 
 static inline nv50_ir::TexTarget
-getResourceTarget(const struct tgsi::Source *code, int r)
+getResourceTarget(const tgsi::Source *code, int r)
 {
    if (isResourceSpecial(r))
       return nv50_ir::TEX_TARGET_BUFFER;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
index 64989ac..596ac95 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_gm107.cpp
@@ -240,6 +240,7 @@ GM107LoweringPass::visit(Instruction *i)
             Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                     i->getIndirect(0, 0), bld.mkImm(4));
             i->setIndirect(0, 0, ptr);
+            i->op = OP_VFETCH;
          } else {
             i->op = OP_VFETCH;
             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 1ad0860..2c7f7e3 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -887,7 +887,7 @@ NV50LoweringPreSSA::handleTXL(TexInstruction *i)
       }
    }
    bld.setPosition(joinBB, false);
-   bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
+   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
    return true;
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index b61f3c4..7a5d1ce 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -100,8 +100,7 @@ void
 NVC0LegalizeSSA::handleFTZ(Instruction *i)
 {
    // Only want to flush float inputs
-   if (i->sType != TYPE_F32)
-      return;
+   assert(i->sType == TYPE_F32);
 
    // If we're already flushing denorms (and NaN's) to zero, no need for this.
    if (i->dnz)
@@ -129,7 +128,7 @@ NVC0LegalizeSSA::visit(BasicBlock *bb)
    Instruction *next;
    for (Instruction *i = bb->getEntry(); i; i = next) {
       next = i->next;
-      if (i->dType == TYPE_F32) {
+      if (i->sType == TYPE_F32) {
          if (prog->getType() != Program::TYPE_COMPUTE)
             handleFTZ(i);
          continue;
@@ -169,7 +168,7 @@ NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
 
 void
 NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
-                              Instruction *usei, const Instruction *insn)
+                              Instruction *usei, const Instruction *texi)
 {
    bool add = true;
    for (std::list<TexUse>::iterator it = uses.begin();
@@ -184,7 +183,7 @@ NVC0LegalizePostRA::addTexUse(std::list<TexUse> &uses,
          ++it;
    }
    if (add)
-      uses.push_back(TexUse(usei, insn));
+      uses.push_back(TexUse(usei, texi));
 }
 
 void
@@ -196,7 +195,8 @@ NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
    while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
       insn = insn->getSrc(0)->getUniqueInsn();
 
-   if (!insn->bb->reachableBy(texi->bb, term))
+   // NOTE: the tex itself is, of course, not an overwriting definition
+   if (insn == texi || !insn->bb->reachableBy(texi->bb, term))
       return;
 
    switch (insn->op) {
@@ -244,7 +244,12 @@ NVC0LegalizePostRA::findFirstUses(
          visited.insert(usei);
 
          if (usei->op == OP_PHI || usei->op == OP_UNION) {
-            // need a barrier before WAW cases
+            // need a barrier before WAW cases, like:
+            //   %r0 = tex
+            //   if ...
+            //     texbar <- is required or tex might replace x again
+            //     %r1 = x <- overwriting def
+            //   %r2 = phi %r0, %r1
             for (int s = 0; usei->srcExists(s); ++s) {
                Instruction *defi = usei->getSrc(s)->getUniqueInsn();
                if (defi && &usei->src(s) != *u)
@@ -263,7 +268,7 @@ NVC0LegalizePostRA::findFirstUses(
              usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
             findFirstUses(texi, usei, uses, visited);
          } else {
-            addTexUse(uses, usei, insn);
+            addTexUse(uses, usei, texi);
          }
       }
    }
@@ -1751,6 +1756,7 @@ NVC0LoweringPass::visit(Instruction *i)
             Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
                                     i->getIndirect(0, 0), bld.mkImm(4));
             i->setIndirect(0, 0, ptr);
+            i->op = OP_VFETCH;
          } else {
             i->op = OP_VFETCH;
             assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 14446b6..ae739ee 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -236,6 +236,9 @@ LoadPropagation::visit(BasicBlock *bb)
       if (i->op == OP_CALL) // calls have args as sources, they must be in regs
          continue;
 
+      if (i->op == OP_PFETCH) // pfetch expects arg1 to be a reg
+         continue;
+
       if (i->srcExists(1))
          checkSwapSrc01(i);
 
@@ -278,7 +281,6 @@ private:
 
    void tryCollapseChainedMULs(Instruction *, const int s, ImmediateValue&);
 
-   // TGSI 'true' is converted to -1 by F2I(NEG(SET)), track back to SET
    CmpInstruction *findOriginForTestWithZero(Value *);
 
    unsigned int foldCount;
@@ -337,25 +339,33 @@ ConstantFolding::findOriginForTestWithZero(Value *value)
       return NULL;
    Instruction *insn = value->getInsn();
 
-   while (insn && insn->op != OP_SET) {
-      Instruction *next = NULL;
-      switch (insn->op) {
-      case OP_NEG:
-      case OP_ABS:
-      case OP_CVT:
-         next = insn->getSrc(0)->getInsn();
-         if (insn->sType != next->dType)
+   if (insn->asCmp() && insn->op != OP_SLCT)
+      return insn->asCmp();
+
+   /* Sometimes mov's will sneak in as a result of other folding. This gets
+    * cleaned up later.
+    */
+   if (insn->op == OP_MOV)
+      return findOriginForTestWithZero(insn->getSrc(0));
+
+   /* Deal with AND 1.0 here since nv50 can't fold into boolean float */
+   if (insn->op == OP_AND) {
+      int s = 0;
+      ImmediateValue imm;
+      if (!insn->src(s).getImmediate(imm)) {
+         s = 1;
+         if (!insn->src(s).getImmediate(imm))
             return NULL;
-         break;
-      case OP_MOV:
-         next = insn->getSrc(0)->getInsn();
-         break;
-      default:
-         return NULL;
       }
-      insn = next;
+      if (imm.reg.data.f32 != 1.0f)
+         return NULL;
+      /* TODO: Come up with a way to handle the condition being inverted */
+      if (insn->src(!s).mod != Modifier(0))
+         return NULL;
+      return findOriginForTestWithZero(insn->getSrc(!s));
    }
-   return insn ? insn->asCmp() : NULL;
+
+   return NULL;
 }
 
 void
@@ -574,6 +584,11 @@ ConstantFolding::expr(Instruction *i,
    case OP_POPCNT:
       res.data.u32 = util_bitcount(a->data.u32 & b->data.u32);
       break;
+   case OP_PFETCH:
+      // The two arguments to pfetch are logically added together. Normally
+      // the second argument will not be constant, but that can happen.
+      res.data.u32 = a->data.u32 + b->data.u32;
+      break;
    default:
       return;
    }
@@ -588,7 +603,9 @@ ConstantFolding::expr(Instruction *i,
 
    i->getSrc(0)->reg.data = res.data;
 
-   if (i->op == OP_MAD || i->op == OP_FMA) {
+   switch (i->op) {
+   case OP_MAD:
+   case OP_FMA: {
       i->op = OP_ADD;
 
       i->setSrc(1, i->getSrc(0));
@@ -603,8 +620,14 @@ ConstantFolding::expr(Instruction *i,
          bld.setPosition(i, false);
          i->setSrc(1, bld.loadImm(NULL, res.data.u32));
       }
-   } else {
+      break;
+   }
+   case OP_PFETCH:
+      // Leave PFETCH alone... we just folded its 2 args into 1.
+      break;
+   default:
       i->op = i->saturate ? OP_SAT : OP_MOV; /* SAT handled by unary() */
+      break;
    }
    i->subOp = 0;
 }
@@ -946,33 +969,82 @@ ConstantFolding::opnd(Instruction *i, ImmediateValue &imm0, int s)
 
    case OP_SET: // TODO: SET_AND,OR,XOR
    {
+      /* This optimizes the case where the output of a set is being compared
+       * to zero. Since the set can only produce 0/-1 (int) or 0/1 (float), we
+       * can be a lot cleverer in our comparison.
+       */
       CmpInstruction *si = findOriginForTestWithZero(i->getSrc(t));
       CondCode cc, ccZ;
-      if (i->src(t).mod != Modifier(0))
-         return;
-      if (imm0.reg.data.u32 != 0 || !si || si->op != OP_SET)
+      if (imm0.reg.data.u32 != 0 || !si)
          return;
       cc = si->setCond;
       ccZ = (CondCode)((unsigned int)i->asCmp()->setCond & ~CC_U);
+      // We do everything assuming var (cmp) 0, reverse the condition if 0 is
+      // first.
       if (s == 0)
          ccZ = reverseCondCode(ccZ);
+      // If there is a negative modifier, we need to undo that, by flipping
+      // the comparison to zero.
+      if (i->src(t).mod.neg())
+         ccZ = reverseCondCode(ccZ);
+      // If this is a signed comparison, we expect the input to be a regular
+      // boolean, i.e. 0/-1. However the rest of the logic assumes that true
+      // is positive, so just flip the sign.
+      if (i->sType == TYPE_S32) {
+         assert(!isFloatType(si->dType));
+         ccZ = reverseCondCode(ccZ);
+      }
       switch (ccZ) {
-      case CC_LT: cc = CC_FL; break;
-      case CC_GE: cc = CC_TR; break;
-      case CC_EQ: cc = inverseCondCode(cc); break;
-      case CC_LE: cc = inverseCondCode(cc); break;
-      case CC_GT: break;
-      case CC_NE: break;
+      case CC_LT: cc = CC_FL; break; // bool < 0 -- this is never true
+      case CC_GE: cc = CC_TR; break; // bool >= 0 -- this is always true
+      case CC_EQ: cc = inverseCondCode(cc); break; // bool == 0 -- !bool
+      case CC_LE: cc = inverseCondCode(cc); break; // bool <= 0 -- !bool
+      case CC_GT: break; // bool > 0 -- bool
+      case CC_NE: break; // bool != 0 -- bool
       default:
          return;
       }
+
+      // Update the condition of this SET to be identical to the origin set,
+      // but with the updated condition code. The original SET should get
+      // DCE'd, ideally.
+      i->op = si->op;
       i->asCmp()->setCond = cc;
       i->setSrc(0, si->src(0));
       i->setSrc(1, si->src(1));
+      if (si->srcExists(2))
+         i->setSrc(2, si->src(2));
       i->sType = si->sType;
    }
       break;
 
+   case OP_AND:
+   {
+      CmpInstruction *cmp = i->getSrc(t)->getInsn()->asCmp();
+      if (!cmp || cmp->op == OP_SLCT || cmp->getDef(0)->refCount() > 1)
+         return;
+      if (!prog->getTarget()->isOpSupported(cmp->op, TYPE_F32))
+         return;
+      if (imm0.reg.data.f32 != 1.0)
+         return;
+      if (i->getSrc(t)->getInsn()->dType != TYPE_U32)
+         return;
+
+      i->getSrc(t)->getInsn()->dType = TYPE_F32;
+      if (i->src(t).mod != Modifier(0)) {
+         assert(i->src(t).mod == Modifier(NV50_IR_MOD_NOT));
+         i->src(t).mod = Modifier(0);
+         cmp->setCond = inverseCondCode(cmp->setCond);
+      }
+      i->op = OP_MOV;
+      i->setSrc(s, NULL);
+      if (t) {
+         i->setSrc(0, i->getSrc(t));
+         i->setSrc(t, NULL);
+      }
+   }
+      break;
+
    case OP_SHL:
    {
       if (s != 1 || i->src(0).mod != Modifier(0))
@@ -2216,7 +2288,7 @@ FlatteningPass::visit(BasicBlock *bb)
              insn->op != OP_LINTERP && // probably just nve4
              insn->op != OP_PINTERP && // probably just nve4
              ((insn->op != OP_LOAD && insn->op != OP_STORE) ||
-              typeSizeof(insn->dType) <= 4) &&
+              (typeSizeof(insn->dType) <= 4 && !insn->src(0).isIndirect(0))) &&
              !insn->isNop()) {
             insn->join = 1;
             bb->remove(bb->getExit());
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
index 178a167..ca545a6 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_target_nv50.cpp
@@ -84,7 +84,7 @@ static const struct opProperties _initProps[] =
    //           neg  abs  not  sat  c[]  s[], a[], imm
    { OP_ADD,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
    { OP_SUB,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
-   { OP_MUL,    0x3, 0x0, 0x0, 0x8, 0x2, 0x1, 0x1, 0x2 },
+   { OP_MUL,    0x3, 0x0, 0x0, 0x0, 0x2, 0x1, 0x1, 0x2 },
    { OP_MAX,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
    { OP_MIN,    0x3, 0x3, 0x0, 0x0, 0x2, 0x1, 0x1, 0x0 },
    { OP_MAD,    0x7, 0x0, 0x0, 0x8, 0x6, 0x1, 0x1, 0x0 }, // special constraint
@@ -188,6 +188,9 @@ void TargetNV50::initOpInfo()
       if (prop->mSat & 8)
          opInfo[prop->op].dstMods = NV50_IR_MOD_SAT;
    }
+
+   if (chipset >= 0xa0)
+      opInfo[OP_MUL].dstMods = NV50_IR_MOD_SAT;
 }
 
 unsigned int
@@ -413,6 +416,8 @@ TargetNV50::isOpSupported(operation op, DataType ty) const
       return false;
    case OP_SAD:
       return ty == TYPE_S32;
+   case OP_SET:
+      return !isFloatType(ty);
    default:
       return true;
    }
diff --git a/src/gallium/drivers/nouveau/nouveau_buffer.c b/src/gallium/drivers/nouveau/nouveau_buffer.c
index 32fa65c..09cdbb5 100644
--- a/src/gallium/drivers/nouveau/nouveau_buffer.c
+++ b/src/gallium/drivers/nouveau/nouveau_buffer.c
@@ -658,13 +658,13 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
       switch (buffer->base.usage) {
       case PIPE_USAGE_DEFAULT:
       case PIPE_USAGE_IMMUTABLE:
-         buffer->domain = NOUVEAU_BO_VRAM;
+         buffer->domain = NV_VRAM_DOMAIN(screen);
          break;
       case PIPE_USAGE_DYNAMIC:
          /* For most apps, we'd have to do staging transfers to avoid sync
           * with this usage, and GART -> GART copies would be suboptimal.
           */
-         buffer->domain = NOUVEAU_BO_VRAM;
+         buffer->domain = NV_VRAM_DOMAIN(screen);
          break;
       case PIPE_USAGE_STAGING:
       case PIPE_USAGE_STREAM:
@@ -676,7 +676,7 @@ nouveau_buffer_create(struct pipe_screen *pscreen,
       }
    } else {
       if (buffer->base.bind & screen->vidmem_bindings)
-         buffer->domain = NOUVEAU_BO_VRAM;
+         buffer->domain = NV_VRAM_DOMAIN(screen);
       else
       if (buffer->base.bind & screen->sysmem_bindings)
          buffer->domain = NOUVEAU_BO_GART;
diff --git a/src/gallium/drivers/nouveau/nouveau_heap.h b/src/gallium/drivers/nouveau/nouveau_heap.h
index d0b2284..a3d64a6 100644
--- a/src/gallium/drivers/nouveau/nouveau_heap.h
+++ b/src/gallium/drivers/nouveau/nouveau_heap.h
@@ -23,6 +23,26 @@
 #ifndef __NOUVEAU_HEAP_H__
 #define __NOUVEAU_HEAP_H__
 
+/* This datastructure represents a memory allocation heap. Fundamentally, this
+ * is a doubly-linked list with a few properties, and a usage convention.
+ *
+ * On initial allocation, there is a single node with the full size that's
+ * marked as not in-use. As allocations are made, blocks are taken off the end
+ * of that first node, and inserted right after it. If the first node doesn't
+ * have enough free space, we look for free space down in the rest of the
+ * list. This can happen if an allocation is made and then freed.
+ *
+ * The first node will remain with in_use == 0 even if the whole heap is
+ * exhausted. Another invariant is that there will never be two sequential
+ * in_use == 0 nodes. If a node is freed and it has one (or both) adjacent
+ * free nodes, they are merged into one, and the relevant heap entries are
+ * freed.
+ *
+ * The pattern to free the whole heap is to start with the first node and then
+ * just free the "next" node, until there is no next node. This should assure
+ * that at the end the first (and only) node is not in use and contains the
+ * full size of the heap.
+ */
 struct nouveau_heap {
 	struct nouveau_heap *prev;
 	struct nouveau_heap *next;
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.c b/src/gallium/drivers/nouveau/nouveau_screen.c
index b4f1413..c6e5074 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.c
+++ b/src/gallium/drivers/nouveau/nouveau_screen.c
@@ -164,6 +164,16 @@ nouveau_screen_init(struct nouveau_screen *screen, struct nouveau_device *dev)
 		size = sizeof(nvc0_data);
 	}
 
+	/*
+	 * Set default VRAM domain if not overridden
+	 */
+	if (!screen->vram_domain) {
+		if (dev->vram_size > 0)
+			screen->vram_domain = NOUVEAU_BO_VRAM;
+		else
+			screen->vram_domain = NOUVEAU_BO_GART;
+	}
+
 	ret = nouveau_object_new(&dev->object, 0, NOUVEAU_FIFO_CHANNEL_CLASS,
 				 data, size, &screen->channel);
 	if (ret)
diff --git a/src/gallium/drivers/nouveau/nouveau_screen.h b/src/gallium/drivers/nouveau/nouveau_screen.h
index cf06f7e..30041b2 100644
--- a/src/gallium/drivers/nouveau/nouveau_screen.h
+++ b/src/gallium/drivers/nouveau/nouveau_screen.h
@@ -51,6 +51,8 @@ struct nouveau_screen {
 
 	boolean hint_buf_keep_sysmem_copy;
 
+	unsigned vram_domain;
+
 	struct {
 		unsigned profiles_checked;
 		unsigned profiles_present;
@@ -94,6 +96,8 @@ struct nouveau_screen {
 #endif
 };
 
+#define NV_VRAM_DOMAIN(screen) ((screen)->vram_domain)
+
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
 # define NOUVEAU_DRV_STAT(s, n, v) do {         \
       (s)->stats.named.n += (v);               \
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_clear.c b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
index 1ab8929..83fd1fa 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_clear.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_clear.c
@@ -58,7 +58,7 @@ nv30_clear(struct pipe_context *pipe, unsigned buffers,
    struct pipe_framebuffer_state *fb = &nv30->framebuffer;
    uint32_t colr = 0, zeta = 0, mode = 0;
 
-   if (!nv30_state_validate(nv30, TRUE))
+   if (!nv30_state_validate(nv30, NV30_NEW_FRAMEBUFFER | NV30_NEW_SCISSOR, TRUE))
       return;
 
    if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_context.h b/src/gallium/drivers/nouveau/nv30/nv30_context.h
index 7b32aae..592cdbe 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_context.h
+++ b/src/gallium/drivers/nouveau/nv30/nv30_context.h
@@ -204,7 +204,7 @@ void
 nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info);
 
 boolean
-nv30_state_validate(struct nv30_context *nv30, boolean hwtnl);
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl);
 
 void
 nv30_state_release(struct nv30_context *nv30);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index 3575c3d..c1665b7 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -71,12 +71,12 @@ nv30_render_allocate_vertices(struct vbuf_render *render,
    struct nv30_render *r = nv30_render(render);
    struct nv30_context *nv30 = r->nv30;
 
-   r->length = vertex_size * nr_vertices;
+   r->length = (uint32_t)vertex_size * (uint32_t)nr_vertices;
 
    if (r->offset + r->length >= render->max_vertex_buffer_bytes) {
       pipe_resource_reference(&r->buffer, NULL);
       r->buffer = pipe_buffer_create(&nv30->screen->base.base,
-                                     PIPE_BIND_VERTEX_BUFFER, 0,
+                                     PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM,
                                      render->max_vertex_buffer_bytes);
       if (!r->buffer)
          return FALSE;
@@ -91,10 +91,14 @@ static void *
 nv30_render_map_vertices(struct vbuf_render *render)
 {
    struct nv30_render *r = nv30_render(render);
-   char *map = pipe_buffer_map(&r->nv30->base.pipe, r->buffer,
-                               PIPE_TRANSFER_WRITE |
-                               PIPE_TRANSFER_UNSYNCHRONIZED, &r->transfer);
-   return map + r->offset;
+   char *map = pipe_buffer_map_range(
+         &r->nv30->base.pipe, r->buffer,
+         r->offset, r->length,
+         PIPE_TRANSFER_WRITE |
+         PIPE_TRANSFER_DISCARD_RANGE,
+         &r->transfer);
+   assert(map);
+   return map;
 }
 
 static void
@@ -103,6 +107,7 @@ nv30_render_unmap_vertices(struct vbuf_render *render,
 {
    struct nv30_render *r = nv30_render(render);
    pipe_buffer_unmap(&r->nv30->base.pipe, r->transfer);
+   r->transfer = NULL;
 }
 
 static void
@@ -126,10 +131,10 @@ nv30_render_draw_elements(struct vbuf_render *render,
    for (i = 0; i < r->vertex_info.num_attribs; i++) {
       PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
                        nv04_resource(r->buffer), r->offset + r->vtxptr[i],
-                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0);
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, FALSE))
+   if (!nv30_state_validate(nv30, ~0, FALSE))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -171,10 +176,10 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
    for (i = 0; i < r->vertex_info.num_attribs; i++) {
       PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
                        nv04_resource(r->buffer), r->offset + r->vtxptr[i],
-                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, 0);
+                       NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, FALSE))
+   if (!nv30_state_validate(nv30, ~0, FALSE))
       return;
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
@@ -213,22 +218,24 @@ static const struct {
    [TGSI_SEMANTIC_BCOLOR  ] = { EMIT_4F, INTERP_LINEAR     , 1, 3, 0x00000004 },
    [TGSI_SEMANTIC_FOG     ] = { EMIT_4F, INTERP_PERSPECTIVE, 5, 5, 0x00000010 },
    [TGSI_SEMANTIC_PSIZE   ] = { EMIT_1F_PSIZE, INTERP_POS  , 6, 6, 0x00000020 },
-   [TGSI_SEMANTIC_GENERIC ] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 }
+   [TGSI_SEMANTIC_TEXCOORD] = { EMIT_4F, INTERP_PERSPECTIVE, 8, 7, 0x00004000 },
 };
 
 static boolean
 vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
 {
-   struct pipe_screen *pscreen = &r->nv30->screen->base.base;
+   struct nv30_screen *screen = r->nv30->screen;
    struct nv30_fragprog *fp = r->nv30->fragprog.program;
    struct vertex_info *vinfo = &r->vertex_info;
    enum pipe_format format;
    uint emit = EMIT_OMIT;
    uint result = *idx;
 
-   if (sem == TGSI_SEMANTIC_GENERIC && result >= 8) {
-      for (result = 0; result < 8; result++) {
-         if (fp->texcoord[result] == *idx) {
+   if (sem == TGSI_SEMANTIC_GENERIC) {
+      uint num_texcoords = (screen->eng3d->oclass < NV40_3D_CLASS) ? 8 : 10;
+      for (result = 0; result < num_texcoords; result++) {
+         if (fp->texcoord[result] == *idx + 8) {
+            sem = TGSI_SEMANTIC_TEXCOORD;
             emit = vroute[sem].emit;
             break;
          }
@@ -243,11 +250,11 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
    draw_emit_vertex_attr(vinfo, emit, vroute[sem].interp, attrib);
    format = draw_translate_vinfo_format(emit);
 
-   r->vtxfmt[attrib] = nv30_vtxfmt(pscreen, format)->hw;
-   r->vtxptr[attrib] = vinfo->size | NV30_3D_VTXBUF_DMA1;
+   r->vtxfmt[attrib] = nv30_vtxfmt(&screen->base.base, format)->hw;
+   r->vtxptr[attrib] = vinfo->size;
    vinfo->size += draw_translate_vinfo_size(emit);
 
-   if (nv30_screen(pscreen)->eng3d->oclass < NV40_3D_CLASS) {
+   if (screen->eng3d->oclass < NV40_3D_CLASS) {
       r->vtxprog[attrib][0] = 0x001f38d8;
       r->vtxprog[attrib][1] = 0x0080001b | (attrib << 9);
       r->vtxprog[attrib][2] = 0x0836106c;
@@ -259,7 +266,12 @@ vroute_add(struct nv30_render *r, uint attrib, uint sem, uint *idx)
       r->vtxprog[attrib][3] = 0x6041ff80 | (result + vroute[sem].vp40) << 2;
    }
 
-   *idx = vroute[sem].ow40 << result;
+   if (result < 8)
+      *idx = vroute[sem].ow40 << result;
+   else {
+      assert(sem == TGSI_SEMANTIC_TEXCOORD);
+      *idx = 0x00001000 << (result - 8);
+   }
    return TRUE;
 }
 
@@ -313,7 +325,7 @@ nv30_render_validate(struct nv30_context *nv30)
 
    while (pntc && attrib < 16) {
       uint index = ffs(pntc) - 1; pntc &= ~(1 << index);
-      if (vroute_add(r, attrib, TGSI_SEMANTIC_GENERIC, &index)) {
+      if (vroute_add(r, attrib, TGSI_SEMANTIC_TEXCOORD, &index)) {
          vp_attribs |= (1 << attrib++);
          vp_results |= index;
       }
@@ -398,17 +410,17 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (nv30->vertprog.constbuf) {
          void *map = nv04_resource(nv30->vertprog.constbuf)->data;
          draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0,
-                                         map, nv30->vertprog.constbuf_nr);
+                                         map, nv30->vertprog.constbuf_nr * 16);
+      } else {
+         draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0, NULL, 0);
       }
    }
 
    for (i = 0; i < nv30->num_vtxbufs; i++) {
       const void *map = nv30->vtxbuf[i].user_buffer;
       if (!map) {
-         if (!nv30->vtxbuf[i].buffer) {
-            continue;
-         }
-         map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer,
+         if (nv30->vtxbuf[i].buffer)
+            map = pipe_buffer_map(pipe, nv30->vtxbuf[i].buffer,
                                   PIPE_TRANSFER_UNSYNCHRONIZED |
                                   PIPE_TRANSFER_READ, &transfer[i]);
       }
@@ -418,9 +430,9 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (info->indexed) {
       const void *map = nv30->idxbuf.user_buffer;
       if (!map)
-         pipe_buffer_map(pipe, nv30->idxbuf.buffer,
-                                  PIPE_TRANSFER_UNSYNCHRONIZED |
-                                  PIPE_TRANSFER_READ, &transferi);
+         map = pipe_buffer_map(pipe, nv30->idxbuf.buffer,
+                               PIPE_TRANSFER_UNSYNCHRONIZED |
+                               PIPE_TRANSFER_READ, &transferi);
       draw_set_indexes(draw,
                        (ubyte *) map + nv30->idxbuf.offset,
                        nv30->idxbuf.index_size, ~0);
@@ -444,6 +456,12 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 static void
 nv30_render_destroy(struct vbuf_render *render)
 {
+   struct nv30_render *r = nv30_render(render);
+
+   if (r->transfer)
+      pipe_buffer_unmap(&r->nv30->base.pipe, r->transfer);
+   pipe_resource_reference(&r->buffer, NULL);
+   nouveau_heap_free(&r->vertprog);
    FREE(render);
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index a05bfe1..7f22786 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -23,6 +23,7 @@
  *
  */
 
+#include "draw/draw_context.h"
 #include "tgsi/tgsi_parse.h"
 
 #include "nv_object.xml.h"
@@ -147,8 +148,12 @@ nv30_fp_state_delete(struct pipe_context *pipe, void *hwcso)
 
    pipe_resource_reference(&fp->buffer, NULL);
 
+   if (fp->draw)
+      draw_delete_fragment_shader(nv30_context(pipe)->draw, fp->draw);
+
    FREE((void *)fp->pipe.tokens);
    FREE(fp->insn);
+   FREE(fp->consts);
    FREE(fp);
 }
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index eeb7148..2e38a19 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -161,6 +161,7 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_POLYGON_OFFSET_CLAMP:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -251,6 +252,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       default:
          debug_printf("unknown vertex shader param %d\n", param);
@@ -291,6 +293,7 @@ nv30_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       default:
          debug_printf("unknown fragment shader param %d\n", param);
@@ -523,7 +526,7 @@ nv30_screen_create(struct nouveau_device *dev)
 
    ret = nouveau_bo_wrap(screen->base.device, fifo->notify, &screen->notify);
    if (ret == 0)
-      nouveau_bo_map(screen->notify, 0, screen->base.client);
+      ret = nouveau_bo_map(screen->notify, 0, screen->base.client);
    if (ret)
       FAIL_SCREEN_INIT("error mapping notifier memory: %d\n", ret);
 
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
index 0f9d19d..a954dcc 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state_validate.c
@@ -272,15 +272,13 @@ nv30_validate_clip(struct nv30_context *nv30)
    uint32_t clpd_enable = 0;
 
    for (i = 0; i < 6; i++) {
-      if (nv30->rast->pipe.clip_plane_enable & (1 << i)) {
-         if (nv30->dirty & NV30_NEW_CLIP) {
-            BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5);
-            PUSH_DATA (push, i);
-            PUSH_DATAp(push, nv30->clip.ucp[i], 4);
-         }
-
-         clpd_enable |= 1 << (1 + 4*i);
+      if (nv30->dirty & NV30_NEW_CLIP) {
+         BEGIN_NV04(push, NV30_3D(VP_UPLOAD_CONST_ID), 5);
+         PUSH_DATA (push, i);
+         PUSH_DATAp(push, nv30->clip.ucp[i], 4);
       }
+      if (nv30->rast->pipe.clip_plane_enable & (1 << i))
+         clpd_enable |= 2 << (4*i);
    }
 
    BEGIN_NV04(push, NV30_3D(VP_CLIP_PLANES_ENABLE), 1);
@@ -389,7 +387,7 @@ static struct state_validate hwtnl_validate_list[] = {
     { nv30_validate_stipple,       NV30_NEW_STIPPLE },
     { nv30_validate_scissor,       NV30_NEW_SCISSOR | NV30_NEW_RASTERIZER },
     { nv30_validate_viewport,      NV30_NEW_VIEWPORT },
-    { nv30_validate_clip,          NV30_NEW_CLIP },
+    { nv30_validate_clip,          NV30_NEW_CLIP | NV30_NEW_RASTERIZER },
     { nv30_fragprog_validate,      NV30_NEW_FRAGPROG | NV30_NEW_FRAGCONST },
     { nv30_vertprog_validate,      NV30_NEW_VERTPROG | NV30_NEW_VERTCONST |
                                    NV30_NEW_FRAGPROG | NV30_NEW_RASTERIZER },
@@ -456,7 +454,7 @@ nv30_state_context_switch(struct nv30_context *nv30)
 }
 
 boolean
-nv30_state_validate(struct nv30_context *nv30, boolean hwtnl)
+nv30_state_validate(struct nv30_context *nv30, uint32_t mask, boolean hwtnl)
 {
    struct nouveau_screen *screen = &nv30->screen->base;
    struct nouveau_pushbuf *push = nv30->base.pushbuf;
@@ -481,14 +479,16 @@ nv30_state_validate(struct nv30_context *nv30, boolean hwtnl)
    else
       validate = swtnl_validate_list;
 
-   if (nv30->dirty) {
+   mask &= nv30->dirty;
+
+   if (mask) {
       while (validate->func) {
-         if (nv30->dirty & validate->mask)
+         if (mask & validate->mask)
             validate->func(nv30);
          validate++;
       }
 
-      nv30->dirty = 0;
+      nv30->dirty &= ~mask;
    }
 
    nouveau_pushbuf_bufctx(push, bctx);
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
index 67ab829..d4e384b 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vbo.c
@@ -564,7 +564,7 @@ nv30_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nv30->vbo_user && !(nv30->dirty & (NV30_NEW_VERTEX | NV30_NEW_ARRAYS)))
       nv30_update_user_vbufs(nv30);
 
-   nv30_state_validate(nv30, TRUE);
+   nv30_state_validate(nv30, ~0, TRUE);
    if (nv30->draw_flags) {
       nv30_render_vbo(pipe, info);
       return;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
index 3c1b7e7..4d4145d 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_vertprog.c
@@ -23,6 +23,7 @@
  *
  */
 
+#include "draw/draw_context.h"
 #include "util/u_dynarray.h"
 #include "tgsi/tgsi_parse.h"
 
@@ -237,6 +238,10 @@ nv30_vp_state_delete(struct pipe_context *pipe, void *hwcso)
 
    if (vp->translated)
       nv30_vertprog_destroy(vp);
+
+   if (vp->draw)
+      draw_delete_vertex_shader(nv30_context(pipe)->draw, vp->draw);
+
    FREE((void *)vp->pipe.tokens);
    FREE(vp);
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
index bbdca81..9ef1696 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_fragprog.c
@@ -327,6 +327,8 @@ nv40_fp_rep(struct nvfx_fpc *fpc, unsigned count, unsigned target)
         //util_dynarray_append(&fpc->loop_stack, unsigned, target);
 }
 
+#if 0
+/* documentation only */
 /* warning: this only works forward, and probably only if not inside any IF */
 static void
 nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
@@ -352,6 +354,7 @@ nv40_fp_bra(struct nvfx_fpc *fpc, unsigned target)
         reloc.location = fpc->inst_offset + 3;
         util_dynarray_append(&fpc->label_relocs, struct nvfx_relocation, reloc);
 }
+#endif
 
 static void
 nv40_fp_brk(struct nvfx_fpc *fpc)
@@ -528,7 +531,7 @@ nvfx_fragprog_parse_instruction(struct nvfx_fpc *fpc,
 
    dst  = tgsi_dst(fpc, &finst->Dst[0]);
    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
-   sat  = (finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE);
+   sat  = finst->Instruction.Saturate;
 
    switch (finst->Instruction.Opcode) {
    case TGSI_OPCODE_ABS:
@@ -1201,17 +1204,3 @@ out_err:
    tgsi_dump(fp->pipe.tokens, 0);
    goto out;
 }
-
-static inline void
-nvfx_fp_memcpy(void* dst, const void* src, size_t len)
-{
-#ifndef PIPE_ARCH_BIG_ENDIAN
-   memcpy(dst, src, len);
-#else
-   size_t i;
-   for(i = 0; i < len; i += 4) {
-      uint32_t v = *(uint32_t*)((char*)src + i);
-      *(uint32_t*)((char*)dst + i) = (v >> 16) | (v << 16);
-   }
-#endif
-}
diff --git a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
index 29d506b..1ce0589 100644
--- a/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nvfx_vertprog.c
@@ -539,7 +539,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
 
    final_dst = dst  = tgsi_dst(vpc, &finst->Dst[0]);
    mask = tgsi_mask(finst->Dst[0].Register.WriteMask);
-   if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE) {
+   if(finst->Instruction.Saturate) {
       assert(finst->Instruction.Opcode != TGSI_OPCODE_ARL);
       if (vpc->is_nv4x)
          sat = TRUE;
@@ -796,7 +796,7 @@ nvfx_vertprog_parse_instruction(struct nvfx_vpc *vpc,
       return FALSE;
    }
 
-   if(finst->Instruction.Saturate == TGSI_SAT_ZERO_ONE && !vpc->is_nv4x) {
+   if(finst->Instruction.Saturate && !vpc->is_nv4x) {
       if (!vpc->r_0_1.type)
          vpc->r_0_1 = constant(vpc, -1, 0, 1, 0, 0);
       nvfx_vp_emit(vpc, arith(0, VEC, MAX, dst, mask, nvfx_src(dst), swz(nvfx_src(vpc->r_0_1), X, X, X, X), none));
@@ -872,9 +872,8 @@ nvfx_vertprog_parse_decl_output(struct nvfx_vpc *vpc,
       }
       break;
    case TGSI_SEMANTIC_EDGEFLAG:
-      /* not really an error just a fallback */
-      NOUVEAU_ERR("cannot handle edgeflag output\n");
-      return FALSE;
+      vpc->r_result[idx] = nvfx_reg(NVFXSR_NONE, 0);
+      return TRUE;
    default:
       NOUVEAU_ERR("bad output semantic\n");
       return FALSE;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 2cfd5db..5b5d391 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -138,8 +138,11 @@ nv50_destroy(struct pipe_context *pipe)
 {
    struct nv50_context *nv50 = nv50_context(pipe);
 
-   if (nv50_context_screen(nv50)->cur_ctx == nv50)
-      nv50_context_screen(nv50)->cur_ctx = NULL;
+   if (nv50->screen->cur_ctx == nv50) {
+      nv50->screen->cur_ctx = NULL;
+      /* Save off the state in case another context gets created */
+      nv50->screen->save_state = nv50->state;
+   }
    nouveau_pushbuf_bufctx(nv50->base.pushbuf, NULL);
    nouveau_pushbuf_kick(nv50->base.pushbuf, nv50->base.pushbuf->channel);
 
@@ -290,6 +293,10 @@ nv50_create(struct pipe_screen *pscreen, void *priv)
    pipe->get_sample_position = nv50_context_get_sample_position;
 
    if (!screen->cur_ctx) {
+      /* Restore the last context's state here, normally handled during
+       * context switch
+       */
+      nv50->state = screen->save_state;
       screen->cur_ctx = nv50;
       nouveau_pushbuf_bufctx(screen->base.pushbuf, nv50->bufctx);
    }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index 45eb554..1f123ef 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -104,28 +104,7 @@ struct nv50_context {
    uint32_t dirty;
    boolean cb_dirty;
 
-   struct {
-      uint32_t instance_elts; /* bitmask of per-instance elements */
-      uint32_t instance_base;
-      uint32_t interpolant_ctrl;
-      uint32_t semantic_color;
-      uint32_t semantic_psize;
-      int32_t index_bias;
-      boolean uniform_buffer_bound[3];
-      boolean prim_restart;
-      boolean point_sprite;
-      boolean rt_serialize;
-      boolean flushed;
-      boolean rasterizer_discard;
-      uint8_t tls_required;
-      boolean new_tls_space;
-      uint8_t num_vtxbufs;
-      uint8_t num_vtxelts;
-      uint8_t num_textures[3];
-      uint8_t num_samplers[3];
-      uint8_t prim_size;
-      uint16_t scissor;
-   } state;
+   struct nv50_graph_state state;
 
    struct nv50_blend_stateobj *blend;
    struct nv50_rasterizer_stateobj *rast;
@@ -191,12 +170,6 @@ nv50_context(struct pipe_context *pipe)
    return (struct nv50_context *)pipe;
 }
 
-static INLINE struct nv50_screen *
-nv50_context_screen(struct nv50_context *nv50)
-{
-   return nv50_screen(&nv50->base.screen->base);
-}
-
 /* return index used in nv50_context arrays for a specific shader type */
 static INLINE unsigned
 nv50_context_shader_stage(unsigned pipe)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
index 744a3a5..f15d8f3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_miptree.c
@@ -377,7 +377,7 @@ nv50_miptree_create(struct pipe_screen *pscreen,
    if (!bo_config.nv50.memtype && (pt->bind & PIPE_BIND_SHARED))
       mt->base.domain = NOUVEAU_BO_GART;
    else
-      mt->base.domain = NOUVEAU_BO_VRAM;
+      mt->base.domain = NV_VRAM_DOMAIN(nouveau_screen(pscreen));
 
    bo_flags = mt->base.domain | NOUVEAU_BO_NOSNOOP;
    if (mt->base.base.bind & (PIPE_BIND_CURSOR | PIPE_BIND_DISPLAY_TARGET))
@@ -419,7 +419,7 @@ nv50_miptree_from_handle(struct pipe_screen *pscreen,
       FREE(mt);
       return NULL;
    }
-   mt->base.domain = NOUVEAU_BO_VRAM;
+   mt->base.domain = mt->base.bo->flags & NOUVEAU_BO_APER;
    mt->base.address = mt->base.bo->offset;
 
    mt->base.base = *templ;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_query.c b/src/gallium/drivers/nouveau/nv50/nv50_query.c
index 6690aa2..81f7474 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_query.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_query.c
@@ -27,6 +27,11 @@
 #include "nv50/nv50_context.h"
 #include "nv_object.xml.h"
 
+#define NV50_QUERY_STATE_READY   0
+#define NV50_QUERY_STATE_ACTIVE  1
+#define NV50_QUERY_STATE_ENDED   2
+#define NV50_QUERY_STATE_FLUSHED 3
+
 /* XXX: Nested queries, and simultaneous queries on multiple gallium contexts
  * (since we use only a single GPU channel per screen) will not work properly.
  *
@@ -42,10 +47,10 @@ struct nv50_query {
    struct nouveau_bo *bo;
    uint32_t base;
    uint32_t offset; /* base + i * 32 */
-   boolean ready;
-   boolean flushed;
+   uint8_t state;
    boolean is64bit;
    struct nouveau_mm_allocation *mm;
+   struct nouveau_fence *fence;
 };
 
 #define NV50_QUERY_ALLOC_SPACE 256
@@ -65,7 +70,7 @@ nv50_query_allocate(struct nv50_context *nv50, struct nv50_query *q, int size)
    if (q->bo) {
       nouveau_bo_ref(NULL, &q->bo);
       if (q->mm) {
-         if (q->ready)
+         if (q->state == NV50_QUERY_STATE_READY)
             nouveau_mm_free(q->mm);
          else
             nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work,
@@ -92,6 +97,7 @@ static void
 nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
 {
    nv50_query_allocate(nv50_context(pipe), nv50_query(pq), 0);
+   nouveau_fence_ref(NULL, &nv50_query(pq)->fence);
    FREE(nv50_query(pq));
 }
 
@@ -112,7 +118,8 @@ nv50_query_create(struct pipe_context *pipe, unsigned type, unsigned index)
 
    q->is64bit = (type == PIPE_QUERY_PRIMITIVES_GENERATED ||
                  type == PIPE_QUERY_PRIMITIVES_EMITTED ||
-                 type == PIPE_QUERY_SO_STATISTICS);
+                 type == PIPE_QUERY_SO_STATISTICS ||
+                 type == PIPE_QUERY_PIPELINE_STATISTICS);
    q->type = type;
 
    if (q->type == PIPE_QUERY_OCCLUSION_COUNTER) {
@@ -200,7 +207,7 @@ nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
    default:
       break;
    }
-   q->ready = FALSE;
+   q->state = NV50_QUERY_STATE_ACTIVE;
    return true;
 }
 
@@ -211,6 +218,8 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
    struct nouveau_pushbuf *push = nv50->base.pushbuf;
    struct nv50_query *q = nv50_query(pq);
 
+   q->state = NV50_QUERY_STATE_ENDED;
+
    switch (q->type) {
    case PIPE_QUERY_OCCLUSION_COUNTER:
       nv50_query_get(push, q, 0, 0x0100f002);
@@ -253,19 +262,27 @@ nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
       break;
    case PIPE_QUERY_TIMESTAMP_DISJOINT:
       /* This query is not issued on GPU because disjoint is forced to FALSE */
-      q->ready = TRUE;
+      q->state = NV50_QUERY_STATE_READY;
       break;
    default:
       assert(0);
       break;
    }
-   q->ready = q->flushed = FALSE;
+
+   if (q->is64bit)
+      nouveau_fence_ref(nv50->screen->base.fence.current, &q->fence);
 }
 
-static INLINE boolean
-nv50_query_ready(struct nv50_query *q)
+static INLINE void
+nv50_query_update(struct nv50_query *q)
 {
-   return q->ready || (!q->is64bit && (q->data[0] == q->sequence));
+   if (q->is64bit) {
+      if (nouveau_fence_signalled(q->fence))
+         q->state = NV50_QUERY_STATE_READY;
+   } else {
+      if (q->data[0] == q->sequence)
+         q->state = NV50_QUERY_STATE_READY;
+   }
 }
 
 static boolean
@@ -280,13 +297,14 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
    uint64_t *data64 = (uint64_t *)q->data;
    int i;
 
-   if (!q->ready) /* update ? */
-      q->ready = nv50_query_ready(q);
-   if (!q->ready) {
+   if (q->state != NV50_QUERY_STATE_READY)
+      nv50_query_update(q);
+
+   if (q->state != NV50_QUERY_STATE_READY) {
       if (!wait) {
          /* for broken apps that spin on GL_QUERY_RESULT_AVAILABLE */
-         if (!q->flushed) {
-            q->flushed = TRUE;
+         if (q->state != NV50_QUERY_STATE_FLUSHED) {
+            q->state = NV50_QUERY_STATE_FLUSHED;
             PUSH_KICK(nv50->base.pushbuf);
          }
          return FALSE;
@@ -294,7 +312,7 @@ nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
       if (nouveau_bo_wait(q->bo, NOUVEAU_BO_RD, nv50->screen->base.client))
          return FALSE;
    }
-   q->ready = TRUE;
+   q->state = NV50_QUERY_STATE_READY;
 
    switch (q->type) {
    case PIPE_QUERY_GPU_FINISHED:
@@ -434,6 +452,7 @@ nv50_query_pushbuf_submit(struct nouveau_pushbuf *push,
    /* XXX: does this exist ? */
 #define NV50_IB_ENTRY_1_NO_PREFETCH (0 << (31 - 8))
 
+   PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
    nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
                         NV50_IB_ENTRY_1_NO_PREFETCH);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 829dfbc..6583a35 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -209,6 +209,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -290,6 +291,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
    default:
       NOUVEAU_ERR("unknown PIPE_SHADER_CAP %d\n", param);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index f8ce365..881051b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -25,10 +25,34 @@ struct nv50_context;
 
 struct nv50_blitter;
 
+struct nv50_graph_state {
+   uint32_t instance_elts; /* bitmask of per-instance elements */
+   uint32_t instance_base;
+   uint32_t interpolant_ctrl;
+   uint32_t semantic_color;
+   uint32_t semantic_psize;
+   int32_t index_bias;
+   boolean uniform_buffer_bound[3];
+   boolean prim_restart;
+   boolean point_sprite;
+   boolean rt_serialize;
+   boolean flushed;
+   boolean rasterizer_discard;
+   uint8_t tls_required;
+   boolean new_tls_space;
+   uint8_t num_vtxbufs;
+   uint8_t num_vtxelts;
+   uint8_t num_textures[3];
+   uint8_t num_samplers[3];
+   uint8_t prim_size;
+   uint16_t scissor;
+};
+
 struct nv50_screen {
    struct nouveau_screen base;
 
    struct nv50_context *cur_ctx;
+   struct nv50_graph_state save_state;
 
    struct nouveau_bo *code;
    struct nouveau_bo *uniforms;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 2907504..d4d41af 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -811,12 +811,12 @@ nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    nv50->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
    if (nv50->constbuf[s][i].user) {
       nv50->constbuf[s][i].u.data = cb->user_buffer;
-      nv50->constbuf[s][i].size = cb->buffer_size;
+      nv50->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
    } else
    if (res) {
       nv50->constbuf[s][i].offset = cb->buffer_offset;
-      nv50->constbuf[s][i].size = align(cb->buffer_size, 0x100);
+      nv50->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nv50->constbuf_valid[s] |= 1 << i;
    } else {
       nv50->constbuf_valid[s] &= ~(1 << i);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 85e19b4..116bf4b 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -394,6 +394,8 @@ nv50_switch_pipe_context(struct nv50_context *ctx_to)
 
    if (ctx_from)
       ctx_to->state = ctx_from->state;
+   else
+      ctx_to->state = ctx_to->screen->save_state;
 
    ctx_to->dirty = ~0;
    ctx_to->viewports_dirty = ~0;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
index c1590ee..1fd33b8 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_vbo.c
@@ -628,6 +628,7 @@ nv50_draw_elements(struct nv50_context *nv50, boolean shorten,
          BEGIN_NV04(push, NV50_3D(VERTEX_BEGIN_GL), 1);
          PUSH_DATA (push, prim);
 
+         PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
          nouveau_pushbuf_space(push, 8, 0, 1);
 
          switch (index_size) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index ad287a2..56fc83d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -57,7 +57,7 @@ nvc0_screen_compute_setup(struct nvc0_screen *screen,
       return ret;
    }
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, 1 << 12, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
                         &screen->parm);
    if (ret)
       return ret;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 7662fb5..a35c3f6 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -139,8 +139,12 @@ nvc0_destroy(struct pipe_context *pipe)
 {
    struct nvc0_context *nvc0 = nvc0_context(pipe);
 
-   if (nvc0->screen->cur_ctx == nvc0)
+   if (nvc0->screen->cur_ctx == nvc0) {
       nvc0->screen->cur_ctx = NULL;
+      nvc0->screen->save_state = nvc0->state;
+      nvc0->screen->save_state.tfb = NULL;
+   }
+
    /* Unset bufctx, we don't want to revalidate any resources after the flush.
     * Other contexts will always set their bufctx again on action calls.
     */
@@ -303,6 +307,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
    pipe->get_sample_position = nvc0_context_get_sample_position;
 
    if (!screen->cur_ctx) {
+      nvc0->state = screen->save_state;
       screen->cur_ctx = nvc0;
       nouveau_pushbuf_bufctx(screen->base.pushbuf, nvc0->bufctx);
    }
@@ -324,7 +329,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
 
    /* add permanently resident buffers to bufctxts */
 
-   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RD;
+   flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RD;
 
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->text);
    BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->uniform_bo);
@@ -335,7 +340,7 @@ nvc0_create(struct pipe_screen *pscreen, void *priv)
       BCTX_REFN_bo(nvc0->bufctx_cp, CP_SCREEN, flags, screen->parm);
    }
 
-   flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+   flags = NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_RDWR;
 
    if (screen->poly_cache)
       BCTX_REFN_bo(nvc0->bufctx_3d, SCREEN, flags, screen->poly_cache);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index ef251f3..a8d7593 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -113,29 +113,7 @@ struct nvc0_context {
    uint32_t dirty;
    uint32_t dirty_cp; /* dirty flags for compute state */
 
-   struct {
-      boolean flushed;
-      boolean rasterizer_discard;
-      boolean early_z_forced;
-      boolean prim_restart;
-      uint32_t instance_elts; /* bitmask of per-instance elements */
-      uint32_t instance_base;
-      uint32_t constant_vbos;
-      uint32_t constant_elts;
-      int32_t index_bias;
-      uint16_t scissor;
-      uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
-      uint8_t num_vtxbufs;
-      uint8_t num_vtxelts;
-      uint8_t num_textures[6];
-      uint8_t num_samplers[6];
-      uint8_t tls_required; /* bitmask of shader types using l[] */
-      uint8_t c14_bound; /* whether immediate array constbuf is bound */
-      uint8_t clip_enable;
-      uint32_t clip_mode;
-      uint32_t uniform_buffer_bound[5];
-      struct nvc0_transform_feedback_state *tfb;
-   } state;
+   struct nvc0_graph_state state;
 
    struct nvc0_blend_stateobj *blend;
    struct nvc0_rasterizer_stateobj *rast;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
index fc75fc6..3875bbf 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_miptree.c
@@ -302,7 +302,7 @@ nvc0_miptree_create(struct pipe_screen *pscreen,
    if (!bo_config.nvc0.memtype && (pt->usage == PIPE_USAGE_STAGING || pt->bind & PIPE_BIND_SHARED))
       mt->base.domain = NOUVEAU_BO_GART;
    else
-      mt->base.domain = NOUVEAU_BO_VRAM;
+      mt->base.domain = NV_VRAM_DOMAIN(nouveau_screen(pscreen));
 
    bo_flags = mt->base.domain | NOUVEAU_BO_NOSNOOP;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index c156e91..e1f5a8c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -392,7 +392,7 @@ nvc0_gp_gen_header(struct nvc0_program *gp, struct nv50_ir_prog_info *info)
       break;
    }
 
-   gp->hdr[4] = info->prop.gp.maxVertices & 0x1ff;
+   gp->hdr[4] = MIN2(info->prop.gp.maxVertices, 1024);
 
    return nvc0_vtgp_gen_header(gp, info);
 }
@@ -683,11 +683,12 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
    ret = nouveau_heap_alloc(screen->text_heap, size, prog, &prog->mem);
    if (ret) {
       struct nouveau_heap *heap = screen->text_heap;
-      struct nouveau_heap *iter;
-      for (iter = heap; iter && iter->next != heap; iter = iter->next) {
-         struct nvc0_program *evict = iter->priv;
-         if (evict)
-            nouveau_heap_free(&evict->mem);
+      /* Note that the code library, which is allocated before anything else,
+       * does not have a priv pointer. We can stop once we hit it.
+       */
+      while (heap->next && heap->next->priv) {
+         struct nvc0_program *evict = heap->next->priv;
+         nouveau_heap_free(&evict->mem);
       }
       debug_printf("WARNING: out of code space, evicting all shaders.\n");
       ret = nouveau_heap_alloc(heap, size, prog, &prog->mem);
@@ -734,12 +735,12 @@ nvc0_program_upload_code(struct nvc0_context *nvc0, struct nvc0_program *prog)
 
    if (!is_cp)
       nvc0->base.push_data(&nvc0->base, screen->text, prog->code_base,
-                           NOUVEAU_BO_VRAM, NVC0_SHADER_HEADER_SIZE, prog->hdr);
+                           NV_VRAM_DOMAIN(&screen->base), NVC0_SHADER_HEADER_SIZE, prog->hdr);
    nvc0->base.push_data(&nvc0->base, screen->text, code_pos,
-                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
+                        NV_VRAM_DOMAIN(&screen->base), prog->code_size, prog->code);
    if (prog->immd_size)
       nvc0->base.push_data(&nvc0->base,
-                           screen->text, prog->immd_base, NOUVEAU_BO_VRAM,
+                           screen->text, prog->immd_base, NV_VRAM_DOMAIN(&screen->base),
                            prog->immd_size, prog->immd_data);
 
    BEGIN_NVC0(nvc0->base.pushbuf, NVC0_3D(MEM_BARRIER), 1);
@@ -770,7 +771,7 @@ nvc0_program_library_upload(struct nvc0_context *nvc0)
       return;
 
    nvc0->base.push_data(&nvc0->base,
-                        screen->text, screen->lib_code->start, NOUVEAU_BO_VRAM,
+                        screen->text, screen->lib_code->start, NV_VRAM_DOMAIN(&screen->base),
                         size, code);
    /* no need for a memory barrier, will be emitted with first program */
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index 52032eb..aea6cbd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -617,6 +617,7 @@ nvc0_query_pushbuf_submit(struct nouveau_pushbuf *push,
 
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
 
+   PUSH_REFN(push, q->bo, NOUVEAU_BO_RD | NOUVEAU_BO_GART);
    nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push, q->bo, q->offset + result_offset, 4 |
                         NVC0_IB_ENTRY_1_NO_PREFETCH);
@@ -1407,11 +1408,14 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    count += NVC0_QUERY_DRV_STAT_COUNT;
 
    if (screen->base.device->drm_version >= 0x01000101) {
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         count += NVE4_PM_QUERY_COUNT;
-      } else
       if (screen->compute) {
-         count += NVC0_PM_QUERY_COUNT; /* NVC0_COMPUTE is not always enabled */
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            count += NVE4_PM_QUERY_COUNT;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            /* NVC0_COMPUTE is not always enabled */
+            count += NVC0_PM_QUERY_COUNT;
+         }
       }
    }
 
@@ -1437,19 +1441,21 @@ nvc0_screen_get_driver_query_info(struct pipe_screen *pscreen,
    } else
 #endif
    if (id < count) {
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-         info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
-         info->max_value.u64 =
-            (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
-         info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
-         return 1;
-      } else
       if (screen->compute) {
-         info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
-         info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
-         info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
-         return 1;
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            info->name = nve4_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
+            info->query_type = NVE4_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+            info->max_value.u64 =
+               (id < NVE4_PM_QUERY_METRIC_MP_OCCUPANCY) ? 0 : 100;
+            info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
+            return 1;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            info->name = nvc0_pm_query_names[id - NVC0_QUERY_DRV_STAT_COUNT];
+            info->query_type = NVC0_PM_QUERY(id - NVC0_QUERY_DRV_STAT_COUNT);
+            info->group_id = NVC0_QUERY_MP_COUNTER_GROUP;
+            return 1;
+         }
       }
    }
    /* user asked for info about non-existing query */
@@ -1469,10 +1475,13 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
 #endif
 
    if (screen->base.device->drm_version >= 0x01000101) {
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         count++;
-      } else if (screen->compute) {
-         count++; /* NVC0_COMPUTE is not always enabled */
+      if (screen->compute) {
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            count++;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            count++; /* NVC0_COMPUTE is not always enabled */
+         }
       }
    }
 
@@ -1480,25 +1489,28 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
       return count;
 
    if (id == NVC0_QUERY_MP_COUNTER_GROUP) {
-      info->name = "MP counters";
-      info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
-
-      if (screen->base.class_3d >= NVE4_3D_CLASS) {
-         info->num_queries = NVE4_PM_QUERY_COUNT;
-
-          /* On NVE4+, each multiprocessor have 8 hardware counters separated
-           * in two distinct domains, but we allow only one active query
-           * simultaneously because some of them use more than one hardware
-           * counter and this will result in an undefined behaviour. */
-          info->max_active_queries = 1; /* TODO: handle multiple hw counters */
-          return 1;
-      } else if (screen->compute) {
-         info->num_queries = NVC0_PM_QUERY_COUNT;
-
-         /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
-          * in a single domain. */
-         info->max_active_queries = 8;
-         return 1;
+      if (screen->compute) {
+         info->name = "MP counters";
+         info->type = PIPE_DRIVER_QUERY_GROUP_TYPE_GPU;
+
+         if (screen->base.class_3d == NVE4_3D_CLASS) {
+            info->num_queries = NVE4_PM_QUERY_COUNT;
+
+             /* On NVE4+, each multiprocessor have 8 hardware counters separated
+              * in two distinct domains, but we allow only one active query
+              * simultaneously because some of them use more than one hardware
+              * counter and this will result in an undefined behaviour. */
+             info->max_active_queries = 1; /* TODO: handle multiple hw counters */
+             return 1;
+         } else
+         if (screen->base.class_3d < NVE4_3D_CLASS) {
+            info->num_queries = NVC0_PM_QUERY_COUNT;
+
+            /* On NVC0:NVE4, each multiprocessor have 8 hardware counters
+             * in a single domain. */
+            info->max_active_queries = 8;
+            return 1;
+         }
       }
    }
 #ifdef NOUVEAU_ENABLE_DRIVER_STATISTICS
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 748c9e7..56c230e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -193,6 +193,7 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -296,6 +297,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       return 1;
    case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+   case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
    case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
       return 16; /* would be 32 in linked (OpenGL-style) mode */
@@ -581,7 +583,7 @@ nvc0_screen_resize_tls_area(struct nvc0_screen *screen,
 
    size = align(size, 1 << 17);
 
-   ret = nouveau_bo_new(screen->base.device, NOUVEAU_BO_VRAM, 1 << 17, size,
+   ret = nouveau_bo_new(screen->base.device, NV_VRAM_DOMAIN(&screen->base), 1 << 17, size,
                         NULL, &bo);
    if (ret) {
       NOUVEAU_ERR("failed to allocate TLS area, size: 0x%"PRIx64"\n", size);
@@ -644,6 +646,11 @@ nvc0_screen_create(struct nouveau_device *dev)
    screen->base.sysmem_bindings |=
       PIPE_BIND_VERTEX_BUFFER | PIPE_BIND_INDEX_BUFFER;
 
+   if (screen->base.vram_domain & NOUVEAU_BO_GART) {
+      screen->base.sysmem_bindings |= screen->base.vidmem_bindings;
+      screen->base.vidmem_bindings = 0;
+   }
+
    pscreen->destroy = nvc0_screen_destroy;
    pscreen->context_create = nvc0_create;
    pscreen->is_format_supported = nvc0_screen_is_format_supported;
@@ -822,7 +829,7 @@ nvc0_screen_create(struct nouveau_device *dev)
 
    nvc0_magic_3d_init(push, screen->eng3d->oclass);
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL,
                         &screen->text);
    if (ret)
       goto fail;
@@ -832,12 +839,12 @@ nvc0_screen_create(struct nouveau_device *dev)
     */
    nouveau_heap_init(&screen->text_heap, 0, (1 << 20) - 0x100);
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 12, 6 << 16, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 12, 6 << 16, NULL,
                         &screen->uniform_bo);
    if (ret)
       goto fail;
 
-   PUSH_REFN (push, screen->uniform_bo, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+   PUSH_REFN (push, screen->uniform_bo, NV_VRAM_DOMAIN(&screen->base) | NOUVEAU_BO_WR);
 
    for (i = 0; i < 5; ++i) {
       /* TIC and TSC entries for each unit (nve4+ only) */
@@ -908,7 +915,7 @@ nvc0_screen_create(struct nouveau_device *dev)
    PUSH_DATA (push, 0);
 
    if (screen->eng3d->oclass < GM107_3D_CLASS) {
-      ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 20, NULL,
+      ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 20, NULL,
                            &screen->poly_cache);
       if (ret)
          goto fail;
@@ -919,7 +926,7 @@ nvc0_screen_create(struct nouveau_device *dev)
       PUSH_DATA (push, 3);
    }
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 1 << 17, 1 << 17, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 1 << 17, 1 << 17, NULL,
                         &screen->txc);
    if (ret)
       goto fail;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 1a7d502..ef2bd43 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -27,10 +27,35 @@ struct nvc0_context;
 
 struct nvc0_blitter;
 
+struct nvc0_graph_state {
+   boolean flushed;
+   boolean rasterizer_discard;
+   boolean early_z_forced;
+   boolean prim_restart;
+   uint32_t instance_elts; /* bitmask of per-instance elements */
+   uint32_t instance_base;
+   uint32_t constant_vbos;
+   uint32_t constant_elts;
+   int32_t index_bias;
+   uint16_t scissor;
+   uint8_t vbo_mode; /* 0 = normal, 1 = translate, 3 = translate, forced */
+   uint8_t num_vtxbufs;
+   uint8_t num_vtxelts;
+   uint8_t num_textures[6];
+   uint8_t num_samplers[6];
+   uint8_t tls_required; /* bitmask of shader types using l[] */
+   uint8_t c14_bound; /* whether immediate array constbuf is bound */
+   uint8_t clip_enable;
+   uint32_t clip_mode;
+   uint32_t uniform_buffer_bound[5];
+   struct nvc0_transform_feedback_state *tfb;
+};
+
 struct nvc0_screen {
    struct nouveau_screen base;
 
    struct nvc0_context *cur_ctx;
+   struct nvc0_graph_state save_state;
 
    int num_occlusion_queries_active;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 516b33b..e084278 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -34,7 +34,7 @@ nvc0_program_update_context_state(struct nvc0_context *nvc0,
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
 
    if (prog && prog->need_tls) {
-      const uint32_t flags = NOUVEAU_BO_VRAM | NOUVEAU_BO_RDWR;
+      const uint32_t flags = NV_VRAM_DOMAIN(&nvc0->screen->base) | NOUVEAU_BO_RDWR;
       if (!nvc0->state.tls_required)
          BCTX_REFN_bo(nvc0->bufctx_3d, TLS, flags, nvc0->screen->tls);
       nvc0->state.tls_required |= 1 << stage;
@@ -262,11 +262,13 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
       if (tfb)
          targ->stride = tfb->stride[b];
 
+      buf = nv04_resource(targ->pipe.buffer);
+
+      BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR);
+
       if (!(nvc0->tfbbuf_dirty & (1 << b)))
          continue;
 
-      buf = nv04_resource(targ->pipe.buffer);
-
       if (!targ->clean)
          nvc0_query_fifo_wait(push, targ->pq);
       BEGIN_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 5);
@@ -280,7 +282,6 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
          PUSH_DATA(push, 0); /* TFB_BUFFER_OFFSET */
          targ->clean = FALSE;
       }
-      BCTX_REFN(nvc0->bufctx_3d, TFB, buf, WR);
    }
    for (; b < 4; ++b)
       IMMED_NVC0(push, NVC0_3D(TFB_BUFFER_ENABLE(b)), 0);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index dca06f4..6b7a211 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -413,24 +413,6 @@ nvc0_zsa_state_delete(struct pipe_context *pipe, void *hwcso)
 #define NV50_TSC_WRAP_CASE(n) \
     case PIPE_TEX_WRAP_##n: return NV50_TSC_WRAP_##n
 
-static INLINE unsigned
-nv50_tsc_wrap_mode(unsigned wrap)
-{
-   switch (wrap) {
-   NV50_TSC_WRAP_CASE(REPEAT);
-   NV50_TSC_WRAP_CASE(MIRROR_REPEAT);
-   NV50_TSC_WRAP_CASE(CLAMP_TO_EDGE);
-   NV50_TSC_WRAP_CASE(CLAMP_TO_BORDER);
-   NV50_TSC_WRAP_CASE(CLAMP);
-   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_EDGE);
-   NV50_TSC_WRAP_CASE(MIRROR_CLAMP_TO_BORDER);
-   NV50_TSC_WRAP_CASE(MIRROR_CLAMP);
-   default:
-       NOUVEAU_ERR("unknown wrap mode: %d\n", wrap);
-       return NV50_TSC_WRAP_REPEAT;
-   }
-}
-
 static void
 nvc0_sampler_state_delete(struct pipe_context *pipe, void *hwcso)
 {
@@ -811,12 +793,12 @@ nvc0_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
    nvc0->constbuf[s][i].user = (cb && cb->user_buffer) ? TRUE : FALSE;
    if (nvc0->constbuf[s][i].user) {
       nvc0->constbuf[s][i].u.data = cb->user_buffer;
-      nvc0->constbuf[s][i].size = cb->buffer_size;
+      nvc0->constbuf[s][i].size = MIN2(cb->buffer_size, 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
    } else
    if (cb) {
       nvc0->constbuf[s][i].offset = cb->buffer_offset;
-      nvc0->constbuf[s][i].size = align(cb->buffer_size, 0x100);
+      nvc0->constbuf[s][i].size = MIN2(align(cb->buffer_size, 0x100), 0x10000);
       nvc0->constbuf_valid[s] |= 1 << i;
    }
    else {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index 6051f12..c52399a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -439,7 +439,7 @@ nvc0_constbufs_validate(struct nvc0_context *nvc0)
                BEGIN_NVC0(push, NVC0_3D(CB_BIND(s)), 1);
                PUSH_DATA (push, (0 << 4) | 1);
             }
-            nvc0_cb_push(&nvc0->base, bo, NOUVEAU_BO_VRAM,
+            nvc0_cb_push(&nvc0->base, bo, NV_VRAM_DOMAIN(&nvc0->screen->base),
                          base, nvc0->state.uniform_buffer_bound[s],
                          0, (size + 3) / 4,
                          nvc0->constbuf[s][0].u.data);
@@ -543,6 +543,8 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
 
    if (ctx_from)
       ctx_to->state = ctx_from->state;
+   else
+      ctx_to->state = ctx_to->screen->save_state;
 
    ctx_to->dirty = ~0;
    ctx_to->viewports_dirty = ~0;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 4404d8c..a820de7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1152,6 +1152,12 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 |
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST);
    }
+   if (nvc0->state.instance_elts) {
+      nvc0->state.instance_elts = 0;
+      BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
+      PUSH_DATA (push, n);
+      PUSH_DATA (push, 0);
+   }
    nvc0->state.num_vtxelts = 2;
 
    for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 457f27c..ddc0409 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -396,7 +396,7 @@ nvc0_validate_tsc(struct nvc0_context *nvc0, int s)
          tsc->id = nvc0_screen_tsc_alloc(nvc0->screen, tsc);
 
          nvc0_m2mf_push_linear(&nvc0->base, nvc0->screen->txc,
-                               65536 + tsc->id * 32, NOUVEAU_BO_VRAM,
+                               65536 + tsc->id * 32, NV_VRAM_DOMAIN(&nvc0->screen->base),
                                32, tsc->tsc);
          need_flush = TRUE;
       }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 657b8c0..8cf2584 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -829,6 +829,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
    }
    PUSH_DATA(push, nvc0_prim_gl(info->mode));
 #define NVC0_IB_ENTRY_1_NO_PREFETCH (1 << (31 - 8))
+   PUSH_REFN(push, buf->bo, NOUVEAU_BO_RD | buf->domain);
    nouveau_pushbuf_space(push, 0, 0, 1);
    nouveau_pushbuf_data(push,
                         buf->bo, offset, NVC0_IB_ENTRY_1_NO_PREFETCH | size);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index f243316..fce02a7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -63,7 +63,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
       return ret;
    }
 
-   ret = nouveau_bo_new(dev, NOUVEAU_BO_VRAM, 0, NVE4_CP_PARAM_SIZE, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL,
                         &screen->parm);
    if (ret)
       return ret;
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index a7b59d8..a7bca91 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -190,6 +190,7 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
         case PIPE_CAP_POLYGON_OFFSET_CLAMP:
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+        case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
             return 0;
 
         /* SWTCL-only features. */
@@ -273,6 +274,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
             return (is_r500 ? 256 : 32) * sizeof(float[4]);
         case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
             return 1;
         case PIPE_SHADER_CAP_MAX_TEMPS:
             return is_r500 ? 128 : is_r400 ? 64 : 32;
@@ -332,6 +334,7 @@ static int r300_get_shader_param(struct pipe_screen *pscreen, unsigned shader, e
         case PIPE_SHADER_CAP_MAX_PREDS:
             return 0; /* unused */
         case PIPE_SHADER_CAP_INDIRECT_CONST_ADDR:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
             return 1;
         case PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS:
         case PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS:
diff --git a/src/gallium/drivers/r300/r300_tgsi_to_rc.c b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
index 69afb4c..23ed2cf 100644
--- a/src/gallium/drivers/r300/r300_tgsi_to_rc.c
+++ b/src/gallium/drivers/r300/r300_tgsi_to_rc.c
@@ -133,13 +133,7 @@ static unsigned translate_opcode(unsigned opcode)
 
 static unsigned translate_saturate(unsigned saturate)
 {
-    switch(saturate) {
-        default:
-            fprintf(stderr, "Unknown saturate mode: %i\n", saturate);
-            /* fall-through */
-        case TGSI_SAT_NONE: return RC_SATURATE_NONE;
-        case TGSI_SAT_ZERO_ONE: return RC_SATURATE_ZERO_ONE;
-    }
+    return saturate ? RC_SATURATE_ZERO_ONE : RC_SATURATE_NONE;
 }
 
 static unsigned translate_register_file(unsigned file)
diff --git a/src/gallium/drivers/r600/Android.mk b/src/gallium/drivers/r600/Android.mk
index e935759..bfe3987 100644
--- a/src/gallium/drivers/r600/Android.mk
+++ b/src/gallium/drivers/r600/Android.mk
@@ -33,6 +33,10 @@ LOCAL_SRC_FILES := $(C_SOURCES) $(CXX_SOURCES)
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_r600
 
+ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_C_INCLUDES := external/libcxx/include
+else
 include external/stlport/libstlport.mk
+endif
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 21e5d42..e122b60 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -332,6 +332,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	/* Stream output. */
@@ -475,6 +476,7 @@ static int r600_get_shader_param(struct pipe_screen* pscreen, unsigned shader, e
 	case PIPE_SHADER_CAP_SUBROUTINES:
 		return 0;
 	case PIPE_SHADER_CAP_INTEGERS:
+	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 1;
 	case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
 	case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 87b6e6e..af7622e 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -617,98 +617,100 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 
 	switch (d->Declaration.File) {
 	case TGSI_FILE_INPUT:
-		i = ctx->shader->ninput;
-                assert(i < Elements(ctx->shader->input));
-		ctx->shader->ninput += count;
-		ctx->shader->input[i].name = d->Semantic.Name;
-		ctx->shader->input[i].sid = d->Semantic.Index;
-		ctx->shader->input[i].interpolate = d->Interp.Interpolate;
-		ctx->shader->input[i].interpolate_location = d->Interp.Location;
-		ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First;
-		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
-			switch (ctx->shader->input[i].name) {
-			case TGSI_SEMANTIC_FACE:
-				if (ctx->face_gpr != -1)
-					ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
-				else
-					ctx->face_gpr = ctx->shader->input[i].gpr;
-				break;
-			case TGSI_SEMANTIC_COLOR:
-				ctx->colors_used++;
-				break;
-			case TGSI_SEMANTIC_POSITION:
-				ctx->fragcoord_input = i;
-				break;
-			case TGSI_SEMANTIC_PRIMID:
-				/* set this for now */
-				ctx->shader->gs_prim_id_input = true;
-				ctx->shader->ps_prim_id_input = i;
-				break;
-			}
-			if (ctx->bc->chip_class >= EVERGREEN) {
-				if ((r = evergreen_interp_input(ctx, i)))
-					return r;
+		for (j = 0; j < count; j++) {
+			i = ctx->shader->ninput + j;
+			assert(i < Elements(ctx->shader->input));
+			ctx->shader->input[i].name = d->Semantic.Name;
+			ctx->shader->input[i].sid = d->Semantic.Index + j;
+			ctx->shader->input[i].interpolate = d->Interp.Interpolate;
+			ctx->shader->input[i].interpolate_location = d->Interp.Location;
+			ctx->shader->input[i].gpr = ctx->file_offset[TGSI_FILE_INPUT] + d->Range.First + j;
+			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+				ctx->shader->input[i].spi_sid = r600_spi_sid(&ctx->shader->input[i]);
+				switch (ctx->shader->input[i].name) {
+				case TGSI_SEMANTIC_FACE:
+					if (ctx->face_gpr != -1)
+						ctx->shader->input[i].gpr = ctx->face_gpr; /* already allocated by allocate_system_value_inputs */
+					else
+						ctx->face_gpr = ctx->shader->input[i].gpr;
+					break;
+				case TGSI_SEMANTIC_COLOR:
+					ctx->colors_used++;
+					break;
+				case TGSI_SEMANTIC_POSITION:
+					ctx->fragcoord_input = i;
+					break;
+				case TGSI_SEMANTIC_PRIMID:
+					/* set this for now */
+					ctx->shader->gs_prim_id_input = true;
+					ctx->shader->ps_prim_id_input = i;
+					break;
+				}
+				if (ctx->bc->chip_class >= EVERGREEN) {
+					if ((r = evergreen_interp_input(ctx, i)))
+						return r;
+				}
+			} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+				/* FIXME probably skip inputs if they aren't passed in the ring */
+				ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
+				ctx->next_ring_offset += 16;
+				if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
+					ctx->shader->gs_prim_id_input = true;
 			}
-		} else if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
-			/* FIXME probably skip inputs if they aren't passed in the ring */
-			ctx->shader->input[i].ring_offset = ctx->next_ring_offset;
-			ctx->next_ring_offset += 16;
-			if (ctx->shader->input[i].name == TGSI_SEMANTIC_PRIMID)
-				ctx->shader->gs_prim_id_input = true;
-		}
-		for (j = 1; j < count; ++j) {
-			ctx->shader->input[i + j] = ctx->shader->input[i];
-			ctx->shader->input[i + j].gpr += j;
 		}
+		ctx->shader->ninput += count;
 		break;
 	case TGSI_FILE_OUTPUT:
-		i = ctx->shader->noutput++;
-                assert(i < Elements(ctx->shader->output));
-		ctx->shader->output[i].name = d->Semantic.Name;
-		ctx->shader->output[i].sid = d->Semantic.Index;
-		ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First;
-		ctx->shader->output[i].interpolate = d->Interp.Interpolate;
-		ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
-		if (ctx->type == TGSI_PROCESSOR_VERTEX ||
-				ctx->type == TGSI_PROCESSOR_GEOMETRY) {
-			ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
-			switch (d->Semantic.Name) {
-			case TGSI_SEMANTIC_CLIPDIST:
-				ctx->shader->clip_dist_write |= d->Declaration.UsageMask << (d->Semantic.Index << 2);
-				break;
-			case TGSI_SEMANTIC_PSIZE:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_point_size = 1;
-				break;
-			case TGSI_SEMANTIC_EDGEFLAG:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_edgeflag = 1;
-				ctx->edgeflag_output = i;
-				break;
-			case TGSI_SEMANTIC_VIEWPORT_INDEX:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_viewport = 1;
-				break;
-			case TGSI_SEMANTIC_LAYER:
-				ctx->shader->vs_out_misc_write = 1;
-				ctx->shader->vs_out_layer = 1;
-				break;
-			case TGSI_SEMANTIC_CLIPVERTEX:
-				ctx->clip_vertex_write = TRUE;
-				ctx->cv_output = i;
-				break;
-			}
-			if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
-				ctx->gs_out_ring_offset += 16;
-			}
-		} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
-			switch (d->Semantic.Name) {
-			case TGSI_SEMANTIC_COLOR:
-				ctx->shader->nr_ps_max_color_exports++;
-				break;
+		for (j = 0; j < count; j++) {
+			i = ctx->shader->noutput + j;
+			assert(i < Elements(ctx->shader->output));
+			ctx->shader->output[i].name = d->Semantic.Name;
+			ctx->shader->output[i].sid = d->Semantic.Index + j;
+			ctx->shader->output[i].gpr = ctx->file_offset[TGSI_FILE_OUTPUT] + d->Range.First + j;
+			ctx->shader->output[i].interpolate = d->Interp.Interpolate;
+			ctx->shader->output[i].write_mask = d->Declaration.UsageMask;
+			if (ctx->type == TGSI_PROCESSOR_VERTEX ||
+			    ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+				ctx->shader->output[i].spi_sid = r600_spi_sid(&ctx->shader->output[i]);
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_CLIPDIST:
+					ctx->shader->clip_dist_write |= d->Declaration.UsageMask <<
+									((d->Semantic.Index + j) << 2);
+					break;
+				case TGSI_SEMANTIC_PSIZE:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_point_size = 1;
+					break;
+				case TGSI_SEMANTIC_EDGEFLAG:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_edgeflag = 1;
+					ctx->edgeflag_output = i;
+					break;
+				case TGSI_SEMANTIC_VIEWPORT_INDEX:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_viewport = 1;
+					break;
+				case TGSI_SEMANTIC_LAYER:
+					ctx->shader->vs_out_misc_write = 1;
+					ctx->shader->vs_out_layer = 1;
+					break;
+				case TGSI_SEMANTIC_CLIPVERTEX:
+					ctx->clip_vertex_write = TRUE;
+					ctx->cv_output = i;
+					break;
+				}
+				if (ctx->type == TGSI_PROCESSOR_GEOMETRY) {
+					ctx->gs_out_ring_offset += 16;
+				}
+			} else if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
+				switch (d->Semantic.Name) {
+				case TGSI_SEMANTIC_COLOR:
+					ctx->shader->nr_ps_max_color_exports++;
+					break;
+				}
 			}
 		}
+		ctx->shader->noutput += count;
 		break;
 	case TGSI_FILE_TEMPORARY:
 		if (ctx->info.indirect_files & (1 << TGSI_FILE_TEMPORARY)) {
@@ -723,6 +725,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 
 	case TGSI_FILE_CONSTANT:
 	case TGSI_FILE_SAMPLER:
+	case TGSI_FILE_SAMPLER_VIEW:
 	case TGSI_FILE_ADDRESS:
 		break;
 
@@ -1337,7 +1340,7 @@ static int emit_streamout(struct r600_shader_ctx *ctx, struct pipe_stream_output
 	int i, j, r;
 
 	/* Sanity checking. */
-	if (so->num_outputs > PIPE_MAX_SHADER_OUTPUTS) {
+	if (so->num_outputs > PIPE_MAX_SO_OUTPUTS) {
 		R600_ERR("Too many stream outputs: %d\n", so->num_outputs);
 		r = -EINVAL;
 		goto out_err;
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index c50c705..13dc9ee 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -95,22 +95,23 @@ static void r600_texture_barrier(struct pipe_context *ctx)
 static unsigned r600_conv_pipe_prim(unsigned prim)
 {
 	static const unsigned prim_conv[] = {
-		V_008958_DI_PT_POINTLIST,
-		V_008958_DI_PT_LINELIST,
-		V_008958_DI_PT_LINELOOP,
-		V_008958_DI_PT_LINESTRIP,
-		V_008958_DI_PT_TRILIST,
-		V_008958_DI_PT_TRISTRIP,
-		V_008958_DI_PT_TRIFAN,
-		V_008958_DI_PT_QUADLIST,
-		V_008958_DI_PT_QUADSTRIP,
-		V_008958_DI_PT_POLYGON,
-		V_008958_DI_PT_LINELIST_ADJ,
-		V_008958_DI_PT_LINESTRIP_ADJ,
-		V_008958_DI_PT_TRILIST_ADJ,
-		V_008958_DI_PT_TRISTRIP_ADJ,
-		V_008958_DI_PT_RECTLIST
+		[PIPE_PRIM_POINTS]			= V_008958_DI_PT_POINTLIST,
+		[PIPE_PRIM_LINES]			= V_008958_DI_PT_LINELIST,
+		[PIPE_PRIM_LINE_LOOP]			= V_008958_DI_PT_LINELOOP,
+		[PIPE_PRIM_LINE_STRIP]			= V_008958_DI_PT_LINESTRIP,
+		[PIPE_PRIM_TRIANGLES]			= V_008958_DI_PT_TRILIST,
+		[PIPE_PRIM_TRIANGLE_STRIP]		= V_008958_DI_PT_TRISTRIP,
+		[PIPE_PRIM_TRIANGLE_FAN]		= V_008958_DI_PT_TRIFAN,
+		[PIPE_PRIM_QUADS]			= V_008958_DI_PT_QUADLIST,
+		[PIPE_PRIM_QUAD_STRIP]			= V_008958_DI_PT_QUADSTRIP,
+		[PIPE_PRIM_POLYGON]			= V_008958_DI_PT_POLYGON,
+		[PIPE_PRIM_LINES_ADJACENCY]		= V_008958_DI_PT_LINELIST_ADJ,
+		[PIPE_PRIM_LINE_STRIP_ADJACENCY]	= V_008958_DI_PT_LINESTRIP_ADJ,
+		[PIPE_PRIM_TRIANGLES_ADJACENCY]		= V_008958_DI_PT_TRILIST_ADJ,
+		[PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY]	= V_008958_DI_PT_TRISTRIP_ADJ,
+		[R600_PRIM_RECTANGLE_LIST]		= V_008958_DI_PT_RECTLIST
 	};
+	assert(prim < Elements(prim_conv));
 	return prim_conv[prim];
 }
 
diff --git a/src/gallium/drivers/radeon/Android.mk b/src/gallium/drivers/radeon/Android.mk
index d615792..6997a6d 100644
--- a/src/gallium/drivers/radeon/Android.mk
+++ b/src/gallium/drivers/radeon/Android.mk
@@ -30,6 +30,10 @@ include $(CLEAR_VARS)
 
 LOCAL_SRC_FILES := $(C_SOURCES)
 
+ifeq ($(MESA_ENABLE_LLVM),true)
+LOCAL_SRC_FILES += $(LLVM_C_FILES)
+endif
+
 LOCAL_SHARED_LIBRARIES := libdrm libdrm_radeon
 LOCAL_MODULE := libmesa_pipe_radeon
 
diff --git a/src/gallium/drivers/radeon/Makefile.sources b/src/gallium/drivers/radeon/Makefile.sources
index c655fe5..f63790c 100644
--- a/src/gallium/drivers/radeon/Makefile.sources
+++ b/src/gallium/drivers/radeon/Makefile.sources
@@ -12,6 +12,7 @@ C_SOURCES := \
 	radeon_uvd.c \
 	radeon_uvd.h \
 	radeon_vce_40_2_2.c \
+	radeon_vce_50.c \
 	radeon_vce.c \
 	radeon_vce.h \
 	radeon_video.c \
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index 42e681d..3def444 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -107,11 +107,10 @@ void r600_draw_rectangle(struct blitter_context *blitter,
 
 void r600_need_dma_space(struct r600_common_context *ctx, unsigned num_dw)
 {
-	/* The number of dwords we already used in the DMA so far. */
-	num_dw += ctx->rings.dma.cs->cdw;
 	/* Flush if there's not enough space. */
-	if (num_dw > RADEON_MAX_CMDBUF_DWORDS) {
+	if ((num_dw + ctx->rings.dma.cs->cdw) > RADEON_MAX_CMDBUF_DWORDS) {
 		ctx->rings.dma.flush(ctx, RADEON_FLUSH_ASYNC, NULL);
+		assert((num_dw + ctx->rings.dma.cs->cdw) <= RADEON_MAX_CMDBUF_DWORDS);
 	}
 }
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm.h b/src/gallium/drivers/radeon/radeon_llvm.h
index 8612ef8..6a9557b 100644
--- a/src/gallium/drivers/radeon/radeon_llvm.h
+++ b/src/gallium/drivers/radeon/radeon_llvm.h
@@ -33,7 +33,6 @@
 
 #define RADEON_LLVM_MAX_INPUTS 32 * 4
 #define RADEON_LLVM_MAX_OUTPUTS 32 * 4
-#define RADEON_LLVM_MAX_ARRAYS 16
 
 #define RADEON_LLVM_INITIAL_CF_DEPTH 4
 
@@ -130,8 +129,7 @@ struct radeon_llvm_context {
 	unsigned loop_depth;
 	unsigned loop_depth_max;
 
-	struct tgsi_declaration_range arrays[RADEON_LLVM_MAX_ARRAYS];
-	unsigned num_arrays;
+	struct tgsi_declaration_range *arrays;
 
 	LLVMValueRef main_fn;
 
diff --git a/src/gallium/drivers/radeon/radeon_llvm_emit.c b/src/gallium/drivers/radeon/radeon_llvm_emit.c
index 624077c..25580b6 100644
--- a/src/gallium/drivers/radeon/radeon_llvm_emit.c
+++ b/src/gallium/drivers/radeon/radeon_llvm_emit.c
@@ -86,10 +86,18 @@ static void init_r600_target()
 {
 	static unsigned initialized = 0;
 	if (!initialized) {
+#if HAVE_LLVM < 0x0307
 		LLVMInitializeR600TargetInfo();
 		LLVMInitializeR600Target();
 		LLVMInitializeR600TargetMC();
 		LLVMInitializeR600AsmPrinter();
+#else
+		LLVMInitializeAMDGPUTargetInfo();
+		LLVMInitializeAMDGPUTarget();
+		LLVMInitializeAMDGPUTargetMC();
+		LLVMInitializeAMDGPUAsmPrinter();
+
+#endif
 		initialized = 1;
 	}
 }
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 20e506b..c8c980d 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -85,8 +85,9 @@ get_array_range(struct lp_build_tgsi_context *bld_base,
 		unsigned File, const struct tgsi_ind_register *reg)
 {
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
+
 	if (File != TGSI_FILE_TEMPORARY || reg->ArrayID == 0 ||
-            reg->ArrayID > RADEON_LLVM_MAX_ARRAYS) {
+	    reg->ArrayID > bld_base->info->array_max[TGSI_FILE_TEMPORARY]) {
 		struct tgsi_declaration_range range;
 		range.First = 0;
 		range.Last = bld_base->info->file_max[File];
@@ -252,8 +253,14 @@ static void emit_declaration(
 	}
 
 	case TGSI_FILE_TEMPORARY:
-		if (decl->Declaration.Array && decl->Array.ArrayID <= RADEON_LLVM_MAX_ARRAYS)
+		if (decl->Declaration.Array) {
+			if (!ctx->arrays) {
+				int size = bld_base->info->array_max[TGSI_FILE_TEMPORARY];
+				ctx->arrays = MALLOC(sizeof(ctx->arrays[0]) * size);
+			}
+
 			ctx->arrays[decl->Array.ArrayID - 1] = decl->Range;
+		}
 		if (uses_temp_indirect_addressing(bld_base)) {
 			lp_emit_declaration_soa(bld_base, decl);
 			break;
@@ -314,6 +321,21 @@ static void emit_declaration(
 	}
 }
 
+static LLVMValueRef radeon_llvm_saturate(struct lp_build_tgsi_context *bld_base,
+                                         LLVMValueRef value)
+{
+	struct lp_build_emit_data clamp_emit_data;
+
+	memset(&clamp_emit_data, 0, sizeof(clamp_emit_data));
+	clamp_emit_data.arg_count = 3;
+	clamp_emit_data.args[0] = value;
+	clamp_emit_data.args[2] = bld_base->base.one;
+	clamp_emit_data.args[1] = bld_base->base.zero;
+
+	return lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP,
+				  &clamp_emit_data);
+}
+
 static void
 emit_store(
 	struct lp_build_tgsi_context * bld_base,
@@ -324,7 +346,6 @@ emit_store(
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct lp_build_tgsi_soa_context *bld = lp_soa_context(bld_base);
 	struct gallivm_state *gallivm = bld->bld_base.base.gallivm;
-	struct lp_build_context base = bld->bld_base.base;
 	const struct tgsi_full_dst_register *reg = &inst->Dst[0];
 	LLVMBuilderRef builder = bld->bld_base.base.gallivm->builder;
 	LLVMValueRef temp_ptr;
@@ -350,28 +371,8 @@ emit_store(
 	TGSI_FOR_EACH_DST0_ENABLED_CHANNEL( inst, chan_index ) {
 		LLVMValueRef value = dst[chan_index];
 
-		if (inst->Instruction.Saturate != TGSI_SAT_NONE) {
-			struct lp_build_emit_data clamp_emit_data;
-
-			memset(&clamp_emit_data, 0, sizeof(clamp_emit_data));
-			clamp_emit_data.arg_count = 3;
-			clamp_emit_data.args[0] = value;
-			clamp_emit_data.args[2] = base.one;
-
-			switch(inst->Instruction.Saturate) {
-			case TGSI_SAT_ZERO_ONE:
-				clamp_emit_data.args[1] = base.zero;
-				break;
-			case TGSI_SAT_MINUS_PLUS_ONE:
-				clamp_emit_data.args[1] = LLVMConstReal(
-						base.elem_type, -1.0f);
-				break;
-			default:
-				assert(0);
-			}
-			value = lp_build_emit_llvm(bld_base, TGSI_OPCODE_CLAMP,
-						&clamp_emit_data);
-		}
+		if (inst->Instruction.Saturate)
+			value = radeon_llvm_saturate(bld_base, value);
 
 		if (reg->Register.File == TGSI_FILE_ADDRESS) {
 			temp_ptr = bld->addr[reg->Register.Index][chan_index];
@@ -1438,8 +1439,6 @@ void radeon_llvm_context_init(struct radeon_llvm_context * ctx)
 	/* Allocate outputs */
 	ctx->soa.outputs = ctx->outputs;
 
-	ctx->num_arrays = 0;
-
 	/* XXX: Is there a better way to initialize all this ? */
 
 	lp_set_default_actions(bld_base);
@@ -1628,8 +1627,11 @@ void radeon_llvm_dispose(struct radeon_llvm_context * ctx)
 {
 	LLVMDisposeModule(ctx->soa.bld_base.base.gallivm->module);
 	LLVMContextDispose(ctx->soa.bld_base.base.gallivm->context);
+	FREE(ctx->arrays);
+	ctx->arrays = NULL;
 	FREE(ctx->temps);
 	ctx->temps = NULL;
+	ctx->temps_count = 0;
 	FREE(ctx->loop);
 	ctx->loop = NULL;
 	ctx->loop_depth_max = 0;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index e220f40..a656737 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -44,6 +44,10 @@
 #include "radeon_video.h"
 #include "radeon_vce.h"
 
+#define FW_40_2_2 ((40 << 24) | (2 << 16) | (2 << 8))
+#define FW_50_0_1 ((50 << 24) | (0 << 16) | (1 << 8))
+#define FW_50_1_2 ((50 << 24) | (1 << 16) | (2 << 8))
+
 /**
  * flush commands to the hardware
  */
@@ -183,6 +187,44 @@ static unsigned get_cpb_num(struct rvce_encoder *enc)
 }
 
 /**
+ * Get the slot for the currently encoded frame
+ */
+struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc)
+{
+	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.prev, list);
+}
+
+/**
+ * Get the slot for L0
+ */
+struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc)
+{
+	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next, list);
+}
+
+/**
+ * Get the slot for L1
+ */
+struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
+{
+	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next->next, list);
+}
+
+/**
+ * Calculate the offsets into the CPB
+ */
+void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
+		       unsigned *luma_offset, unsigned *chroma_offset)
+{
+	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
+	unsigned vpitch = align(enc->luma->npix_y, 16);
+	unsigned fsize = pitch * (vpitch + vpitch / 2);
+
+	*luma_offset = slot->index * fsize;
+	*chroma_offset = *luma_offset + pitch * vpitch;
+}
+
+/**
  * destroy this video encoder
  */
 static void rvce_destroy(struct pipe_video_codec *encoder)
@@ -406,7 +448,19 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	reset_cpb(enc);
 
-	radeon_vce_40_2_2_init(enc);
+	switch (rscreen->info.vce_fw_version) {
+	case FW_40_2_2:
+		radeon_vce_40_2_2_init(enc);
+		break;
+
+	case FW_50_0_1:
+	case FW_50_1_2:
+		radeon_vce_50_init(enc);
+		break;
+
+	default:
+		goto error;
+	}
 
 	return &enc->base;
 
@@ -426,5 +480,7 @@ error:
  */
 bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 {
-	return rscreen->info.vce_fw_version == ((40 << 24) | (2 << 16) | (2 << 8));
+	return rscreen->info.vce_fw_version == FW_40_2_2 ||
+		rscreen->info.vce_fw_version == FW_50_0_1 ||
+		rscreen->info.vce_fw_version == FW_50_1_2;
 }
diff --git a/src/gallium/drivers/radeon/radeon_vce.h b/src/gallium/drivers/radeon/radeon_vce.h
index 1cf0180..8319ef4 100644
--- a/src/gallium/drivers/radeon/radeon_vce.h
+++ b/src/gallium/drivers/radeon/radeon_vce.h
@@ -104,6 +104,13 @@ struct rvce_encoder {
 	bool use_vui;
 };
 
+/* CPB handling functions */
+struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc);
+struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc);
+struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc);
+void rvce_frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
+		       unsigned *luma_offset, unsigned *chroma_offset);
+
 struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 					     const struct pipe_video_codec *templat,
 					     struct radeon_winsys* ws,
@@ -114,4 +121,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen);
 /* init vce fw 40.2.2 specific callbacks */
 void radeon_vce_40_2_2_init(struct rvce_encoder *enc);
 
+/* init vce fw 50 specific callbacks */
+void radeon_vce_50_init(struct rvce_encoder *enc);
+
 #endif
diff --git a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
index 0902957..51b17b5 100644
--- a/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
+++ b/src/gallium/drivers/radeon/radeon_vce_40_2_2.c
@@ -46,32 +46,6 @@
 
 static const unsigned profiles[7] = { 66, 77, 88, 100, 110, 122, 244 };
 
-static struct rvce_cpb_slot *current_slot(struct rvce_encoder *enc)
-{
-	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.prev, list);
-}
-
-static struct rvce_cpb_slot *l0_slot(struct rvce_encoder *enc)
-{
-	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next, list);
-}
-
-static struct rvce_cpb_slot *l1_slot(struct rvce_encoder *enc)
-{
-	return LIST_ENTRY(struct rvce_cpb_slot, enc->cpb_slots.next->next, list);
-}
-
-static void frame_offset(struct rvce_encoder *enc, struct rvce_cpb_slot *slot,
-			 unsigned *luma_offset, unsigned *chroma_offset)
-{
-	unsigned pitch = align(enc->luma->level[0].pitch_bytes, 128);
-	unsigned vpitch = align(enc->luma->npix_y, 16);
-	unsigned fsize = pitch * (vpitch + vpitch / 2);
-
-	*luma_offset = slot->index * fsize;
-	*chroma_offset = *luma_offset + pitch * vpitch;
-}
-
 static void session(struct rvce_encoder *enc)
 {
 	RVCE_BEGIN(0x00000001); // session cmd
@@ -369,7 +343,7 @@ static void encode(struct rvce_encoder *enc)
 	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
 	   enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
 		struct rvce_cpb_slot *l0 = l0_slot(enc);
-		frame_offset(enc, l0, &luma_offset, &chroma_offset);
+		rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset);
 		RVCE_CS(l0->picture_type); // encPicType
 		RVCE_CS(l0->frame_num); // frameNumber
 		RVCE_CS(l0->pic_order_cnt); // pictureOrderCount
@@ -395,7 +369,7 @@ static void encode(struct rvce_encoder *enc)
 	RVCE_CS(0x00000000); // pictureStructure
 	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
 		struct rvce_cpb_slot *l1 = l1_slot(enc);
-		frame_offset(enc, l1, &luma_offset, &chroma_offset);
+		rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset);
 		RVCE_CS(l1->picture_type); // encPicType
 		RVCE_CS(l1->frame_num); // frameNumber
 		RVCE_CS(l1->pic_order_cnt); // pictureOrderCount
@@ -409,7 +383,7 @@ static void encode(struct rvce_encoder *enc)
 		RVCE_CS(0xffffffff); // chromaOffset
 	}
 
-	frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
+	rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
 	RVCE_CS(luma_offset); // encReconstructedLumaOffset
 	RVCE_CS(chroma_offset); // encReconstructedChromaOffset
 	RVCE_CS(0x00000000); // encColocBufferOffset
diff --git a/src/gallium/drivers/radeon/radeon_vce_50.c b/src/gallium/drivers/radeon/radeon_vce_50.c
new file mode 100644
index 0000000..84a2bfb
--- /dev/null
+++ b/src/gallium/drivers/radeon/radeon_vce_50.c
@@ -0,0 +1,228 @@
+/**************************************************************************
+ *
+ * Copyright 2013 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ *      Christian König <christian.koenig@amd.com>
+ *
+ */
+
+#include <stdio.h>
+
+#include "pipe/p_video_codec.h"
+
+#include "util/u_video.h"
+#include "util/u_memory.h"
+
+#include "vl/vl_video_buffer.h"
+
+#include "r600_pipe_common.h"
+#include "radeon_video.h"
+#include "radeon_vce.h"
+
+static void task_info(struct rvce_encoder *enc, uint32_t taskOperation)
+{
+	RVCE_BEGIN(0x00000002); // task info
+	RVCE_CS(0xffffffff); // offsetOfNextTaskInfo
+	RVCE_CS(taskOperation); // taskOperation
+	RVCE_CS(0x00000000); // referencePictureDependency
+	RVCE_CS(0x00000000); // collocateFlagDependency
+	RVCE_CS(0x00000000); // feedbackIndex
+	RVCE_CS(0x00000000); // videoBitstreamRingIndex
+	RVCE_END();
+}
+
+static void rate_control(struct rvce_encoder *enc)
+{
+	RVCE_BEGIN(0x04000005); // rate control
+	RVCE_CS(enc->pic.rate_ctrl.rate_ctrl_method); // encRateControlMethod
+	RVCE_CS(enc->pic.rate_ctrl.target_bitrate); // encRateControlTargetBitRate
+	RVCE_CS(enc->pic.rate_ctrl.peak_bitrate); // encRateControlPeakBitRate
+	RVCE_CS(enc->pic.rate_ctrl.frame_rate_num); // encRateControlFrameRateNum
+	RVCE_CS(0x00000000); // encGOPSize
+	RVCE_CS(enc->pic.quant_i_frames); // encQP_I
+	RVCE_CS(enc->pic.quant_p_frames); // encQP_P
+	RVCE_CS(enc->pic.quant_b_frames); // encQP_B
+	RVCE_CS(enc->pic.rate_ctrl.vbv_buffer_size); // encVBVBufferSize
+	RVCE_CS(enc->pic.rate_ctrl.frame_rate_den); // encRateControlFrameRateDen
+	RVCE_CS(0x00000000); // encVBVBufferLevel
+	RVCE_CS(0x00000000); // encMaxAUSize
+	RVCE_CS(0x00000000); // encQPInitialMode
+	RVCE_CS(enc->pic.rate_ctrl.target_bits_picture); // encTargetBitsPerPicture
+	RVCE_CS(enc->pic.rate_ctrl.peak_bits_picture_integer); // encPeakBitsPerPictureInteger
+	RVCE_CS(enc->pic.rate_ctrl.peak_bits_picture_fraction); // encPeakBitsPerPictureFractional
+	RVCE_CS(0x00000000); // encMinQP
+	RVCE_CS(0x00000033); // encMaxQP
+	RVCE_CS(0x00000000); // encSkipFrameEnable
+	RVCE_CS(0x00000000); // encFillerDataEnable
+	RVCE_CS(0x00000000); // encEnforceHRD
+	RVCE_CS(0x00000000); // encBPicsDeltaQP
+	RVCE_CS(0x00000000); // encReferenceBPicsDeltaQP
+	RVCE_CS(0x00000000); // encRateControlReInitDisable
+	RVCE_CS(0x00000000); // encLCVBRInitQPFlag
+	RVCE_CS(0x00000000); // encLCVBRSATDBasedNonlinearBitBudgetFlag
+	RVCE_END();
+}
+
+static void encode(struct rvce_encoder *enc)
+{
+	int i;
+	unsigned luma_offset, chroma_offset;
+
+	task_info(enc, 0x00000003);
+
+	RVCE_BEGIN(0x05000001); // context buffer
+	RVCE_READWRITE(enc->cpb.res->cs_buf, enc->cpb.res->domains); // encodeContextAddressHi
+	RVCE_CS(0x00000000); // encodeContextAddressLo
+	RVCE_END();
+
+	RVCE_BEGIN(0x05000004); // video bitstream buffer
+	RVCE_WRITE(enc->bs_handle, RADEON_DOMAIN_GTT); // videoBitstreamRingAddressHi
+	RVCE_CS(0x00000000); // videoBitstreamRingAddressLo
+	RVCE_CS(enc->bs_size); // videoBitstreamRingSize
+	RVCE_END();
+
+	RVCE_BEGIN(0x03000001); // encode
+	RVCE_CS(enc->pic.frame_num ? 0x0 : 0x11); // insertHeaders
+	RVCE_CS(0x00000000); // pictureStructure
+	RVCE_CS(enc->bs_size); // allowedMaxBitstreamSize
+	RVCE_CS(0x00000000); // forceRefreshMap
+	RVCE_CS(0x00000000); // insertAUD
+	RVCE_CS(0x00000000); // endOfSequence
+	RVCE_CS(0x00000000); // endOfStream
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureLumaAddressHi
+	RVCE_CS(enc->luma->level[0].offset); // inputPictureLumaAddressLo
+	RVCE_READ(enc->handle, RADEON_DOMAIN_VRAM); // inputPictureChromaAddressHi
+	RVCE_CS(enc->chroma->level[0].offset); // inputPictureChromaAddressLo
+	RVCE_CS(align(enc->luma->npix_y, 16)); // encInputFrameYPitch
+	RVCE_CS(enc->luma->level[0].pitch_bytes); // encInputPicLumaPitch
+	RVCE_CS(enc->chroma->level[0].pitch_bytes); // encInputPicChromaPitch
+	RVCE_CS(0x00010000); // encInputPic(Addr|Array)Mode,encDisable(TwoPipeMode|MBOffloading)
+	RVCE_CS(0x00000000); // encInputPicTileConfig
+	RVCE_CS(enc->pic.picture_type); // encPicType
+	RVCE_CS(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_IDR); // encIdrFlag
+	RVCE_CS(0x00000000); // encIdrPicId
+	RVCE_CS(0x00000000); // encMGSKeyPic
+	RVCE_CS(!enc->pic.not_referenced); // encReferenceFlag
+	RVCE_CS(0x00000000); // encTemporalLayerIndex
+	RVCE_CS(0x00000000); // num_ref_idx_active_override_flag
+	RVCE_CS(0x00000000); // num_ref_idx_l0_active_minus1
+	RVCE_CS(0x00000000); // num_ref_idx_l1_active_minus1
+
+	i = enc->pic.frame_num - enc->pic.ref_idx_l0;
+	if (i > 1 && enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P) {
+		RVCE_CS(0x00000001); // encRefListModificationOp
+		RVCE_CS(i - 1);      // encRefListModificationNum
+	} else {
+		RVCE_CS(0x00000000); // encRefListModificationOp
+		RVCE_CS(0x00000000); // encRefListModificationNum
+	}
+
+	for (i = 0; i < 3; ++i) {
+		RVCE_CS(0x00000000); // encRefListModificationOp
+		RVCE_CS(0x00000000); // encRefListModificationNum
+	}
+	for (i = 0; i < 4; ++i) {
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingOp
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingNum
+		RVCE_CS(0x00000000); // encDecodedPictureMarkingIdx
+		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingOp
+		RVCE_CS(0x00000000); // encDecodedRefBasePictureMarkingNum
+	}
+
+	// encReferencePictureL0[0]
+	RVCE_CS(0x00000000); // pictureStructure
+	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_P ||
+	   enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+		struct rvce_cpb_slot *l0 = l0_slot(enc);
+		rvce_frame_offset(enc, l0, &luma_offset, &chroma_offset);
+		RVCE_CS(l0->picture_type); // encPicType
+		RVCE_CS(l0->frame_num); // frameNumber
+		RVCE_CS(l0->pic_order_cnt); // pictureOrderCount
+		RVCE_CS(luma_offset); // lumaOffset
+		RVCE_CS(chroma_offset); // chromaOffset
+	} else {
+		RVCE_CS(0x00000000); // encPicType
+		RVCE_CS(0x00000000); // frameNumber
+		RVCE_CS(0x00000000); // pictureOrderCount
+		RVCE_CS(0xffffffff); // lumaOffset
+		RVCE_CS(0xffffffff); // chromaOffset
+	}
+
+	// encReferencePictureL0[1]
+	RVCE_CS(0x00000000); // pictureStructure
+	RVCE_CS(0x00000000); // encPicType
+	RVCE_CS(0x00000000); // frameNumber
+	RVCE_CS(0x00000000); // pictureOrderCount
+	RVCE_CS(0xffffffff); // lumaOffset
+	RVCE_CS(0xffffffff); // chromaOffset
+
+	// encReferencePictureL1[0]
+	RVCE_CS(0x00000000); // pictureStructure
+	if(enc->pic.picture_type == PIPE_H264_ENC_PICTURE_TYPE_B) {
+		struct rvce_cpb_slot *l1 = l1_slot(enc);
+		rvce_frame_offset(enc, l1, &luma_offset, &chroma_offset);
+		RVCE_CS(l1->picture_type); // encPicType
+		RVCE_CS(l1->frame_num); // frameNumber
+		RVCE_CS(l1->pic_order_cnt); // pictureOrderCount
+		RVCE_CS(luma_offset); // lumaOffset
+		RVCE_CS(chroma_offset); // chromaOffset
+	} else {
+		RVCE_CS(0x00000000); // encPicType
+		RVCE_CS(0x00000000); // frameNumber
+		RVCE_CS(0x00000000); // pictureOrderCount
+		RVCE_CS(0xffffffff); // lumaOffset
+		RVCE_CS(0xffffffff); // chromaOffset
+	}
+
+	rvce_frame_offset(enc, current_slot(enc), &luma_offset, &chroma_offset);
+	RVCE_CS(luma_offset); // encReconstructedLumaOffset
+	RVCE_CS(chroma_offset); // encReconstructedChromaOffset
+	RVCE_CS(0x00000000); // encColocBufferOffset
+	RVCE_CS(0x00000000); // encReconstructedRefBasePictureLumaOffset
+	RVCE_CS(0x00000000); // encReconstructedRefBasePictureChromaOffset
+	RVCE_CS(0x00000000); // encReferenceRefBasePictureLumaOffset
+	RVCE_CS(0x00000000); // encReferenceRefBasePictureChromaOffset
+	RVCE_CS(0x00000000); // pictureCount
+	RVCE_CS(enc->pic.frame_num); // frameNumber
+	RVCE_CS(enc->pic.pic_order_cnt); // pictureOrderCount
+	RVCE_CS(0x00000000); // numIPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numPPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numBPicRemainInRCGOP
+	RVCE_CS(0x00000000); // numIRPicRemainInRCGOP
+	RVCE_CS(0x00000000); // enableIntraRefresh
+	RVCE_END();
+}
+
+void radeon_vce_50_init(struct rvce_encoder *enc)
+{
+	radeon_vce_40_2_2_init(enc);
+
+	/* only the two below are different */
+	enc->rate_control = rate_control;
+	enc->encode = encode;
+}
diff --git a/src/gallium/drivers/radeonsi/Makefile.sources b/src/gallium/drivers/radeonsi/Makefile.sources
index 774dc22..2876c0a 100644
--- a/src/gallium/drivers/radeonsi/Makefile.sources
+++ b/src/gallium/drivers/radeonsi/Makefile.sources
@@ -1,4 +1,5 @@
 C_SOURCES := \
+	cik_sdma.c \
 	si_blit.c \
 	si_commands.c \
 	si_compute.c \
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
new file mode 100644
index 0000000..86111cb
--- /dev/null
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright 2010 Jerome Glisse <glisse@freedesktop.org>
+ * Copyright 2014,2015 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *      Jerome Glisse
+ */
+
+#include "sid.h"
+#include "si_pipe.h"
+#include "../radeon/r600_cs.h"
+
+#include "util/u_format.h"
+
+static uint32_t cik_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode)
+{
+	if (sscreen->b.info.si_tile_mode_array_valid) {
+		uint32_t gb_tile_mode = sscreen->b.info.si_tile_mode_array[tile_mode];
+
+		return G_009910_MICRO_TILE_MODE_NEW(gb_tile_mode);
+	}
+
+	/* The kernel cannod return the tile mode array. Guess? */
+	return V_009910_ADDR_SURF_THIN_MICRO_TILING;
+}
+
+static void cik_sdma_do_copy_buffer(struct si_context *ctx,
+				    struct pipe_resource *dst,
+				    struct pipe_resource *src,
+				    uint64_t dst_offset,
+				    uint64_t src_offset,
+				    uint64_t size)
+{
+	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	unsigned i, ncopy, csize;
+	struct r600_resource *rdst = (struct r600_resource*)dst;
+	struct r600_resource *rsrc = (struct r600_resource*)src;
+
+	dst_offset += r600_resource(dst)->gpu_address;
+	src_offset += r600_resource(src)->gpu_address;
+
+	ncopy = (size + CIK_SDMA_COPY_MAX_SIZE - 1) / CIK_SDMA_COPY_MAX_SIZE;
+	r600_need_dma_space(&ctx->b, ncopy * 7);
+
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rsrc, RADEON_USAGE_READ,
+			      RADEON_PRIO_MIN);
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, rdst, RADEON_USAGE_WRITE,
+			      RADEON_PRIO_MIN);
+
+	for (i = 0; i < ncopy; i++) {
+		csize = size < CIK_SDMA_COPY_MAX_SIZE ? size : CIK_SDMA_COPY_MAX_SIZE;
+		cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+						     CIK_SDMA_COPY_SUB_OPCODE_LINEAR,
+						     0);
+		cs->buf[cs->cdw++] = csize;
+		cs->buf[cs->cdw++] = 0; /* src/dst endian swap */
+		cs->buf[cs->cdw++] = src_offset;
+		cs->buf[cs->cdw++] = src_offset >> 32;
+		cs->buf[cs->cdw++] = dst_offset;
+		cs->buf[cs->cdw++] = dst_offset >> 32;
+		dst_offset += csize;
+		src_offset += csize;
+		size -= csize;
+	}
+}
+
+static void cik_sdma_copy_buffer(struct si_context *ctx,
+				 struct pipe_resource *dst,
+				 struct pipe_resource *src,
+				 uint64_t dst_offset,
+				 uint64_t src_offset,
+				 uint64_t size)
+{
+	struct r600_resource *rdst = (struct r600_resource*)dst;
+
+	/* Mark the buffer range of destination as valid (initialized),
+	 * so that transfer_map knows it should wait for the GPU when mapping
+	 * that range. */
+	util_range_add(&rdst->valid_buffer_range, dst_offset,
+		       dst_offset + size);
+
+	cik_sdma_do_copy_buffer(ctx, dst, src, dst_offset, src_offset, size);
+}
+
+static void cik_sdma_copy_tile(struct si_context *ctx,
+			       struct pipe_resource *dst,
+			       unsigned dst_level,
+			       struct pipe_resource *src,
+			       unsigned src_level,
+			       unsigned y,
+			       unsigned copy_height,
+			       unsigned y_align,
+			       unsigned pitch,
+			       unsigned bpe)
+{
+	struct radeon_winsys_cs *cs = ctx->b.rings.dma.cs;
+	struct si_screen *sscreen = ctx->screen;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	struct r600_texture *rlinear, *rtiled;
+	unsigned linear_lvl, tiled_lvl;
+	unsigned array_mode, lbpe, pitch_tile_max, slice_tile_max, size;
+	unsigned ncopy, height, cheight, detile, i, src_mode, dst_mode;
+	unsigned sub_op, bank_h, bank_w, mt_aspect, nbanks, tile_split, mt;
+	uint64_t base, addr;
+	unsigned pipe_config, tile_mode_index;
+
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+	assert(dst_mode != src_mode);
+	assert(src_mode == RADEON_SURF_MODE_LINEAR || dst_mode == RADEON_SURF_MODE_LINEAR);
+
+	sub_op = CIK_SDMA_COPY_SUB_OPCODE_TILED;
+	lbpe = util_logbase2(bpe);
+	pitch_tile_max = ((pitch / bpe) / 8) - 1;
+
+	detile = dst_mode == RADEON_SURF_MODE_LINEAR;
+	rlinear = detile ? rdst : rsrc;
+	rtiled = detile ? rsrc : rdst;
+	linear_lvl = detile ? dst_level : src_level;
+	tiled_lvl = detile ? src_level : dst_level;
+
+	assert(!util_format_is_depth_and_stencil(rtiled->resource.b.b.format));
+
+	array_mode = si_array_mode(rtiled->surface.level[tiled_lvl].mode);
+	slice_tile_max = (rtiled->surface.level[tiled_lvl].nblk_x *
+			  rtiled->surface.level[tiled_lvl].nblk_y) / (8*8) - 1;
+	height = rlinear->surface.level[linear_lvl].nblk_y;
+	base = rtiled->surface.level[tiled_lvl].offset;
+	addr = rlinear->surface.level[linear_lvl].offset;
+	bank_h = cik_bank_wh(rtiled->surface.bankh);
+	bank_w = cik_bank_wh(rtiled->surface.bankw);
+	mt_aspect = cik_macro_tile_aspect(rtiled->surface.mtilea);
+	tile_split = cik_tile_split(rtiled->surface.tile_split);
+	tile_mode_index = si_tile_mode_index(rtiled, tiled_lvl, false);
+	nbanks = si_num_banks(sscreen, rtiled);
+	base += rtiled->resource.gpu_address;
+	addr += rlinear->resource.gpu_address;
+
+	pipe_config = cik_db_pipe_config(sscreen, tile_mode_index);
+	mt = cik_micro_tile_mode(sscreen, tile_mode_index);
+
+	size = (copy_height * pitch) / 4;
+	cheight = copy_height;
+	if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) {
+		cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch;
+		cheight &= ~(y_align - 1);
+	}
+	ncopy = (copy_height + cheight - 1) / cheight;
+	r600_need_dma_space(&ctx->b, ncopy * 12);
+
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rsrc->resource,
+			      RADEON_USAGE_READ, RADEON_PRIO_MIN);
+	r600_context_bo_reloc(&ctx->b, &ctx->b.rings.dma, &rdst->resource,
+			      RADEON_USAGE_WRITE, RADEON_PRIO_MIN);
+
+	copy_height = size * 4 / pitch;
+	for (i = 0; i < ncopy; i++) {
+		cheight = copy_height;
+		if (((cheight * pitch) / 4) > CIK_SDMA_COPY_MAX_SIZE) {
+			cheight = (CIK_SDMA_COPY_MAX_SIZE * 4) / pitch;
+			cheight &= ~(y_align - 1);
+		}
+		size = (cheight * pitch) / 4;
+
+		cs->buf[cs->cdw++] = CIK_SDMA_PACKET(CIK_SDMA_OPCODE_COPY,
+						     sub_op, detile << 15);
+		cs->buf[cs->cdw++] = base;
+		cs->buf[cs->cdw++] = base >> 32;
+		cs->buf[cs->cdw++] = ((height - 1) << 16) | pitch_tile_max;
+		cs->buf[cs->cdw++] = slice_tile_max;
+		cs->buf[cs->cdw++] = (pipe_config << 26) | (mt_aspect << 24) |
+			(nbanks << 21) | (bank_h << 18) | (bank_w << 15) |
+			(tile_split << 11) | (mt << 8) | (array_mode << 3) |
+			lbpe;
+		cs->buf[cs->cdw++] = y << 16; /* | x */
+		cs->buf[cs->cdw++] = 0; /* z */;
+		cs->buf[cs->cdw++] = addr & 0xfffffffc;
+		cs->buf[cs->cdw++] = addr >> 32;
+		cs->buf[cs->cdw++] = (pitch / bpe) - 1;
+		cs->buf[cs->cdw++] = size;
+
+		copy_height -= cheight;
+		y += cheight;
+	}
+}
+
+void cik_sdma_copy(struct pipe_context *ctx,
+		   struct pipe_resource *dst,
+		   unsigned dst_level,
+		   unsigned dstx, unsigned dsty, unsigned dstz,
+		   struct pipe_resource *src,
+		   unsigned src_level,
+		   const struct pipe_box *src_box)
+{
+	struct si_context *sctx = (struct si_context *)ctx;
+	struct r600_texture *rsrc = (struct r600_texture*)src;
+	struct r600_texture *rdst = (struct r600_texture*)dst;
+	unsigned dst_pitch, src_pitch, bpe, dst_mode, src_mode;
+	unsigned src_w, dst_w;
+	unsigned src_x, src_y;
+	unsigned copy_height, y_align;
+	unsigned dst_x = dstx, dst_y = dsty, dst_z = dstz;
+
+	if (sctx->b.rings.dma.cs == NULL) {
+		goto fallback;
+	}
+
+	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
+		cik_sdma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width);
+		return;
+	}
+
+	/* Before re-enabling this, please make sure you can hit all newly
+	 * enabled paths in your testing, preferably with both piglit (in
+	 * particular the streaming-texture-leak test) and real world apps
+	 * (e.g. the UE4 Elemental demo).
+	 */
+	goto fallback;
+
+	if (src->format != dst->format ||
+	    rdst->surface.nsamples > 1 || rsrc->surface.nsamples > 1 ||
+	    rdst->dirty_level_mask & (1 << dst_level)) {
+		goto fallback;
+	}
+
+	if (rsrc->dirty_level_mask & (1 << src_level)) {
+		if (rsrc->htile_buffer)
+			goto fallback;
+
+		ctx->flush_resource(ctx, src);
+	}
+
+	src_x = util_format_get_nblocksx(src->format, src_box->x);
+	dst_x = util_format_get_nblocksx(src->format, dst_x);
+	src_y = util_format_get_nblocksy(src->format, src_box->y);
+	dst_y = util_format_get_nblocksy(src->format, dst_y);
+
+	dst_pitch = rdst->surface.level[dst_level].pitch_bytes;
+	src_pitch = rsrc->surface.level[src_level].pitch_bytes;
+	src_w = rsrc->surface.level[src_level].npix_x;
+	dst_w = rdst->surface.level[dst_level].npix_x;
+
+	if (src_pitch != dst_pitch || src_box->x || dst_x || src_w != dst_w ||
+	    src_box->width != src_w ||
+	    rsrc->surface.level[src_level].nblk_y !=
+	    rdst->surface.level[dst_level].nblk_y) {
+		/* FIXME CIK can do partial blit */
+		goto fallback;
+	}
+
+	bpe = rdst->surface.bpe;
+	copy_height = src_box->height / rsrc->surface.blk_h;
+	dst_mode = rdst->surface.level[dst_level].mode;
+	src_mode = rsrc->surface.level[src_level].mode;
+	/* downcast linear aligned to linear to simplify test */
+	src_mode = src_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : src_mode;
+	dst_mode = dst_mode == RADEON_SURF_MODE_LINEAR_ALIGNED ? RADEON_SURF_MODE_LINEAR : dst_mode;
+
+	/* Dimensions must be aligned to (macro)tiles */
+	switch (src_mode == RADEON_SURF_MODE_LINEAR ? dst_mode : src_mode) {
+	case RADEON_SURF_MODE_1D:
+		if ((src_x % 8) || (src_y % 8) || (dst_x % 8) || (dst_y % 8) ||
+		    (copy_height % 8))
+			goto fallback;
+		y_align = 8;
+		break;
+	case RADEON_SURF_MODE_2D: {
+		unsigned mtilew, mtileh, num_banks;
+
+			switch (si_num_banks(sctx->screen, rsrc)) {
+			case V_02803C_ADDR_SURF_2_BANK:
+			default:
+				num_banks = 2;
+				break;
+			case V_02803C_ADDR_SURF_4_BANK:
+				num_banks = 4;
+				break;
+			case V_02803C_ADDR_SURF_8_BANK:
+				num_banks = 8;
+				break;
+			case V_02803C_ADDR_SURF_16_BANK:
+				num_banks = 16;
+				break;
+			}
+
+			mtilew = (8 * rsrc->surface.bankw *
+				  sctx->screen->b.tiling_info.num_channels) *
+				rsrc->surface.mtilea;
+			assert(!(mtilew & (mtilew - 1)));
+			mtileh = (8 * rsrc->surface.bankh * num_banks) /
+				rsrc->surface.mtilea;
+			assert(!(mtileh & (mtileh - 1)));
+
+			if ((src_x & (mtilew - 1)) || (src_y & (mtileh - 1)) ||
+			    (dst_x & (mtilew - 1)) || (dst_y & (mtileh - 1)) ||
+			    (copy_height & (mtileh - 1)))
+				goto fallback;
+
+			y_align = mtileh;
+			break;
+	}
+	default:
+		y_align = 1;
+	}
+
+	if (src_mode == dst_mode) {
+		uint64_t dst_offset, src_offset;
+		unsigned src_h, dst_h;
+
+		src_h = rsrc->surface.level[src_level].npix_y;
+		dst_h = rdst->surface.level[dst_level].npix_y;
+
+		if (src_box->depth > 1 &&
+		    (src_y || dst_y || src_h != dst_h || src_box->height != src_h))
+			goto fallback;
+
+		/* simple dma blit would do NOTE code here assume :
+		 *   dst_pitch == src_pitch
+		 */
+		src_offset= rsrc->surface.level[src_level].offset;
+		src_offset += rsrc->surface.level[src_level].slice_size * src_box->z;
+		src_offset += src_y * src_pitch + src_x * bpe;
+		dst_offset = rdst->surface.level[dst_level].offset;
+		dst_offset += rdst->surface.level[dst_level].slice_size * dst_z;
+		dst_offset += dst_y * dst_pitch + dst_x * bpe;
+		cik_sdma_do_copy_buffer(sctx, dst, src, dst_offset, src_offset,
+					src_box->depth *
+					rsrc->surface.level[src_level].slice_size);
+	} else {
+		if (dst_y != src_y || src_box->depth > 1 || src_box->z || dst_z)
+			goto fallback;
+
+		cik_sdma_copy_tile(sctx, dst, dst_level, src, src_level,
+				   src_y, copy_height, y_align, dst_pitch, bpe);
+	}
+	return;
+
+fallback:
+	si_resource_copy_region(ctx, dst, dst_level, dstx, dsty, dstz,
+				src, src_level, src_box);
+}
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index db523ee..7a0076e 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -30,21 +30,6 @@
 
 #include "util/u_format.h"
 
-static unsigned si_array_mode(unsigned mode)
-{
-	switch (mode) {
-	case RADEON_SURF_MODE_LINEAR_ALIGNED:
-		return V_009910_ARRAY_LINEAR_ALIGNED;
-	case RADEON_SURF_MODE_1D:
-		return V_009910_ARRAY_1D_TILED_THIN1;
-	case RADEON_SURF_MODE_2D:
-		return V_009910_ARRAY_2D_TILED_THIN1;
-	default:
-	case RADEON_SURF_MODE_LINEAR:
-		return V_009910_ARRAY_LINEAR_GENERAL;
-	}
-}
-
 static uint32_t si_micro_tile_mode(struct si_screen *sscreen, unsigned tile_mode)
 {
 	if (sscreen->b.info.si_tile_mode_array_valid) {
@@ -240,11 +225,6 @@ void si_dma_copy(struct pipe_context *ctx,
 		goto fallback;
 	}
 
-	/* TODO: Implement DMA copy for CIK */
-	if (sctx->b.chip_class >= CIK) {
-		goto fallback;
-	}
-
 	if (dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER) {
 		si_dma_copy_buffer(sctx, dst, src, dst_x, src_box->x, src_box->width);
 		return;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index e68c30e..53ae71a 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -251,6 +251,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_POLYGON_OFFSET_CLAMP:
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 	case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+	case PIPE_CAP_TGSI_TEXCOORD:
 		return 1;
 
 	case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
@@ -286,13 +287,13 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
 	case PIPE_CAP_VERTEX_COLOR_CLAMPED:
 	case PIPE_CAP_USER_VERTEX_BUFFERS:
-	case PIPE_CAP_TGSI_TEXCOORD:
 	case PIPE_CAP_FAKE_SW_MSAA:
 	case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
 	case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
 	case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
 	case PIPE_CAP_SAMPLER_VIEW_TARGET:
 	case PIPE_CAP_VERTEXID_NOBASE:
+	case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
 		return 0;
 
 	case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
@@ -451,6 +452,7 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
 	case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
 		return 0;
 	case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+	case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
 		return 1;
 	}
 	return 0;
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index f98c7a8..2d67342 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -237,6 +237,15 @@ struct si_context {
 	unsigned		spi_tmpring_size;
 };
 
+/* cik_sdma.c */
+void cik_sdma_copy(struct pipe_context *ctx,
+		   struct pipe_resource *dst,
+		   unsigned dst_level,
+		   unsigned dstx, unsigned dsty, unsigned dstz,
+		   struct pipe_resource *src,
+		   unsigned src_level,
+		   const struct pipe_box *src_box);
+
 /* si_blit.c */
 void si_init_blit_functions(struct si_context *sctx);
 void si_flush_depth_textures(struct si_context *sctx,
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 89f02ab..47e5f96 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -128,21 +128,10 @@ unsigned si_shader_io_get_unique_index(unsigned semantic_name, unsigned index)
 	case TGSI_SEMANTIC_CLIPDIST:
 		assert(index <= 1);
 		return 2 + index;
-	case TGSI_SEMANTIC_CLIPVERTEX:
-		return 4;
-	case TGSI_SEMANTIC_COLOR:
-		assert(index <= 1);
-		return 5 + index;
-	case TGSI_SEMANTIC_BCOLOR:
-		assert(index <= 1);
-		return 7 + index;
-	case TGSI_SEMANTIC_FOG:
-		return 9;
-	case TGSI_SEMANTIC_EDGEFLAG:
-		return 10;
 	case TGSI_SEMANTIC_GENERIC:
-		assert(index <= 63-11);
-		return 11 + index;
+		assert(index <= 63-4);
+		return 4 + index;
+
 	default:
 		assert(0);
 		return 63;
@@ -1183,6 +1172,7 @@ handle_semantic:
 			continue;
 		case TGSI_SEMANTIC_PRIMID:
 		case TGSI_SEMANTIC_FOG:
+		case TGSI_SEMANTIC_TEXCOORD:
 		case TGSI_SEMANTIC_GENERIC:
 			target = V_008DFC_SQ_EXP_PARAM + param_count;
 			shader->vs_output_param_offset[i] = param_count;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 7f0fdd5..6c18836 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -44,6 +44,21 @@ static void si_init_atom(struct r600_atom *atom, struct r600_atom **list_elem,
 	*list_elem = atom;
 }
 
+unsigned si_array_mode(unsigned mode)
+{
+	switch (mode) {
+	case RADEON_SURF_MODE_LINEAR_ALIGNED:
+		return V_009910_ARRAY_LINEAR_ALIGNED;
+	case RADEON_SURF_MODE_1D:
+		return V_009910_ARRAY_1D_TILED_THIN1;
+	case RADEON_SURF_MODE_2D:
+		return V_009910_ARRAY_2D_TILED_THIN1;
+	default:
+	case RADEON_SURF_MODE_LINEAR:
+		return V_009910_ARRAY_LINEAR_GENERAL;
+	}
+}
+
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex)
 {
 	if (sscreen->b.chip_class == CIK &&
@@ -636,18 +651,14 @@ static void *si_create_rs_state(struct pipe_context *ctx,
 	rs->offset_units = state->offset_units;
 	rs->offset_scale = state->offset_scale * 12.0f;
 
-	tmp = S_0286D4_FLAT_SHADE_ENA(1);
-	if (state->sprite_coord_enable) {
-		tmp |= S_0286D4_PNT_SPRITE_ENA(1) |
-			S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
-			S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
-			S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
-			S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1);
-		if (state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT) {
-			tmp |= S_0286D4_PNT_SPRITE_TOP_1(1);
-		}
-	}
-	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0, tmp);
+	si_pm4_set_reg(pm4, R_0286D4_SPI_INTERP_CONTROL_0,
+		S_0286D4_FLAT_SHADE_ENA(1) |
+		S_0286D4_PNT_SPRITE_ENA(1) |
+		S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
+		S_0286D4_PNT_SPRITE_OVRD_Y(V_0286D4_SPI_PNT_SPRITE_SEL_T) |
+		S_0286D4_PNT_SPRITE_OVRD_Z(V_0286D4_SPI_PNT_SPRITE_SEL_0) |
+		S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
+		S_0286D4_PNT_SPRITE_TOP_1(state->sprite_coord_mode != PIPE_SPRITE_COORD_UPPER_LEFT));
 
 	/* point size 12.4 fixed point */
 	tmp = (unsigned)(state->point_size * 8.0);
@@ -2910,11 +2921,16 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->b.b.set_polygon_stipple = si_set_polygon_stipple;
 	sctx->b.b.set_min_samples = si_set_min_samples;
 
-	sctx->b.dma_copy = si_dma_copy;
 	sctx->b.set_occlusion_query_state = si_set_occlusion_query_state;
 	sctx->b.need_gfx_cs_space = si_need_gfx_cs_space;
 
 	sctx->b.b.draw_vbo = si_draw_vbo;
+
+	if (sctx->b.chip_class >= CIK) {
+		sctx->b.dma_copy = cik_sdma_copy;
+	} else {
+		sctx->b.dma_copy = si_dma_copy;
+	}
 }
 
 static void
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 2f8a943..5e68b16 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -261,6 +261,7 @@ unsigned cik_bank_wh(unsigned bankwh);
 unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode);
 unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect);
 unsigned cik_tile_split(unsigned tile_split);
+unsigned si_array_mode(unsigned mode);
 uint32_t si_num_banks(struct si_screen *sscreen, struct r600_texture *tex);
 unsigned si_tile_mode_index(struct r600_texture *rtex, unsigned level, bool stencil);
 
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 1bbc6b3..208c852 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -182,8 +182,13 @@ static void si_shader_vs(struct si_shader *shader)
 	for (nparams = 0, i = 0 ; i < info->num_outputs; i++) {
 		switch (info->output_semantic_name[i]) {
 		case TGSI_SEMANTIC_CLIPVERTEX:
+		case TGSI_SEMANTIC_CLIPDIST:
+		case TGSI_SEMANTIC_CULLDIST:
 		case TGSI_SEMANTIC_POSITION:
 		case TGSI_SEMANTIC_PSIZE:
+		case TGSI_SEMANTIC_EDGEFLAG:
+		case TGSI_SEMANTIC_VIEWPORT_INDEX:
+		case TGSI_SEMANTIC_LAYER:
 			break;
 		default:
 			nparams++;
@@ -351,21 +356,25 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx,
 					  union si_shader_key *key)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
-	memset(key, 0, sizeof(*key));
+	unsigned i;
 
-	if (sel->type == PIPE_SHADER_VERTEX) {
-		unsigned i;
-		if (!sctx->vertex_elements)
-			return;
+	memset(key, 0, sizeof(*key));
 
-		for (i = 0; i < sctx->vertex_elements->count; ++i)
-			key->vs.instance_divisors[i] = sctx->vertex_elements->elements[i].instance_divisor;
+	switch (sel->type) {
+	case PIPE_SHADER_VERTEX:
+		if (sctx->vertex_elements)
+			for (i = 0; i < sctx->vertex_elements->count; ++i)
+				key->vs.instance_divisors[i] =
+					sctx->vertex_elements->elements[i].instance_divisor;
 
 		if (sctx->gs_shader) {
 			key->vs.as_es = 1;
 			key->vs.gs_used_inputs = sctx->gs_shader->gs_used_inputs;
 		}
-	} else if (sel->type == PIPE_SHADER_FRAGMENT) {
+		break;
+	case PIPE_SHADER_GEOMETRY:
+		break;
+	case PIPE_SHADER_FRAGMENT: {
 		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 
 		if (sel->info.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS])
@@ -393,11 +402,14 @@ static INLINE void si_shader_selector_key(struct pipe_context *ctx,
 		}
 
 		key->ps.alpha_func = PIPE_FUNC_ALWAYS;
-
 		/* Alpha-test should be disabled if colorbuffer 0 is integer. */
 		if (sctx->queued.named.dsa &&
 		    !sctx->framebuffer.cb0_is_integer)
 			key->ps.alpha_func = sctx->queued.named.dsa->alpha_func;
+		break;
+	}
+	default:
+		assert(0);
 	}
 }
 
@@ -580,15 +592,22 @@ static void si_delete_shader_selector(struct pipe_context *ctx,
 
 	while (p) {
 		c = p->next_variant;
-		if (sel->type == PIPE_SHADER_GEOMETRY) {
+		switch (sel->type) {
+		case PIPE_SHADER_VERTEX:
+			if (p->key.vs.as_es)
+				si_pm4_delete_state(sctx, es, p->pm4);
+			else
+				si_pm4_delete_state(sctx, vs, p->pm4);
+			break;
+		case PIPE_SHADER_GEOMETRY:
 			si_pm4_delete_state(sctx, gs, p->pm4);
 			si_pm4_delete_state(sctx, vs, p->gs_copy_shader->pm4);
-		} else if (sel->type == PIPE_SHADER_FRAGMENT)
+			break;
+		case PIPE_SHADER_FRAGMENT:
 			si_pm4_delete_state(sctx, ps, p->pm4);
-		else if (p->key.vs.as_es)
-			si_pm4_delete_state(sctx, es, p->pm4);
-		else
-			si_pm4_delete_state(sctx, vs, p->pm4);
+			break;
+		}
+
 		si_shader_destroy(ctx, p);
 		free(p);
 		p = c;
@@ -661,8 +680,9 @@ bcolor:
 		    (interpolate == TGSI_INTERPOLATE_COLOR && sctx->flatshade))
 			tmp |= S_028644_FLAT_SHADE(1);
 
-		if (name == TGSI_SEMANTIC_GENERIC &&
-		    sctx->sprite_coord_enable & (1 << index)) {
+		if (name == TGSI_SEMANTIC_PCOORD ||
+		    (name == TGSI_SEMANTIC_TEXCOORD &&
+		     sctx->sprite_coord_enable & (1 << index))) {
 			tmp |= S_028644_PT_SPRITE_TEX(1);
 		}
 
@@ -835,8 +855,15 @@ static void si_update_spi_tmpring_size(struct si_context *sctx)
 			si_pm4_bind_state(sctx, ps, sctx->ps_shader->current->pm4);
 		if (si_update_scratch_buffer(sctx, sctx->gs_shader))
 			si_pm4_bind_state(sctx, gs, sctx->gs_shader->current->pm4);
-		if (si_update_scratch_buffer(sctx, sctx->vs_shader))
-			si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+
+		/* VS can be bound as ES or VS. */
+		if (sctx->gs_shader) {
+			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+				si_pm4_bind_state(sctx, es, sctx->vs_shader->current->pm4);
+		} else {
+			if (si_update_scratch_buffer(sctx, sctx->vs_shader))
+				si_pm4_bind_state(sctx, vs, sctx->vs_shader->current->pm4);
+		}
 	}
 
 	/* The LLVM shader backend should be reporting aligned scratch_sizes. */
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index afe011b..35d5ee2 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -4516,6 +4516,13 @@
 #define     V_009910_ADDR_SURF_8_BANK                               0x02
 #define     V_009910_ADDR_SURF_16_BANK                              0x03
 /* CIK */
+#define   S_009910_MICRO_TILE_MODE_NEW(x)                             (((x) & 0x07) << 22)
+#define   G_009910_MICRO_TILE_MODE_NEW(x)                             (((x) >> 22) & 0x07)
+#define   C_009910_MICRO_TILE_MODE_NEW(x)                             0xFE3FFFFF
+#define     V_009910_ADDR_SURF_DISPLAY_MICRO_TILING                 0x00
+#define     V_009910_ADDR_SURF_THIN_MICRO_TILING                    0x01
+#define     V_009910_ADDR_SURF_DEPTH_MICRO_TILING                   0x02
+#define     V_009910_ADDR_SURF_ROTATED_MICRO_TILING                 0x03
 #define R_00B01C_SPI_SHADER_PGM_RSRC3_PS                                0x00B01C
 #define   S_00B01C_CU_EN(x)                                           (((x) & 0xFFFF) << 0)
 #define   G_00B01C_CU_EN(x)                                           (((x) >> 0) & 0xFFFF)
@@ -8696,5 +8703,29 @@
 #define    SI_DMA_PACKET_CONSTANT_FILL             0xd
 #define    SI_DMA_PACKET_NOP                       0xf
 
+/* CIK async DMA packets */
+#define CIK_SDMA_PACKET(op, sub_op, n)   ((((n) & 0xFFFF) << 16) |	\
+					 (((sub_op) & 0xFF) << 8) |	\
+					 (((op) & 0xFF) << 0))
+/* CIK async DMA packet types */
+#define    CIK_SDMA_OPCODE_NOP                     0x0
+#define    CIK_SDMA_OPCODE_COPY                    0x1
+#define        CIK_SDMA_COPY_SUB_OPCODE_LINEAR            0x0
+#define        CIK_SDMA_COPY_SUB_OPCODE_TILED             0x1
+#define        CIK_SDMA_COPY_SUB_OPCODE_SOA               0x3
+#define        CIK_SDMA_COPY_SUB_OPCODE_LINEAR_SUB_WINDOW 0x4
+#define        CIK_SDMA_COPY_SUB_OPCODE_TILED_SUB_WINDOW  0x5
+#define        CIK_SDMA_COPY_SUB_OPCODE_T2T_SUB_WINDOW    0x6
+#define    CIK_SDMA_OPCODE_WRITE                   0x2
+#define        SDMA_WRITE_SUB_OPCODE_LINEAR               0x0
+#define        SDMA_WRTIE_SUB_OPCODE_TILED                0x1
+#define    CIK_SDMA_OPCODE_INDIRECT_BUFFER         0x4
+#define    CIK_SDMA_PACKET_FENCE                   0x5
+#define    CIK_SDMA_PACKET_TRAP                    0x6
+#define    CIK_SDMA_PACKET_SEMAPHORE               0x7
+#define    CIK_SDMA_PACKET_CONSTANT_FILL           0xb
+#define    CIK_SDMA_PACKET_SRBM_WRITE              0xe
+#define    CIK_SDMA_COPY_MAX_SIZE                  0x1fffff
+
 #endif /* _SID_H */
 
diff --git a/src/gallium/drivers/rbug/rbug_public.h b/src/gallium/drivers/rbug/rbug_public.h
index b66740b..83f9c94 100644
--- a/src/gallium/drivers/rbug/rbug_public.h
+++ b/src/gallium/drivers/rbug/rbug_public.h
@@ -28,6 +28,10 @@
 #ifndef RBUG_PUBLIC_H
 #define RBUG_PUBLIC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 struct pipe_context;
 
@@ -37,4 +41,8 @@ rbug_screen_create(struct pipe_screen *screen);
 boolean
 rbug_enabled(void);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* RBUG_PUBLIC_H */
diff --git a/src/gallium/drivers/softpipe/sp_public.h b/src/gallium/drivers/softpipe/sp_public.h
index 62d0903..88a9b5e 100644
--- a/src/gallium/drivers/softpipe/sp_public.h
+++ b/src/gallium/drivers/softpipe/sp_public.h
@@ -1,10 +1,18 @@
 #ifndef SP_PUBLIC_H
 #define SP_PUBLIC_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct pipe_screen;
 struct sw_winsys;
 
 struct pipe_screen *
 softpipe_create_screen(struct sw_winsys *winsys);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_query.c b/src/gallium/drivers/softpipe/sp_query.c
index e773870..76105b4 100644
--- a/src/gallium/drivers/softpipe/sp_query.c
+++ b/src/gallium/drivers/softpipe/sp_query.c
@@ -277,7 +277,7 @@ softpipe_check_render_cond(struct softpipe_context *sp)
    b = pipe->get_query_result(pipe, sp->render_cond_query, wait,
                               (void*)&result);
    if (b)
-      return (!result == sp->render_cond_cond);
+      return (!result) == sp->render_cond_cond;
    else
       return TRUE;
 }
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index d289e28..a688d31 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -191,7 +191,9 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_ENDIANNESS:
       return PIPE_ENDIAN_NATIVE;
    case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+      return 4;
    case PIPE_CAP_TEXTURE_GATHER_SM5:
+      return 1;
    case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
    case PIPE_CAP_TEXTURE_QUERY_LOD:
    case PIPE_CAP_SAMPLE_SHADING:
@@ -200,13 +202,15 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
       return 1;
    case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
-   case PIPE_CAP_SAMPLER_VIEW_TARGET:
       return 0;
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+      return 1;
    case PIPE_CAP_FAKE_SW_MSAA:
       return 1;
    case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+      return -32;
    case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
-      return 0;
+      return 31;
    case PIPE_CAP_DRAW_INDIRECT:
       return 1;
 
@@ -237,6 +241,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 0;
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index e56fb5b..d7a3360 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -202,7 +202,7 @@ prepare_shader_sampling(
             struct pipe_resource *res = view->texture;
             int j;
 
-            if (res->target != PIPE_BUFFER) {
+            if (view->target != PIPE_BUFFER) {
                first_level = view->u.tex.first_level;
                last_level = view->u.tex.last_level;
                assert(first_level <= last_level);
@@ -214,15 +214,17 @@ prepare_shader_sampling(
                   row_stride[j] = sp_tex->stride[j];
                   img_stride[j] = sp_tex->img_stride[j];
                }
-               if (res->target == PIPE_TEXTURE_1D_ARRAY ||
-                   res->target == PIPE_TEXTURE_2D_ARRAY ||
-                   res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+               if (view->target == PIPE_TEXTURE_1D_ARRAY ||
+                   view->target == PIPE_TEXTURE_2D_ARRAY ||
+                   view->target == PIPE_TEXTURE_CUBE ||
+                   view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                   num_layers = view->u.tex.last_layer - view->u.tex.first_layer + 1;
                   for (j = first_level; j <= last_level; j++) {
                      mip_offsets[j] += view->u.tex.first_layer *
                                        sp_tex->img_stride[j];
                   }
-                  if (res->target == PIPE_TEXTURE_CUBE_ARRAY) {
+                  if (view->target == PIPE_TEXTURE_CUBE ||
+                      view->target == PIPE_TEXTURE_CUBE_ARRAY) {
                      assert(num_layers % 6 == 0);
                   }
                   assert(view->u.tex.first_layer <= view->u.tex.last_layer);
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index 68dcf57..1010b63 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -131,68 +131,80 @@ repeat(int coord, unsigned size)
  * \param icoord  returns the integer texcoords
  */
 static void
-wrap_nearest_repeat(float s, unsigned size, int *icoord)
+wrap_nearest_repeat(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [0,1) */
    /* i limited to [0,size-1] */
    int i = util_ifloor(s * size);
-   *icoord = repeat(i, size);
+   *icoord = repeat(i + offset, size);
 }
 
 
 static void
-wrap_nearest_clamp(float s, unsigned size, int *icoord)
+wrap_nearest_clamp(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [0,1] */
    /* i limited to [0,size-1] */
+   s *= size;
+   s += offset;
    if (s <= 0.0F)
       *icoord = 0;
-   else if (s >= 1.0F)
+   else if (s >= size)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(s * size);
+      *icoord = util_ifloor(s);
 }
 
 
 static void
-wrap_nearest_clamp_to_edge(float s, unsigned size, int *icoord)
+wrap_nearest_clamp_to_edge(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [min,max] */
    /* i limited to [0, size-1] */
-   const float min = 1.0F / (2.0F * size);
-   const float max = 1.0F - min;
+   const float min = 0.5F;
+   const float max = (float)size - 0.5F;
+
+   s *= size;
+   s += offset;
+
    if (s < min)
       *icoord = 0;
    else if (s > max)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(s * size);
+      *icoord = util_ifloor(s);
 }
 
 
 static void
-wrap_nearest_clamp_to_border(float s, unsigned size, int *icoord)
+wrap_nearest_clamp_to_border(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [min,max] */
    /* i limited to [-1, size] */
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
+   const float min = -0.5F;
+   const float max = size + 0.5F;
+
+   s *= size;
+   s += offset;
    if (s <= min)
       *icoord = -1;
    else if (s >= max)
       *icoord = size;
    else
-      *icoord = util_ifloor(s * size);
+      *icoord = util_ifloor(s);
 }
 
-
 static void
-wrap_nearest_mirror_repeat(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_repeat(float s, unsigned size, int offset, int *icoord)
 {
    const float min = 1.0F / (2.0F * size);
    const float max = 1.0F - min;
-   const int flr = util_ifloor(s);
-   float u = frac(s);
+   int flr;
+   float u;
+
+   s += (float)offset / size;
+   flr = util_ifloor(s);
+   u = frac(s);
    if (flr & 1)
       u = 1.0F - u;
    if (u < min)
@@ -205,51 +217,52 @@ wrap_nearest_mirror_repeat(float s, unsigned size, int *icoord)
 
 
 static void
-wrap_nearest_mirror_clamp(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_clamp(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [0,1] */
    /* i limited to [0,size-1] */
-   const float u = fabsf(s);
+   const float u = fabsf(s * size + offset);
    if (u <= 0.0F)
       *icoord = 0;
-   else if (u >= 1.0F)
+   else if (u >= size)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(u * size);
+      *icoord = util_ifloor(u);
 }
 
 
 static void
-wrap_nearest_mirror_clamp_to_edge(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_clamp_to_edge(float s, unsigned size, int offset, int *icoord)
 {
    /* s limited to [min,max] */
    /* i limited to [0, size-1] */
-   const float min = 1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   const float u = fabsf(s);
+   const float min = 0.5F;
+   const float max = (float)size - 0.5F;
+   const float u = fabsf(s * size + offset);
+
    if (u < min)
       *icoord = 0;
    else if (u > max)
       *icoord = size - 1;
    else
-      *icoord = util_ifloor(u * size);
+      *icoord = util_ifloor(u);
 }
 
 
 static void
-wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int *icoord)
+wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int offset, int *icoord)
 {
-   /* s limited to [min,max] */
-   /* i limited to [0, size-1] */
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   const float u = fabsf(s);
+   /* u limited to [-0.5, size-0.5] */
+   const float min = -0.5F;
+   const float max = (float)size + 0.5F;
+   const float u = fabsf(s * size + offset);
+
    if (u < min)
       *icoord = -1;
    else if (u > max)
       *icoord = size;
    else
-      *icoord = util_ifloor(u * size);
+      *icoord = util_ifloor(u);
 }
 
 
@@ -264,22 +277,23 @@ wrap_nearest_mirror_clamp_to_border(float s, unsigned size, int *icoord)
  * \param icoord  returns the computed integer texture coord
  */
 static void
-wrap_linear_repeat(float s, unsigned size,
+wrap_linear_repeat(float s, unsigned size, int offset,
                    int *icoord0, int *icoord1, float *w)
 {
    float u = s * size - 0.5F;
-   *icoord0 = repeat(util_ifloor(u), size);
+   *icoord0 = repeat(util_ifloor(u) + offset, size);
    *icoord1 = repeat(*icoord0 + 1, size);
    *w = frac(u);
 }
 
 
 static void
-wrap_linear_clamp(float s, unsigned size,
+wrap_linear_clamp(float s, unsigned size, int offset,
                   int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, 0.0F, 1.0F);
-   u = u * size - 0.5f;
+   float u = CLAMP(s * size + offset, 0.0F, (float)size);
+
+   u = u - 0.5f;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    *w = frac(u);
@@ -287,11 +301,11 @@ wrap_linear_clamp(float s, unsigned size,
 
 
 static void
-wrap_linear_clamp_to_edge(float s, unsigned size,
+wrap_linear_clamp_to_edge(float s, unsigned size, int offset,
                           int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, 0.0F, 1.0F);
-   u = u * size - 0.5f;
+   float u = CLAMP(s * size + offset, 0.0F, (float)size);
+   u = u - 0.5f;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    if (*icoord0 < 0)
@@ -303,13 +317,13 @@ wrap_linear_clamp_to_edge(float s, unsigned size,
 
 
 static void
-wrap_linear_clamp_to_border(float s, unsigned size,
+wrap_linear_clamp_to_border(float s, unsigned size, int offset,
                             int *icoord0, int *icoord1, float *w)
 {
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   float u = CLAMP(s, min, max);
-   u = u * size - 0.5f;
+   const float min = -0.5F;
+   const float max = (float)size + 0.5F;
+   float u = CLAMP(s * size + offset, min, max);
+   u = u - 0.5f;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    *w = frac(u);
@@ -317,11 +331,15 @@ wrap_linear_clamp_to_border(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_repeat(float s, unsigned size,
+wrap_linear_mirror_repeat(float s, unsigned size, int offset,
                           int *icoord0, int *icoord1, float *w)
 {
-   const int flr = util_ifloor(s);
-   float u = frac(s);
+   int flr;
+   float u;
+
+   s += (float)offset / size;
+   flr = util_ifloor(s);
+   u = frac(s);
    if (flr & 1)
       u = 1.0F - u;
    u = u * size - 0.5F;
@@ -336,14 +354,12 @@ wrap_linear_mirror_repeat(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_clamp(float s, unsigned size,
+wrap_linear_mirror_clamp(float s, unsigned size, int offset,
                          int *icoord0, int *icoord1, float *w)
 {
-   float u = fabsf(s);
-   if (u >= 1.0F)
+   float u = fabsf(s * size + offset);
+   if (u >= size)
       u = (float) size;
-   else
-      u *= size;
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -352,14 +368,12 @@ wrap_linear_mirror_clamp(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_clamp_to_edge(float s, unsigned size,
+wrap_linear_mirror_clamp_to_edge(float s, unsigned size, int offset,
                                  int *icoord0, int *icoord1, float *w)
 {
-   float u = fabsf(s);
-   if (u >= 1.0F)
+   float u = fabsf(s * size + offset);
+   if (u >= size)
       u = (float) size;
-   else
-      u *= size;
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -372,18 +386,16 @@ wrap_linear_mirror_clamp_to_edge(float s, unsigned size,
 
 
 static void
-wrap_linear_mirror_clamp_to_border(float s, unsigned size,
+wrap_linear_mirror_clamp_to_border(float s, unsigned size, int offset,
                                    int *icoord0, int *icoord1, float *w)
 {
-   const float min = -1.0F / (2.0F * size);
-   const float max = 1.0F - min;
-   float u = fabsf(s);
+   const float min = -0.5F;
+   const float max = size + 0.5F;
+   float u = fabsf(s * size + offset);
    if (u <= min)
-      u = min * size;
+      u = min;
    else if (u >= max)
-      u = max * size;
-   else
-      u *= size;
+      u = max;
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -395,10 +407,10 @@ wrap_linear_mirror_clamp_to_border(float s, unsigned size,
  * PIPE_TEX_WRAP_CLAMP for nearest sampling, unnormalized coords.
  */
 static void
-wrap_nearest_unorm_clamp(float s, unsigned size, int *icoord)
+wrap_nearest_unorm_clamp(float s, unsigned size, int offset, int *icoord)
 {
    int i = util_ifloor(s);
-   *icoord = CLAMP(i, 0, (int) size-1);
+   *icoord = CLAMP(i + offset, 0, (int) size-1);
 }
 
 
@@ -406,9 +418,9 @@ wrap_nearest_unorm_clamp(float s, unsigned size, int *icoord)
  * PIPE_TEX_WRAP_CLAMP_TO_BORDER for nearest sampling, unnormalized coords.
  */
 static void
-wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int *icoord)
+wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int offset, int *icoord)
 {
-   *icoord = util_ifloor( CLAMP(s, -0.5F, (float) size + 0.5F) );
+   *icoord = util_ifloor( CLAMP(s + offset, -0.5F, (float) size + 0.5F) );
 }
 
 
@@ -416,9 +428,9 @@ wrap_nearest_unorm_clamp_to_border(float s, unsigned size, int *icoord)
  * PIPE_TEX_WRAP_CLAMP_TO_EDGE for nearest sampling, unnormalized coords.
  */
 static void
-wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int *icoord)
+wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int offset, int *icoord)
 {
-   *icoord = util_ifloor( CLAMP(s, 0.5F, (float) size - 0.5F) );
+   *icoord = util_ifloor( CLAMP(s + offset, 0.5F, (float) size - 0.5F) );
 }
 
 
@@ -426,11 +438,11 @@ wrap_nearest_unorm_clamp_to_edge(float s, unsigned size, int *icoord)
  * PIPE_TEX_WRAP_CLAMP for linear sampling, unnormalized coords.
  */
 static void
-wrap_linear_unorm_clamp(float s, unsigned size,
+wrap_linear_unorm_clamp(float s, unsigned size, int offset,
                         int *icoord0, int *icoord1, float *w)
 {
    /* Not exactly what the spec says, but it matches NVIDIA output */
-   float u = CLAMP(s - 0.5F, 0.0f, (float) size - 1.0f);
+   float u = CLAMP(s + offset - 0.5F, 0.0f, (float) size - 1.0f);
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
    *w = frac(u);
@@ -441,10 +453,10 @@ wrap_linear_unorm_clamp(float s, unsigned size,
  * PIPE_TEX_WRAP_CLAMP_TO_BORDER for linear sampling, unnormalized coords.
  */
 static void
-wrap_linear_unorm_clamp_to_border(float s, unsigned size,
+wrap_linear_unorm_clamp_to_border(float s, unsigned size, int offset,
                                   int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, -0.5F, (float) size + 0.5F);
+   float u = CLAMP(s + offset, -0.5F, (float) size + 0.5F);
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -458,10 +470,10 @@ wrap_linear_unorm_clamp_to_border(float s, unsigned size,
  * PIPE_TEX_WRAP_CLAMP_TO_EDGE for linear sampling, unnormalized coords.
  */
 static void
-wrap_linear_unorm_clamp_to_edge(float s, unsigned size,
+wrap_linear_unorm_clamp_to_edge(float s, unsigned size, int offset,
                                 int *icoord0, int *icoord1, float *w)
 {
-   float u = CLAMP(s, +0.5F, (float) size - 0.5F);
+   float u = CLAMP(s + offset, +0.5F, (float) size - 0.5F);
    u -= 0.5F;
    *icoord0 = util_ifloor(u);
    *icoord1 = *icoord0 + 1;
@@ -474,11 +486,11 @@ wrap_linear_unorm_clamp_to_edge(float s, unsigned size,
 /**
  * Do coordinate to array index conversion.  For array textures.
  */
-static INLINE void
-wrap_array_layer(float coord, unsigned size, int *layer)
+static INLINE int
+coord_to_layer(float coord, unsigned first_layer, unsigned last_layer)
 {
    int c = util_ifloor(coord + 0.5F);
-   *layer = CLAMP(c, 0, (int) size - 1);
+   return CLAMP(c, (int)first_layer, (int)last_layer);
 }
 
 
@@ -757,61 +769,6 @@ get_next_ycoord(unsigned face, unsigned fall_off_index, int max, int xc, int yc)
 }
 
 
-static INLINE const float *
-get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
-                        union tex_tile_address addr, int x, int y,
-                        float *corner)
-{
-   const struct pipe_resource *texture = sp_sview->base.texture;
-   unsigned level = addr.bits.level;
-   unsigned face = addr.bits.face;
-   int new_x, new_y, max_x;
-
-   max_x = (int) u_minify(texture->width0, level);
-
-   assert(texture->width0 == texture->height0);
-   new_x = x;
-   new_y = y;
-
-   /* change the face */
-   if (x < 0) {
-      /*
-       * Cheat with corners. They are difficult and I believe because we don't get
-       * per-pixel faces we can actually have multiple corner texels per pixel,
-       * which screws things up majorly in any case (as the per spec behavior is
-       * to average the 3 remaining texels, which we might not have).
-       * Hence just make sure that the 2nd coord is clamped, will simply pick the
-       * sample which would have fallen off the x coord, but not y coord.
-       * So the filter weight of the samples will be wrong, but at least this
-       * ensures that only valid texels near the corner are used.
-       */
-      if (y < 0 || y >= max_x) {
-         y = CLAMP(y, 0, max_x - 1);
-      }
-      new_x = get_next_xcoord(face, 0, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 0, max_x -1, x, y);
-      face = get_next_face(face, 0);
-   } else if (x >= max_x) {
-      if (y < 0 || y >= max_x) {
-         y = CLAMP(y, 0, max_x - 1);
-      }
-      new_x = get_next_xcoord(face, 1, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 1, max_x -1, x, y);
-      face = get_next_face(face, 1);
-   } else if (y < 0) {
-      new_x = get_next_xcoord(face, 2, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 2, max_x -1, x, y);
-      face = get_next_face(face, 2);
-   } else if (y >= max_x) {
-      new_x = get_next_xcoord(face, 3, max_x -1, x, y);
-      new_y = get_next_ycoord(face, 3, max_x -1, x, y);
-      face = get_next_face(face, 3);
-   }
-
-   addr.bits.face = face;
-   return get_texel_2d_no_border( sp_sview, addr, new_x, new_y );
-}
-
 /* Gather a quad of adjacent texels within a tile:
  */
 static INLINE void
@@ -948,6 +905,60 @@ get_texel_2d_array(const struct sp_sampler_view *sp_sview,
 }
 
 
+static INLINE const float *
+get_texel_cube_seamless(const struct sp_sampler_view *sp_sview,
+                        union tex_tile_address addr, int x, int y,
+                        float *corner, int layer, unsigned face)
+{
+   const struct pipe_resource *texture = sp_sview->base.texture;
+   unsigned level = addr.bits.level;
+   int new_x, new_y, max_x;
+
+   max_x = (int) u_minify(texture->width0, level);
+
+   assert(texture->width0 == texture->height0);
+   new_x = x;
+   new_y = y;
+
+   /* change the face */
+   if (x < 0) {
+      /*
+       * Cheat with corners. They are difficult and I believe because we don't get
+       * per-pixel faces we can actually have multiple corner texels per pixel,
+       * which screws things up majorly in any case (as the per spec behavior is
+       * to average the 3 remaining texels, which we might not have).
+       * Hence just make sure that the 2nd coord is clamped, will simply pick the
+       * sample which would have fallen off the x coord, but not y coord.
+       * So the filter weight of the samples will be wrong, but at least this
+       * ensures that only valid texels near the corner are used.
+       */
+      if (y < 0 || y >= max_x) {
+         y = CLAMP(y, 0, max_x - 1);
+      }
+      new_x = get_next_xcoord(face, 0, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 0, max_x -1, x, y);
+      face = get_next_face(face, 0);
+   } else if (x >= max_x) {
+      if (y < 0 || y >= max_x) {
+         y = CLAMP(y, 0, max_x - 1);
+      }
+      new_x = get_next_xcoord(face, 1, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 1, max_x -1, x, y);
+      face = get_next_face(face, 1);
+   } else if (y < 0) {
+      new_x = get_next_xcoord(face, 2, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 2, max_x -1, x, y);
+      face = get_next_face(face, 2);
+   } else if (y >= max_x) {
+      new_x = get_next_xcoord(face, 3, max_x -1, x, y);
+      new_y = get_next_ycoord(face, 3, max_x -1, x, y);
+      face = get_next_face(face, 3);
+   }
+
+   return get_texel_3d_no_border(sp_sview, addr, new_x, new_y, layer + face);
+}
+
+
 /* Get texel pointer for cube array texture */
 static INLINE const float *
 get_texel_cube_array(const struct sp_sampler_view *sp_sview,
@@ -1008,22 +1019,18 @@ print_sample_4(const char *function, float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZ
 static INLINE void
 img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
-                                float s,
-                                float t,
-                                float p,
-                                unsigned level,
-                                unsigned face_id,
+                                const struct img_filter_args *args,
                                 float *rgba)
 {
-   unsigned xpot = pot_level_size(sp_sview->xpot, level);
-   unsigned ypot = pot_level_size(sp_sview->ypot, level);
+   unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
+   unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
    int xmax = (xpot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, xpot) - 1; */
    int ymax = (ypot - 1) & (TEX_TILE_SIZE - 1); /* MIN2(TEX_TILE_SIZE, ypot) - 1; */
    union tex_tile_address addr;
    int c;
 
-   float u = s * xpot - 0.5F;
-   float v = t * ypot - 0.5F;
+   float u = (args->s * xpot - 0.5F) + args->offset[0];
+   float v = (args->t * ypot - 0.5F) + args->offset[1];
 
    int uflr = util_ifloor(u);
    int vflr = util_ifloor(v);
@@ -1037,7 +1044,7 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
    const float *tx[4];
       
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    /* Can we fetch all four at once:
     */
@@ -1066,21 +1073,17 @@ img_filter_2d_linear_repeat_POT(struct sp_sampler_view *sp_sview,
 static INLINE void
 img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
                                  struct sp_sampler *sp_samp,
-                                 float s,
-                                 float t,
-                                 float p,
-                                 unsigned level,
-                                 unsigned face_id,
+                                 const struct img_filter_args *args,
                                  float rgba[TGSI_QUAD_SIZE])
 {
-   unsigned xpot = pot_level_size(sp_sview->xpot, level);
-   unsigned ypot = pot_level_size(sp_sview->ypot, level);
+   unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
+   unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
    const float *out;
    union tex_tile_address addr;
    int c;
 
-   float u = s * xpot;
-   float v = t * ypot;
+   float u = args->s * xpot + args->offset[0];
+   float v = args->t * ypot + args->offset[1];
 
    int uflr = util_ifloor(u);
    int vflr = util_ifloor(v);
@@ -1089,7 +1092,7 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
    int y0 = vflr & (ypot - 1);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    out = get_texel_2d_no_border(sp_sview, addr, x0, y0);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1104,26 +1107,22 @@ img_filter_2d_nearest_repeat_POT(struct sp_sampler_view *sp_sview,
 static INLINE void
 img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
-                                float s,
-                                float t,
-                                float p,
-                                unsigned level,
-                                unsigned face_id,
+                                const struct img_filter_args *args,
                                 float rgba[TGSI_QUAD_SIZE])
 {
-   unsigned xpot = pot_level_size(sp_sview->xpot, level);
-   unsigned ypot = pot_level_size(sp_sview->ypot, level);
+   unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
+   unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
    union tex_tile_address addr;
    int c;
 
-   float u = s * xpot;
-   float v = t * ypot;
+   float u = args->s * xpot + args->offset[0];
+   float v = args->t * ypot + args->offset[1];
 
    int x0, y0;
    const float *out;
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    x0 = util_ifloor(u);
    if (x0 < 0) 
@@ -1150,11 +1149,7 @@ img_filter_2d_nearest_clamp_POT(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_nearest(struct sp_sampler_view *sp_sview,
                       struct sp_sampler *sp_samp,
-                      float s,
-                      float t,
-                      float p,
-                      unsigned level,
-                      unsigned face_id,
+                      const struct img_filter_args *args,
                       float rgba[TGSI_QUAD_SIZE])
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1164,14 +1159,14 @@ img_filter_1d_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
 
    out = get_texel_2d(sp_sview, sp_samp, addr, x, 0);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1186,11 +1181,7 @@ img_filter_1d_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview,
                             struct sp_sampler *sp_samp,
-                            float s,
-                            float t,
-                            float p,
-                            unsigned level,
-                            unsigned face_id,
+                            const struct img_filter_args *args,
                             float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1200,15 +1191,16 @@ img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   wrap_array_layer(t, texture->array_size, &layer);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   layer = coord_to_layer(args->t, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
 
    out = get_texel_1d_array(sp_sview, sp_samp, addr, x, layer);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1223,11 +1215,7 @@ img_filter_1d_array_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_2d_nearest(struct sp_sampler_view *sp_sview,
                       struct sp_sampler *sp_samp,
-                      float s,
-                      float t,
-                      float p,
-                      unsigned level,
-                      unsigned face_id,
+                      const struct img_filter_args *args,
                       float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1237,17 +1225,17 @@ img_filter_2d_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
 
    out = get_texel_2d(sp_sview, sp_samp, addr, x, y);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1262,11 +1250,7 @@ img_filter_2d_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview,
                             struct sp_sampler *sp_samp,
-                            float s,
-                            float t,
-                            float p,
-                            unsigned level,
-                            unsigned face_id,
+                            const struct img_filter_args *args,
                             float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1276,18 +1260,19 @@ img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
-   wrap_array_layer(p, texture->array_size, &layer);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
+   layer = coord_to_layer(args->p, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
 
    out = get_texel_2d_array(sp_sview, sp_samp, addr, x, y, layer);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1299,54 +1284,43 @@ img_filter_2d_array_nearest(struct sp_sampler_view *sp_sview,
 }
 
 
-static INLINE union tex_tile_address
-face(union tex_tile_address addr, unsigned face )
-{
-   addr.bits.face = face;
-   return addr;
-}
-
-
 static void
 img_filter_cube_nearest(struct sp_sampler_view *sp_sview,
                         struct sp_sampler *sp_samp,
-                        float s,
-                        float t,
-                        float p,
-                        unsigned level,
-                        unsigned face_id,
+                        const struct img_filter_args *args,
                         float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
    int width, height;
-   int x, y;
+   int x, y, layerface;
    union tex_tile_address addr;
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    /*
     * If NEAREST filtering is done within a miplevel, always apply wrap
     * mode CLAMP_TO_EDGE.
     */
    if (sp_samp->base.seamless_cube_map) {
-      wrap_nearest_clamp_to_edge(s, width, &x);
-      wrap_nearest_clamp_to_edge(t, height, &y);
+      wrap_nearest_clamp_to_edge(args->s, width, args->offset[0], &x);
+      wrap_nearest_clamp_to_edge(args->t, height, args->offset[1], &y);
    } else {
       /* Would probably make sense to ignore mode and just do edge clamp */
-      sp_samp->nearest_texcoord_s(s, width, &x);
-      sp_samp->nearest_texcoord_t(t, height, &y);
+      sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+      sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
    }
 
-   out = get_texel_2d(sp_sview, sp_samp, face(addr, face_id), x, y);
+   layerface = args->face_id + sp_sview->base.u.tex.first_layer;
+   out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
@@ -1358,34 +1332,32 @@ img_filter_cube_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_cube_array_nearest(struct sp_sampler_view *sp_sview,
                               struct sp_sampler *sp_samp,
-                              float s,
-                              float t,
-                              float p,
-                              unsigned level,
-                              unsigned face_id,
+                              const struct img_filter_args *args,
                               float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
    int width, height;
-   int x, y, layer;
+   int x, y, layerface;
    union tex_tile_address addr;
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
  
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->nearest_texcoord_s(s, width, &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
-   wrap_array_layer(p, texture->array_size, &layer);
+   sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
+   layerface = coord_to_layer(6 * args->p + sp_sview->base.u.tex.first_layer,
+                              sp_sview->base.u.tex.first_layer,
+                              sp_sview->base.u.tex.last_layer - 5) + args->face_id;
 
-   out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layer * 6 + face_id);
+   out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
@@ -1397,11 +1369,7 @@ img_filter_cube_array_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_3d_nearest(struct sp_sampler_view *sp_sview,
                       struct sp_sampler *sp_samp,
-                      float s,
-                      float t,
-                      float p,
-                      unsigned level,
-                      unsigned face_id,
+                      const struct img_filter_args *args,
                       float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1411,20 +1379,20 @@ img_filter_3d_nearest(struct sp_sampler_view *sp_sview,
    const float *out;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
-   depth = u_minify(texture->depth0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
+   depth = u_minify(texture->depth0, args->level);
 
    assert(width > 0);
    assert(height > 0);
    assert(depth > 0);
 
-   sp_samp->nearest_texcoord_s(s, width,  &x);
-   sp_samp->nearest_texcoord_t(t, height, &y);
-   sp_samp->nearest_texcoord_p(p, depth,  &z);
+   sp_samp->nearest_texcoord_s(args->s, width,  args->offset[0], &x);
+   sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
+   sp_samp->nearest_texcoord_p(args->p, depth,  args->offset[2], &z);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    out = get_texel_3d(sp_sview, sp_samp, addr, x, y, z);
    for (c = 0; c < TGSI_QUAD_SIZE; c++)
@@ -1435,11 +1403,7 @@ img_filter_3d_nearest(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_linear(struct sp_sampler_view *sp_sview,
                      struct sp_sampler *sp_samp,
-                     float s,
-                     float t,
-                     float p,
-                     unsigned level,
-                     unsigned face_id,
+                     const struct img_filter_args *args,
                      float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1450,14 +1414,14 @@ img_filter_1d_linear(struct sp_sampler_view *sp_sview,
    const float *tx0, *tx1;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw);
+   sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw);
 
    tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, 0);
    tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, 0);
@@ -1471,11 +1435,7 @@ img_filter_1d_linear(struct sp_sampler_view *sp_sview,
 static void
 img_filter_1d_array_linear(struct sp_sampler_view *sp_sview,
                            struct sp_sampler *sp_samp,
-                           float s,
-                           float t,
-                           float p,
-                           unsigned level,
-                           unsigned face_id,
+                           const struct img_filter_args *args,
                            float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1486,15 +1446,16 @@ img_filter_1d_array_linear(struct sp_sampler_view *sp_sview,
    const float *tx0, *tx1;
    int c;
 
-   width = u_minify(texture->width0, level);
+   width = u_minify(texture->width0, args->level);
 
    assert(width > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width, &x0, &x1, &xw);
-   wrap_array_layer(t, texture->array_size, &layer);
+   sp_samp->linear_texcoord_s(args->s, width, args->offset[0], &x0, &x1, &xw);
+   layer = coord_to_layer(args->t, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
 
    tx0 = get_texel_1d_array(sp_sview, sp_samp, addr, x0, layer);
    tx1 = get_texel_1d_array(sp_sview, sp_samp, addr, x1, layer);
@@ -1504,15 +1465,77 @@ img_filter_1d_array_linear(struct sp_sampler_view *sp_sview,
       rgba[TGSI_NUM_CHANNELS*c] = lerp(xw, tx0[c], tx1[c]);
 }
 
+/*
+ * Retrieve the gathered value, need to convert to the
+ * TGSI expected interface, and take component select
+ * and swizzling into account.
+ */
+static float
+get_gather_value(const struct sp_sampler_view *sp_sview,
+                 int chan_in, int comp_sel,
+                 const float *tx[4])
+{
+   int chan;
+   unsigned swizzle;
+
+   /*
+    * softpipe samples in a different order
+    * to TGSI expects, so we need to swizzle,
+    * the samples into the correct slots.
+    */
+   switch (chan_in) {
+   case 0:
+      chan = 2;
+      break;
+   case 1:
+      chan = 3;
+      break;
+   case 2:
+      chan = 1;
+      break;
+   case 3:
+      chan = 0;
+      break;
+   default:
+      assert(0);
+      return 0.0;
+   }
+
+   /* pick which component to use for the swizzle */
+   switch (comp_sel) {
+   case 0:
+      swizzle = sp_sview->base.swizzle_r;
+      break;
+   case 1:
+      swizzle = sp_sview->base.swizzle_g;
+      break;
+   case 2:
+      swizzle = sp_sview->base.swizzle_b;
+      break;
+   case 3:
+      swizzle = sp_sview->base.swizzle_a;
+      break;
+   default:
+      assert(0);
+      return 0.0;
+   }
+
+   /* get correct result using the channel and swizzle */
+   switch (swizzle) {
+   case PIPE_SWIZZLE_ZERO:
+      return 0.0;
+   case PIPE_SWIZZLE_ONE:
+      return 1.0;
+   default:
+      return tx[chan][swizzle];
+   }
+}
+
 
 static void
 img_filter_2d_linear(struct sp_sampler_view *sp_sview,
                      struct sp_sampler *sp_samp,
-                     float s,
-                     float t,
-                     float p,
-                     unsigned level,
-                     unsigned face_id,
+                     const struct img_filter_args *args,
                      float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1520,42 +1543,45 @@ img_filter_2d_linear(struct sp_sampler_view *sp_sview,
    int x0, y0, x1, y1;
    float xw, yw; /* weights */
    union tex_tile_address addr;
-   const float *tx0, *tx1, *tx2, *tx3;
+   const float *tx[4];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
+   sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+   sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
 
-   tx0 = get_texel_2d(sp_sview, sp_samp, addr, x0, y0);
-   tx1 = get_texel_2d(sp_sview, sp_samp, addr, x1, y0);
-   tx2 = get_texel_2d(sp_sview, sp_samp, addr, x0, y1);
-   tx3 = get_texel_2d(sp_sview, sp_samp, addr, x1, y1);
+   tx[0] = get_texel_2d(sp_sview, sp_samp, addr, x0, y0);
+   tx[1] = get_texel_2d(sp_sview, sp_samp, addr, x1, y0);
+   tx[2] = get_texel_2d(sp_sview, sp_samp, addr, x0, y1);
+   tx[3] = get_texel_2d(sp_sview, sp_samp, addr, x1, y1);
 
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
+   }
 }
 
 
 static void
 img_filter_2d_array_linear(struct sp_sampler_view *sp_sview,
                            struct sp_sampler *sp_samp,
-                           float s,
-                           float t,
-                           float p,
-                           unsigned level,
-                           unsigned face_id,
+                           const struct img_filter_args *args,
                            float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1563,63 +1589,67 @@ img_filter_2d_array_linear(struct sp_sampler_view *sp_sview,
    int x0, y0, x1, y1, layer;
    float xw, yw; /* weights */
    union tex_tile_address addr;
-   const float *tx0, *tx1, *tx2, *tx3;
+   const float *tx[4];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
-
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
-   wrap_array_layer(p, texture->array_size, &layer);
-
-   tx0 = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y0, layer);
-   tx1 = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y0, layer);
-   tx2 = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y1, layer);
-   tx3 = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer);
-
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
+   addr.bits.level = args->level;
+
+   sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+   sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
+   layer = coord_to_layer(args->p, sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer);
+
+   tx[0] = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y0, layer);
+   tx[1] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y0, layer);
+   tx[2] = get_texel_2d_array(sp_sview, sp_samp, addr, x0, y1, layer);
+   tx[3] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer);
+
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
+   }
 }
 
 
 static void
 img_filter_cube_linear(struct sp_sampler_view *sp_sview,
                        struct sp_sampler *sp_samp,
-                       float s,
-                       float t,
-                       float p,
-                       unsigned level,
-                       unsigned face_id,
+                       const struct img_filter_args *args,
                        float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
    int width, height;
-   int x0, y0, x1, y1;
+   int x0, y0, x1, y1, layer;
    float xw, yw; /* weights */
-   union tex_tile_address addr, addrj;
-   const float *tx0, *tx1, *tx2, *tx3;
+   union tex_tile_address addr;
+   const float *tx[4];
    float corner0[TGSI_QUAD_SIZE], corner1[TGSI_QUAD_SIZE],
          corner2[TGSI_QUAD_SIZE], corner3[TGSI_QUAD_SIZE];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    /*
     * For seamless if LINEAR filtering is done within a miplevel,
@@ -1627,43 +1657,47 @@ img_filter_cube_linear(struct sp_sampler_view *sp_sview,
     */
    if (sp_samp->base.seamless_cube_map) {
       /* Note this is a bit overkill, actual clamping is not required */
-      wrap_linear_clamp_to_border(s, width, &x0, &x1, &xw);
-      wrap_linear_clamp_to_border(t, height, &y0, &y1, &yw);
+      wrap_linear_clamp_to_border(args->s, width, args->offset[0], &x0, &x1, &xw);
+      wrap_linear_clamp_to_border(args->t, height, args->offset[1], &y0, &y1, &yw);
    } else {
       /* Would probably make sense to ignore mode and just do edge clamp */
-      sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-      sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
+      sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+      sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
    }
 
-   addrj = face(addr, face_id);
+   layer = sp_sview->base.u.tex.first_layer;
 
    if (sp_samp->base.seamless_cube_map) {
-      tx0 = get_texel_cube_seamless(sp_sview, addrj, x0, y0, corner0);
-      tx1 = get_texel_cube_seamless(sp_sview, addrj, x1, y0, corner1);
-      tx2 = get_texel_cube_seamless(sp_sview, addrj, x0, y1, corner2);
-      tx3 = get_texel_cube_seamless(sp_sview, addrj, x1, y1, corner3);
+      tx[0] = get_texel_cube_seamless(sp_sview, addr, x0, y0, corner0, layer, args->face_id);
+      tx[1] = get_texel_cube_seamless(sp_sview, addr, x1, y0, corner1, layer, args->face_id);
+      tx[2] = get_texel_cube_seamless(sp_sview, addr, x0, y1, corner2, layer, args->face_id);
+      tx[3] = get_texel_cube_seamless(sp_sview, addr, x1, y1, corner3, layer, args->face_id);
    } else {
-      tx0 = get_texel_2d(sp_sview, sp_samp, addrj, x0, y0);
-      tx1 = get_texel_2d(sp_sview, sp_samp, addrj, x1, y0);
-      tx2 = get_texel_2d(sp_sview, sp_samp, addrj, x0, y1);
-      tx3 = get_texel_2d(sp_sview, sp_samp, addrj, x1, y1);
+      tx[0] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer + args->face_id);
+      tx[1] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer + args->face_id);
+      tx[2] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer + args->face_id);
+      tx[3] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer + args->face_id);
+   }
+
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
    }
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
 }
 
 
 static void
 img_filter_cube_array_linear(struct sp_sampler_view *sp_sview,
                              struct sp_sampler *sp_samp,
-                             float s,
-                             float t,
-                             float p,
-                             unsigned level,
-                             unsigned face_id,
+                             const struct img_filter_args *args,
                              float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1671,42 +1705,68 @@ img_filter_cube_array_linear(struct sp_sampler_view *sp_sview,
    int x0, y0, x1, y1, layer;
    float xw, yw; /* weights */
    union tex_tile_address addr;
-   const float *tx0, *tx1, *tx2, *tx3;
+   const float *tx[4];
+   float corner0[TGSI_QUAD_SIZE], corner1[TGSI_QUAD_SIZE],
+         corner2[TGSI_QUAD_SIZE], corner3[TGSI_QUAD_SIZE];
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
 
    assert(width > 0);
    assert(height > 0);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
-   wrap_array_layer(p, texture->array_size, &layer);
+   /*
+    * For seamless if LINEAR filtering is done within a miplevel,
+    * always apply wrap mode CLAMP_TO_BORDER.
+    */
+   if (sp_samp->base.seamless_cube_map) {
+      /* Note this is a bit overkill, actual clamping is not required */
+      wrap_linear_clamp_to_border(args->s, width, args->offset[0], &x0, &x1, &xw);
+      wrap_linear_clamp_to_border(args->t, height, args->offset[1], &y0, &y1, &yw);
+   } else {
+      /* Would probably make sense to ignore mode and just do edge clamp */
+      sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+      sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
+   }
 
-   tx0 = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer * 6 + face_id);
-   tx1 = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer * 6 + face_id);
-   tx2 = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer * 6 + face_id);
-   tx3 = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer * 6 + face_id);
+   layer = coord_to_layer(6 * args->p + sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.first_layer,
+                          sp_sview->base.u.tex.last_layer - 5);
 
-   /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
-      rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
-                                          tx0[c], tx1[c],
-                                          tx2[c], tx3[c]);
+   if (sp_samp->base.seamless_cube_map) {
+      tx[0] = get_texel_cube_seamless(sp_sview, addr, x0, y0, corner0, layer, args->face_id);
+      tx[1] = get_texel_cube_seamless(sp_sview, addr, x1, y0, corner1, layer, args->face_id);
+      tx[2] = get_texel_cube_seamless(sp_sview, addr, x0, y1, corner2, layer, args->face_id);
+      tx[3] = get_texel_cube_seamless(sp_sview, addr, x1, y1, corner3, layer, args->face_id);
+   } else {
+      tx[0] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y0, layer + args->face_id);
+      tx[1] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y0, layer + args->face_id);
+      tx[2] = get_texel_cube_array(sp_sview, sp_samp, addr, x0, y1, layer + args->face_id);
+      tx[3] = get_texel_cube_array(sp_sview, sp_samp, addr, x1, y1, layer + args->face_id);
+   }
+
+   if (args->gather_only) {
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
+                                                      args->gather_comp,
+                                                      tx);
+   } else {
+      /* interpolate R, G, B, A */
+      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+         rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
+                                             tx[0][c], tx[1][c],
+                                             tx[2][c], tx[3][c]);
+   }
 }
 
 static void
 img_filter_3d_linear(struct sp_sampler_view *sp_sview,
                      struct sp_sampler *sp_samp,
-                     float s,
-                     float t,
-                     float p,
-                     unsigned level,
-                     unsigned face_id,
+                     const struct img_filter_args *args,
                      float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -1717,21 +1777,20 @@ img_filter_3d_linear(struct sp_sampler_view *sp_sview,
    const float *tx00, *tx01, *tx02, *tx03, *tx10, *tx11, *tx12, *tx13;
    int c;
 
-   width = u_minify(texture->width0, level);
-   height = u_minify(texture->height0, level);
-   depth = u_minify(texture->depth0, level);
+   width = u_minify(texture->width0, args->level);
+   height = u_minify(texture->height0, args->level);
+   depth = u_minify(texture->depth0, args->level);
 
    addr.value = 0;
-   addr.bits.level = level;
+   addr.bits.level = args->level;
 
    assert(width > 0);
    assert(height > 0);
    assert(depth > 0);
 
-   sp_samp->linear_texcoord_s(s, width,  &x0, &x1, &xw);
-   sp_samp->linear_texcoord_t(t, height, &y0, &y1, &yw);
-   sp_samp->linear_texcoord_p(p, depth,  &z0, &z1, &zw);
-
+   sp_samp->linear_texcoord_s(args->s, width,  args->offset[0], &x0, &x1, &xw);
+   sp_samp->linear_texcoord_t(args->t, height, args->offset[1], &y0, &y1, &yw);
+   sp_samp->linear_texcoord_p(args->p, depth,  args->offset[2], &z0, &z1, &zw);
 
    tx00 = get_texel_3d(sp_sview, sp_samp, addr, x0, y0, z0);
    tx01 = get_texel_3d(sp_sview, sp_samp, addr, x1, y0, z0);
@@ -1837,6 +1896,7 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview,
       }
       break;
    case tgsi_sampler_lod_zero:
+   case tgsi_sampler_gather:
       /* this is all static state in the sampler really need clamp here? */
       lod[0] = lod[1] = lod[2] = lod[3] = CLAMP(lod_bias, min_lod, max_lod);
       break;
@@ -1846,6 +1906,12 @@ compute_lambda_lod(struct sp_sampler_view *sp_sview,
    }
 }
 
+static INLINE unsigned
+get_gather_component(const float lod_in[TGSI_QUAD_SIZE])
+{
+   /* gather component is stored in lod_in slot as unsigned */
+   return (*(unsigned int *)lod_in) & 0x3;
+}
 
 static void
 mip_filter_linear(struct sp_sampler_view *sp_sview,
@@ -1857,36 +1923,45 @@ mip_filter_linear(struct sp_sampler_view *sp_sview,
                   const float p[TGSI_QUAD_SIZE],
                   const float c0[TGSI_QUAD_SIZE],
                   const float lod_in[TGSI_QUAD_SIZE],
-                  enum tgsi_sampler_control control,
+                  const struct filter_args *filt_args,
                   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_view *psview = &sp_sview->base;
    int j;
    float lod[TGSI_QUAD_SIZE];
+   struct img_filter_args args;
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
+
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
+   args.gather_comp = get_gather_component(lod_in);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
       int level0 = psview->u.tex.first_level + (int)lod[j];
 
-      if (lod[j] < 0.0)
-         mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    psview->u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
-
-      else if (level0 >= (int) psview->u.tex.last_level)
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], psview->u.tex.last_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
 
+      if (lod[j] < 0.0) {
+         args.level = psview->u.tex.first_level;
+         mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      }
+      else if (level0 >= (int) psview->u.tex.last_level) {
+         args.level = psview->u.tex.last_level;
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      }
       else {
          float levelBlend = frac(lod[j]);
          float rgbax[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
          int c;
 
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level0,
-                    sp_sview->faces[j], &rgbax[0][0]);
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level0+1,
-                    sp_sview->faces[j], &rgbax[0][1]);
+         args.level = level0;
+         min_filter(sp_sview, sp_samp, &args, &rgbax[0][0]);
+         args.level = level0+1;
+         min_filter(sp_sview, sp_samp, &args, &rgbax[0][1]);
 
          for (c = 0; c < 4; c++) {
             rgba[c][j] = lerp(levelBlend, rgbax[c][0], rgbax[c][1]);
@@ -1915,25 +1990,33 @@ mip_filter_nearest(struct sp_sampler_view *sp_sview,
                    const float p[TGSI_QUAD_SIZE],
                    const float c0[TGSI_QUAD_SIZE],
                    const float lod_in[TGSI_QUAD_SIZE],
-                   enum tgsi_sampler_control control,
+                   const struct filter_args *filt_args,
                    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_view *psview = &sp_sview->base;
    float lod[TGSI_QUAD_SIZE];
    int j;
+   struct img_filter_args args;
+
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
+   args.gather_comp = get_gather_component(lod_in);
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-      if (lod[j] < 0.0)
-         mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    psview->u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
-      else {
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+
+      if (lod[j] < 0.0) {
+         args.level = psview->u.tex.first_level;
+         mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      } else {
          int level = psview->u.tex.first_level + (int)(lod[j] + 0.5F);
-         level = MIN2(level, (int)psview->u.tex.last_level);
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    level, sp_sview->faces[j], &rgba[0][j]);
+         args.level = MIN2(level, (int)psview->u.tex.last_level);
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
       }
    }
 
@@ -1953,24 +2036,29 @@ mip_filter_none(struct sp_sampler_view *sp_sview,
                 const float p[TGSI_QUAD_SIZE],
                 const float c0[TGSI_QUAD_SIZE],
                 const float lod_in[TGSI_QUAD_SIZE],
-                enum tgsi_sampler_control control,
+                const struct filter_args *filt_args,
                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    float lod[TGSI_QUAD_SIZE];
    int j;
+   struct img_filter_args args;
+
+   args.level = sp_sview->base.u.tex.first_level;
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-      if (lod[j] < 0.0) { 
-         mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    sp_sview->base.u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+      if (lod[j] < 0.0) {
+         mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
       }
       else {
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                    sp_sview->base.u.tex.first_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
       }
    }
 }
@@ -1986,15 +2074,21 @@ mip_filter_none_no_filter_select(struct sp_sampler_view *sp_sview,
                                  const float p[TGSI_QUAD_SIZE],
                                  const float c0[TGSI_QUAD_SIZE],
                                  const float lod_in[TGSI_QUAD_SIZE],
-                                 enum tgsi_sampler_control control,
+                                 const struct filter_args *filt_args,
                                  float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    int j;
-
-   for (j = 0; j < TGSI_QUAD_SIZE; j++)
-      mag_filter(sp_sview, sp_samp, s[j], t[j], p[j],
-                 sp_sview->base.u.tex.first_level,
-                 sp_sview->faces[j], &rgba[0][j]);
+   struct img_filter_args args;
+   args.level = sp_sview->base.u.tex.first_level;
+   args.offset = filt_args->offset;
+   args.gather_only = filt_args->control == tgsi_sampler_gather;
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+      mag_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+   }
 }
 
 
@@ -2050,7 +2144,7 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
    float scaling = 1.0f / (1 << level0);
    int width = u_minify(texture->width0, level0);
    int height = u_minify(texture->height0, level0);
-
+   struct img_filter_args args;
    float ux = dudx * scaling;
    float vx = dvdx * scaling;
    float uy = dudy * scaling;
@@ -2100,7 +2194,8 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
     * full, then the pixel values are read from the image.
     */
    ddq = 2 * A;
-   
+
+   args.level = level;
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
       /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
        * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
@@ -2117,6 +2212,8 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
       float num[4] = {0.0F, 0.0F, 0.0F, 0.0F};
       buffer_next = 0;
       den = 0;
+      args.face_id = sp_sview->faces[j];
+
       U = u0 - tex_u;
       for (v = v0; v <= v1; ++v) {
          float V = v - tex_v;
@@ -2148,8 +2245,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
                    * accelerated img_filter_2d_nearest_XXX functions.
                    */
                   for (jj = 0; jj < buffer_next; jj++) {
-                     min_filter(sp_sview, sp_samp, s_buffer[jj], t_buffer[jj], p[jj],
-                                level, sp_sview->faces[j], &rgba_temp[0][jj]);
+                     args.s = s_buffer[jj];
+                     args.t = t_buffer[jj];
+                     args.p = p[jj];
+                     min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][jj]);
                      num[0] += weight_buffer[jj] * rgba_temp[0][jj];
                      num[1] += weight_buffer[jj] * rgba_temp[1][jj];
                      num[2] += weight_buffer[jj] * rgba_temp[2][jj];
@@ -2176,8 +2275,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
           * accelerated img_filter_2d_nearest_XXX functions.
           */
          for (jj = 0; jj < buffer_next; jj++) {
-            min_filter(sp_sview, sp_samp, s_buffer[jj], t_buffer[jj], p[jj],
-                       level, sp_sview->faces[j], &rgba_temp[0][jj]);
+            args.s = s_buffer[jj];
+            args.t = t_buffer[jj];
+            args.p = p[jj];
+            min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][jj]);
             num[0] += weight_buffer[jj] * rgba_temp[0][jj];
             num[1] += weight_buffer[jj] * rgba_temp[1][jj];
             num[2] += weight_buffer[jj] * rgba_temp[2][jj];
@@ -2196,8 +2297,10 @@ img_filter_2d_ewa(struct sp_sampler_view *sp_sview,
          rgba[2]=0;
          rgba[3]=0;*/
          /* not enough pixels in resampling, resort to direct interpolation */
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], level,
-                    sp_sview->faces[j], &rgba_temp[0][j]);
+         args.s = s[j];
+         args.t = t[j];
+         args.p = p[j];
+         min_filter(sp_sview, sp_samp, &args, &rgba_temp[0][j]);
          den = 1;
          num[0] = rgba_temp[0][j];
          num[1] = rgba_temp[1][j];
@@ -2226,7 +2329,7 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
                         const float p[TGSI_QUAD_SIZE],
                         const float c0[TGSI_QUAD_SIZE],
                         const float lod_in[TGSI_QUAD_SIZE],
-                        enum tgsi_sampler_control control,
+                        const struct filter_args *filt_args,
                         float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
@@ -2241,11 +2344,12 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
    float dudy = (s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]) * s_to_u;
    float dvdx = (t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]) * t_to_v;
    float dvdy = (t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]) * t_to_v;
-   
-   if (control == tgsi_sampler_lod_bias ||
-       control == tgsi_sampler_lod_none ||
+   struct img_filter_args args;
+
+   if (filt_args->control == tgsi_sampler_lod_bias ||
+       filt_args->control == tgsi_sampler_lod_none ||
        /* XXX FIXME */
-       control == tgsi_sampler_derivs_explicit) {
+       filt_args->control == tgsi_sampler_derivs_explicit) {
       /* note: instead of working with Px and Py, we will use the 
        * squared length instead, to avoid sqrt.
        */
@@ -2282,12 +2386,12 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
        * this since 0.5*log(x) = log(sqrt(x))
        */
       lambda = 0.5F * util_fast_log2(Pmin2) + sp_samp->base.lod_bias;
-      compute_lod(&sp_samp->base, control, lambda, lod_in, lod);
+      compute_lod(&sp_samp->base, filt_args->control, lambda, lod_in, lod);
    }
    else {
-      assert(control == tgsi_sampler_lod_explicit ||
-             control == tgsi_sampler_lod_zero);
-      compute_lod(&sp_samp->base, control, sp_samp->base.lod_bias, lod_in, lod);
+      assert(filt_args->control == tgsi_sampler_lod_explicit ||
+             filt_args->control == tgsi_sampler_lod_zero);
+      compute_lod(&sp_samp->base, filt_args->control, sp_samp->base.lod_bias, lod_in, lod);
    }
    
    /* XXX: Take into account all lod values.
@@ -2300,9 +2404,14 @@ mip_filter_linear_aniso(struct sp_sampler_view *sp_sview,
     */
    if (level0 >= (int) psview->u.tex.last_level) {
       int j;
-      for (j = 0; j < TGSI_QUAD_SIZE; j++)
-         min_filter(sp_sview, sp_samp, s[j], t[j], p[j], psview->u.tex.last_level,
-                    sp_sview->faces[j], &rgba[0][j]);
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         args.s = s[j];
+         args.t = t[j];
+         args.p = p[j];
+         args.level = psview->u.tex.last_level;
+         args.face_id = sp_sview->faces[j];
+         min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
+      }
    }
    else {
       /* don't bother interpolating between multiple LODs; it doesn't
@@ -2334,29 +2443,33 @@ mip_filter_linear_2d_linear_repeat_POT(
    const float p[TGSI_QUAD_SIZE],
    const float c0[TGSI_QUAD_SIZE],
    const float lod_in[TGSI_QUAD_SIZE],
-   enum tgsi_sampler_control control,
+   const struct filter_args *filt_args,
    float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_view *psview = &sp_sview->base;
    int j;
    float lod[TGSI_QUAD_SIZE];
 
-   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, control, lod);
+   compute_lambda_lod(sp_sview, sp_samp, s, t, p, lod_in, filt_args->control, lod);
 
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
       int level0 = psview->u.tex.first_level + (int)lod[j];
-
+      struct img_filter_args args;
       /* Catches both negative and large values of level0:
        */
+      args.s = s[j];
+      args.t = t[j];
+      args.p = p[j];
+      args.face_id = sp_sview->faces[j];
+      args.offset = filt_args->offset;
+      args.gather_only = filt_args->control == tgsi_sampler_gather;
       if ((unsigned)level0 >= psview->u.tex.last_level) {
          if (level0 < 0)
-            img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j],
-                                            psview->u.tex.first_level,
-                                            sp_sview->faces[j], &rgba[0][j]);
+            args.level = psview->u.tex.first_level;
          else
-            img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j],
-                                            psview->u.tex.last_level,
-                                            sp_sview->faces[j], &rgba[0][j]);
+            args.level = psview->u.tex.last_level;
+         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args,
+                                         &rgba[0][j]);
 
       }
       else {
@@ -2364,10 +2477,10 @@ mip_filter_linear_2d_linear_repeat_POT(
          float rgbax[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
          int c;
 
-         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], level0,
-                                         sp_sview->faces[j], &rgbax[0][0]);
-         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, s[j], t[j], p[j], level0+1,
-                                         sp_sview->faces[j], &rgbax[0][1]);
+         args.level = level0;
+         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args, &rgbax[0][0]);
+         args.level = level0+1;
+         img_filter_2d_linear_repeat_POT(sp_sview, sp_samp, &args, &rgbax[0][1]);
 
          for (c = 0; c < TGSI_NUM_CHANNELS; c++)
             rgba[c][j] = lerp(levelBlend, rgbax[c][0], rgbax[c][1]);
@@ -2395,11 +2508,12 @@ sample_compare(struct sp_sampler_view *sp_sview,
                float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    const struct pipe_sampler_state *sampler = &sp_samp->base;
-   int j;
-   int k[4];
+   int j, v;
+   int k[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
    float pc[4];
    const struct util_format_description *format_desc;
    unsigned chan_type;
+   bool is_gather = (control == tgsi_sampler_gather);
 
    /**
     * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
@@ -2408,13 +2522,13 @@ sample_compare(struct sp_sampler_view *sp_sview,
     * RGBA channels.  We look at the red channel here.
     */
 
-   if (sp_sview->base.texture->target == PIPE_TEXTURE_2D_ARRAY ||
-       sp_sview->base.texture->target == PIPE_TEXTURE_CUBE) {
+   if (sp_sview->base.target == PIPE_TEXTURE_2D_ARRAY ||
+       sp_sview->base.target == PIPE_TEXTURE_CUBE) {
       pc[0] = c0[0];
       pc[1] = c0[1];
       pc[2] = c0[2];
       pc[3] = c0[3];
-   } else if (sp_sview->base.texture->target == PIPE_TEXTURE_CUBE_ARRAY) {
+   } else if (sp_sview->base.target == PIPE_TEXTURE_CUBE_ARRAY) {
       pc[0] = c1[0];
       pc[1] = c1[1];
       pc[2] = c1[2];
@@ -2443,65 +2557,74 @@ sample_compare(struct sp_sampler_view *sp_sview,
       pc[3] = CLAMP(pc[3], 0.0F, 1.0F);
    }
 
-   /* compare four texcoords vs. four texture samples */
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k[0] = pc[0] < rgba[0][0];
-      k[1] = pc[1] < rgba[0][1];
-      k[2] = pc[2] < rgba[0][2];
-      k[3] = pc[3] < rgba[0][3];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k[0] = pc[0] <= rgba[0][0];
-      k[1] = pc[1] <= rgba[0][1];
-      k[2] = pc[2] <= rgba[0][2];
-      k[3] = pc[3] <= rgba[0][3];
-      break;
-   case PIPE_FUNC_GREATER:
-      k[0] = pc[0] > rgba[0][0];
-      k[1] = pc[1] > rgba[0][1];
-      k[2] = pc[2] > rgba[0][2];
-      k[3] = pc[3] > rgba[0][3];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k[0] = pc[0] >= rgba[0][0];
-      k[1] = pc[1] >= rgba[0][1];
-      k[2] = pc[2] >= rgba[0][2];
-      k[3] = pc[3] >= rgba[0][3];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k[0] = pc[0] == rgba[0][0];
-      k[1] = pc[1] == rgba[0][1];
-      k[2] = pc[2] == rgba[0][2];
-      k[3] = pc[3] == rgba[0][3];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k[0] = pc[0] != rgba[0][0];
-      k[1] = pc[1] != rgba[0][1];
-      k[2] = pc[2] != rgba[0][2];
-      k[3] = pc[3] != rgba[0][3];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k[0] = k[1] = k[2] = k[3] = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k[0] = k[1] = k[2] = k[3] = 0;
-      break;
-   default:
-      k[0] = k[1] = k[2] = k[3] = 0;
-      assert(0);
-      break;
+   for (v = 0; v < (is_gather ? TGSI_NUM_CHANNELS : 1); v++) {
+      /* compare four texcoords vs. four texture samples */
+      switch (sampler->compare_func) {
+      case PIPE_FUNC_LESS:
+         k[v][0] = pc[0] < rgba[v][0];
+         k[v][1] = pc[1] < rgba[v][1];
+         k[v][2] = pc[2] < rgba[v][2];
+         k[v][3] = pc[3] < rgba[v][3];
+         break;
+      case PIPE_FUNC_LEQUAL:
+         k[v][0] = pc[0] <= rgba[v][0];
+         k[v][1] = pc[1] <= rgba[v][1];
+         k[v][2] = pc[2] <= rgba[v][2];
+         k[v][3] = pc[3] <= rgba[v][3];
+         break;
+      case PIPE_FUNC_GREATER:
+         k[v][0] = pc[0] > rgba[v][0];
+         k[v][1] = pc[1] > rgba[v][1];
+         k[v][2] = pc[2] > rgba[v][2];
+         k[v][3] = pc[3] > rgba[v][3];
+         break;
+      case PIPE_FUNC_GEQUAL:
+         k[v][0] = pc[0] >= rgba[v][0];
+         k[v][1] = pc[1] >= rgba[v][1];
+         k[v][2] = pc[2] >= rgba[v][2];
+         k[v][3] = pc[3] >= rgba[v][3];
+         break;
+      case PIPE_FUNC_EQUAL:
+         k[v][0] = pc[0] == rgba[v][0];
+         k[v][1] = pc[1] == rgba[v][1];
+         k[v][2] = pc[2] == rgba[v][2];
+         k[v][3] = pc[3] == rgba[v][3];
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         k[v][0] = pc[0] != rgba[v][0];
+         k[v][1] = pc[1] != rgba[v][1];
+         k[v][2] = pc[2] != rgba[v][2];
+         k[v][3] = pc[3] != rgba[v][3];
+         break;
+      case PIPE_FUNC_ALWAYS:
+         k[v][0] = k[v][1] = k[v][2] = k[v][3] = 1;
+         break;
+      case PIPE_FUNC_NEVER:
+         k[v][0] = k[v][1] = k[v][2] = k[v][3] = 0;
+         break;
+      default:
+         k[v][0] = k[v][1] = k[v][2] = k[v][3] = 0;
+         assert(0);
+         break;
+      }
    }
 
-   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
-      rgba[0][j] = k[j];
-      rgba[1][j] = k[j];
-      rgba[2][j] = k[j];
-      rgba[3][j] = 1.0F;
+   if (is_gather) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         for (v = 0; v < TGSI_NUM_CHANNELS; v++) {
+            rgba[v][j] = k[v][j];
+         }
+      }
+   } else {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         rgba[0][j] = k[0][j];
+         rgba[1][j] = k[0][j];
+         rgba[2][j] = k[0][j];
+         rgba[3][j] = 1.0F;
+      }
    }
 }
 
-
 static void
 do_swizzling(const struct pipe_sampler_view *sview,
              float in[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
@@ -2679,9 +2802,9 @@ any_swizzle(const struct pipe_sampler_view *view)
 static img_filter_func
 get_img_filter(const struct sp_sampler_view *sp_sview,
                const struct pipe_sampler_state *sampler,
-               unsigned filter)
+               unsigned filter, bool gather)
 {
-   switch (sp_sview->base.texture->target) {
+   switch (sp_sview->base.target) {
    case PIPE_BUFFER:
    case PIPE_TEXTURE_1D:
       if (filter == PIPE_TEX_FILTER_NEAREST) 
@@ -2699,7 +2822,7 @@ get_img_filter(const struct sp_sampler_view *sp_sview,
    case PIPE_TEXTURE_RECT:
       /* Try for fast path:
        */
-      if (sp_sview->pot2d &&
+      if (!gather && sp_sview->pot2d &&
           sampler->wrap_s == sampler->wrap_t &&
           sampler->normalized_coords) 
       {
@@ -2769,35 +2892,38 @@ sample_mip(struct sp_sampler_view *sp_sview,
            const float p[TGSI_QUAD_SIZE],
            const float c0[TGSI_QUAD_SIZE],
            const float lod[TGSI_QUAD_SIZE],
-           enum tgsi_sampler_control control,
+           const struct filter_args *filt_args,
            float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    mip_filter_func mip_filter;
    img_filter_func min_img_filter = NULL;
    img_filter_func mag_img_filter = NULL;
 
-   if (sp_sview->pot2d & sp_samp->min_mag_equal_repeat_linear) {
+   if (filt_args->control == tgsi_sampler_gather) {
+      mip_filter = mip_filter_nearest;
+      min_img_filter = get_img_filter(sp_sview, &sp_samp->base, PIPE_TEX_FILTER_LINEAR, true);
+   } else if (sp_sview->pot2d & sp_samp->min_mag_equal_repeat_linear) {
       mip_filter = mip_filter_linear_2d_linear_repeat_POT;
    }
    else {
       mip_filter = sp_samp->mip_filter;
-      min_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->min_img_filter);
+      min_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->min_img_filter, false);
       if (sp_samp->min_mag_equal) {
          mag_img_filter = min_img_filter;
       }
       else {
-         mag_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->base.mag_img_filter);
+         mag_img_filter = get_img_filter(sp_sview, &sp_samp->base, sp_samp->base.mag_img_filter, false);
       }
    }
 
    mip_filter(sp_sview, sp_samp, min_img_filter, mag_img_filter,
-              s, t, p, c0, lod, control, rgba);
+              s, t, p, c0, lod, filt_args, rgba);
 
    if (sp_samp->base.compare_mode != PIPE_TEX_COMPARE_NONE) {
-      sample_compare(sp_sview, sp_samp, s, t, p, c0, lod, control, rgba);
+      sample_compare(sp_sview, sp_samp, s, t, p, c0, lod, filt_args->control, rgba);
    }
 
-   if (sp_sview->need_swizzle) {
+   if (sp_sview->need_swizzle && filt_args->control != tgsi_sampler_gather) {
       float rgba_temp[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
       memcpy(rgba_temp, rgba, sizeof(rgba_temp));
       do_swizzling(&sp_sview->base, rgba_temp, rgba);
@@ -2818,7 +2944,7 @@ sample_cube(struct sp_sampler_view *sp_sview,
             const float p[TGSI_QUAD_SIZE],
             const float c0[TGSI_QUAD_SIZE],
             const float c1[TGSI_QUAD_SIZE],
-            enum tgsi_sampler_control control,
+            const struct filter_args *filt_args,
             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    unsigned j;
@@ -2896,7 +3022,7 @@ sample_cube(struct sp_sampler_view *sp_sview,
       }
    }
 
-   sample_mip(sp_sview, sp_samp, ssss, tttt, pppp, c0, c1, control, rgba);
+   sample_mip(sp_sview, sp_samp, ssss, tttt, pppp, c0, c1, filt_args, rgba);
 }
 
 
@@ -2907,7 +3033,7 @@ sp_get_dims(struct sp_sampler_view *sp_sview, int level,
    const struct pipe_sampler_view *view = &sp_sview->base;
    const struct pipe_resource *texture = view->texture;
 
-   if (texture->target == PIPE_BUFFER) {
+   if (view->target == PIPE_BUFFER) {
       dims[0] = (view->u.buf.last_element - view->u.buf.first_element) + 1;
       /* the other values are undefined, but let's avoid potential valgrind
        * warnings.
@@ -2924,7 +3050,7 @@ sp_get_dims(struct sp_sampler_view *sp_sview, int level,
    dims[3] = view->u.tex.last_level - view->u.tex.first_level + 1;
    dims[0] = u_minify(texture->width0, level);
 
-   switch(texture->target) {
+   switch (view->target) {
    case PIPE_TEXTURE_1D_ARRAY:
       dims[1] = view->u.tex.last_layer - view->u.tex.first_layer + 1;
       /* fallthrough */
@@ -2975,13 +3101,16 @@ sp_get_texels(struct sp_sampler_view *sp_sview,
 
    addr.value = 0;
    /* TODO write a better test for LOD */
-   addr.bits.level = lod[0];
+   addr.bits.level = sp_sview->base.target == PIPE_BUFFER ? 0 :
+                        CLAMP(lod[0] + sp_sview->base.u.tex.first_level, 
+                              sp_sview->base.u.tex.first_level,
+                              sp_sview->base.u.tex.last_level);
 
    width = u_minify(texture->width0, addr.bits.level);
    height = u_minify(texture->height0, addr.bits.level);
    depth = u_minify(texture->depth0, addr.bits.level);
 
-   switch(texture->target) {
+   switch (sp_sview->base.target) {
    case PIPE_BUFFER:
    case PIPE_TEXTURE_1D:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
@@ -2995,7 +3124,8 @@ sp_get_texels(struct sp_sampler_view *sp_sview,
    case PIPE_TEXTURE_1D_ARRAY:
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
-         int y = CLAMP(v_j[j], sp_sview->base.u.tex.first_layer, sp_sview->base.u.tex.last_layer);
+         int y = CLAMP(v_j[j], sp_sview->base.u.tex.first_layer,
+                       sp_sview->base.u.tex.last_layer);
          tx = get_texel_2d_no_border(sp_sview, addr, x, y);
          for (c = 0; c < 4; c++) {
             rgba[c][j] = tx[c];
@@ -3017,7 +3147,8 @@ sp_get_texels(struct sp_sampler_view *sp_sview,
       for (j = 0; j < TGSI_QUAD_SIZE; j++) {
          int x = CLAMP(v_i[j] + offset[0], 0, width - 1);
          int y = CLAMP(v_j[j] + offset[1], 0, height - 1);
-         int layer = CLAMP(v_k[j], sp_sview->base.u.tex.first_layer, sp_sview->base.u.tex.last_layer);
+         int layer = CLAMP(v_k[j], sp_sview->base.u.tex.first_layer,
+                           sp_sview->base.u.tex.last_layer);
          tx = get_texel_3d_no_border(sp_sview, addr, x, y, layer);
          for (c = 0; c < 4; c++) {
             rgba[c][j] = tx[c];
@@ -3140,7 +3271,7 @@ softpipe_get_lambda_func(const struct pipe_sampler_view *view, unsigned shader)
    if (shader != PIPE_SHADER_FRAGMENT)
       return compute_lambda_vert;
 
-   switch (view->texture->target) {
+   switch (view->target) {
    case PIPE_BUFFER:
    case PIPE_TEXTURE_1D:
    case PIPE_TEXTURE_1D_ARRAY:
@@ -3176,19 +3307,49 @@ softpipe_create_sampler_view(struct pipe_context *pipe,
       pipe_resource_reference(&view->texture, resource);
       view->context = pipe;
 
+#ifdef DEBUG
+     /*
+      * This is possibly too lenient, but the primary reason is just
+      * to catch state trackers which forget to initialize this, so
+      * it only catches clearly impossible view targets.
+      */
+      if (view->target != resource->target) {
+         if (view->target == PIPE_TEXTURE_1D)
+            assert(resource->target == PIPE_TEXTURE_1D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_1D_ARRAY)
+            assert(resource->target == PIPE_TEXTURE_1D);
+         else if (view->target == PIPE_TEXTURE_2D)
+            assert(resource->target == PIPE_TEXTURE_2D_ARRAY ||
+                   resource->target == PIPE_TEXTURE_CUBE ||
+                   resource->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_2D_ARRAY)
+            assert(resource->target == PIPE_TEXTURE_2D ||
+                   resource->target == PIPE_TEXTURE_CUBE ||
+                   resource->target == PIPE_TEXTURE_CUBE_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE)
+            assert(resource->target == PIPE_TEXTURE_CUBE_ARRAY ||
+                   resource->target == PIPE_TEXTURE_2D_ARRAY);
+         else if (view->target == PIPE_TEXTURE_CUBE_ARRAY)
+            assert(resource->target == PIPE_TEXTURE_CUBE ||
+                   resource->target == PIPE_TEXTURE_2D_ARRAY);
+         else
+            assert(0);
+      }
+#endif
+
       if (any_swizzle(view)) {
          sview->need_swizzle = TRUE;
       }
 
-      if (resource->target == PIPE_TEXTURE_CUBE ||
-          resource->target == PIPE_TEXTURE_CUBE_ARRAY)
+      if (view->target == PIPE_TEXTURE_CUBE ||
+          view->target == PIPE_TEXTURE_CUBE_ARRAY)
          sview->get_samples = sample_cube;
       else {
          sview->get_samples = sample_mip;
       }
       sview->pot2d = spr->pot &&
-                     (resource->target == PIPE_TEXTURE_2D ||
-                      resource->target == PIPE_TEXTURE_RECT);
+                     (view->target == PIPE_TEXTURE_2D ||
+                      view->target == PIPE_TEXTURE_RECT);
 
       sview->xpot = util_logbase2( resource->width0 );
       sview->ypot = util_logbase2( resource->height0 );
@@ -3230,7 +3391,7 @@ sp_tgsi_get_samples(struct tgsi_sampler *tgsi_sampler,
                     float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
 {
    struct sp_tgsi_sampler *sp_samp = (struct sp_tgsi_sampler *)tgsi_sampler;
-
+   struct filter_args filt_args;
    assert(sview_index < PIPE_MAX_SHADER_SAMPLER_VIEWS);
    assert(sampler_index < PIPE_MAX_SAMPLERS);
    assert(sp_samp->sp_sampler[sampler_index]);
@@ -3244,9 +3405,12 @@ sp_tgsi_get_samples(struct tgsi_sampler *tgsi_sampler,
       }
       return;
    }
+
+   filt_args.control = control;
+   filt_args.offset = offset;
    sp_samp->sp_sview[sview_index].get_samples(&sp_samp->sp_sview[sview_index],
                                               sp_samp->sp_sampler[sampler_index],
-                                              s, t, p, c0, lod, control, rgba);
+                                              s, t, p, c0, lod, &filt_args, rgba);
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 00a97c5..7d1aafc 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -38,10 +38,12 @@ struct sp_sampler;
 
 typedef void (*wrap_nearest_func)(float s,
                                   unsigned size,
+                                  int offset,
                                   int *icoord);
 
 typedef void (*wrap_linear_func)(float s, 
                                  unsigned size,
+                                 int offset,
                                  int *icoord0,
                                  int *icoord1,
                                  float *w);
@@ -51,15 +53,27 @@ typedef float (*compute_lambda_func)(const struct sp_sampler_view *sp_sview,
                                      const float t[TGSI_QUAD_SIZE],
                                      const float p[TGSI_QUAD_SIZE]);
 
+struct img_filter_args {
+   float s;
+   float t;
+   float p;
+   unsigned level;
+   unsigned face_id;
+   const int8_t *offset;
+   bool gather_only;
+   int gather_comp;
+};
+
 typedef void (*img_filter_func)(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
-                                float s,
-                                float t,
-                                float p,
-                                unsigned level,
-                                unsigned face_id,
+                                const struct img_filter_args *args,
                                 float *rgba);
 
+struct filter_args {
+   enum tgsi_sampler_control control;
+   const int8_t *offset;
+};
+
 typedef void (*mip_filter_func)(struct sp_sampler_view *sp_sview,
                                 struct sp_sampler *sp_samp,
                                 img_filter_func min_filter,
@@ -69,7 +83,7 @@ typedef void (*mip_filter_func)(struct sp_sampler_view *sp_sview,
                                 const float p[TGSI_QUAD_SIZE],
                                 const float c0[TGSI_QUAD_SIZE],
                                 const float lod[TGSI_QUAD_SIZE],
-                                enum tgsi_sampler_control control,
+                                const struct filter_args *args,
                                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
 
 
@@ -80,7 +94,7 @@ typedef void (*filter_func)(struct sp_sampler_view *sp_sview,
                             const float p[TGSI_QUAD_SIZE],
                             const float c0[TGSI_QUAD_SIZE],
                             const float lod[TGSI_QUAD_SIZE],
-                            enum tgsi_sampler_control control,
+                            const struct filter_args *args,
                             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
 
 
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
index ab8ba60..4a421a8 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -151,7 +151,7 @@ sp_tex_tile_cache_set_sampler_view(struct softpipe_tex_tile_cache *tc,
          tc->entries[i].addr.bits.invalid = 1;
       }
 
-      tc->tex_face = -1; /* any invalid value here */
+      tc->tex_z = -1; /* any invalid value here */
    }
 }
 
@@ -172,7 +172,7 @@ sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
       for (pos = 0; pos < Elements(tc->entries); pos++) {
          tc->entries[pos].addr.bits.invalid = 1;
       }
-      tc->tex_face = -1;
+      tc->tex_z = -1;
    }
 
 }
@@ -190,8 +190,7 @@ tex_cache_pos( union tex_tile_address addr )
 {
    uint entry = (addr.bits.x + 
                  addr.bits.y * 9 + 
-                 addr.bits.z * 3 + 
-                 addr.bits.face + 
+                 addr.bits.z +
                  addr.bits.level * 7);
 
    return entry % NUM_TEX_TILE_ENTRIES;
@@ -226,7 +225,6 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
 
       /* check if we need to get a new transfer */
       if (!tc->tex_trans ||
-          tc->tex_face != addr.bits.face ||
           tc->tex_level != addr.bits.level ||
           tc->tex_z != addr.bits.z) {
          /* get new transfer (view into texture) */
@@ -245,7 +243,7 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
          }
          else {
             height = u_minify(tc->texture->height0, addr.bits.level);
-            layer = addr.bits.face + addr.bits.z;
+            layer = addr.bits.z;
          }
 
          tc->tex_trans_map =
@@ -255,7 +253,6 @@ sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc,
                               PIPE_TRANSFER_READ | PIPE_TRANSFER_UNSYNCHRONIZED,
                               0, 0, width, height, &tc->tex_trans);
 
-         tc->tex_face = addr.bits.face;
          tc->tex_level = addr.bits.level;
          tc->tex_z = addr.bits.z;
       }
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
index 4eb4246..2233eff 100644
--- a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -55,7 +55,6 @@ union tex_tile_address {
       unsigned x:TEX_ADDR_BITS;  /* 16K / TILE_SIZE */
       unsigned y:TEX_ADDR_BITS;  /* 16K / TILE_SIZE */
       unsigned z:TEX_Z_BITS;     /* 16K -- z not tiled */
-      unsigned face:3;
       unsigned level:4;
       unsigned invalid:1;
    } bits;
@@ -94,7 +93,7 @@ struct softpipe_tex_tile_cache
 
    struct pipe_transfer *tex_trans;
    void *tex_trans_map;
-   int tex_face, tex_level, tex_z;
+   int tex_level, tex_z;
 
    unsigned swizzle_r;
    unsigned swizzle_g;
@@ -141,7 +140,6 @@ tex_tile_address( unsigned x,
    addr.bits.x = x / TEX_TILE_SIZE;
    addr.bits.y = y / TEX_TILE_SIZE;
    addr.bits.z = z;
-   addr.bits.face = face;
    addr.bits.level = level;
 
    return addr;
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index b75f038..56e4867 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -308,6 +308,7 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
       return 1;
    case PIPE_CAP_UMA:
    case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
       return 0;
    }
 
@@ -376,6 +377,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       }
       /* If we get here, we failed to handle a cap above */
@@ -433,6 +435,7 @@ static int svga_get_shader_param(struct pipe_screen *screen, unsigned shader, en
       case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
       case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
          return 0;
       }
       /* If we get here, we failed to handle a cap above */
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 7a12b52..bac9560 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -1900,7 +1900,7 @@ emit_tex(struct svga_shader_emitter *emit,
                       emit->key.fkey.tex[unit].swizzle_b != PIPE_SWIZZLE_BLUE ||
                       emit->key.fkey.tex[unit].swizzle_a != PIPE_SWIZZLE_ALPHA);
 
-   boolean saturate = insn->Instruction.Saturate != TGSI_SAT_NONE;
+   boolean saturate = insn->Instruction.Saturate;
 
    /* If doing compare processing or tex swizzle or saturation, we need to put
     * the fetched color into a temporary so it can be used as a source later on.
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index 0b56517..0013c96 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -553,6 +553,8 @@ trace_context_delete_depth_stencil_alpha_state(struct pipe_context *_pipe,
 TRACE_SHADER_STATE(fs)
 TRACE_SHADER_STATE(vs)
 TRACE_SHADER_STATE(gs)
+TRACE_SHADER_STATE(tcs)
+TRACE_SHADER_STATE(tes)
 
 #undef TRACE_SHADER_STATE
 
@@ -1508,6 +1510,23 @@ static void trace_context_memory_barrier(struct pipe_context *_context,
 }
 
 
+static void trace_context_set_tess_state(struct pipe_context *_context,
+                                         const float default_outer_level[4],
+                                         const float default_inner_level[2])
+{
+   struct trace_context *tr_context = trace_context(_context);
+   struct pipe_context *context = tr_context->pipe;
+
+   trace_dump_call_begin("pipe_context", "set_tess_state");
+   trace_dump_arg(ptr, context);
+   trace_dump_arg_array(float, default_outer_level, 4);
+   trace_dump_arg_array(float, default_inner_level, 2);
+   trace_dump_call_end();
+
+   context->set_tess_state(context, default_outer_level, default_inner_level);
+}
+
+
 static const struct debug_named_value rbug_blocker_flags[] = {
    {"before", 1, NULL},
    {"after", 2, NULL},
@@ -1566,6 +1585,12 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(create_gs_state);
    TR_CTX_INIT(bind_gs_state);
    TR_CTX_INIT(delete_gs_state);
+   TR_CTX_INIT(create_tcs_state);
+   TR_CTX_INIT(bind_tcs_state);
+   TR_CTX_INIT(delete_tcs_state);
+   TR_CTX_INIT(create_tes_state);
+   TR_CTX_INIT(bind_tes_state);
+   TR_CTX_INIT(delete_tes_state);
    TR_CTX_INIT(create_vertex_elements_state);
    TR_CTX_INIT(bind_vertex_elements_state);
    TR_CTX_INIT(delete_vertex_elements_state);
@@ -1597,6 +1622,7 @@ trace_context_create(struct trace_screen *tr_scr,
    TR_CTX_INIT(flush);
    TR_CTX_INIT(texture_barrier);
    TR_CTX_INIT(memory_barrier);
+   TR_CTX_INIT(set_tess_state);
 
    TR_CTX_INIT(transfer_map);
    TR_CTX_INIT(transfer_unmap);
diff --git a/src/gallium/drivers/trace/tr_dump_state.c b/src/gallium/drivers/trace/tr_dump_state.c
index 7127338..9bf4a72 100644
--- a/src/gallium/drivers/trace/tr_dump_state.c
+++ b/src/gallium/drivers/trace/tr_dump_state.c
@@ -709,6 +709,8 @@ void trace_dump_draw_info(const struct pipe_draw_info *state)
    trace_dump_member(uint, state, start_instance);
    trace_dump_member(uint, state, instance_count);
 
+   trace_dump_member(uint, state, vertices_per_patch);
+
    trace_dump_member(int,  state, index_bias);
    trace_dump_member(uint, state, min_index);
    trace_dump_member(uint, state, max_index);
diff --git a/src/gallium/drivers/trace/tr_public.h b/src/gallium/drivers/trace/tr_public.h
index aee4937..b03133f 100644
--- a/src/gallium/drivers/trace/tr_public.h
+++ b/src/gallium/drivers/trace/tr_public.h
@@ -28,6 +28,8 @@
 #ifndef TR_PUBLIC_H
 #define TR_PUBLIC_H
 
+#include "pipe/p_compiler.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
diff --git a/src/gallium/drivers/vc4/kernel/Makefile.am b/src/gallium/drivers/vc4/Android.mk
index 1ae5f1c..f42a152 100644
--- a/src/gallium/drivers/vc4/kernel/Makefile.am
+++ b/src/gallium/drivers/vc4/Android.mk
@@ -1,4 +1,4 @@
-# Copyright © 2014 Broadcom
+# Copyright (C) 2014 Emil Velikov <emil.l.velikov@gmail.com>
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -7,34 +7,31 @@
 # and/or sell copies of the Software, and to permit persons to whom the
 # Software is furnished to do so, subject to the following conditions:
 #
-# The above copyright notice and this permission notice (including the next
-# paragraph) shall be included in all copies or substantial portions of the
-# Software.
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
 #
 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-# IN THE SOFTWARE.
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
 
-include Makefile.sources
-include $(top_srcdir)/src/gallium/Automake.inc
+LOCAL_PATH := $(call my-dir)
 
-if USE_VC4_SIMULATOR
-SIM_CFLAGS = -DUSE_VC4_SIMULATOR=1
-endif
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
 
-AM_CFLAGS = \
-	$(LIBDRM_CFLAGS) \
-	$(GALLIUM_DRIVER_CFLAGS) \
-	$(SIM_CFLAGS) \
-	-I$(top_srcdir)/src/mesa/ \
-	-I$(srcdir)/../ \
-	$()
+include $(CLEAR_VARS)
 
-noinst_LTLIBRARIES = libvc4_kernel.la
+LOCAL_SRC_FILES := \
+	$(C_SOURCES)
 
-libvc4_kernel_la_SOURCES = $(C_SOURCES)
-libvc4_kernel_la_LDFLAGS = $(SIM_LDFLAGS)
+LOCAL_SHARED_LIBRARIES := libdrm
+# We need libmesa_glsl to get NIR's generated include directories.
+LOCAL_STATIC_LIBRARIES := libmesa_glsl
+LOCAL_MODULE := libmesa_pipe_vc4
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/drivers/vc4/Makefile.am b/src/gallium/drivers/vc4/Makefile.am
index 3fc591f..3f62ce2 100644
--- a/src/gallium/drivers/vc4/Makefile.am
+++ b/src/gallium/drivers/vc4/Makefile.am
@@ -19,7 +19,7 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-SUBDIRS = kernel
+AUTOMAKE_OPTIONS = subdir-objects
 
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
@@ -39,5 +39,5 @@ AM_CFLAGS = \
 noinst_LTLIBRARIES = libvc4.la
 
 libvc4_la_SOURCES = $(C_SOURCES)
-libvc4_la_LIBADD = $(SIM_LIB) kernel/libvc4_kernel.la
+libvc4_la_LIBADD = $(SIM_LIB)
 libvc4_la_LDFLAGS = $(SIM_LDFLAGS)
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index 49474df..1eb029e 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -1,4 +1,10 @@
 C_SOURCES := \
+	kernel/vc4_drv.h \
+	kernel/vc4_gem.c \
+	kernel/vc4_packet.h \
+	kernel/vc4_render_cl.c \
+	kernel/vc4_validate.c \
+	kernel/vc4_validate_shaders.c \
 	vc4_blit.c \
 	vc4_bufmgr.c \
 	vc4_bufmgr.h \
@@ -20,7 +26,6 @@ C_SOURCES := \
 	vc4_opt_dead_code.c \
 	vc4_opt_small_immediates.c \
 	vc4_opt_vpm_writes.c \
-	vc4_packet.h \
 	vc4_program.c \
 	vc4_qir.c \
 	vc4_qir_lower_uniforms.c \
diff --git a/src/gallium/drivers/vc4/kernel/Makefile.sources b/src/gallium/drivers/vc4/kernel/Makefile.sources
deleted file mode 100644
index 7d17a89..0000000
--- a/src/gallium/drivers/vc4/kernel/Makefile.sources
+++ /dev/null
@@ -1,6 +0,0 @@
-C_SOURCES := \
-	vc4_drv.h \
-	vc4_gem.c \
-	vc4_validate.c \
-	vc4_validate_shaders.c \
-	$()
diff --git a/src/gallium/drivers/vc4/kernel/vc4_drv.h b/src/gallium/drivers/vc4/kernel/vc4_drv.h
index 325f944..1fd8aa9 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_drv.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_drv.h
@@ -28,8 +28,6 @@
 
 enum vc4_bo_mode {
 	VC4_MODE_UNDECIDED,
-	VC4_MODE_TILE_ALLOC,
-	VC4_MODE_TSDA,
 	VC4_MODE_RENDER,
 	VC4_MODE_SHADER,
 };
@@ -52,6 +50,11 @@ struct vc4_exec_info {
 	struct vc4_bo_exec_state *bo;
 	uint32_t bo_count;
 
+	/* List of other BOs used in the job that need to be released
+	 * once the job is complete.
+	 */
+	struct list_head unref_list;
+
 	/* Current unvalidated indices into @bo loaded by the non-hardware
 	 * VC4_PACKET_GEM_HANDLES.
 	 */
@@ -83,14 +86,11 @@ struct vc4_exec_info {
 	uint32_t shader_state_count;
 
 	bool found_tile_binning_mode_config_packet;
-	bool found_tile_rendering_mode_config_packet;
 	bool found_start_tile_binning_packet;
 	bool found_increment_semaphore_packet;
-	bool found_wait_on_semaphore_packet;
 	uint8_t bin_tiles_x, bin_tiles_y;
-	uint32_t fb_width, fb_height;
-	uint32_t tile_alloc_init_block_size;
-	struct drm_gem_cma_object *tile_alloc_bo;
+	struct drm_gem_cma_object *tile_bo;
+	uint32_t tile_alloc_offset;
 
 	/**
 	 * Computed addresses pointing into exec_bo where we start the
@@ -157,13 +157,10 @@ struct vc4_validated_shader_info
 
 /* vc4_validate.c */
 int
-vc4_validate_cl(struct drm_device *dev,
-                void *validated,
-                void *unvalidated,
-                uint32_t len,
-                bool is_bin,
-                bool has_bin,
-                struct vc4_exec_info *exec);
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec);
 
 int
 vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
@@ -171,4 +168,16 @@ vc4_validate_shader_recs(struct drm_device *dev, struct vc4_exec_info *exec);
 struct vc4_validated_shader_info *
 vc4_validate_shader(struct drm_gem_cma_object *shader_obj);
 
+bool vc4_use_bo(struct vc4_exec_info *exec,
+		uint32_t hindex,
+		enum vc4_bo_mode mode,
+		struct drm_gem_cma_object **obj);
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec);
+
+bool vc4_check_tex_size(struct vc4_exec_info *exec,
+			struct drm_gem_cma_object *fbo,
+			uint32_t offset, uint8_t tiling_format,
+			uint32_t width, uint32_t height, uint8_t cpp);
+
 #endif /* VC4_DRV_H */
diff --git a/src/gallium/drivers/vc4/kernel/vc4_gem.c b/src/gallium/drivers/vc4/kernel/vc4_gem.c
index ac29ab3..e4b7fea 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_gem.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_gem.c
@@ -25,24 +25,26 @@
 
 #include "vc4_drv.h"
 
-int
-vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
+/*
+ * Copies in the user's binning command list and generates the validated bin
+ * CL, along with associated data (shader records, uniforms).
+ */
+static int
+vc4_get_bcl(struct drm_device *dev, struct vc4_exec_info *exec)
 {
 	struct drm_vc4_submit_cl *args = exec->args;
 	void *temp = NULL;
-	void *bin, *render;
+	void *bin;
 	int ret = 0;
 	uint32_t bin_offset = 0;
-	uint32_t render_offset = bin_offset + args->bin_cl_size;
-	uint32_t shader_rec_offset = roundup(render_offset +
-					     args->render_cl_size, 16);
+	uint32_t shader_rec_offset = roundup(bin_offset + args->bin_cl_size,
+					     16);
 	uint32_t uniforms_offset = shader_rec_offset + args->shader_rec_size;
 	uint32_t exec_size = uniforms_offset + args->uniforms_size;
 	uint32_t temp_size = exec_size + (sizeof(struct vc4_shader_state) *
 					  args->shader_rec_count);
 
-	if (shader_rec_offset < render_offset ||
-	    uniforms_offset < shader_rec_offset ||
+	if (uniforms_offset < shader_rec_offset ||
 	    exec_size < uniforms_offset ||
 	    args->shader_rec_count >= (UINT_MAX /
 					  sizeof(struct vc4_shader_state)) ||
@@ -66,7 +68,6 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 		goto fail;
 	}
 	bin = temp + bin_offset;
-	render = temp + render_offset;
 	exec->shader_rec_u = temp + shader_rec_offset;
 	exec->uniforms_u = temp + uniforms_offset;
 	exec->shader_state = temp + exec_size;
@@ -80,14 +81,6 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 		goto fail;
 	}
 
-	ret = copy_from_user(render,
-			     (void __user *)(uintptr_t)args->render_cl,
-			     args->render_cl_size);
-	if (ret) {
-		DRM_ERROR("Failed to copy in render cl\n");
-		goto fail;
-	}
-
 	ret = copy_from_user(exec->shader_rec_u,
 			     (void __user *)(uintptr_t)args->shader_rec,
 			     args->shader_rec_size);
@@ -114,8 +107,10 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 	}
 #endif
 
+	list_addtail(&to_vc4_bo(&exec->exec_bo->base)->unref_head,
+		     &exec->unref_list);
+
 	exec->ct0ca = exec->exec_bo->paddr + bin_offset;
-	exec->ct1ca = exec->exec_bo->paddr + render_offset;
 
 	exec->shader_rec_v = exec->exec_bo->vaddr + shader_rec_offset;
 	exec->shader_rec_p = exec->exec_bo->paddr + shader_rec_offset;
@@ -125,23 +120,10 @@ vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
 	exec->uniforms_p = exec->exec_bo->paddr + uniforms_offset;
 	exec->uniforms_size = args->uniforms_size;
 
-	ret = vc4_validate_cl(dev,
-			      exec->exec_bo->vaddr + bin_offset,
-			      bin,
-			      args->bin_cl_size,
-			      true,
-			      args->bin_cl_size != 0,
-			      exec);
-	if (ret)
-		goto fail;
-
-	ret = vc4_validate_cl(dev,
-			      exec->exec_bo->vaddr + render_offset,
-			      render,
-			      args->render_cl_size,
-			      false,
-			      args->bin_cl_size != 0,
-			      exec);
+	ret = vc4_validate_bin_cl(dev,
+				  exec->exec_bo->vaddr + bin_offset,
+				  bin,
+				  exec);
 	if (ret)
 		goto fail;
 
@@ -152,4 +134,25 @@ fail:
 	return ret;
 }
 
+int
+vc4_cl_validate(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	int ret = 0;
+
+	if (exec->args->bin_cl_size != 0) {
+		ret = vc4_get_bcl(dev, exec);
+		if (ret)
+			goto fail;
+	} else {
+		exec->ct0ca = exec->ct0ea = 0;
+	}
+
+	ret = vc4_get_rcl(dev, exec);
+	if (ret)
+		goto fail;
+
+fail:
+	return ret;
+}
+
 #endif /* USE_VC4_SIMULATOR */
diff --git a/src/gallium/drivers/vc4/vc4_packet.h b/src/gallium/drivers/vc4/kernel/vc4_packet.h
index 181f2e0..88cfc0f 100644
--- a/src/gallium/drivers/vc4/vc4_packet.h
+++ b/src/gallium/drivers/vc4/kernel/vc4_packet.h
@@ -81,6 +81,38 @@ enum vc4_packet {
         VC4_PACKET_GEM_HANDLES = 254,
 } __attribute__ ((__packed__));
 
+#define VC4_PACKET_HALT_SIZE						1
+#define VC4_PACKET_NOP_SIZE						1
+#define VC4_PACKET_FLUSH_SIZE						1
+#define VC4_PACKET_FLUSH_ALL_SIZE					1
+#define VC4_PACKET_START_TILE_BINNING_SIZE				1
+#define VC4_PACKET_INCREMENT_SEMAPHORE_SIZE				1
+#define VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE				1
+#define VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE				5
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE				1
+#define VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF_SIZE			1
+#define VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE			7
+#define VC4_PACKET_GL_INDEXED_PRIMITIVE_SIZE				14
+#define VC4_PACKET_GL_ARRAY_PRIMITIVE_SIZE				10
+#define VC4_PACKET_PRIMITIVE_LIST_FORMAT_SIZE				2
+#define VC4_PACKET_GL_SHADER_STATE_SIZE					5
+#define VC4_PACKET_NV_SHADER_STATE_SIZE					5
+#define VC4_PACKET_CONFIGURATION_BITS_SIZE				4
+#define VC4_PACKET_FLAT_SHADE_FLAGS_SIZE				5
+#define VC4_PACKET_POINT_SIZE_SIZE					5
+#define VC4_PACKET_LINE_WIDTH_SIZE					5
+#define VC4_PACKET_RHT_X_BOUNDARY_SIZE					3
+#define VC4_PACKET_DEPTH_OFFSET_SIZE					5
+#define VC4_PACKET_CLIP_WINDOW_SIZE					9
+#define VC4_PACKET_VIEWPORT_OFFSET_SIZE					5
+#define VC4_PACKET_CLIPPER_XY_SCALING_SIZE				9
+#define VC4_PACKET_CLIPPER_Z_SCALING_SIZE				9
+#define VC4_PACKET_TILE_BINNING_MODE_CONFIG_SIZE			16
+#define VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE			11
+#define VC4_PACKET_CLEAR_COLORS_SIZE					14
+#define VC4_PACKET_TILE_COORDINATES_SIZE				3
+#define VC4_PACKET_GEM_HANDLES_SIZE					9
 
 #define VC4_MASK(high, low) (((1 << ((high) - (low) + 1)) - 1) << (low))
 /* Using the GNU statement expression extension */
@@ -117,18 +149,19 @@ enum vc4_packet {
 
 /** @{
  *
- * byte 1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
+ * byte 0-1 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
  * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
  */
-#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR (1 << 7)
-#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR     (1 << 6)
-#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR  (1 << 5)
-#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP         (1 << 4)
-
-#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888         (0 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER    (1 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_BGR565           (2 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_MASK             (3 << 0)
+#define VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR (1 << 15)
+#define VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR     (1 << 14)
+#define VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR  (1 << 13)
+#define VC4_STORE_TILE_BUFFER_DISABLE_SWAP         (1 << 12)
+
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK      VC4_MASK(9, 8)
+#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT     8
+#define VC4_LOADSTORE_TILE_BUFFER_RGBA8888         0
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER    1
+#define VC4_LOADSTORE_TILE_BUFFER_BGR565           2
 /** @} */
 
 /** @{
@@ -136,21 +169,24 @@ enum vc4_packet {
  * byte 0 of VC4_PACKET_STORE_TILE_BUFFER_GENERAL and
  * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL
  */
+#define VC4_STORE_TILE_BUFFER_MODE_MASK            VC4_MASK(7, 6)
+#define VC4_STORE_TILE_BUFFER_MODE_SHIFT           6
 #define VC4_STORE_TILE_BUFFER_MODE_SAMPLE0         (0 << 6)
 #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X4     (1 << 6)
 #define VC4_STORE_TILE_BUFFER_MODE_DECIMATE_X16    (2 << 6)
 
 /** The values of the field are VC4_TILING_FORMAT_* */
-#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK      (3 << 4)
-#define VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT     4
-
-
-#define VC4_LOADSTORE_TILE_BUFFER_NONE             (0 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_COLOR            (1 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_ZS               (2 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_Z                (3 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK          (4 << 0)
-#define VC4_LOADSTORE_TILE_BUFFER_FULL             (5 << 0)
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_MASK      VC4_MASK(5, 4)
+#define VC4_LOADSTORE_TILE_BUFFER_TILING_SHIFT     4
+
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK      VC4_MASK(2, 0)
+#define VC4_LOADSTORE_TILE_BUFFER_BUFFER_SHIFT     0
+#define VC4_LOADSTORE_TILE_BUFFER_NONE             0
+#define VC4_LOADSTORE_TILE_BUFFER_COLOR            1
+#define VC4_LOADSTORE_TILE_BUFFER_ZS               2
+#define VC4_LOADSTORE_TILE_BUFFER_Z                3
+#define VC4_LOADSTORE_TILE_BUFFER_VG_MASK          4
+#define VC4_LOADSTORE_TILE_BUFFER_FULL             5
 /** @} */
 
 #define VC4_INDEX_BUFFER_U8                        (0 << 4)
@@ -196,15 +232,19 @@ enum vc4_packet {
 /** @{ bits in the last u8 of VC4_PACKET_TILE_BINNING_MODE_CONFIG */
 #define VC4_BIN_CONFIG_DB_NON_MS                   (1 << 7)
 
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32         (0 << 5)
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64         (1 << 5)
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128        (2 << 5)
-#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256        (3 << 5)
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK       VC4_MASK(6, 5)
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_SHIFT      5
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32         0
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_64         1
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128        2
+#define VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_256        3
 
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32    (0 << 3)
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64    (1 << 3)
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128   (2 << 3)
-#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256   (3 << 3)
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK  VC4_MASK(4, 3)
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_SHIFT 3
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32    0
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_64    1
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_128   2
+#define VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_256   3
 
 #define VC4_BIN_CONFIG_AUTO_INIT_TSDA              (1 << 2)
 #define VC4_BIN_CONFIG_TILE_BUFFER_64BIT           (1 << 1)
@@ -219,17 +259,18 @@ enum vc4_packet {
 #define VC4_RENDER_CONFIG_ENABLE_VG_MASK           (1 << 8)
 
 /** The values of the field are VC4_TILING_FORMAT_* */
-#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK       (3 << 6)
+#define VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK       VC4_MASK(7, 6)
 #define VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT      6
 
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_1X         (0 << 4)
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_4X         (1 << 4)
 #define VC4_RENDER_CONFIG_DECIMATE_MODE_16X        (2 << 4)
 
-#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED   (0 << 2)
-#define VC4_RENDER_CONFIG_FORMAT_RGBA8888          (1 << 2)
-#define VC4_RENDER_CONFIG_FORMAT_BGR565            (2 << 2)
-#define VC4_RENDER_CONFIG_FORMAT_MASK              (3 << 2)
+#define VC4_RENDER_CONFIG_FORMAT_MASK              VC4_MASK(3, 2)
+#define VC4_RENDER_CONFIG_FORMAT_SHIFT             2
+#define VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED   0
+#define VC4_RENDER_CONFIG_FORMAT_RGBA8888          1
+#define VC4_RENDER_CONFIG_FORMAT_BGR565            2
 
 #define VC4_RENDER_CONFIG_TILE_BUFFER_64BIT        (1 << 1)
 #define VC4_RENDER_CONFIG_MS_MODE_4X               (1 << 0)
diff --git a/src/gallium/drivers/vc4/kernel/vc4_render_cl.c b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
new file mode 100644
index 0000000..e2d907a
--- /dev/null
+++ b/src/gallium/drivers/vc4/kernel/vc4_render_cl.c
@@ -0,0 +1,447 @@
+/*
+ * Copyright © 2014-2015 Broadcom
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * DOC: Render command list generation
+ *
+ * In the VC4 driver, render command list generation is performed by the
+ * kernel instead of userspace.  We do this because validating a
+ * user-submitted command list is hard to get right and has high CPU overhead,
+ * while the number of valid configurations for render command lists is
+ * actually fairly low.
+ */
+
+#include "vc4_drv.h"
+#include "vc4_packet.h"
+
+struct vc4_rcl_setup {
+	struct drm_gem_cma_object *color_read;
+	struct drm_gem_cma_object *color_ms_write;
+	struct drm_gem_cma_object *zs_read;
+	struct drm_gem_cma_object *zs_write;
+
+	struct drm_gem_cma_object *rcl;
+	u32 next_offset;
+};
+
+static inline void rcl_u8(struct vc4_rcl_setup *setup, u8 val)
+{
+	*(u8 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 1;
+}
+
+static inline void rcl_u16(struct vc4_rcl_setup *setup, u16 val)
+{
+	*(u16 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 2;
+}
+
+static inline void rcl_u32(struct vc4_rcl_setup *setup, u32 val)
+{
+	*(u32 *)(setup->rcl->vaddr + setup->next_offset) = val;
+	setup->next_offset += 4;
+}
+
+
+/*
+ * Emits a no-op STORE_TILE_BUFFER_GENERAL.
+ *
+ * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of
+ * some sort before another load is triggered.
+ */
+static void vc4_store_before_load(struct vc4_rcl_setup *setup)
+{
+	rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+	rcl_u16(setup,
+		VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_NONE,
+			      VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
+		VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR |
+		VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR);
+	rcl_u32(setup, 0); /* no address, since we're in None mode */
+}
+
+/*
+ * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
+ *
+ * The tile coordinates packet triggers a pending load if there is one, are
+ * used for clipping during rendering, and determine where loads/stores happen
+ * relative to their base address.
+ */
+static void vc4_tile_coordinates(struct vc4_rcl_setup *setup,
+				 uint32_t x, uint32_t y)
+{
+	rcl_u8(setup, VC4_PACKET_TILE_COORDINATES);
+	rcl_u8(setup, x);
+	rcl_u8(setup, y);
+}
+
+static void emit_tile(struct vc4_exec_info *exec,
+		      struct vc4_rcl_setup *setup,
+		      uint8_t x, uint8_t y, bool first, bool last)
+{
+	bool has_bin = exec->args->bin_cl_size != 0;
+
+	/* Note that the load doesn't actually occur until the
+	 * tile coords packet is processed, and only one load
+	 * may be outstanding at a time.
+	 */
+	if (setup->color_read) {
+		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, exec->args->color_read.bits);
+		rcl_u32(setup,
+			setup->color_read->paddr +
+			exec->args->color_read.offset);
+	}
+
+	if (setup->zs_read) {
+		if (setup->color_read) {
+			/* Exec previous load. */
+			vc4_tile_coordinates(setup, x, y);
+			vc4_store_before_load(setup);
+		}
+
+		rcl_u8(setup, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, exec->args->zs_read.bits);
+		rcl_u32(setup,
+			setup->zs_read->paddr + exec->args->zs_read.offset);
+	}
+
+	/* Clipping depends on tile coordinates having been
+	 * emitted, so we always need one here.
+	 */
+	vc4_tile_coordinates(setup, x, y);
+
+	/* Wait for the binner before jumping to the first
+	 * tile's lists.
+	 */
+	if (first && has_bin)
+		rcl_u8(setup, VC4_PACKET_WAIT_ON_SEMAPHORE);
+
+	if (has_bin) {
+		rcl_u8(setup, VC4_PACKET_BRANCH_TO_SUB_LIST);
+		rcl_u32(setup, (exec->tile_bo->paddr +
+				exec->tile_alloc_offset +
+				(y * exec->bin_tiles_x + x) * 32));
+	}
+
+	if (setup->zs_write) {
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, exec->args->zs_write.bits |
+			(setup->color_ms_write ?
+			 VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR : 0));
+		rcl_u32(setup,
+			(setup->zs_write->paddr + exec->args->zs_write.offset) |
+			((last && !setup->color_ms_write) ?
+			 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
+	}
+
+	if (setup->color_ms_write) {
+		if (setup->zs_write) {
+			/* Reset after previous store */
+			vc4_tile_coordinates(setup, x, y);
+		}
+
+		if (last)
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
+		else
+			rcl_u8(setup, VC4_PACKET_STORE_MS_TILE_BUFFER);
+	}
+}
+
+static int vc4_create_rcl_bo(struct drm_device *dev, struct vc4_exec_info *exec,
+			     struct vc4_rcl_setup *setup)
+{
+	bool has_bin = exec->args->bin_cl_size != 0;
+	uint8_t min_x_tile = exec->args->min_x_tile;
+	uint8_t min_y_tile = exec->args->min_y_tile;
+	uint8_t max_x_tile = exec->args->max_x_tile;
+	uint8_t max_y_tile = exec->args->max_y_tile;
+	uint8_t xtiles = max_x_tile - min_x_tile + 1;
+	uint8_t ytiles = max_y_tile - min_y_tile + 1;
+	uint8_t x, y;
+	uint32_t size, loop_body_size;
+
+	size = VC4_PACKET_TILE_RENDERING_MODE_CONFIG_SIZE;
+	loop_body_size = VC4_PACKET_TILE_COORDINATES_SIZE;
+
+	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		size += VC4_PACKET_CLEAR_COLORS_SIZE +
+			VC4_PACKET_TILE_COORDINATES_SIZE +
+			VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+	}
+
+	if (setup->color_read) {
+		loop_body_size += (VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE);
+	}
+	if (setup->zs_read) {
+		if (setup->color_read) {
+			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+			loop_body_size += VC4_PACKET_STORE_TILE_BUFFER_GENERAL_SIZE;
+		}
+		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+	}
+
+	if (has_bin) {
+		size += VC4_PACKET_WAIT_ON_SEMAPHORE_SIZE;
+		loop_body_size += VC4_PACKET_BRANCH_TO_SUB_LIST_SIZE;
+	}
+
+	if (setup->zs_write)
+		loop_body_size += VC4_PACKET_LOAD_TILE_BUFFER_GENERAL_SIZE;
+	if (setup->color_ms_write) {
+		if (setup->zs_write)
+			loop_body_size += VC4_PACKET_TILE_COORDINATES_SIZE;
+		loop_body_size += VC4_PACKET_STORE_MS_TILE_BUFFER_SIZE;
+	}
+	size += xtiles * ytiles * loop_body_size;
+
+	setup->rcl = drm_gem_cma_create(dev, size);
+	if (!setup->rcl)
+		return -ENOMEM;
+	list_addtail(&to_vc4_bo(&setup->rcl->base)->unref_head,
+		     &exec->unref_list);
+
+	rcl_u8(setup, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
+	rcl_u32(setup,
+		(setup->color_ms_write ?
+		 (setup->color_ms_write->paddr +
+		  exec->args->color_ms_write.offset) :
+		 0));
+	rcl_u16(setup, exec->args->width);
+	rcl_u16(setup, exec->args->height);
+	rcl_u16(setup, exec->args->color_ms_write.bits);
+
+	/* The tile buffer gets cleared when the previous tile is stored.  If
+	 * the clear values changed between frames, then the tile buffer has
+	 * stale clear values in it, so we have to do a store in None mode (no
+	 * writes) so that we trigger the tile buffer clear.
+	 */
+	if (exec->args->flags & VC4_SUBMIT_CL_USE_CLEAR_COLOR) {
+		rcl_u8(setup, VC4_PACKET_CLEAR_COLORS);
+		rcl_u32(setup, exec->args->clear_color[0]);
+		rcl_u32(setup, exec->args->clear_color[1]);
+		rcl_u32(setup, exec->args->clear_z);
+		rcl_u8(setup, exec->args->clear_s);
+
+		vc4_tile_coordinates(setup, 0, 0);
+
+		rcl_u8(setup, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
+		rcl_u16(setup, VC4_LOADSTORE_TILE_BUFFER_NONE);
+		rcl_u32(setup, 0); /* no address, since we're in None mode */
+	}
+
+	for (y = min_y_tile; y <= max_y_tile; y++) {
+		for (x = min_x_tile; x <= max_x_tile; x++) {
+			bool first = (x == min_x_tile && y == min_y_tile);
+			bool last = (x == max_x_tile && y == max_y_tile);
+			emit_tile(exec, setup, x, y, first, last);
+		}
+	}
+
+	BUG_ON(setup->next_offset != size);
+	exec->ct1ca = setup->rcl->paddr;
+	exec->ct1ea = setup->rcl->paddr + setup->next_offset;
+
+	return 0;
+}
+
+static int vc4_rcl_surface_setup(struct vc4_exec_info *exec,
+				 struct drm_gem_cma_object **obj,
+				 struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_TILING);
+	uint8_t buffer = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_BUFFER);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+	int cpp;
+
+	if (surf->pad != 0) {
+		DRM_ERROR("Padding unset\n");
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (surf->bits & ~(VC4_LOADSTORE_TILE_BUFFER_TILING_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_BUFFER_MASK |
+			   VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK)) {
+		DRM_ERROR("Unknown bits in load/store: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	if (buffer == VC4_LOADSTORE_TILE_BUFFER_ZS) {
+		if (format != 0) {
+			DRM_ERROR("No color format should be set for ZS\n");
+			return -EINVAL;
+		}
+		cpp = 4;
+	} else if (buffer == VC4_LOADSTORE_TILE_BUFFER_COLOR) {
+		switch (format) {
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565:
+		case VC4_LOADSTORE_TILE_BUFFER_BGR565_DITHER:
+			cpp = 2;
+			break;
+		case VC4_LOADSTORE_TILE_BUFFER_RGBA8888:
+			cpp = 4;
+			break;
+		default:
+			DRM_ERROR("Bad tile buffer format\n");
+			return -EINVAL;
+		}
+	} else {
+		DRM_ERROR("Bad load/store buffer %d.\n", buffer);
+		return -EINVAL;
+	}
+
+	if (surf->offset & 0xf) {
+		DRM_ERROR("load/store buffer must be 16b aligned.\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int
+vc4_rcl_ms_surface_setup(struct vc4_exec_info *exec,
+			 struct drm_gem_cma_object **obj,
+			 struct drm_vc4_submit_rcl_surface *surf)
+{
+	uint8_t tiling = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_MEMORY_FORMAT);
+	uint8_t format = VC4_GET_FIELD(surf->bits,
+				       VC4_RENDER_CONFIG_FORMAT);
+	int cpp;
+
+	if (surf->pad != 0) {
+		DRM_ERROR("Padding unset\n");
+		return -EINVAL;
+	}
+
+	if (surf->bits & ~(VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK |
+			   VC4_RENDER_CONFIG_FORMAT_MASK)) {
+		DRM_ERROR("Unknown bits in render config: 0x%04x\n",
+			  surf->bits);
+		return -EINVAL;
+	}
+
+	if (surf->hindex == ~0)
+		return 0;
+
+	if (!vc4_use_bo(exec, surf->hindex, VC4_MODE_RENDER, obj))
+		return -EINVAL;
+
+	if (tiling > VC4_TILING_FORMAT_LT) {
+		DRM_ERROR("Bad tiling format\n");
+		return -EINVAL;
+	}
+
+	switch (format) {
+	case VC4_RENDER_CONFIG_FORMAT_BGR565_DITHERED:
+	case VC4_RENDER_CONFIG_FORMAT_BGR565:
+		cpp = 2;
+		break;
+	case VC4_RENDER_CONFIG_FORMAT_RGBA8888:
+		cpp = 4;
+		break;
+	default:
+		DRM_ERROR("Bad tile buffer format\n");
+		return -EINVAL;
+	}
+
+	if (!vc4_check_tex_size(exec, *obj, surf->offset, tiling,
+				exec->args->width, exec->args->height, cpp)) {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+int vc4_get_rcl(struct drm_device *dev, struct vc4_exec_info *exec)
+{
+	struct vc4_rcl_setup setup = {0};
+	struct drm_vc4_submit_cl *args = exec->args;
+	bool has_bin = args->bin_cl_size != 0;
+	int ret;
+
+	if (args->min_x_tile > args->max_x_tile ||
+	    args->min_y_tile > args->max_y_tile) {
+		DRM_ERROR("Bad render tile set (%d,%d)-(%d,%d)\n",
+			  args->min_x_tile, args->min_y_tile,
+			  args->max_x_tile, args->max_y_tile);
+		return -EINVAL;
+	}
+
+	if (has_bin &&
+	    (args->max_x_tile > exec->bin_tiles_x ||
+	     args->max_y_tile > exec->bin_tiles_y)) {
+		DRM_ERROR("Render tiles (%d,%d) outside of bin config (%d,%d)\n",
+			  args->max_x_tile, args->max_y_tile,
+			  exec->bin_tiles_x, exec->bin_tiles_y);
+		return -EINVAL;
+	}
+
+	ret = vc4_rcl_surface_setup(exec, &setup.color_read, &args->color_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_ms_surface_setup(exec, &setup.color_ms_write,
+				       &args->color_ms_write);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_read, &args->zs_read);
+	if (ret)
+		return ret;
+
+	ret = vc4_rcl_surface_setup(exec, &setup.zs_write, &args->zs_write);
+	if (ret)
+		return ret;
+
+	/* We shouldn't even have the job submitted to us if there's no
+	 * surface to write out.
+	 */
+	if (!setup.color_ms_write && !setup.zs_write) {
+		DRM_ERROR("RCL requires color or Z/S write\n");
+		return -EINVAL;
+	}
+
+	return vc4_create_rcl_bo(dev, exec, &setup);
+}
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate.c b/src/gallium/drivers/vc4/kernel/vc4_validate.c
index 2d04a4a..a0b67a7 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate.c
@@ -94,7 +94,7 @@ size_is_lt(uint32_t width, uint32_t height, int cpp)
 		height <= 4 * utile_height(cpp));
 }
 
-static bool
+bool
 vc4_use_bo(struct vc4_exec_info *exec,
 	   uint32_t hindex,
 	   enum vc4_bo_mode mode,
@@ -147,33 +147,39 @@ gl_shader_rec_size(uint32_t pointer_bits)
 		return 36 + attribute_count * 8;
 }
 
-static bool
-check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
-	       uint32_t offset, uint8_t tiling_format,
-	       uint32_t width, uint32_t height, uint8_t cpp)
+bool
+vc4_check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
+		   uint32_t offset, uint8_t tiling_format,
+		   uint32_t width, uint32_t height, uint8_t cpp)
 {
 	uint32_t aligned_width, aligned_height, stride, size;
 	uint32_t utile_w = utile_width(cpp);
 	uint32_t utile_h = utile_height(cpp);
 
-	/* The values are limited by the packet/texture parameter bitfields,
-	 * so we don't need to worry as much about integer overflow.
+	/* The shaded vertex format stores signed 12.4 fixed point
+	 * (-2048,2047) offsets from the viewport center, so we should
+	 * never have a render target larger than 4096.  The texture
+	 * unit can only sample from 2048x2048, so it's even more
+	 * restricted.  This lets us avoid worrying about overflow in
+	 * our math.
 	 */
-	BUG_ON(width > 65535);
-	BUG_ON(height > 65535);
+	if (width > 4096 || height > 4096) {
+		DRM_ERROR("Surface dimesions (%d,%d) too large", width, height);
+		return false;
+	}
 
 	switch (tiling_format) {
 	case VC4_TILING_FORMAT_LINEAR:
-		aligned_width = roundup(width, utile_w);
+		aligned_width = round_up(width, utile_w);
 		aligned_height = height;
 		break;
 	case VC4_TILING_FORMAT_T:
-		aligned_width = roundup(width, utile_w * 8);
-		aligned_height = roundup(height, utile_h * 8);
+		aligned_width = round_up(width, utile_w * 8);
+		aligned_height = round_up(height, utile_h * 8);
 		break;
 	case VC4_TILING_FORMAT_LT:
-		aligned_width = roundup(width, utile_w);
-		aligned_height = roundup(height, utile_h);
+		aligned_width = round_up(width, utile_w);
+		aligned_height = round_up(height, utile_h);
 		break;
 	default:
 		DRM_ERROR("buffer tiling %d unsupported\n", tiling_format);
@@ -181,13 +187,6 @@ check_tex_size(struct vc4_exec_info *exec, struct drm_gem_cma_object *fbo,
 	}
 
 	stride = aligned_width * cpp;
-
-	if (INT_MAX / stride < aligned_height) {
-		DRM_ERROR("Overflow in fbo size (%dx%d -> %dx%d)\n",
-			  width, height,
-			  aligned_width, aligned_height);
-		return false;
-	}
 	size = stride * aligned_height;
 
 	if (size + offset < size ||
@@ -249,122 +248,6 @@ validate_increment_semaphore(VALIDATE_ARGS)
 }
 
 static int
-validate_wait_on_semaphore(VALIDATE_ARGS)
-{
-	if (exec->found_wait_on_semaphore_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_WAIT_ON_SEMAPHORE\n");
-		return -EINVAL;
-	}
-	exec->found_wait_on_semaphore_packet = true;
-
-	if (!exec->found_increment_semaphore_packet) {
-		DRM_ERROR("VC4_PACKET_WAIT_ON_SEMAPHORE without "
-			  "VC4_PACKET_INCREMENT_SEMAPHORE\n");
-		return -EINVAL;
-	}
-
-	return 0;
-}
-
-static int
-validate_branch_to_sublist(VALIDATE_ARGS)
-{
-	struct drm_gem_cma_object *target;
-	uint32_t offset;
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &target))
-		return -EINVAL;
-
-	if (target != exec->tile_alloc_bo) {
-		DRM_ERROR("Jumping to BOs other than tile alloc unsupported\n");
-		return -EINVAL;
-	}
-
-	if (!exec->found_wait_on_semaphore_packet) {
-		DRM_ERROR("Jumping to tile alloc before binning finished.\n");
-		return -EINVAL;
-	}
-
-	offset = *(uint32_t *)(untrusted + 0);
-	if (offset % exec->tile_alloc_init_block_size ||
-	    offset / exec->tile_alloc_init_block_size >=
-	    exec->bin_tiles_x * exec->bin_tiles_y) {
-		DRM_ERROR("VC4_PACKET_BRANCH_TO_SUB_LIST must jump to initial "
-			  "tile allocation space.\n");
-		return -EINVAL;
-	}
-
-	*(uint32_t *)(validated + 0) = target->paddr + offset;
-
-	return 0;
-}
-
-/**
- * validate_loadstore_tile_buffer_general() - Validation for
- * VC4_PACKET_LOAD_TILE_BUFFER_GENERAL and
- * VC4_PACKET_STORE_TILE_BUFFER_GENERAL.
- *
- * The two packets are nearly the same, except for the TLB-clearing management
- * bits not being present for loads.  Additionally, while stores are executed
- * immediately (using the current tile coordinates), loads are queued to be
- * executed when the tile coordinates packet occurs.
- *
- * Note that coordinates packets are validated to be within the declared
- * bin_x/y, which themselves are verified to match the rendering-configuration
- * FB width and height (which the hardware uses to clip loads and stores).
- */
-static int
-validate_loadstore_tile_buffer_general(VALIDATE_ARGS)
-{
-	uint32_t packet_b0 = *(uint8_t *)(untrusted + 0);
-	uint32_t packet_b1 = *(uint8_t *)(untrusted + 1);
-	struct drm_gem_cma_object *fbo;
-	uint32_t buffer_type = packet_b0 & 0xf;
-	uint32_t untrusted_address, offset, cpp;
-
-	switch (buffer_type) {
-	case VC4_LOADSTORE_TILE_BUFFER_NONE:
-		return 0;
-	case VC4_LOADSTORE_TILE_BUFFER_COLOR:
-		if ((packet_b1 & VC4_LOADSTORE_TILE_BUFFER_MASK) ==
-		    VC4_LOADSTORE_TILE_BUFFER_RGBA8888) {
-			cpp = 4;
-		} else {
-			cpp = 2;
-		}
-		break;
-
-	case VC4_LOADSTORE_TILE_BUFFER_Z:
-	case VC4_LOADSTORE_TILE_BUFFER_ZS:
-		cpp = 4;
-		break;
-
-	default:
-		DRM_ERROR("Load/store type %d unsupported\n", buffer_type);
-		return -EINVAL;
-	}
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo))
-		return -EINVAL;
-
-	untrusted_address = *(uint32_t *)(untrusted + 2);
-	offset = untrusted_address & ~0xf;
-
-	if (!check_tex_size(exec, fbo, offset,
-			    ((packet_b0 &
-			      VC4_LOADSTORE_TILE_BUFFER_FORMAT_MASK) >>
-			     VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT),
-			    exec->fb_width, exec->fb_height, cpp)) {
-		return -EINVAL;
-	}
-
-	*(uint32_t *)(validated + 2) = (offset + fbo->paddr +
-					(untrusted_address & 0xf));
-
-	return 0;
-}
-
-static int
 validate_indexed_prim_list(VALIDATE_ARGS)
 {
 	struct drm_gem_cma_object *ib;
@@ -492,14 +375,10 @@ validate_nv_shader_state(VALIDATE_ARGS)
 static int
 validate_tile_binning_config(VALIDATE_ARGS)
 {
-	struct drm_gem_cma_object *tile_allocation;
-	struct drm_gem_cma_object *tile_state_data_array;
+	struct drm_device *dev = exec->exec_bo->base.dev;
 	uint8_t flags;
-	uint32_t tile_allocation_size;
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_TILE_ALLOC, &tile_allocation) ||
-	    !vc4_use_handle(exec, 1, VC4_MODE_TSDA, &tile_state_data_array))
-		return -EINVAL;
+	uint32_t tile_state_size, tile_alloc_size;
+	uint32_t tile_count;
 
 	if (exec->found_tile_binning_mode_config_packet) {
 		DRM_ERROR("Duplicate VC4_PACKET_TILE_BINNING_MODE_CONFIG\n");
@@ -509,6 +388,7 @@ validate_tile_binning_config(VALIDATE_ARGS)
 
 	exec->bin_tiles_x = *(uint8_t *)(untrusted + 12);
 	exec->bin_tiles_y = *(uint8_t *)(untrusted + 13);
+	tile_count = exec->bin_tiles_x * exec->bin_tiles_y;
 	flags = *(uint8_t *)(untrusted + 14);
 
 	if (exec->bin_tiles_x == 0 ||
@@ -518,15 +398,6 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	/* Our validation relies on the user not getting to set up their own
-	 * tile state/tile allocation BO contents.
-	 */
-	if (!(flags & VC4_BIN_CONFIG_AUTO_INIT_TSDA)) {
-		DRM_ERROR("binning config missing "
-			  "VC4_BIN_CONFIG_AUTO_INIT_TSDA\n");
-		return -EINVAL;
-	}
-
 	if (flags & (VC4_BIN_CONFIG_DB_NON_MS |
 		     VC4_BIN_CONFIG_TILE_BUFFER_64BIT |
 		     VC4_BIN_CONFIG_MS_MODE_4X)) {
@@ -534,94 +405,52 @@ validate_tile_binning_config(VALIDATE_ARGS)
 		return -EINVAL;
 	}
 
-	if (*(uint32_t *)(untrusted + 0) != 0) {
-		DRM_ERROR("tile allocation offset != 0 unsupported\n");
-		return -EINVAL;
-	}
-	tile_allocation_size = *(uint32_t *)(untrusted + 4);
-	if (tile_allocation_size > tile_allocation->base.size) {
-		DRM_ERROR("tile allocation size %d > BO size %d\n",
-			  tile_allocation_size, tile_allocation->base.size);
-		return -EINVAL;
-	}
-	*(uint32_t *)validated = tile_allocation->paddr;
-	exec->tile_alloc_bo = tile_allocation;
-
-	exec->tile_alloc_init_block_size = 1 << (5 + ((flags >> 5) & 3));
-	if (exec->bin_tiles_x * exec->bin_tiles_y *
-	    exec->tile_alloc_init_block_size > tile_allocation_size) {
-		DRM_ERROR("tile init exceeds tile alloc size (%d vs %d)\n",
-			  exec->bin_tiles_x * exec->bin_tiles_y *
-			  exec->tile_alloc_init_block_size,
-			  tile_allocation_size);
-		return -EINVAL;
-	}
-	if (*(uint32_t *)(untrusted + 8) != 0) {
-		DRM_ERROR("TSDA offset != 0 unsupported\n");
-		return -EINVAL;
-	}
-	if (exec->bin_tiles_x * exec->bin_tiles_y * 48 >
-	    tile_state_data_array->base.size) {
-		DRM_ERROR("TSDA of %db too small for %dx%d bin config\n",
-			  tile_state_data_array->base.size,
-			  exec->bin_tiles_x, exec->bin_tiles_y);
-	}
-	*(uint32_t *)(validated + 8) = tile_state_data_array->paddr;
-
-	return 0;
-}
-
-static int
-validate_tile_rendering_mode_config(VALIDATE_ARGS)
-{
-	struct drm_gem_cma_object *fbo;
-	uint32_t flags, offset, cpp;
-
-	if (exec->found_tile_rendering_mode_config_packet) {
-		DRM_ERROR("Duplicate VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n");
-		return -EINVAL;
-	}
-	exec->found_tile_rendering_mode_config_packet = true;
-
-	if (!vc4_use_handle(exec, 0, VC4_MODE_RENDER, &fbo))
-		return -EINVAL;
-
-	exec->fb_width = *(uint16_t *)(untrusted + 4);
-	exec->fb_height = *(uint16_t *)(untrusted + 6);
-
-	flags = *(uint16_t *)(untrusted + 8);
-	if ((flags & VC4_RENDER_CONFIG_FORMAT_MASK) ==
-	    VC4_RENDER_CONFIG_FORMAT_RGBA8888) {
-		cpp = 4;
-	} else {
-		cpp = 2;
-	}
-
-	offset = *(uint32_t *)untrusted;
-	if (!check_tex_size(exec, fbo, offset,
-			    ((flags &
-			      VC4_RENDER_CONFIG_MEMORY_FORMAT_MASK) >>
-			     VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT),
-			    exec->fb_width, exec->fb_height, cpp)) {
-		return -EINVAL;
-	}
-
-	*(uint32_t *)validated = fbo->paddr + offset;
-
-	return 0;
-}
-
-static int
-validate_tile_coordinates(VALIDATE_ARGS)
-{
-	uint8_t tile_x = *(uint8_t *)(untrusted + 0);
-	uint8_t tile_y = *(uint8_t *)(untrusted + 1);
+	/* The tile state data array is 48 bytes per tile, and we put it at
+	 * the start of a BO containing both it and the tile alloc.
+	 */
+	tile_state_size = 48 * tile_count;
+
+	/* Since the tile alloc array will follow us, align. */
+	exec->tile_alloc_offset = roundup(tile_state_size, 4096);
+
+	*(uint8_t *)(validated + 14) =
+		((flags & ~(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_MASK |
+			    VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_MASK)) |
+		 VC4_BIN_CONFIG_AUTO_INIT_TSDA |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32,
+			       VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE) |
+		 VC4_SET_FIELD(VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_128,
+			       VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE));
+
+	/* Initial block size. */
+	tile_alloc_size = 32 * tile_count;
+
+	/*
+	 * The initial allocation gets rounded to the next 256 bytes before
+	 * the hardware starts fulfilling further allocations.
+	 */
+	tile_alloc_size = roundup(tile_alloc_size, 256);
 
-	if (tile_x * 64 >= exec->fb_width || tile_y * 64 >= exec->fb_height) {
-		DRM_ERROR("Tile coordinates %d,%d > render config %dx%d\n",
-			  tile_x, tile_y, exec->fb_width, exec->fb_height);
-		return -EINVAL;
-	}
+	/* Add space for the extra allocations.  This is what gets used first,
+	 * before overflow memory.  It must have at least 4096 bytes, but we
+	 * want to avoid overflow memory usage if possible.
+	 */
+	tile_alloc_size += 1024 * 1024;
+
+	exec->tile_bo = drm_gem_cma_create(dev, exec->tile_alloc_offset +
+					   tile_alloc_size);
+	if (!exec->tile_bo)
+		return -ENOMEM;
+	list_addtail(&to_vc4_bo(&exec->tile_bo->base)->unref_head,
+		     &exec->unref_list);
+
+	/* tile alloc address. */
+	*(uint32_t *)(validated + 0) = (exec->tile_bo->paddr +
+					exec->tile_alloc_offset);
+	/* tile alloc size. */
+	*(uint32_t *)(validated + 4) = tile_alloc_size;
+	/* tile state address. */
+	*(uint32_t *)(validated + 8) = exec->tile_bo->paddr;
 
 	return 0;
 }
@@ -633,78 +462,60 @@ validate_gem_handles(VALIDATE_ARGS)
 	return 0;
 }
 
+#define VC4_DEFINE_PACKET(packet, name, func) \
+	[packet] = { packet ## _SIZE, name, func }
+
 static const struct cmd_info {
-	bool bin;
-	bool render;
 	uint16_t len;
 	const char *name;
 	int (*func)(struct vc4_exec_info *exec, void *validated,
 		    void *untrusted);
 } cmd_info[] = {
-	[VC4_PACKET_HALT] = { 1, 1, 1, "halt", NULL },
-	[VC4_PACKET_NOP] = { 1, 1, 1, "nop", NULL },
-	[VC4_PACKET_FLUSH] = { 1, 1, 1, "flush", NULL },
-	[VC4_PACKET_FLUSH_ALL] = { 1, 0, 1, "flush all state", validate_flush_all },
-	[VC4_PACKET_START_TILE_BINNING] = { 1, 0, 1, "start tile binning", validate_start_tile_binning },
-	[VC4_PACKET_INCREMENT_SEMAPHORE] = { 1, 0, 1, "increment semaphore", validate_increment_semaphore },
-	[VC4_PACKET_WAIT_ON_SEMAPHORE] = { 0, 1, 1, "wait on semaphore", validate_wait_on_semaphore },
-	/* BRANCH_TO_SUB_LIST is actually supported in the binner as well, but
-	 * we only use it from the render CL in order to jump into the tile
-	 * allocation BO.
-	 */
-	[VC4_PACKET_BRANCH_TO_SUB_LIST] = { 0, 1, 5, "branch to sublist", validate_branch_to_sublist },
-	[VC4_PACKET_STORE_MS_TILE_BUFFER] = { 0, 1, 1, "store MS resolved tile color buffer", NULL },
-	[VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF] = { 0, 1, 1, "store MS resolved tile color buffer and EOF", NULL },
+	VC4_DEFINE_PACKET(VC4_PACKET_HALT, "halt", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_NOP, "nop", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH, "flush", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLUSH_ALL, "flush all state", validate_flush_all),
+	VC4_DEFINE_PACKET(VC4_PACKET_START_TILE_BINNING, "start tile binning", validate_start_tile_binning),
+	VC4_DEFINE_PACKET(VC4_PACKET_INCREMENT_SEMAPHORE, "increment semaphore", validate_increment_semaphore),
 
-	[VC4_PACKET_STORE_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Store Tile Buffer General", validate_loadstore_tile_buffer_general },
-	[VC4_PACKET_LOAD_TILE_BUFFER_GENERAL] = { 0, 1, 7, "Load Tile Buffer General", validate_loadstore_tile_buffer_general },
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_INDEXED_PRIMITIVE, "Indexed Primitive List", validate_indexed_prim_list),
 
-	[VC4_PACKET_GL_INDEXED_PRIMITIVE] = { 1, 1, 14, "Indexed Primitive List", validate_indexed_prim_list },
-
-	[VC4_PACKET_GL_ARRAY_PRIMITIVE] = { 1, 1, 10, "Vertex Array Primitives", validate_gl_array_primitive },
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_ARRAY_PRIMITIVE, "Vertex Array Primitives", validate_gl_array_primitive),
 
 	/* This is only used by clipped primitives (packets 48 and 49), which
 	 * we don't support parsing yet.
 	 */
-	[VC4_PACKET_PRIMITIVE_LIST_FORMAT] = { 1, 1, 2, "primitive list format", NULL },
-
-	[VC4_PACKET_GL_SHADER_STATE] = { 1, 1, 5, "GL Shader State", validate_gl_shader_state },
-	[VC4_PACKET_NV_SHADER_STATE] = { 1, 1, 5, "NV Shader State", validate_nv_shader_state },
-
-	[VC4_PACKET_CONFIGURATION_BITS] = { 1, 1, 4, "configuration bits", NULL },
-	[VC4_PACKET_FLAT_SHADE_FLAGS] = { 1, 1, 5, "flat shade flags", NULL },
-	[VC4_PACKET_POINT_SIZE] = { 1, 1, 5, "point size", NULL },
-	[VC4_PACKET_LINE_WIDTH] = { 1, 1, 5, "line width", NULL },
-	[VC4_PACKET_RHT_X_BOUNDARY] = { 1, 1, 3, "RHT X boundary", NULL },
-	[VC4_PACKET_DEPTH_OFFSET] = { 1, 1, 5, "Depth Offset", NULL },
-	[VC4_PACKET_CLIP_WINDOW] = { 1, 1, 9, "Clip Window", NULL },
-	[VC4_PACKET_VIEWPORT_OFFSET] = { 1, 1, 5, "Viewport Offset", NULL },
-	[VC4_PACKET_CLIPPER_XY_SCALING] = { 1, 1, 9, "Clipper XY Scaling", NULL },
+	VC4_DEFINE_PACKET(VC4_PACKET_PRIMITIVE_LIST_FORMAT, "primitive list format", NULL),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_GL_SHADER_STATE, "GL Shader State", validate_gl_shader_state),
+	VC4_DEFINE_PACKET(VC4_PACKET_NV_SHADER_STATE, "NV Shader State", validate_nv_shader_state),
+
+	VC4_DEFINE_PACKET(VC4_PACKET_CONFIGURATION_BITS, "configuration bits", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_FLAT_SHADE_FLAGS, "flat shade flags", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_POINT_SIZE, "point size", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_LINE_WIDTH, "line width", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_RHT_X_BOUNDARY, "RHT X boundary", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_DEPTH_OFFSET, "Depth Offset", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIP_WINDOW, "Clip Window", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_VIEWPORT_OFFSET, "Viewport Offset", NULL),
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_XY_SCALING, "Clipper XY Scaling", NULL),
 	/* Note: The docs say this was also 105, but it was 106 in the
 	 * initial userland code drop.
 	 */
-	[VC4_PACKET_CLIPPER_Z_SCALING] = { 1, 1, 9, "Clipper Z Scale and Offset", NULL },
-
-	[VC4_PACKET_TILE_BINNING_MODE_CONFIG] = { 1, 0, 16, "tile binning configuration", validate_tile_binning_config },
+	VC4_DEFINE_PACKET(VC4_PACKET_CLIPPER_Z_SCALING, "Clipper Z Scale and Offset", NULL),
 
-	[VC4_PACKET_TILE_RENDERING_MODE_CONFIG] = { 0, 1, 11, "tile rendering mode configuration", validate_tile_rendering_mode_config},
+	VC4_DEFINE_PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, "tile binning configuration", validate_tile_binning_config),
 
-	[VC4_PACKET_CLEAR_COLORS] = { 0, 1, 14, "Clear Colors", NULL },
-
-	[VC4_PACKET_TILE_COORDINATES] = { 0, 1, 3, "Tile Coordinates", validate_tile_coordinates },
-
-	[VC4_PACKET_GEM_HANDLES] = { 1, 1, 9, "GEM handles", validate_gem_handles },
+	VC4_DEFINE_PACKET(VC4_PACKET_GEM_HANDLES, "GEM handles", validate_gem_handles),
 };
 
 int
-vc4_validate_cl(struct drm_device *dev,
-		void *validated,
-		void *unvalidated,
-		uint32_t len,
-		bool is_bin,
-		bool has_bin,
-		struct vc4_exec_info *exec)
+vc4_validate_bin_cl(struct drm_device *dev,
+		    void *validated,
+		    void *unvalidated,
+		    struct vc4_exec_info *exec)
 {
+	uint32_t len = exec->args->bin_cl_size;
 	uint32_t dst_offset = 0;
 	uint32_t src_offset = 0;
 
@@ -732,14 +543,6 @@ vc4_validate_cl(struct drm_device *dev,
 			 src_offset, cmd, info->name, info->len);
 #endif
 
-		if ((is_bin && !info->bin) ||
-		    (!is_bin && !info->render)) {
-			DRM_ERROR("0x%08x: packet %d (%s) invalid for %s\n",
-				  src_offset, cmd, info->name,
-				  is_bin ? "binner" : "render");
-			return -EINVAL;
-		}
-
 		if (src_offset + info->len > len) {
 			DRM_ERROR("0x%08x: packet %d (%s) length 0x%08x "
 				  "exceeds bounds (0x%08x)\n",
@@ -770,30 +573,16 @@ vc4_validate_cl(struct drm_device *dev,
 			break;
 	}
 
-	if (is_bin) {
-		exec->ct0ea = exec->ct0ca + dst_offset;
+	exec->ct0ea = exec->ct0ca + dst_offset;
 
-		if (has_bin && !exec->found_start_tile_binning_packet) {
-			DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n");
-			return -EINVAL;
-		}
-	} else {
-		if (!exec->found_tile_rendering_mode_config_packet) {
-			DRM_ERROR("Render CL missing VC4_PACKET_TILE_RENDERING_MODE_CONFIG\n");
-			return -EINVAL;
-		}
+	if (!exec->found_start_tile_binning_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_START_TILE_BINNING\n");
+		return -EINVAL;
+	}
 
-		/* Make sure that they actually consumed the semaphore
-		 * increment from the bin CL.  Otherwise a later submit would
-		 * have render execute immediately.
-		 */
-		if (exec->found_wait_on_semaphore_packet != has_bin) {
-			DRM_ERROR("Render CL %s VC4_PACKET_WAIT_ON_SEMAPHORE\n",
-				  exec->found_wait_on_semaphore_packet ?
-				  "has" : "missing");
-			return -EINVAL;
-		}
-		exec->ct1ea = exec->ct1ca + dst_offset;
+	if (!exec->found_increment_semaphore_packet) {
+		DRM_ERROR("Bin CL missing VC4_PACKET_INCREMENT_SEMAPHORE\n");
+		return -EINVAL;
 	}
 
 	return 0;
@@ -814,10 +603,10 @@ reloc_tex(struct vc4_exec_info *exec,
 	uint32_t p3 = (sample->p_offset[3] != ~0 ?
 		       *(uint32_t *)(uniform_data_u + sample->p_offset[3]) : 0);
 	uint32_t *validated_p0 = exec->uniforms_v + sample->p_offset[0];
-	uint32_t offset = p0 & ~0xfff;
-	uint32_t miplevels = (p0 & 15);
-	uint32_t width = (p1 >> 8) & 2047;
-	uint32_t height = (p1 >> 20) & 2047;
+	uint32_t offset = p0 & VC4_TEX_P0_OFFSET_MASK;
+	uint32_t miplevels = VC4_GET_FIELD(p0, VC4_TEX_P0_MIPLVLS);
+	uint32_t width = VC4_GET_FIELD(p1, VC4_TEX_P1_WIDTH);
+	uint32_t height = VC4_GET_FIELD(p1, VC4_TEX_P1_HEIGHT);
 	uint32_t cpp, tiling_format, utile_w, utile_h;
 	uint32_t i;
 	uint32_t cube_map_stride = 0;
@@ -845,16 +634,18 @@ reloc_tex(struct vc4_exec_info *exec,
 	if (height == 0)
 		height = 2048;
 
-	if (p0 & (1 << 9)) {
-		if ((p2 & (3 << 30)) == (1 << 30))
-			cube_map_stride = p2 & 0x3ffff000;
-		if ((p3 & (3 << 30)) == (1 << 30)) {
+	if (p0 & VC4_TEX_P0_CMMODE_MASK) {
+		if (VC4_GET_FIELD(p2, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE)
+			cube_map_stride = p2 & VC4_TEX_P2_CMST_MASK;
+		if (VC4_GET_FIELD(p3, VC4_TEX_P2_PTYPE) ==
+		    VC4_TEX_P2_PTYPE_CUBE_MAP_STRIDE) {
 			if (cube_map_stride) {
 				DRM_ERROR("Cube map stride set twice\n");
 				return false;
 			}
 
-			cube_map_stride = p3 & 0x3ffff000;
+			cube_map_stride = p3 & VC4_TEX_P2_CMST_MASK;
 		}
 		if (!cube_map_stride) {
 			DRM_ERROR("Cube map stride not set\n");
@@ -862,7 +653,8 @@ reloc_tex(struct vc4_exec_info *exec,
 		}
 	}
 
-	type = ((p0 >> 4) & 15) | ((p1 >> 31) << 4);
+	type = (VC4_GET_FIELD(p0, VC4_TEX_P0_TYPE) |
+		(VC4_GET_FIELD(p1, VC4_TEX_P1_TYPE4) << 4));
 
 	switch (type) {
 	case VC4_TEXTURE_TYPE_RGBA8888:
@@ -905,8 +697,8 @@ reloc_tex(struct vc4_exec_info *exec,
 			tiling_format = VC4_TILING_FORMAT_T;
 	}
 
-	if (!check_tex_size(exec, tex, offset + cube_map_stride * 5,
-			    tiling_format, width, height, cpp)) {
+	if (!vc4_check_tex_size(exec, tex, offset + cube_map_stride * 5,
+				tiling_format, width, height, cpp)) {
 		return false;
 	}
 
@@ -927,15 +719,15 @@ reloc_tex(struct vc4_exec_info *exec,
 
 		switch (tiling_format) {
 		case VC4_TILING_FORMAT_T:
-			aligned_width = roundup(level_width, utile_w * 8);
-			aligned_height = roundup(level_height, utile_h * 8);
+			aligned_width = round_up(level_width, utile_w * 8);
+			aligned_height = round_up(level_height, utile_h * 8);
 			break;
 		case VC4_TILING_FORMAT_LT:
-			aligned_width = roundup(level_width, utile_w);
-			aligned_height = roundup(level_height, utile_h);
+			aligned_width = round_up(level_width, utile_w);
+			aligned_height = round_up(level_height, utile_h);
 			break;
 		default:
-			aligned_width = roundup(level_width, utile_w);
+			aligned_width = round_up(level_width, utile_w);
 			aligned_height = level_height;
 			break;
 		}
diff --git a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
index e5a75c5..ab9a651 100644
--- a/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
+++ b/src/gallium/drivers/vc4/kernel/vc4_validate_shaders.c
@@ -58,7 +58,8 @@ struct vc4_shader_validation_state {
 	 *
 	 * This is used for the validation of direct address memory reads.
 	 */
-	uint32_t live_clamp_offsets[32 + 32 + 4];
+	uint32_t live_min_clamp_offsets[32 + 32 + 4];
+	bool live_max_clamp_regs[32 + 32 + 4];
 };
 
 static uint32_t
@@ -77,6 +78,25 @@ waddr_to_live_reg_index(uint32_t waddr, bool is_b)
 	}
 }
 
+static uint32_t
+raddr_add_a_to_live_reg_index(uint64_t inst)
+{
+	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
+	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
+	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
+
+	if (add_a == QPU_MUX_A) {
+		return raddr_a;
+	} else if (add_a == QPU_MUX_B && sig != QPU_SIG_SMALL_IMM) {
+		return 32 + raddr_b;
+	} else if (add_a <= QPU_MUX_R3) {
+		return 64 + add_a;
+	} else {
+		return ~0;
+	}
+}
+
 static bool
 is_tmu_submit(uint32_t waddr)
 {
@@ -136,9 +156,8 @@ check_tmu_write(uint64_t inst,
 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
 
 	if (is_direct) {
-		uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
 		uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
-		uint32_t clamp_offset = ~0;
+		uint32_t clamp_reg, clamp_offset;
 
 		if (sig == QPU_SIG_SMALL_IMM) {
 			DRM_ERROR("direct TMU read used small immediate\n");
@@ -159,14 +178,13 @@ check_tmu_write(uint64_t inst,
 		 * This is arbitrary, but simpler than supporting flipping the
 		 * two either way.
 		 */
-		if (add_a == QPU_MUX_A) {
-			clamp_offset = validation_state->live_clamp_offsets[raddr_a];
-		} else if (add_a == QPU_MUX_B) {
-			clamp_offset = validation_state->live_clamp_offsets[32 + raddr_b];
-		} else if (add_a <= QPU_MUX_R4) {
-			clamp_offset = validation_state->live_clamp_offsets[64 + add_a];
+		clamp_reg = raddr_add_a_to_live_reg_index(inst);
+		if (clamp_reg == ~0) {
+			DRM_ERROR("direct TMU load wasn't clamped\n");
+			return false;
 		}
 
+		clamp_offset = validation_state->live_min_clamp_offsets[clamp_reg];
 		if (clamp_offset == ~0) {
 			DRM_ERROR("direct TMU load wasn't clamped\n");
 			return false;
@@ -229,8 +247,6 @@ check_register_write(uint64_t inst,
 	uint32_t waddr = (is_mul ?
 			  QPU_GET_FIELD(inst, QPU_WADDR_MUL) :
 			  QPU_GET_FIELD(inst, QPU_WADDR_ADD));
-	bool is_b = is_mul != ((inst & QPU_WS) != 0);
-	uint32_t live_reg_index;
 
 	switch (waddr) {
 	case QPU_W_UNIFORMS_ADDRESS:
@@ -285,14 +301,6 @@ check_register_write(uint64_t inst,
                 return true;
 	}
 
-	/* Clear out the live offset clamp tracking for the written register.
-	 * If this particular instruction is setting up an offset clamp, it'll
-	 * get tracked immediately after we return.
-	 */
-	live_reg_index = waddr_to_live_reg_index(waddr, is_b);
-	if (live_reg_index != ~0)
-		validation_state->live_clamp_offsets[live_reg_index] = ~0;
-
 	return true;
 }
 
@@ -301,26 +309,72 @@ track_live_clamps(uint64_t inst,
 		  struct vc4_validated_shader_info *validated_shader,
 		  struct vc4_shader_validation_state *validation_state)
 {
+	uint32_t op_add = QPU_GET_FIELD(inst, QPU_OP_ADD);
 	uint32_t waddr_add = QPU_GET_FIELD(inst, QPU_WADDR_ADD);
+	uint32_t waddr_mul = QPU_GET_FIELD(inst, QPU_WADDR_MUL);
+	uint32_t cond_add = QPU_GET_FIELD(inst, QPU_COND_ADD);
+	uint32_t add_a = QPU_GET_FIELD(inst, QPU_ADD_A);
 	uint32_t add_b = QPU_GET_FIELD(inst, QPU_ADD_B);
 	uint32_t raddr_a = QPU_GET_FIELD(inst, QPU_RADDR_A);
 	uint32_t raddr_b = QPU_GET_FIELD(inst, QPU_RADDR_B);
 	uint32_t sig = QPU_GET_FIELD(inst, QPU_SIG);
-	bool is_b = inst & QPU_WS;
-	uint32_t live_reg_index;
+	bool ws = inst & QPU_WS;
+	uint32_t lri_add_a, lri_add, lri_mul;
+	bool add_a_is_min_0;
 
-	if (QPU_GET_FIELD(inst, QPU_OP_ADD) != QPU_A_MIN)
+	/* Check whether OP_ADD's A argumennt comes from a live MAX(x, 0),
+	 * before we clear previous live state.
+	 */
+	lri_add_a = raddr_add_a_to_live_reg_index(inst);
+	add_a_is_min_0 = (lri_add_a != ~0 &&
+			  validation_state->live_max_clamp_regs[lri_add_a]);
+
+	/* Clear live state for registers written by our instruction. */
+	lri_add = waddr_to_live_reg_index(waddr_add, ws);
+	lri_mul = waddr_to_live_reg_index(waddr_mul, !ws);
+	if (lri_mul != ~0) {
+		validation_state->live_max_clamp_regs[lri_mul] = false;
+		validation_state->live_min_clamp_offsets[lri_mul] = ~0;
+	}
+	if (lri_add != ~0) {
+		validation_state->live_max_clamp_regs[lri_add] = false;
+		validation_state->live_min_clamp_offsets[lri_add] = ~0;
+	} else {
+		/* Nothing further to do for live tracking, since only ADDs
+		 * generate new live clamp registers.
+		 */
 		return;
+	}
+
+	/* Now, handle remaining live clamp tracking for the ADD operation. */
 
-	if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
-	    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
-	      sig != QPU_SIG_SMALL_IMM)) {
+	if (cond_add != QPU_COND_ALWAYS)
 		return;
-	}
 
-	live_reg_index = waddr_to_live_reg_index(waddr_add, is_b);
-	if (live_reg_index != ~0) {
-		validation_state->live_clamp_offsets[live_reg_index] =
+	if (op_add == QPU_A_MAX) {
+		/* Track live clamps of a value to a minimum of 0 (in either
+		 * arg).
+		 */
+		if (sig != QPU_SIG_SMALL_IMM || raddr_b != 0 ||
+		    (add_a != QPU_MUX_B && add_b != QPU_MUX_B)) {
+			return;
+		}
+
+		validation_state->live_max_clamp_regs[lri_add] = true;
+	} if (op_add == QPU_A_MIN) {
+		/* Track live clamps of a value clamped to a minimum of 0 and
+		 * a maximum of some uniform's offset.
+		 */
+		if (!add_a_is_min_0)
+			return;
+
+		if (!(add_b == QPU_MUX_A && raddr_a == QPU_R_UNIF) &&
+		    !(add_b == QPU_MUX_B && raddr_b == QPU_R_UNIF &&
+		      sig != QPU_SIG_SMALL_IMM)) {
+			return;
+		}
+
+		validation_state->live_min_clamp_offsets[lri_add] =
 			validated_shader->uniforms_size;
 	}
 }
@@ -382,8 +436,8 @@ vc4_validate_shader(struct drm_gem_cma_object *shader_obj)
 
 	for (i = 0; i < 8; i++)
 		validation_state.tmu_setup[i / 4].p_offset[i % 4] = ~0;
-	for (i = 0; i < ARRAY_SIZE(validation_state.live_clamp_offsets); i++)
-		validation_state.live_clamp_offsets[i] = ~0;
+	for (i = 0; i < ARRAY_SIZE(validation_state.live_min_clamp_offsets); i++)
+		validation_state.live_min_clamp_offsets[i] = ~0;
 
 	shader = shader_obj->vaddr;
 	max_ip = shader_obj->base.size / sizeof(uint64_t);
diff --git a/src/gallium/drivers/vc4/vc4_blit.c b/src/gallium/drivers/vc4/vc4_blit.c
index 2d524c4..d29e2c9 100644
--- a/src/gallium/drivers/vc4/vc4_blit.c
+++ b/src/gallium/drivers/vc4/vc4_blit.c
@@ -26,86 +26,7 @@
 #include "util/u_blitter.h"
 #include "vc4_context.h"
 
-static void
-vc4_tile_blit_color_rcl(struct vc4_context *vc4,
-                        struct vc4_surface *dst_surf,
-                        struct vc4_surface *src_surf)
-{
-        struct vc4_resource *src = vc4_resource(src_surf->base.texture);
-        struct vc4_resource *dst = vc4_resource(dst_surf->base.texture);
-
-        uint32_t min_x_tile = 0;
-        uint32_t min_y_tile = 0;
-        uint32_t max_x_tile = (dst_surf->base.width - 1) / 64;
-        uint32_t max_y_tile = (dst_surf->base.height - 1) / 64;
-        uint32_t xtiles = max_x_tile - min_x_tile + 1;
-        uint32_t ytiles = max_y_tile - min_y_tile + 1;
-        uint32_t reloc_size = 9;
-        uint32_t config_size = 11 + reloc_size;
-        uint32_t loadstore_size = 7 + reloc_size;
-        uint32_t tilecoords_size = 3;
-        cl_ensure_space(&vc4->rcl,
-                        config_size +
-                        xtiles * ytiles * (loadstore_size * 2 +
-                                           tilecoords_size * 1));
-        cl_ensure_space(&vc4->bo_handles, 2 * sizeof(uint32_t));
-        cl_ensure_space(&vc4->bo_pointers, 2 * sizeof(struct vc4_bo *));
-
-        cl_start_reloc(&vc4->rcl, 1);
-        cl_u8(&vc4->rcl, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
-        cl_reloc(vc4, &vc4->rcl, dst->bo, dst_surf->offset);
-        cl_u16(&vc4->rcl, dst_surf->base.width);
-        cl_u16(&vc4->rcl, dst_surf->base.height);
-        cl_u16(&vc4->rcl, ((dst_surf->tiling <<
-                            VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT) |
-                           (vc4_rt_format_is_565(dst_surf->base.format) ?
-                            VC4_RENDER_CONFIG_FORMAT_BGR565 :
-                            VC4_RENDER_CONFIG_FORMAT_RGBA8888)));
-
-        uint32_t src_hindex = vc4_gem_hindex(vc4, src->bo);
-
-        for (int y = min_y_tile; y <= max_y_tile; y++) {
-                for (int x = min_x_tile; x <= max_x_tile; x++) {
-                        bool end_of_frame = (x == max_x_tile &&
-                                             y == max_y_tile);
-
-                        cl_start_reloc(&vc4->rcl, 1);
-                        cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-                        cl_u8(&vc4->rcl,
-                              VC4_LOADSTORE_TILE_BUFFER_COLOR |
-                              (src_surf->tiling <<
-                               VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                        cl_u8(&vc4->rcl,
-                              vc4_rt_format_is_565(src_surf->base.format) ?
-                              VC4_LOADSTORE_TILE_BUFFER_BGR565 :
-                              VC4_LOADSTORE_TILE_BUFFER_RGBA8888);
-                        cl_reloc_hindex(&vc4->rcl, src_hindex,
-                                        src_surf->offset);
-
-                        cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
-                        cl_u8(&vc4->rcl, x);
-                        cl_u8(&vc4->rcl, y);
-
-                        if (end_of_frame) {
-                                cl_u8(&vc4->rcl,
-                                      VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
-                        } else {
-                                cl_u8(&vc4->rcl,
-                                      VC4_PACKET_STORE_MS_TILE_BUFFER);
-                        }
-                }
-        }
-
-        vc4->draw_min_x = 0;
-        vc4->draw_min_y = 0;
-        vc4->draw_max_x = dst_surf->base.width;
-        vc4->draw_max_y = dst_surf->base.height;
-
-        dst->writes++;
-        vc4->needs_flush = true;
-}
-
-static struct vc4_surface *
+static struct pipe_surface *
 vc4_get_blit_surface(struct pipe_context *pctx,
                      struct pipe_resource *prsc, unsigned level)
 {
@@ -117,7 +38,7 @@ vc4_get_blit_surface(struct pipe_context *pctx,
         tmpl.u.tex.first_layer = 0;
         tmpl.u.tex.last_layer = 0;
 
-        return vc4_surface(pctx->create_surface(pctx, prsc, &tmpl));
+        return pctx->create_surface(pctx, prsc, &tmpl);
 }
 
 static bool
@@ -141,17 +62,28 @@ vc4_tile_blit(struct pipe_context *pctx, const struct pipe_blit_info *info)
         if (info->dst.resource->format != info->src.resource->format)
                 return false;
 
-        struct vc4_surface *dst_surf =
+        vc4_flush(pctx);
+
+        struct pipe_surface *dst_surf =
                 vc4_get_blit_surface(pctx, info->dst.resource, info->dst.level);
-        struct vc4_surface *src_surf =
+        struct pipe_surface *src_surf =
                 vc4_get_blit_surface(pctx, info->src.resource, info->src.level);
 
-        vc4_flush(pctx);
-        vc4_tile_blit_color_rcl(vc4, dst_surf, src_surf);
+        pipe_surface_reference(&vc4->color_read, src_surf);
+        pipe_surface_reference(&vc4->color_write, dst_surf);
+        pipe_surface_reference(&vc4->zs_read, NULL);
+        pipe_surface_reference(&vc4->zs_write, NULL);
+        vc4->draw_min_x = 0;
+        vc4->draw_min_y = 0;
+        vc4->draw_max_x = dst_surf->width;
+        vc4->draw_max_y = dst_surf->height;
+        vc4->draw_width = dst_surf->width;
+        vc4->draw_height = dst_surf->height;
+        vc4->needs_flush = true;
         vc4_job_submit(vc4);
 
-        pctx->surface_destroy(pctx, &dst_surf->base);
-        pctx->surface_destroy(pctx, &src_surf->base);
+        pipe_surface_reference(&dst_surf, NULL);
+        pipe_surface_reference(&src_surf, NULL);
 
         return true;
 }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 4bb2c71..cbdb9e8 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -34,8 +34,46 @@
 #include "vc4_context.h"
 #include "vc4_screen.h"
 
-#define container_of(ptr, type, field) \
-   (type*)((char*)ptr - offsetof(type, field))
+static bool dump_stats = false;
+
+static void
+vc4_bo_dump_stats(struct vc4_screen *screen)
+{
+        struct vc4_bo_cache *cache = &screen->bo_cache;
+
+        fprintf(stderr, "  BOs allocated:   %d\n", screen->bo_count);
+        fprintf(stderr, "  BOs size:        %dkb\n", screen->bo_size / 102);
+        fprintf(stderr, "  BOs cached:      %d\n", cache->bo_count);
+        fprintf(stderr, "  BOs cached size: %dkb\n", cache->bo_size / 102);
+
+        if (!list_empty(&cache->time_list)) {
+                struct vc4_bo *first = LIST_ENTRY(struct vc4_bo,
+                                                  cache->time_list.next,
+                                                  time_list);
+                struct vc4_bo *last = LIST_ENTRY(struct vc4_bo,
+                                                  cache->time_list.prev,
+                                                  time_list);
+
+                fprintf(stderr, "  oldest cache time: %ld\n",
+                        (long)first->free_time);
+                fprintf(stderr, "  newest cache time: %ld\n",
+                        (long)last->free_time);
+
+                struct timespec time;
+                clock_gettime(CLOCK_MONOTONIC, &time);
+                fprintf(stderr, "  now:               %ld\n",
+                        time.tv_sec);
+        }
+}
+
+static void
+vc4_bo_remove_from_cache(struct vc4_bo_cache *cache, struct vc4_bo *bo)
+{
+        list_del(&bo->time_list);
+        list_del(&bo->size_list);
+        cache->bo_count--;
+        cache->bo_size -= bo->size;
+}
 
 static struct vc4_bo *
 vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
@@ -48,12 +86,21 @@ vc4_bo_from_cache(struct vc4_screen *screen, uint32_t size, const char *name)
 
         struct vc4_bo *bo = NULL;
         pipe_mutex_lock(cache->lock);
-        if (!is_empty_list(&cache->size_list[page_index])) {
-                struct simple_node *node = last_elem(&cache->size_list[page_index]);
-                bo = container_of(node, struct vc4_bo, size_list);
+        if (!list_empty(&cache->size_list[page_index])) {
+                bo = LIST_ENTRY(struct vc4_bo, cache->size_list[page_index].next,
+                                size_list);
+
+                /* Check that the BO has gone idle.  If not, then we want to
+                 * allocate something new instead, since we assume that the
+                 * user will proceed to CPU map it and fill it with stuff.
+                 */
+                if (!vc4_bo_wait(bo, 0)) {
+                        pipe_mutex_unlock(cache->lock);
+                        return NULL;
+                }
+
                 pipe_reference_init(&bo->reference, 1);
-                remove_from_list(&bo->time_list);
-                remove_from_list(&bo->size_list);
+                vc4_bo_remove_from_cache(cache, bo);
 
                 bo->name = name;
         }
@@ -70,8 +117,14 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
         size = align(size, 4096);
 
         bo = vc4_bo_from_cache(screen, size, name);
-        if (bo)
+        if (bo) {
+                if (dump_stats) {
+                        fprintf(stderr, "Allocated %s %dkb from cache:\n",
+                                name, size / 1024);
+                        vc4_bo_dump_stats(screen);
+                }
                 return bo;
+        }
 
         bo = CALLOC_STRUCT(vc4_bo);
         if (!bo)
@@ -108,6 +161,13 @@ vc4_bo_alloc(struct vc4_screen *screen, uint32_t size, const char *name)
                 abort();
         }
 
+        screen->bo_count++;
+        screen->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Allocated %s %dkb:\n", name, size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
+
         return bo;
 }
 
@@ -145,26 +205,47 @@ vc4_bo_free(struct vc4_bo *bo)
         if (ret != 0)
                 fprintf(stderr, "close object %d: %s\n", bo->handle, strerror(errno));
 
+        screen->bo_count--;
+        screen->bo_size -= bo->size;
+
+        if (dump_stats) {
+                fprintf(stderr, "Freed %s%s%dkb:\n",
+                        bo->name ? bo->name : "",
+                        bo->name ? " " : "",
+                        bo->size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
+
         free(bo);
 }
 
 static void
 free_stale_bos(struct vc4_screen *screen, time_t time)
 {
-        while (!is_empty_list(&screen->bo_cache.time_list)) {
-                struct simple_node *node =
-                        first_elem(&screen->bo_cache.time_list);
-                struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list);
+        struct vc4_bo_cache *cache = &screen->bo_cache;
+        bool freed_any = false;
+
+        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
+                                 time_list) {
+                if (dump_stats && !freed_any) {
+                        fprintf(stderr, "Freeing stale BOs:\n");
+                        vc4_bo_dump_stats(screen);
+                        freed_any = true;
+                }
 
                 /* If it's more than a second old, free it. */
                 if (time - bo->free_time > 2) {
-                        remove_from_list(&bo->time_list);
-                        remove_from_list(&bo->size_list);
+                        vc4_bo_remove_from_cache(cache, bo);
                         vc4_bo_free(bo);
                 } else {
                         break;
                 }
         }
+
+        if (dump_stats && freed_any) {
+                fprintf(stderr, "Freed stale BOs:\n");
+                vc4_bo_dump_stats(screen);
+        }
 }
 
 void
@@ -180,16 +261,16 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
         }
 
         if (cache->size_list_size <= page_index) {
-                struct simple_node *new_list =
-                        ralloc_array(screen, struct simple_node, page_index + 1);
+                struct list_head *new_list =
+                        ralloc_array(screen, struct list_head, page_index + 1);
 
                 /* Move old list contents over (since the array has moved, and
-                 * therefore the pointers to the list heads have to change.
+                 * therefore the pointers to the list heads have to change).
                  */
                 for (int i = 0; i < cache->size_list_size; i++) {
-                        struct simple_node *old_head = &cache->size_list[i];
-                        if (is_empty_list(old_head))
-                                make_empty_list(&new_list[i]);
+                        struct list_head *old_head = &cache->size_list[i];
+                        if (list_empty(old_head))
+                                list_inithead(&new_list[i]);
                         else {
                                 new_list[i].next = old_head->next;
                                 new_list[i].prev = old_head->prev;
@@ -198,15 +279,23 @@ vc4_bo_last_unreference_locked_timed(struct vc4_bo *bo, time_t time)
                         }
                 }
                 for (int i = cache->size_list_size; i < page_index + 1; i++)
-                        make_empty_list(&new_list[i]);
+                        list_inithead(&new_list[i]);
 
                 cache->size_list = new_list;
                 cache->size_list_size = page_index + 1;
         }
 
         bo->free_time = time;
-        insert_at_tail(&cache->size_list[page_index], &bo->size_list);
-        insert_at_tail(&cache->time_list, &bo->time_list);
+        list_addtail(&bo->size_list, &cache->size_list[page_index]);
+        list_addtail(&bo->time_list, &cache->time_list);
+        cache->bo_count++;
+        cache->bo_size += bo->size;
+        if (dump_stats) {
+                fprintf(stderr, "Freed %s %dkb to cache:\n",
+                        bo->name, bo->size / 1024);
+                vc4_bo_dump_stats(screen);
+        }
+        bo->name = NULL;
 
         free_stale_bos(screen, time);
 }
@@ -286,6 +375,7 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
                         bo->handle);
                 return -1;
         }
+        bo->private = false;
 
         return fd;
 }
@@ -342,15 +432,17 @@ vc4_wait_seqno(struct vc4_screen *screen, uint64_t seqno, uint64_t timeout_ns)
                 ret = 0;
         }
 
-        if (ret == -ETIME) {
-                return false;
-        } else if (ret != 0) {
-                fprintf(stderr, "wait failed\n");
-                abort();
-        } else {
+        if (ret == 0) {
                 screen->finished_seqno = wait.seqno;
                 return true;
         }
+
+        if (errno != ETIME) {
+                fprintf(stderr, "wait failed: %d\n", ret);
+                abort();
+        }
+
+        return false;
 }
 
 bool
@@ -369,14 +461,15 @@ vc4_bo_wait(struct vc4_bo *bo, uint64_t timeout_ns)
         else
                 ret = 0;
 
-        if (ret == -ETIME) {
-                return false;
-        } else if (ret != 0) {
-                fprintf(stderr, "wait failed\n");
-                abort();
-        } else {
+        if (ret == 0)
                 return true;
+
+        if (errno != ETIME) {
+                fprintf(stderr, "wait failed: %d\n", ret);
+                abort();
         }
+
+        return false;
 }
 
 void *
@@ -437,12 +530,14 @@ vc4_bufmgr_destroy(struct pipe_screen *pscreen)
         struct vc4_screen *screen = vc4_screen(pscreen);
         struct vc4_bo_cache *cache = &screen->bo_cache;
 
-        while (!is_empty_list(&cache->time_list)) {
-                struct simple_node *node = first_elem(&cache->time_list);
-                struct vc4_bo *bo = container_of(node, struct vc4_bo, time_list);
-
-                remove_from_list(&bo->time_list);
-                remove_from_list(&bo->size_list);
+        list_for_each_entry_safe(struct vc4_bo, bo, &cache->time_list,
+                                 time_list) {
+                vc4_bo_remove_from_cache(cache, bo);
                 vc4_bo_free(bo);
         }
+
+        if (dump_stats) {
+                fprintf(stderr, "BO stats after screen destroy:\n");
+                vc4_bo_dump_stats(screen);
+        }
 }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index f9559e9..7320695 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -44,9 +44,9 @@ struct vc4_bo {
 #endif
 
         /** Entry in the linked list of buffers freed, by age. */
-        struct simple_node time_list;
+        struct list_head time_list;
         /** Entry in the per-page-count linked list of buffers freed (by age). */
-        struct simple_node size_list;
+        struct list_head size_list;
         /** Approximate second when the bo was freed. */
         time_t free_time;
         /**
diff --git a/src/gallium/drivers/vc4/vc4_cl.h b/src/gallium/drivers/vc4/vc4_cl.h
index 32a2e71..4a50e79 100644
--- a/src/gallium/drivers/vc4/vc4_cl.h
+++ b/src/gallium/drivers/vc4/vc4_cl.h
@@ -29,7 +29,7 @@
 #include "util/u_math.h"
 #include "util/macros.h"
 
-#include "vc4_packet.h"
+#include "kernel/vc4_packet.h"
 
 struct vc4_bo;
 
diff --git a/src/gallium/drivers/vc4/vc4_cl_dump.c b/src/gallium/drivers/vc4/vc4_cl_dump.c
index 1423984..6905508 100644
--- a/src/gallium/drivers/vc4/vc4_cl_dump.c
+++ b/src/gallium/drivers/vc4/vc4_cl_dump.c
@@ -174,6 +174,37 @@ dump_VC4_PACKET_CLIPPER_Z_SCALING(void *cl, uint32_t offset, uint32_t hw_offset)
 }
 
 static void
+dump_VC4_PACKET_TILE_BINNING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t hw_offset)
+{
+        uint32_t *tile_alloc_addr = cl + offset;
+        uint32_t *tile_alloc_size = cl + offset + 4;
+        uint32_t *tile_state_addr = cl + offset + 8;
+        uint8_t *bin_x = cl + offset + 12;
+        uint8_t *bin_y = cl + offset + 13;
+        uint8_t *flags = cl + offset + 14;
+
+        fprintf(stderr, "0x%08x 0x%08x:       tile alloc addr 0x%08x\n",
+                offset, hw_offset,
+                *tile_alloc_addr);
+
+        fprintf(stderr, "0x%08x 0x%08x:       tile alloc size %db\n",
+                offset + 4, hw_offset + 4,
+                *tile_alloc_size);
+
+        fprintf(stderr, "0x%08x 0x%08x:       tile state addr 0x%08x\n",
+                offset + 8, hw_offset + 8,
+                *tile_state_addr);
+
+        fprintf(stderr, "0x%08x 0x%08x:       tiles (%d, %d)\n",
+                offset + 12, hw_offset + 12,
+                *bin_x, *bin_y);
+
+        fprintf(stderr, "0x%08x 0x%08x:       flags 0x%02x\n",
+                offset + 14, hw_offset + 14,
+                *flags);
+}
+
+static void
 dump_VC4_PACKET_TILE_RENDERING_MODE_CONFIG(void *cl, uint32_t offset, uint32_t hw_offset)
 {
         uint32_t *render_offset = cl + offset;
@@ -311,7 +342,7 @@ static const struct packet_info {
         PACKET_DUMP(VC4_PACKET_CLIPPER_XY_SCALING, 9),
         PACKET_DUMP(VC4_PACKET_CLIPPER_Z_SCALING, 9),
 
-        PACKET(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
+        PACKET_DUMP(VC4_PACKET_TILE_BINNING_MODE_CONFIG, 16),
         PACKET_DUMP(VC4_PACKET_TILE_RENDERING_MODE_CONFIG, 11),
         PACKET(VC4_PACKET_CLEAR_COLORS, 14),
         PACKET_DUMP(VC4_PACKET_TILE_COORDINATES, 3),
diff --git a/src/gallium/drivers/vc4/vc4_context.c b/src/gallium/drivers/vc4/vc4_context.c
index b394c18..630f8e6 100644
--- a/src/gallium/drivers/vc4/vc4_context.c
+++ b/src/gallium/drivers/vc4/vc4_context.c
@@ -29,6 +29,7 @@
 #include "util/u_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_blitter.h"
+#include "util/u_upload_mgr.h"
 #include "indices/u_primconvert.h"
 #include "pipe/p_screen.h"
 
@@ -36,270 +37,12 @@
 #include "vc4_context.h"
 #include "vc4_resource.h"
 
-/**
- * Emits a no-op STORE_TILE_BUFFER_GENERAL.
- *
- * If we emit a PACKET_TILE_COORDINATES, it must be followed by a store of
- * some sort before another load is triggered.
- */
-static void
-vc4_store_before_load(struct vc4_context *vc4, bool *coords_emitted)
-{
-        if (!*coords_emitted)
-                return;
-
-        cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-        cl_u8(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE);
-        cl_u8(&vc4->rcl, (VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR |
-                          VC4_STORE_TILE_BUFFER_DISABLE_ZS_CLEAR |
-                          VC4_STORE_TILE_BUFFER_DISABLE_VG_MASK_CLEAR));
-        cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */
-
-        *coords_emitted = false;
-}
-
-/**
- * Emits a PACKET_TILE_COORDINATES if one isn't already pending.
- *
- * The tile coordinates packet triggers a pending load if there is one, are
- * used for clipping during rendering, and determine where loads/stores happen
- * relative to their base address.
- */
-static void
-vc4_tile_coordinates(struct vc4_context *vc4, uint32_t x, uint32_t y,
-                       bool *coords_emitted)
-{
-        if (*coords_emitted)
-                return;
-
-        cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
-        cl_u8(&vc4->rcl, x);
-        cl_u8(&vc4->rcl, y);
-
-        *coords_emitted = true;
-}
-
-static void
-vc4_setup_rcl(struct vc4_context *vc4)
-{
-        struct vc4_surface *csurf = vc4_surface(vc4->framebuffer.cbufs[0]);
-        struct vc4_resource *ctex = csurf ? vc4_resource(csurf->base.texture) : NULL;
-        struct vc4_surface *zsurf = vc4_surface(vc4->framebuffer.zsbuf);
-        struct vc4_resource *ztex = zsurf ? vc4_resource(zsurf->base.texture) : NULL;
-
-        if (!csurf)
-                vc4->resolve &= ~PIPE_CLEAR_COLOR0;
-        if (!zsurf)
-                vc4->resolve &= ~(PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL);
-        uint32_t resolve_uncleared = vc4->resolve & ~vc4->cleared;
-        uint32_t width = vc4->framebuffer.width;
-        uint32_t height = vc4->framebuffer.height;
-        uint32_t stride_in_tiles = align(width, 64) / 64;
-
-        assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
-        uint32_t min_x_tile = vc4->draw_min_x / 64;
-        uint32_t min_y_tile = vc4->draw_min_y / 64;
-        uint32_t max_x_tile = (vc4->draw_max_x - 1) / 64;
-        uint32_t max_y_tile = (vc4->draw_max_y - 1) / 64;
-        uint32_t xtiles = max_x_tile - min_x_tile + 1;
-        uint32_t ytiles = max_y_tile - min_y_tile + 1;
-
-#if 0
-        fprintf(stderr, "RCL: resolve 0x%x clear 0x%x resolve uncleared 0x%x\n",
-                vc4->resolve,
-                vc4->cleared,
-                resolve_uncleared);
-#endif
-
-        uint32_t reloc_size = 9;
-        uint32_t clear_size = 14;
-        uint32_t config_size = 11 + reloc_size;
-        uint32_t loadstore_size = 7 + reloc_size;
-        uint32_t tilecoords_size = 3;
-        uint32_t branch_size = 5 + reloc_size;
-        uint32_t color_store_size = 1;
-        uint32_t semaphore_size = 1;
-        cl_ensure_space(&vc4->rcl,
-                        clear_size +
-                        config_size +
-                        loadstore_size +
-                        semaphore_size +
-                        xtiles * ytiles * (loadstore_size * 4 +
-                                           tilecoords_size * 3 +
-                                           branch_size +
-                                           color_store_size));
-
-        if (vc4->cleared) {
-                cl_u8(&vc4->rcl, VC4_PACKET_CLEAR_COLORS);
-                cl_u32(&vc4->rcl, vc4->clear_color[0]);
-                cl_u32(&vc4->rcl, vc4->clear_color[1]);
-                cl_u32(&vc4->rcl, vc4->clear_depth);
-                cl_u8(&vc4->rcl, vc4->clear_stencil);
-        }
-
-        /* The rendering mode config determines the pointer that's used for
-         * VC4_PACKET_STORE_MS_TILE_BUFFER address computations.  The kernel
-         * could handle a no-relocation rendering mode config and deny those
-         * packets, but instead we just tell the kernel we're doing our color
-         * rendering to the Z buffer, and just don't emit any of those
-         * packets.
-         */
-        struct vc4_surface *render_surf = csurf ? csurf : zsurf;
-        struct vc4_resource *render_tex = vc4_resource(render_surf->base.texture);
-        cl_start_reloc(&vc4->rcl, 1);
-        cl_u8(&vc4->rcl, VC4_PACKET_TILE_RENDERING_MODE_CONFIG);
-        cl_reloc(vc4, &vc4->rcl, render_tex->bo, render_surf->offset);
-        cl_u16(&vc4->rcl, width);
-        cl_u16(&vc4->rcl, height);
-        cl_u16(&vc4->rcl, ((render_surf->tiling <<
-                            VC4_RENDER_CONFIG_MEMORY_FORMAT_SHIFT) |
-                           (vc4_rt_format_is_565(render_surf->base.format) ?
-                            VC4_RENDER_CONFIG_FORMAT_BGR565 :
-                            VC4_RENDER_CONFIG_FORMAT_RGBA8888)));
-
-        /* The tile buffer normally gets cleared when the previous tile is
-         * stored.  If the clear values changed between frames, then the tile
-         * buffer has stale clear values in it, so we have to do a store in
-         * None mode (no writes) so that we trigger the tile buffer clear.
-         *
-         * Excess clearing is only a performance cost, since per-tile contents
-         * will be loaded/stored in the loop below.
-         */
-        if (vc4->cleared & (PIPE_CLEAR_COLOR0 |
-                            PIPE_CLEAR_DEPTH |
-                            PIPE_CLEAR_STENCIL)) {
-                cl_u8(&vc4->rcl, VC4_PACKET_TILE_COORDINATES);
-                cl_u8(&vc4->rcl, 0);
-                cl_u8(&vc4->rcl, 0);
-
-                cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-                cl_u16(&vc4->rcl, VC4_LOADSTORE_TILE_BUFFER_NONE);
-                cl_u32(&vc4->rcl, 0); /* no address, since we're in None mode */
-        }
-
-        uint32_t color_hindex = ctex ? vc4_gem_hindex(vc4, ctex->bo) : 0;
-        uint32_t depth_hindex = ztex ? vc4_gem_hindex(vc4, ztex->bo) : 0;
-        uint32_t tile_alloc_hindex = vc4_gem_hindex(vc4, vc4->tile_alloc);
-
-        for (int y = min_y_tile; y <= max_y_tile; y++) {
-                for (int x = min_x_tile; x <= max_x_tile; x++) {
-                        bool end_of_frame = (x == max_x_tile &&
-                                             y == max_y_tile);
-                        bool coords_emitted = false;
-
-                        /* Note that the load doesn't actually occur until the
-                         * tile coords packet is processed, and only one load
-                         * may be outstanding at a time.
-                         */
-                        if (resolve_uncleared & PIPE_CLEAR_COLOR) {
-                                vc4_store_before_load(vc4, &coords_emitted);
-
-                                cl_start_reloc(&vc4->rcl, 1);
-                                cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-                                cl_u8(&vc4->rcl,
-                                      VC4_LOADSTORE_TILE_BUFFER_COLOR |
-                                      (csurf->tiling <<
-                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                                cl_u8(&vc4->rcl,
-                                      vc4_rt_format_is_565(csurf->base.format) ?
-                                      VC4_LOADSTORE_TILE_BUFFER_BGR565 :
-                                      VC4_LOADSTORE_TILE_BUFFER_RGBA8888);
-                                cl_reloc_hindex(&vc4->rcl, color_hindex,
-                                                csurf->offset);
-
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-                        }
-
-                        if (resolve_uncleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-                                vc4_store_before_load(vc4, &coords_emitted);
-
-                                cl_start_reloc(&vc4->rcl, 1);
-                                cl_u8(&vc4->rcl, VC4_PACKET_LOAD_TILE_BUFFER_GENERAL);
-                                cl_u8(&vc4->rcl,
-                                      VC4_LOADSTORE_TILE_BUFFER_ZS |
-                                      (zsurf->tiling <<
-                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                                cl_u8(&vc4->rcl, 0);
-                                cl_reloc_hindex(&vc4->rcl, depth_hindex,
-                                                zsurf->offset);
-
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-                        }
-
-                        /* Clipping depends on tile coordinates having been
-                         * emitted, so make sure it's happened even if
-                         * everything was cleared to start.
-                         */
-                        vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-
-                        /* Wait for the binner before jumping to the first
-                         * tile's lists.
-                         */
-                        if (x == min_x_tile && y == min_y_tile)
-                                cl_u8(&vc4->rcl, VC4_PACKET_WAIT_ON_SEMAPHORE);
-
-                        cl_start_reloc(&vc4->rcl, 1);
-                        cl_u8(&vc4->rcl, VC4_PACKET_BRANCH_TO_SUB_LIST);
-                        cl_reloc_hindex(&vc4->rcl, tile_alloc_hindex,
-                                        (y * stride_in_tiles + x) * 32);
-
-                        if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL)) {
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-
-                                cl_start_reloc(&vc4->rcl, 1);
-                                cl_u8(&vc4->rcl, VC4_PACKET_STORE_TILE_BUFFER_GENERAL);
-                                cl_u8(&vc4->rcl,
-                                      VC4_LOADSTORE_TILE_BUFFER_ZS |
-                                      (zsurf->tiling <<
-                                       VC4_LOADSTORE_TILE_BUFFER_FORMAT_SHIFT));
-                                cl_u8(&vc4->rcl,
-                                      VC4_STORE_TILE_BUFFER_DISABLE_COLOR_CLEAR);
-                                cl_reloc_hindex(&vc4->rcl, depth_hindex,
-                                                zsurf->offset |
-                                                ((end_of_frame &&
-                                                  !(vc4->resolve & PIPE_CLEAR_COLOR0)) ?
-                                                 VC4_LOADSTORE_TILE_BUFFER_EOF : 0));
-
-                                coords_emitted = false;
-                        }
-
-                        if (vc4->resolve & PIPE_CLEAR_COLOR0) {
-                                vc4_tile_coordinates(vc4, x, y, &coords_emitted);
-                                if (end_of_frame) {
-                                        cl_u8(&vc4->rcl,
-                                              VC4_PACKET_STORE_MS_TILE_BUFFER_AND_EOF);
-                                } else {
-                                        cl_u8(&vc4->rcl,
-                                              VC4_PACKET_STORE_MS_TILE_BUFFER);
-                                }
-
-                                coords_emitted = false;
-                        }
-
-                        /* One of the bits needs to have been set that would
-                         * have triggered an EOF.
-                         */
-                        assert(vc4->resolve & (PIPE_CLEAR_COLOR0 |
-                                               PIPE_CLEAR_DEPTH |
-                                               PIPE_CLEAR_STENCIL));
-                        /* Any coords emitted must also have been consumed by
-                         * a store.
-                         */
-                        assert(!coords_emitted);
-                }
-        }
-
-        if (vc4->resolve & PIPE_CLEAR_COLOR0)
-                ctex->writes++;
-
-        if (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))
-                ztex->writes++;
-}
-
 void
 vc4_flush(struct pipe_context *pctx)
 {
         struct vc4_context *vc4 = vc4_context(pctx);
+        struct pipe_surface *cbuf = vc4->framebuffer.cbufs[0];
+        struct pipe_surface *zsbuf = vc4->framebuffer.zsbuf;
 
         if (!vc4->needs_flush)
                 return;
@@ -322,7 +65,31 @@ vc4_flush(struct pipe_context *pctx)
         /* The FLUSH caps all of our bin lists with a VC4_PACKET_RETURN. */
         cl_u8(&vc4->bcl, VC4_PACKET_FLUSH);
 
-        vc4_setup_rcl(vc4);
+        if (cbuf && (vc4->resolve & PIPE_CLEAR_COLOR0)) {
+                pipe_surface_reference(&vc4->color_write, cbuf);
+                if (!(vc4->cleared & PIPE_CLEAR_COLOR0)) {
+                        pipe_surface_reference(&vc4->color_read, cbuf);
+                } else {
+                        pipe_surface_reference(&vc4->color_read, NULL);
+                }
+
+        } else {
+                pipe_surface_reference(&vc4->color_write, NULL);
+                pipe_surface_reference(&vc4->color_read, NULL);
+        }
+
+        if (vc4->framebuffer.zsbuf &&
+            (vc4->resolve & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
+                pipe_surface_reference(&vc4->zs_write, zsbuf);
+                if (!(vc4->cleared & (PIPE_CLEAR_DEPTH | PIPE_CLEAR_STENCIL))) {
+                        pipe_surface_reference(&vc4->zs_read, zsbuf);
+                } else {
+                        pipe_surface_reference(&vc4->zs_read, NULL);
+                }
+        } else {
+                pipe_surface_reference(&vc4->zs_write, NULL);
+                pipe_surface_reference(&vc4->zs_read, NULL);
+        }
 
         vc4_job_submit(vc4);
 }
@@ -410,12 +177,13 @@ vc4_context_destroy(struct pipe_context *pctx)
         if (vc4->primconvert)
                 util_primconvert_destroy(vc4->primconvert);
 
+        if (vc4->uploader)
+                u_upload_destroy(vc4->uploader);
+
         util_slab_destroy(&vc4->transfer_pool);
 
         pipe_surface_reference(&vc4->framebuffer.cbufs[0], NULL);
         pipe_surface_reference(&vc4->framebuffer.zsbuf, NULL);
-        vc4_bo_unreference(&vc4->tile_alloc);
-        vc4_bo_unreference(&vc4->tile_state);
 
         vc4_program_fini(pctx);
 
@@ -466,6 +234,9 @@ vc4_context_create(struct pipe_screen *pscreen, void *priv)
         if (!vc4->primconvert)
                 goto fail;
 
+        vc4->uploader = u_upload_create(pctx, 16 * 1024, 4,
+                                        PIPE_BIND_INDEX_BUFFER);
+
         vc4_debug |= saved_shaderdb_flag;
 
         return &vc4->base;
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index d89f197..d5d6be1 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -178,12 +178,18 @@ struct vc4_context {
         struct vc4_screen *screen;
 
         struct vc4_cl bcl;
-        struct vc4_cl rcl;
         struct vc4_cl shader_rec;
         struct vc4_cl uniforms;
         struct vc4_cl bo_handles;
         struct vc4_cl bo_pointers;
         uint32_t shader_rec_count;
+
+        /** @{ Surfaces to submit rendering for. */
+        struct pipe_surface *color_read;
+        struct pipe_surface *color_write;
+        struct pipe_surface *zs_read;
+        struct pipe_surface *zs_write;
+        /** @} */
         /** @{
          * Bounding box of the scissor across all queued drawing.
          *
@@ -194,9 +200,13 @@ struct vc4_context {
         uint32_t draw_max_x;
         uint32_t draw_max_y;
         /** @} */
-
-        struct vc4_bo *tile_alloc;
-        struct vc4_bo *tile_state;
+        /** @{
+         * Width/height of the color framebuffer being rendered to,
+         * for VC4_TILE_RENDERING_MODE_CONFIG.
+        */
+        uint32_t draw_width;
+        uint32_t draw_height;
+        /** @} */
 
         struct util_slab_mempool transfer_pool;
         struct blitter_context *blitter;
@@ -243,6 +253,8 @@ struct vc4_context {
         /** Seqno of the last CL flush's job. */
         uint64_t last_emit_seqno;
 
+        struct u_upload_mgr *uploader;
+
         /** @{ Current pipeline state objects */
         struct pipe_scissor_state scissor;
         struct pipe_blend_state *blend;
diff --git a/src/gallium/drivers/vc4/vc4_draw.c b/src/gallium/drivers/vc4/vc4_draw.c
index 16418bf..5e6d70d 100644
--- a/src/gallium/drivers/vc4/vc4_draw.c
+++ b/src/gallium/drivers/vc4/vc4_draw.c
@@ -72,44 +72,15 @@ vc4_start_draw(struct vc4_context *vc4)
         uint32_t tilew = align(width, 64) / 64;
         uint32_t tileh = align(height, 64) / 64;
 
-        /* Tile alloc memory setup: We use an initial alloc size of 32b.  The
-         * hardware then aligns that to 256b (we use 4096, because all of our
-         * BO allocations align to that anyway), then for some reason the
-         * simulator wants an extra page available, even if you have overflow
-         * memory set up.
-         *
-         * XXX: The binner only does 28-bit addressing math, so the tile alloc
-         * and tile state should be in the same BO and that BO needs to not
-         * cross a 256MB boundary, somehow.
-         */
-        uint32_t tile_alloc_size = 32 * tilew * tileh;
-        tile_alloc_size = align(tile_alloc_size, 4096);
-        tile_alloc_size += 4096;
-        uint32_t tile_state_size = 48 * tilew * tileh;
-        if (!vc4->tile_alloc || vc4->tile_alloc->size < tile_alloc_size) {
-                vc4_bo_unreference(&vc4->tile_alloc);
-                vc4->tile_alloc = vc4_bo_alloc(vc4->screen, tile_alloc_size,
-                                               "tile_alloc");
-        }
-        if (!vc4->tile_state || vc4->tile_state->size < tile_state_size) {
-                vc4_bo_unreference(&vc4->tile_state);
-                vc4->tile_state = vc4_bo_alloc(vc4->screen, tile_state_size,
-                                               "tile_state");
-        }
-
         //   Tile state data is 48 bytes per tile, I think it can be thrown away
         //   as soon as binning is finished.
-        cl_start_reloc(&vc4->bcl, 2);
         cl_u8(&vc4->bcl, VC4_PACKET_TILE_BINNING_MODE_CONFIG);
-        cl_reloc(vc4, &vc4->bcl, vc4->tile_alloc, 0);
-        cl_u32(&vc4->bcl, vc4->tile_alloc->size);
-        cl_reloc(vc4, &vc4->bcl, vc4->tile_state, 0);
+        cl_u32(&vc4->bcl, 0); /* tile alloc addr, filled by kernel */
+        cl_u32(&vc4->bcl, 0); /* tile alloc size, filled by kernel */
+        cl_u32(&vc4->bcl, 0); /* tile state addr, filled by kernel */
         cl_u8(&vc4->bcl, tilew);
         cl_u8(&vc4->bcl, tileh);
-        cl_u8(&vc4->bcl,
-              VC4_BIN_CONFIG_AUTO_INIT_TSDA |
-              VC4_BIN_CONFIG_ALLOC_BLOCK_SIZE_32 |
-              VC4_BIN_CONFIG_ALLOC_INIT_BLOCK_SIZE_32);
+        cl_u8(&vc4->bcl, 0); /* flags, filled by kernel. */
 
         /* START_TILE_BINNING resets the statechange counters in the hardware,
          * which are what is used when a primitive is binned to a tile to
@@ -129,6 +100,8 @@ vc4_start_draw(struct vc4_context *vc4)
 
         vc4->needs_flush = true;
         vc4->draw_call_queued = true;
+        vc4->draw_width = width;
+        vc4->draw_height = height;
 }
 
 static void
@@ -266,13 +239,17 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
          * definitions, up to but not including QUADS.
          */
         if (info->indexed) {
-                struct vc4_resource *rsc = vc4_resource(vc4->indexbuf.buffer);
                 uint32_t offset = vc4->indexbuf.offset;
                 uint32_t index_size = vc4->indexbuf.index_size;
-                if (rsc->shadow_parent) {
-                        vc4_update_shadow_index_buffer(pctx, &vc4->indexbuf);
-                        offset = 0;
+                struct pipe_resource *prsc;
+                if (vc4->indexbuf.index_size == 4) {
+                        prsc = vc4_get_shadow_index_buffer(pctx, &vc4->indexbuf,
+                                                           info->count, &offset);
+                        index_size = 2;
+                } else {
+                        prsc = vc4->indexbuf.buffer;
                 }
+                struct vc4_resource *rsc = vc4_resource(prsc);
 
                 cl_start_reloc(&vc4->bcl, 1);
                 cl_u8(&vc4->bcl, VC4_PACKET_GL_INDEXED_PRIMITIVE);
@@ -284,6 +261,9 @@ vc4_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info)
                 cl_u32(&vc4->bcl, info->count);
                 cl_reloc(vc4, &vc4->bcl, rsc->bo, offset);
                 cl_u32(&vc4->bcl, max_index);
+
+                if (vc4->indexbuf.index_size == 4)
+                        pipe_resource_reference(&prsc, NULL);
         } else {
                 cl_u8(&vc4->bcl, VC4_PACKET_GL_ARRAY_PRIMITIVE);
                 cl_u8(&vc4->bcl, info->mode);
diff --git a/src/gallium/drivers/vc4/vc4_drm.h b/src/gallium/drivers/vc4/vc4_drm.h
index 062fd3b..5f1ee4f 100644
--- a/src/gallium/drivers/vc4/vc4_drm.h
+++ b/src/gallium/drivers/vc4/vc4_drm.h
@@ -38,6 +38,15 @@
 #define DRM_IOCTL_VC4_CREATE_BO           DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_CREATE_BO, struct drm_vc4_create_bo)
 #define DRM_IOCTL_VC4_MMAP_BO             DRM_IOWR( DRM_COMMAND_BASE + DRM_VC4_MMAP_BO, struct drm_vc4_mmap_bo)
 
+struct drm_vc4_submit_rcl_surface {
+	uint32_t hindex; /* Handle index, or ~0 if not present. */
+	uint32_t offset; /* Offset to start of buffer. */
+	/*
+         * Bits for either render config (color_ms_write) or load/store packet.
+	 */
+	uint16_t bits;
+	uint16_t pad;
+};
 
 /**
  * struct drm_vc4_submit_cl - ioctl argument for submitting commands to the 3D
@@ -62,16 +71,6 @@ struct drm_vc4_submit_cl {
 	 */
 	uint64_t bin_cl;
 
-	/* Pointer to the render command list.
-	 *
-	 * The render command list contains a set of packets to load the
-	 * current tile's state (reading from memory, or just clearing it)
-	 * into the GPU, then call into the tile allocation BO to run the
-	 * stored rendering for that tile, then store the tile's state back to
-	 * memory.
-	 */
-	uint64_t render_cl;
-
 	/* Pointer to the shader records.
 	 *
 	 * Shader records are the structures read by the hardware that contain
@@ -102,8 +101,6 @@ struct drm_vc4_submit_cl {
 
 	/* Size in bytes of the binner command list. */
 	uint32_t bin_cl_size;
-	/* Size in bytes of the render command list */
-	uint32_t render_cl_size;
 	/* Size in bytes of the set of shader records. */
 	uint32_t shader_rec_size;
 	/* Number of shader records.
@@ -119,8 +116,25 @@ struct drm_vc4_submit_cl {
 	/* Number of BO handles passed in (size is that times 4). */
 	uint32_t bo_handle_count;
 
+	/* RCL setup: */
+	uint16_t width;
+	uint16_t height;
+	uint8_t min_x_tile;
+	uint8_t min_y_tile;
+	uint8_t max_x_tile;
+	uint8_t max_y_tile;
+	struct drm_vc4_submit_rcl_surface color_read;
+	struct drm_vc4_submit_rcl_surface color_ms_write;
+	struct drm_vc4_submit_rcl_surface zs_read;
+	struct drm_vc4_submit_rcl_surface zs_write;
+	uint32_t clear_color[2];
+	uint32_t clear_z;
+	uint8_t clear_s;
+
+	uint32_t pad:24;
+
+#define VC4_SUBMIT_CL_USE_CLEAR_COLOR			(1 << 0)
 	uint32_t flags;
-	uint32_t pad;
 
 	/* Returned value of the seqno of this render job (for the
 	 * wait ioctl).
diff --git a/src/gallium/drivers/vc4/vc4_job.c b/src/gallium/drivers/vc4/vc4_job.c
index 7603716..dcade15 100644
--- a/src/gallium/drivers/vc4/vc4_job.c
+++ b/src/gallium/drivers/vc4/vc4_job.c
@@ -33,7 +33,6 @@ void
 vc4_job_init(struct vc4_context *vc4)
 {
         vc4_init_cl(vc4, &vc4->bcl);
-        vc4_init_cl(vc4, &vc4->rcl);
         vc4_init_cl(vc4, &vc4->shader_rec);
         vc4_init_cl(vc4, &vc4->uniforms);
         vc4_init_cl(vc4, &vc4->bo_handles);
@@ -50,7 +49,6 @@ vc4_job_reset(struct vc4_context *vc4)
                 vc4_bo_unreference(&referenced_bos[i]);
         }
         vc4_reset_cl(&vc4->bcl);
-        vc4_reset_cl(&vc4->rcl);
         vc4_reset_cl(&vc4->shader_rec);
         vc4_reset_cl(&vc4->uniforms);
         vc4_reset_cl(&vc4->bo_handles);
@@ -75,6 +73,70 @@ vc4_job_reset(struct vc4_context *vc4)
         vc4->draw_max_y = 0;
 }
 
+static void
+vc4_submit_setup_rcl_surface(struct vc4_context *vc4,
+                             struct drm_vc4_submit_rcl_surface *submit_surf,
+                             struct pipe_surface *psurf,
+                             bool is_depth, bool is_write)
+{
+        struct vc4_surface *surf = vc4_surface(psurf);
+
+        if (!surf) {
+                submit_surf->hindex = ~0;
+                return;
+        }
+
+        struct vc4_resource *rsc = vc4_resource(psurf->texture);
+        submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
+        submit_surf->offset = surf->offset;
+
+        if (is_depth) {
+                submit_surf->bits =
+                        VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_ZS,
+                                      VC4_LOADSTORE_TILE_BUFFER_BUFFER);
+
+        } else {
+                submit_surf->bits =
+                        VC4_SET_FIELD(VC4_LOADSTORE_TILE_BUFFER_COLOR,
+                                      VC4_LOADSTORE_TILE_BUFFER_BUFFER) |
+                        VC4_SET_FIELD(vc4_rt_format_is_565(psurf->format) ?
+                                      VC4_LOADSTORE_TILE_BUFFER_BGR565 :
+                                      VC4_LOADSTORE_TILE_BUFFER_RGBA8888,
+                                      VC4_LOADSTORE_TILE_BUFFER_FORMAT);
+        }
+        submit_surf->bits |=
+                VC4_SET_FIELD(surf->tiling, VC4_LOADSTORE_TILE_BUFFER_TILING);
+
+        if (is_write)
+                rsc->writes++;
+}
+
+static void
+vc4_submit_setup_ms_rcl_surface(struct vc4_context *vc4,
+                                struct drm_vc4_submit_rcl_surface *submit_surf,
+                                struct pipe_surface *psurf)
+{
+        struct vc4_surface *surf = vc4_surface(psurf);
+
+        if (!surf) {
+                submit_surf->hindex = ~0;
+                return;
+        }
+
+        struct vc4_resource *rsc = vc4_resource(psurf->texture);
+        submit_surf->hindex = vc4_gem_hindex(vc4, rsc->bo);
+        submit_surf->offset = surf->offset;
+
+        submit_surf->bits =
+                VC4_SET_FIELD(vc4_rt_format_is_565(surf->base.format) ?
+                              VC4_RENDER_CONFIG_FORMAT_BGR565 :
+                              VC4_RENDER_CONFIG_FORMAT_RGBA8888,
+                              VC4_RENDER_CONFIG_FORMAT) |
+                VC4_SET_FIELD(surf->tiling, VC4_RENDER_CONFIG_MEMORY_FORMAT);
+
+        rsc->writes++;
+}
+
 /**
  * Submits the job to the kernel and then reinitializes it.
  */
@@ -84,26 +146,49 @@ vc4_job_submit(struct vc4_context *vc4)
         if (vc4_debug & VC4_DEBUG_CL) {
                 fprintf(stderr, "BCL:\n");
                 vc4_dump_cl(vc4->bcl.base, vc4->bcl.next - vc4->bcl.base, false);
-                fprintf(stderr, "RCL:\n");
-                vc4_dump_cl(vc4->rcl.base, vc4->rcl.next - vc4->rcl.base, true);
         }
 
         struct drm_vc4_submit_cl submit;
         memset(&submit, 0, sizeof(submit));
 
+        cl_ensure_space(&vc4->bo_handles, 4 * sizeof(uint32_t));
+        cl_ensure_space(&vc4->bo_pointers, 4 * sizeof(struct vc4_bo *));
+
+        vc4_submit_setup_rcl_surface(vc4, &submit.color_read,
+                                     vc4->color_read, false, false);
+        vc4_submit_setup_ms_rcl_surface(vc4, &submit.color_ms_write,
+                                        vc4->color_write);
+        vc4_submit_setup_rcl_surface(vc4, &submit.zs_read,
+                                     vc4->zs_read, true, false);
+        vc4_submit_setup_rcl_surface(vc4, &submit.zs_write,
+                                     vc4->zs_write, true, true);
+
         submit.bo_handles = (uintptr_t)vc4->bo_handles.base;
         submit.bo_handle_count = (vc4->bo_handles.next -
                                   vc4->bo_handles.base) / 4;
         submit.bin_cl = (uintptr_t)vc4->bcl.base;
         submit.bin_cl_size = vc4->bcl.next - vc4->bcl.base;
-        submit.render_cl = (uintptr_t)vc4->rcl.base;
-        submit.render_cl_size = vc4->rcl.next - vc4->rcl.base;
         submit.shader_rec = (uintptr_t)vc4->shader_rec.base;
         submit.shader_rec_size = vc4->shader_rec.next - vc4->shader_rec.base;
         submit.shader_rec_count = vc4->shader_rec_count;
         submit.uniforms = (uintptr_t)vc4->uniforms.base;
         submit.uniforms_size = vc4->uniforms.next - vc4->uniforms.base;
 
+        assert(vc4->draw_min_x != ~0 && vc4->draw_min_y != ~0);
+        submit.min_x_tile = vc4->draw_min_x / 64;
+        submit.min_y_tile = vc4->draw_min_y / 64;
+        submit.max_x_tile = (vc4->draw_max_x - 1) / 64;
+        submit.max_y_tile = (vc4->draw_max_y - 1) / 64;
+        submit.width = vc4->draw_width;
+        submit.height = vc4->draw_height;
+        if (vc4->cleared) {
+                submit.flags |= VC4_SUBMIT_CL_USE_CLEAR_COLOR;
+                submit.clear_color[0] = vc4->clear_color[0];
+                submit.clear_color[1] = vc4->clear_color[1];
+                submit.clear_z = vc4->clear_depth;
+                submit.clear_s = vc4->clear_stencil;
+        }
+
         if (!(vc4_debug & VC4_DEBUG_NORAST)) {
                 int ret;
 
diff --git a/src/gallium/drivers/vc4/vc4_opt_algebraic.c b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
index e40e0f3..7978ea1 100644
--- a/src/gallium/drivers/vc4/vc4_opt_algebraic.c
+++ b/src/gallium/drivers/vc4/vc4_opt_algebraic.c
@@ -136,11 +136,8 @@ bool
 qir_opt_algebraic(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
-
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
 
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->op) {
                 case QOP_SEL_X_Y_ZS:
                 case QOP_SEL_X_Y_ZC:
diff --git a/src/gallium/drivers/vc4/vc4_opt_constant_folding.c b/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
index ac9be5c..15ec9f0 100644
--- a/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
+++ b/src/gallium/drivers/vc4/vc4_opt_constant_folding.c
@@ -98,10 +98,8 @@ bool
 qir_opt_constant_folding(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
 
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (constant_fold(c, inst))
                         progress = true;
         }
diff --git a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
index 5189a40..d6d2fbf 100644
--- a/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
+++ b/src/gallium/drivers/vc4/vc4_opt_copy_propagation.c
@@ -38,13 +38,10 @@ bool
 qir_opt_copy_propagation(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
         bool debug = false;
         struct qreg *movs = calloc(c->num_temps, sizeof(struct qreg));
 
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         int index = inst->src[i].index;
                         if (inst->src[i].file == QFILE_TEMP &&
diff --git a/src/gallium/drivers/vc4/vc4_opt_cse.c b/src/gallium/drivers/vc4/vc4_opt_cse.c
index 71794f7..92c8260 100644
--- a/src/gallium/drivers/vc4/vc4_opt_cse.c
+++ b/src/gallium/drivers/vc4/vc4_opt_cse.c
@@ -121,7 +121,6 @@ bool
 qir_opt_cse(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node, *t;
         uint32_t sf_count = 0, r4_count = 0;
 
         struct hash_table *ht = _mesa_hash_table_create(NULL, NULL,
@@ -129,9 +128,7 @@ qir_opt_cse(struct vc4_compile *c)
         if (!ht)
                 return false;
 
-        foreach_s(node, t, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (qir_has_side_effects(c, inst) ||
                     qir_has_side_effect_reads(c, inst)) {
                         continue;
diff --git a/src/gallium/drivers/vc4/vc4_opt_dead_code.c b/src/gallium/drivers/vc4/vc4_opt_dead_code.c
index e4ead46..ffd4242 100644
--- a/src/gallium/drivers/vc4/vc4_opt_dead_code.c
+++ b/src/gallium/drivers/vc4/vc4_opt_dead_code.c
@@ -86,7 +86,7 @@ qir_opt_dead_code(struct vc4_compile *c)
         /* Whether we're eliminating texture setup currently. */
         bool dce_tex = false;
 
-        struct simple_node *node, *t;
+        struct list_head *node, *t;
         for (node = c->instructions.prev, t = node->prev;
              &c->instructions != node;
              node = t, t = t->prev) {
diff --git a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
index a329ac6..d6e98f0 100644
--- a/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
+++ b/src/gallium/drivers/vc4/vc4_opt_small_immediates.c
@@ -37,11 +37,8 @@ bool
 qir_opt_small_immediates(struct vc4_compile *c)
 {
         bool progress = false;
-        struct simple_node *node;
-
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
 
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 /* The small immediate value sits in the raddr B field, so we
                  * can't have 2 small immediates in one instruction (unless
                  * they're the same value, but that should be optimized away
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
index e9711f2..e04f028 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
@@ -37,15 +37,12 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                 return false;
 
         bool progress = false;
-        struct simple_node *node;
         struct qinst *vpm_writes[64] = { 0 };
         uint32_t use_count[c->num_temps];
         uint32_t vpm_write_count = 0;
         memset(&use_count, 0, sizeof(use_count));
 
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->dst.file) {
                 case QFILE_VPM:
                         vpm_writes[vpm_write_count++] = inst;
@@ -102,7 +99,8 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                  * to maintain the order of the VPM writes.
                  */
                 assert(!vpm_writes[i]->sf);
-                move_to_tail(&vpm_writes[i]->link, &inst->link);
+                list_del(&inst->link);
+                list_addtail(&inst->link, &vpm_writes[i]->link);
                 qir_remove_instruction(c, vpm_writes[i]);
 
                 c->defs[inst->dst.index] = NULL;
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index bf156f9..ba47c51 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -147,6 +147,9 @@ indirect_uniform_load(struct vc4_compile *c,
         indirect_offset = qir_ADD(c, indirect_offset,
                                   qir_uniform_ui(c, (range->dst_offset +
                                                      offset)));
+
+        /* Clamp to [0, array size).  Note that MIN/MAX are signed. */
+        indirect_offset = qir_MAX(c, indirect_offset, qir_uniform_ui(c, 0));
         indirect_offset = qir_MIN(c, indirect_offset,
                                   qir_uniform_ui(c, (range->dst_offset +
                                                      range->size - 4)));
@@ -322,7 +325,9 @@ ntq_emit_tex(struct vc4_compile *c, nir_tex_instr *instr)
                 switch (instr->src[i].src_type) {
                 case nir_tex_src_coord:
                         s = ntq_get_src(c, instr->src[i].src, 0);
-                        if (instr->sampler_dim != GLSL_SAMPLER_DIM_1D)
+                        if (instr->sampler_dim == GLSL_SAMPLER_DIM_1D)
+                                t = qir_uniform_f(c, 0.5);
+                        else
                                 t = ntq_get_src(c, instr->src[i].src, 1);
                         if (instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
                                 r = ntq_get_src(c, instr->src[i].src, 2);
@@ -1849,8 +1854,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
 
         switch (instr->intrinsic) {
         case nir_intrinsic_load_uniform:
-                assert(instr->const_index[1] == 1);
-
                 for (int i = 0; i < instr->num_components; i++) {
                         dest[i] = qir_uniform(c, QUNIFORM_UNIFORM,
                                               instr->const_index[0] * 4 + i);
@@ -1858,8 +1861,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_uniform_indirect:
-                assert(instr->const_index[1] == 1);
-
                 for (int i = 0; i < instr->num_components; i++) {
                         dest[i] = indirect_uniform_load(c,
                                                         ntq_get_src(c, instr->src[0], 0),
@@ -1870,8 +1871,6 @@ ntq_emit_intrinsic(struct vc4_compile *c, nir_intrinsic_instr *instr)
                 break;
 
         case nir_intrinsic_load_input:
-                assert(instr->const_index[1] == 1);
-
                 for (int i = 0; i < instr->num_components; i++)
                         dest[i] = c->inputs[instr->const_index[0] * 4 + i];
 
@@ -2215,11 +2214,9 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
         shader->program_id = vc4->next_compiled_program_id++;
         if (stage == QSTAGE_FRAG) {
                 bool input_live[c->num_input_semantics];
-                struct simple_node *node;
 
                 memset(input_live, 0, sizeof(input_live));
-                foreach(node, &c->instructions) {
-                        struct qinst *inst = (struct qinst *)node;
+                list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                         for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                                 if (inst->src[i].file == QFILE_VARY)
                                         input_live[inst->src[i].index] = true;
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index e2e6a5c..1c96ef4 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -22,7 +22,6 @@
  */
 
 #include "util/u_memory.h"
-#include "util/simple_list.h"
 #include "util/ralloc.h"
 
 #include "vc4_qir.h"
@@ -301,10 +300,7 @@ qir_dump_inst(struct vc4_compile *c, struct qinst *inst)
 void
 qir_dump(struct vc4_compile *c)
 {
-        struct simple_node *node;
-
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 qir_dump_inst(c, inst);
                 fprintf(stderr, "\n");
         }
@@ -370,7 +366,7 @@ qir_emit(struct vc4_compile *c, struct qinst *inst)
         if (inst->dst.file == QFILE_TEMP)
                 c->defs[inst->dst.index] = inst;
 
-        insert_at_tail(&c->instructions, &inst->link);
+        list_addtail(&inst->link, &c->instructions);
 }
 
 bool
@@ -384,7 +380,7 @@ qir_compile_init(void)
 {
         struct vc4_compile *c = rzalloc(NULL, struct vc4_compile);
 
-        make_empty_list(&c->instructions);
+        list_inithead(&c->instructions);
 
         c->output_position_index = -1;
         c->output_clipvertex_index = -1;
@@ -403,7 +399,7 @@ qir_remove_instruction(struct vc4_compile *c, struct qinst *qinst)
         if (qinst->dst.file == QFILE_TEMP)
                 c->defs[qinst->dst.index] = NULL;
 
-        remove_from_list(&qinst->link);
+        list_del(&qinst->link);
         free(qinst->src);
         free(qinst);
 }
@@ -420,9 +416,9 @@ qir_follow_movs(struct vc4_compile *c, struct qreg reg)
 void
 qir_compile_destroy(struct vc4_compile *c)
 {
-        while (!is_empty_list(&c->instructions)) {
+        while (!list_empty(&c->instructions)) {
                 struct qinst *qinst =
-                        (struct qinst *)first_elem(&c->instructions);
+                        (struct qinst *)c->instructions.next;
                 qir_remove_instruction(c, qinst);
         }
 
@@ -478,7 +474,7 @@ void
 qir_SF(struct vc4_compile *c, struct qreg src)
 {
         struct qinst *last_inst = NULL;
-        if (!is_empty_list(&c->instructions))
+        if (!list_empty(&c->instructions))
                 last_inst = (struct qinst *)c->instructions.prev;
 
         if (!last_inst ||
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index adc2c89..732cfd0 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -33,7 +33,7 @@
 
 #include "util/macros.h"
 #include "glsl/nir/nir.h"
-#include "util/simple_list.h"
+#include "util/list.h"
 #include "util/u_math.h"
 
 enum qfile {
@@ -162,12 +162,12 @@ enum qop {
 };
 
 struct queued_qpu_inst {
-        struct simple_node link;
+        struct list_head link;
         uint64_t inst;
 };
 
 struct qinst {
-        struct simple_node link;
+        struct list_head link;
 
         enum qop op;
         struct qreg dst;
@@ -356,10 +356,10 @@ struct vc4_compile {
         struct qreg undef;
         enum qstage stage;
         uint32_t num_temps;
-        struct simple_node instructions;
+        struct list_head instructions;
         uint32_t immediates[1024];
 
-        struct simple_node qpu_inst_list;
+        struct list_head qpu_inst_list;
         uint64_t *qpu_insts;
         uint32_t qpu_inst_count;
         uint32_t qpu_inst_size;
diff --git a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
index 63f5eb2..910c89d 100644
--- a/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_qir_lower_uniforms.c
@@ -88,7 +88,6 @@ is_lowerable_uniform(struct qinst *inst, int i)
 void
 qir_lower_uniforms(struct vc4_compile *c)
 {
-        struct simple_node *node;
         struct hash_table *ht =
                 _mesa_hash_table_create(c, index_hash, index_compare);
 
@@ -96,8 +95,7 @@ qir_lower_uniforms(struct vc4_compile *c)
          * than one uniform referenced, and add those uniform values to the
          * ht.
          */
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 uint32_t nsrc = qir_get_op_nsrc(inst->op);
 
                 uint32_t count = 0;
@@ -137,10 +135,9 @@ qir_lower_uniforms(struct vc4_compile *c)
                 struct qreg temp = qir_get_temp(c);
                 struct qreg unif = { QFILE_UNIF, max_index };
                 struct qinst *mov = qir_inst(QOP_MOV, temp, unif, c->undef);
-                insert_at_head(&c->instructions, &mov->link);
+                list_add(&mov->link, &c->instructions);
                 c->defs[temp.index] = mov;
-                foreach(node, &c->instructions) {
-                        struct qinst *inst = (struct qinst *)node;
+                list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                         uint32_t nsrc = qir_get_op_nsrc(inst->op);
 
                         uint32_t count = 0;
diff --git a/src/gallium/drivers/vc4/vc4_qpu_emit.c b/src/gallium/drivers/vc4/vc4_qpu_emit.c
index eeb8d3a..99afe4b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_emit.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_emit.c
@@ -47,14 +47,14 @@ queue(struct vc4_compile *c, uint64_t inst)
 {
         struct queued_qpu_inst *q = rzalloc(c, struct queued_qpu_inst);
         q->inst = inst;
-        insert_at_tail(&c->qpu_inst_list, &q->link);
+        list_addtail(&q->link, &c->qpu_inst_list);
 }
 
 static uint64_t *
 last_inst(struct vc4_compile *c)
 {
         struct queued_qpu_inst *q =
-                (struct queued_qpu_inst *)last_elem(&c->qpu_inst_list);
+                (struct queued_qpu_inst *)c->qpu_inst_list.prev;
         return &q->inst;
 }
 
@@ -117,11 +117,11 @@ fixup_raddr_conflict(struct vc4_compile *c,
                 return;
 
         if (mux0 == QPU_MUX_A) {
-                queue(c, qpu_a_MOV(qpu_rb(31), *src1));
-                *src1 = qpu_rb(31);
+                queue(c, qpu_a_MOV(qpu_rb(31), *src0));
+                *src0 = qpu_rb(31);
         } else {
-                queue(c, qpu_a_MOV(qpu_ra(31), *src1));
-                *src1 = qpu_ra(31);
+                queue(c, qpu_a_MOV(qpu_ra(31), *src0));
+                *src0 = qpu_ra(31);
         }
 }
 
@@ -144,7 +144,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 QPU_UNPACK_16B_TO_F32,
         };
 
-        make_empty_list(&c->qpu_inst_list);
+        list_inithead(&c->qpu_inst_list);
 
         switch (c->stage) {
         case QSTAGE_VERT:
@@ -170,10 +170,7 @@ vc4_generate_code(struct vc4_context *vc4, struct vc4_compile *c)
                 break;
         }
 
-        struct simple_node *node;
-        foreach(node, &c->instructions) {
-                struct qinst *qinst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, qinst, &c->instructions, link) {
 #if 0
                 fprintf(stderr, "translating qinst to qpu: ");
                 qir_dump_inst(qinst);
diff --git a/src/gallium/drivers/vc4/vc4_qpu_schedule.c b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
index f523b4c..19cbf7b 100644
--- a/src/gallium/drivers/vc4/vc4_qpu_schedule.c
+++ b/src/gallium/drivers/vc4/vc4_qpu_schedule.c
@@ -43,7 +43,7 @@ static bool debug;
 struct schedule_node_child;
 
 struct schedule_node {
-        struct simple_node link;
+        struct list_head link;
         struct queued_qpu_inst *inst;
         struct schedule_node_child *children;
         uint32_t child_count;
@@ -400,22 +400,21 @@ calculate_deps(struct schedule_state *state, struct schedule_node *n)
 }
 
 static void
-calculate_forward_deps(struct vc4_compile *c, struct simple_node *schedule_list)
+calculate_forward_deps(struct vc4_compile *c, struct list_head *schedule_list)
 {
-        struct simple_node *node;
         struct schedule_state state;
 
         memset(&state, 0, sizeof(state));
         state.dir = F;
 
-        foreach(node, schedule_list)
-                calculate_deps(&state, (struct schedule_node *)node);
+        list_for_each_entry(struct schedule_node, node, schedule_list, link)
+                calculate_deps(&state, node);
 }
 
 static void
-calculate_reverse_deps(struct vc4_compile *c, struct simple_node *schedule_list)
+calculate_reverse_deps(struct vc4_compile *c, struct list_head *schedule_list)
 {
-        struct simple_node *node;
+        struct list_head *node;
         struct schedule_state state;
 
         memset(&state, 0, sizeof(state));
@@ -507,15 +506,13 @@ get_instruction_priority(uint64_t inst)
 
 static struct schedule_node *
 choose_instruction_to_schedule(struct choose_scoreboard *scoreboard,
-                               struct simple_node *schedule_list,
+                               struct list_head *schedule_list,
                                struct schedule_node *prev_inst)
 {
         struct schedule_node *chosen = NULL;
-        struct simple_node *node;
         int chosen_prio = 0;
 
-        foreach(node, schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                 uint64_t inst = n->inst->inst;
 
                 /* "An instruction must not read from a location in physical
@@ -596,14 +593,11 @@ update_scoreboard_for_chosen(struct choose_scoreboard *scoreboard,
 }
 
 static void
-dump_state(struct simple_node *schedule_list)
+dump_state(struct list_head *schedule_list)
 {
-        struct simple_node *node;
-
         uint32_t i = 0;
-        foreach(node, schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
 
+        list_for_each_entry(struct schedule_node, n, schedule_list, link) {
                 fprintf(stderr, "%3d: ", i++);
                 vc4_qpu_disasm(&n->inst->inst, 1);
                 fprintf(stderr, "\n");
@@ -639,7 +633,7 @@ compute_delay(struct schedule_node *n)
 }
 
 static void
-mark_instruction_scheduled(struct simple_node *schedule_list,
+mark_instruction_scheduled(struct list_head *schedule_list,
                            struct schedule_node *node,
                            bool war_only)
 {
@@ -658,16 +652,15 @@ mark_instruction_scheduled(struct simple_node *schedule_list,
 
                 child->parent_count--;
                 if (child->parent_count == 0)
-                        insert_at_head(schedule_list, &child->link);
+                        list_add(&child->link, schedule_list);
 
                 node->children[i].node = NULL;
         }
 }
 
 static void
-schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
+schedule_instructions(struct vc4_compile *c, struct list_head *schedule_list)
 {
-        struct simple_node *node, *t;
         struct choose_scoreboard scoreboard;
 
         /* We reorder the uniforms as we schedule instructions, so save the
@@ -693,14 +686,12 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
         }
 
         /* Remove non-DAG heads from the list. */
-        foreach_s(node, t, schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
-
+        list_for_each_entry_safe(struct schedule_node, n, schedule_list, link) {
                 if (n->parent_count != 0)
-                        remove_from_list(&n->link);
+                        list_del(&n->link);
         }
 
-        while (!is_empty_list(schedule_list)) {
+        while (!list_empty(schedule_list)) {
                 struct schedule_node *chosen =
                         choose_instruction_to_schedule(&scoreboard,
                                                        schedule_list,
@@ -724,7 +715,7 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
                  * find an instruction to pair with it.
                  */
                 if (chosen) {
-                        remove_from_list(&chosen->link);
+                        list_del(&chosen->link);
                         mark_instruction_scheduled(schedule_list, chosen, true);
                         if (chosen->uniform != -1) {
                                 c->uniform_data[next_uniform] =
@@ -738,7 +729,7 @@ schedule_instructions(struct vc4_compile *c, struct simple_node *schedule_list)
                                                                schedule_list,
                                                                chosen);
                         if (merge) {
-                                remove_from_list(&merge->link);
+                                list_del(&merge->link);
                                 inst = qpu_merge_inst(inst, merge->inst->inst);
                                 assert(inst != 0);
                                 if (merge->uniform != -1) {
@@ -813,16 +804,14 @@ void
 qpu_schedule_instructions(struct vc4_compile *c)
 {
         void *mem_ctx = ralloc_context(NULL);
-        struct simple_node schedule_list;
-        struct simple_node *node;
+        struct list_head schedule_list;
 
-        make_empty_list(&schedule_list);
+        list_inithead(&schedule_list);
 
         if (debug) {
                 fprintf(stderr, "Pre-schedule instructions\n");
-                foreach(node, &c->qpu_inst_list) {
-                        struct queued_qpu_inst *q =
-                                (struct queued_qpu_inst *)node;
+                list_for_each_entry(struct queued_qpu_inst, q,
+                                    &c->qpu_inst_list, link) {
                         vc4_qpu_disasm(&q->inst, 1);
                         fprintf(stderr, "\n");
                 }
@@ -831,7 +820,7 @@ qpu_schedule_instructions(struct vc4_compile *c)
 
         /* Wrap each instruction in a scheduler structure. */
         uint32_t next_uniform = 0;
-        while (!is_empty_list(&c->qpu_inst_list)) {
+        while (!list_empty(&c->qpu_inst_list)) {
                 struct queued_qpu_inst *inst =
                         (struct queued_qpu_inst *)c->qpu_inst_list.next;
                 struct schedule_node *n = rzalloc(mem_ctx, struct schedule_node);
@@ -844,16 +833,15 @@ qpu_schedule_instructions(struct vc4_compile *c)
                 } else {
                         n->uniform = -1;
                 }
-                remove_from_list(&inst->link);
-                insert_at_tail(&schedule_list, &n->link);
+                list_del(&inst->link);
+                list_addtail(&n->link, &schedule_list);
         }
         assert(next_uniform == c->num_uniforms);
 
         calculate_forward_deps(c, &schedule_list);
         calculate_reverse_deps(c, &schedule_list);
 
-        foreach(node, &schedule_list) {
-                struct schedule_node *n = (struct schedule_node *)node;
+        list_for_each_entry(struct schedule_node, n, &schedule_list, link) {
                 compute_delay(n);
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_query.c b/src/gallium/drivers/vc4/vc4_query.c
index 1792bec..270832e 100644
--- a/src/gallium/drivers/vc4/vc4_query.c
+++ b/src/gallium/drivers/vc4/vc4_query.c
@@ -50,9 +50,10 @@ vc4_destroy_query(struct pipe_context *ctx, struct pipe_query *query)
         free(query);
 }
 
-static void
+static boolean
 vc4_begin_query(struct pipe_context *ctx, struct pipe_query *query)
 {
+        return true;
 }
 
 static void
diff --git a/src/gallium/drivers/vc4/vc4_register_allocate.c b/src/gallium/drivers/vc4/vc4_register_allocate.c
index f40547b..3b0b890 100644
--- a/src/gallium/drivers/vc4/vc4_register_allocate.c
+++ b/src/gallium/drivers/vc4/vc4_register_allocate.c
@@ -161,7 +161,6 @@ node_to_temp_priority(const void *in_a, const void *in_b)
 struct qpu_reg *
 vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
 {
-        struct simple_node *node;
         struct node_to_temp_map map[c->num_temps];
         uint32_t temp_to_node[c->num_temps];
         uint32_t def[c->num_temps];
@@ -189,9 +188,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         /* Compute the live ranges so we can figure out interference.
          */
         uint32_t ip = 0;
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 if (inst->dst.file == QFILE_TEMP) {
                         def[inst->dst.index] = ip;
                         use[inst->dst.index] = ip;
@@ -227,9 +224,7 @@ vc4_register_allocate(struct vc4_context *vc4, struct vc4_compile *c)
         }
 
         /* Figure out our register classes and preallocated registers*/
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
-
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 switch (inst->op) {
                 case QOP_FRAG_Z:
                         ra_set_node_reg(g, temp_to_node[inst->dst.index],
diff --git a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
index 10972436..7f11fba 100644
--- a/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
+++ b/src/gallium/drivers/vc4/vc4_reorder_uniforms.c
@@ -42,10 +42,8 @@ qir_reorder_uniforms(struct vc4_compile *c)
         uint32_t *uniform_index = NULL;
         uint32_t uniform_index_size = 0;
         uint32_t next_uniform = 0;
-        struct simple_node *node;
-        foreach(node, &c->instructions) {
-                struct qinst *inst = (struct qinst *)node;
 
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
                         if (inst->src[i].file != QFILE_UNIF)
                                 continue;
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 3f180d5..cab7640 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -26,6 +26,7 @@
 #include "util/u_format.h"
 #include "util/u_inlines.h"
 #include "util/u_surface.h"
+#include "util/u_upload_mgr.h"
 
 #include "vc4_screen.h"
 #include "vc4_context.h"
@@ -161,6 +162,8 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                 /* We need to align the box to utile boundaries, since that's
                  * what load/store operate on.
                  */
+                uint32_t orig_width = ptrans->box.width;
+                uint32_t orig_height = ptrans->box.height;
                 uint32_t box_start_x = ptrans->box.x & (utile_w - 1);
                 uint32_t box_start_y = ptrans->box.y & (utile_h - 1);
                 ptrans->box.width += box_start_x;
@@ -174,7 +177,9 @@ vc4_resource_transfer_map(struct pipe_context *pctx,
                 ptrans->layer_stride = ptrans->stride;
 
                 trans->map = malloc(ptrans->stride * ptrans->box.height);
-                if (usage & PIPE_TRANSFER_READ) {
+                if (usage & PIPE_TRANSFER_READ ||
+                    ptrans->box.width != orig_width ||
+                    ptrans->box.height != orig_height) {
                         vc4_load_tiled_image(trans->map, ptrans->stride,
                                              buf + slice->offset +
                                              box->z * rsc->cube_map_stride,
@@ -638,41 +643,37 @@ vc4_update_shadow_baselevel_texture(struct pipe_context *pctx,
  * was in user memory, it would be nice to not have uploaded it to a VBO
  * before translating.
  */
-void
-vc4_update_shadow_index_buffer(struct pipe_context *pctx,
-                               const struct pipe_index_buffer *ib)
+struct pipe_resource *
+vc4_get_shadow_index_buffer(struct pipe_context *pctx,
+                            const struct pipe_index_buffer *ib,
+                            uint32_t count,
+                            uint32_t *shadow_offset)
 {
-        struct vc4_resource *shadow = vc4_resource(ib->buffer);
-        struct vc4_resource *orig = vc4_resource(shadow->shadow_parent);
-        uint32_t count = shadow->base.b.width0 / 2;
-
-        if (shadow->writes == orig->writes)
-                return;
-
+        struct vc4_context *vc4 = vc4_context(pctx);
+        struct vc4_resource *orig = vc4_resource(ib->buffer);
         perf_debug("Fallback conversion for %d uint indices\n", count);
 
+        void *data;
+        struct pipe_resource *shadow_rsc = NULL;
+        u_upload_alloc(vc4->uploader, 0, count * 2,
+                       shadow_offset, &shadow_rsc, &data);
+        uint16_t *dst = data;
+
         struct pipe_transfer *src_transfer;
         uint32_t *src = pipe_buffer_map_range(pctx, &orig->base.b,
                                               ib->offset,
                                               count * 4,
                                               PIPE_TRANSFER_READ, &src_transfer);
 
-        struct pipe_transfer *dst_transfer;
-        uint16_t *dst = pipe_buffer_map_range(pctx, &shadow->base.b,
-                                              0,
-                                              count * 2,
-                                              PIPE_TRANSFER_WRITE, &dst_transfer);
-
         for (int i = 0; i < count; i++) {
                 uint32_t src_index = src[i];
                 assert(src_index <= 0xffff);
                 dst[i] = src_index;
         }
 
-        pctx->transfer_unmap(pctx, dst_transfer);
         pctx->transfer_unmap(pctx, src_transfer);
 
-        shadow->writes = orig->writes;
+        return shadow_rsc;
 }
 
 void
diff --git a/src/gallium/drivers/vc4/vc4_resource.h b/src/gallium/drivers/vc4/vc4_resource.h
index 2ed848b..ab8f5d3 100644
--- a/src/gallium/drivers/vc4/vc4_resource.h
+++ b/src/gallium/drivers/vc4/vc4_resource.h
@@ -26,7 +26,7 @@
 #define VC4_RESOURCE_H
 
 #include "vc4_screen.h"
-#include "vc4_packet.h"
+#include "kernel/vc4_packet.h"
 #include "util/u_transfer.h"
 
 struct vc4_transfer {
@@ -45,7 +45,6 @@ struct vc4_resource_slice {
 struct vc4_surface {
         struct pipe_surface base;
         uint32_t offset;
-        uint32_t stride;
         uint8_t tiling;
 };
 
@@ -107,8 +106,10 @@ struct pipe_resource *vc4_resource_create(struct pipe_screen *pscreen,
                                           const struct pipe_resource *tmpl);
 void vc4_update_shadow_baselevel_texture(struct pipe_context *pctx,
                                          struct pipe_sampler_view *view);
-void vc4_update_shadow_index_buffer(struct pipe_context *pctx,
-                                    const struct pipe_index_buffer *ib);
+struct pipe_resource *vc4_get_shadow_index_buffer(struct pipe_context *pctx,
+                                                  const struct pipe_index_buffer *ib,
+                                                  uint32_t count,
+                                                  uint32_t *offset);
 void vc4_dump_surface(struct pipe_surface *psurf);
 
 #endif /* VC4_RESOURCE_H */
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 84aae91..f63bead 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -175,6 +175,7 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_POLYGON_OFFSET_CLAMP:
         case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
         case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+        case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
                 return 0;
 
                 /* Stream output. */
@@ -322,6 +323,7 @@ vc4_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
         case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
         case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+        case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
                 return 0;
         case PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS:
         case PIPE_SHADER_CAP_MAX_SAMPLER_VIEWS:
@@ -458,7 +460,7 @@ vc4_screen_create(int fd)
         pscreen->is_format_supported = vc4_screen_is_format_supported;
 
         screen->fd = fd;
-        make_empty_list(&screen->bo_cache.time_list);
+        list_inithead(&screen->bo_cache.time_list);
 
         vc4_fence_init(screen);
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 6062628..5992e37 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -27,7 +27,7 @@
 #include "pipe/p_screen.h"
 #include "os/os_thread.h"
 #include "state_tracker/drm_driver.h"
-#include "vc4_qir.h"
+#include "util/list.h"
 
 struct vc4_bo;
 
@@ -61,13 +61,19 @@ struct vc4_screen {
 
         struct vc4_bo_cache {
                 /** List of struct vc4_bo freed, by age. */
-                struct simple_node time_list;
+                struct list_head time_list;
                 /** List of struct vc4_bo freed, per size, by age. */
-                struct simple_node *size_list;
+                struct list_head *size_list;
                 uint32_t size_list_size;
 
                 pipe_mutex lock;
+
+                uint32_t bo_size;
+                uint32_t bo_count;
         } bo_cache;
+
+        uint32_t bo_size;
+        uint32_t bo_count;
 };
 
 static inline struct vc4_screen *
diff --git a/src/gallium/drivers/vc4/vc4_simulator.c b/src/gallium/drivers/vc4/vc4_simulator.c
index 2f72e72..b58013d 100644
--- a/src/gallium/drivers/vc4/vc4_simulator.c
+++ b/src/gallium/drivers/vc4/vc4_simulator.c
@@ -39,11 +39,13 @@ vc4_wrap_bo_with_cma(struct drm_device *dev, struct vc4_bo *bo)
 {
         struct vc4_context *vc4 = dev->vc4;
         struct vc4_screen *screen = vc4->screen;
-        struct drm_gem_cma_object *obj = CALLOC_STRUCT(drm_gem_cma_object);
+        struct drm_vc4_bo *drm_bo = CALLOC_STRUCT(drm_vc4_bo);
+        struct drm_gem_cma_object *obj = &drm_bo->base;
         uint32_t size = align(bo->size, 4096);
 
-        obj->bo = bo;
+        drm_bo->bo = bo;
         obj->base.size = size;
+        obj->base.dev = dev;
         obj->vaddr = screen->simulator_mem_base + dev->simulator_mem_next;
         obj->paddr = simpenrose_hw_addr(obj->vaddr);
 
@@ -94,7 +96,7 @@ vc4_simulator_unpin_bos(struct vc4_exec_info *exec)
 {
         for (int i = 0; i < exec->bo_count; i++) {
                 struct drm_gem_cma_object *obj = exec->bo[i].bo;
-                struct vc4_bo *bo = obj->bo;
+                struct vc4_bo *bo = to_vc4_bo(&obj->base)->bo;
 
                 memcpy(bo->map, obj->vaddr, bo->size);
 
@@ -124,6 +126,7 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
         int ret;
 
         memset(&exec, 0, sizeof(exec));
+        list_inithead(&exec.unref_list);
 
         if (ctex && ctex->bo->simulator_winsys_map) {
 #if 0
@@ -176,8 +179,12 @@ vc4_simulator_flush(struct vc4_context *vc4, struct drm_vc4_submit_cl *args)
         if (ret)
                 return ret;
 
-        vc4_bo_unreference(&exec.exec_bo->bo);
-        free(exec.exec_bo);
+        list_for_each_entry_safe(struct drm_vc4_bo, bo, &exec.unref_list,
+                                 unref_head) {
+		list_del(&bo->unref_head);
+                vc4_bo_unreference(&bo->bo);
+                free(bo);
+        }
 
         if (ctex && ctex->bo->simulator_winsys_map) {
                 for (int y = 0; y < ctex->base.b.height0; y++) {
diff --git a/src/gallium/drivers/vc4/vc4_simulator_validate.h b/src/gallium/drivers/vc4/vc4_simulator_validate.h
index 1f0c6b6..2bb36b2 100644
--- a/src/gallium/drivers/vc4/vc4_simulator_validate.h
+++ b/src/gallium/drivers/vc4/vc4_simulator_validate.h
@@ -43,6 +43,7 @@ struct vc4_exec_info;
 #define kfree(ptr) free(ptr)
 #define krealloc(ptr, size, args) realloc(ptr, size)
 #define roundup(x, y) align(x, y)
+#define round_up(x, y) align(x, y)
 #define max(x, y) MAX2(x, y)
 #define min(x, y) MiN2(x, y)
 #define BUG_ON(condition) assert(!(condition))
@@ -63,16 +64,27 @@ struct drm_device {
         uint32_t simulator_mem_next;
 };
 
-struct drm_gem_cma_object {
-        struct vc4_bo *bo;
+struct drm_gem_object {
+        uint32_t size;
+        struct drm_device *dev;
+};
 
-        struct {
-                uint32_t size;
-        } base;
+struct drm_gem_cma_object {
+        struct drm_gem_object base;
         uint32_t paddr;
         void *vaddr;
 };
 
+struct drm_vc4_bo {
+        struct drm_gem_cma_object base;
+        struct vc4_bo *bo;
+        struct list_head unref_head;
+};
+
+static inline struct drm_vc4_bo *to_vc4_bo(struct drm_gem_object *obj)
+{
+        return (struct drm_vc4_bo *)obj;
+}
 
 struct drm_gem_cma_object *
 drm_gem_cma_create(struct drm_device *dev, size_t size);
diff --git a/src/gallium/drivers/vc4/vc4_state.c b/src/gallium/drivers/vc4/vc4_state.c
index 80e963e..4a1d4c3 100644
--- a/src/gallium/drivers/vc4/vc4_state.c
+++ b/src/gallium/drivers/vc4/vc4_state.c
@@ -304,24 +304,8 @@ vc4_set_index_buffer(struct pipe_context *pctx,
 
         if (ib) {
                 assert(!ib->user_buffer);
-
-                if (ib->index_size == 4) {
-                        struct pipe_resource tmpl = *ib->buffer;
-                        assert(tmpl.format == PIPE_FORMAT_R8_UNORM);
-                        assert(tmpl.height0 == 1);
-                        tmpl.width0 = (tmpl.width0 - ib->offset) / 2;
-                        struct pipe_resource *pshadow =
-                                vc4_resource_create(&vc4->screen->base, &tmpl);
-                        struct vc4_resource *shadow = vc4_resource(pshadow);
-                        pipe_resource_reference(&shadow->shadow_parent, ib->buffer);
-
-                        pipe_resource_reference(&vc4->indexbuf.buffer, NULL);
-                        vc4->indexbuf.buffer = pshadow;
-                        vc4->indexbuf.index_size = 2;
-                } else {
-                        pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer);
-                        vc4->indexbuf.index_size = ib->index_size;
-                }
+                pipe_resource_reference(&vc4->indexbuf.buffer, ib->buffer);
+                vc4->indexbuf.index_size = ib->index_size;
                 vc4->indexbuf.offset = ib->offset;
         } else {
                 pipe_resource_reference(&vc4->indexbuf.buffer, NULL);
@@ -538,6 +522,7 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                 struct pipe_resource tmpl = shadow_parent->base.b;
                 struct vc4_resource *clone;
 
+                tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
                 tmpl.width0 = u_minify(tmpl.width0, so->u.tex.first_level);
                 tmpl.height0 = u_minify(tmpl.height0, so->u.tex.first_level);
                 tmpl.last_level = so->u.tex.last_level - so->u.tex.first_level;
@@ -547,6 +532,8 @@ vc4_create_sampler_view(struct pipe_context *pctx, struct pipe_resource *prsc,
                 clone->shadow_parent = &shadow_parent->base.b;
                 /* Flag it as needing update of the contents from the parent. */
                 clone->writes = shadow_parent->writes - 1;
+
+                assert(clone->vc4_format != VC4_TEXTURE_TYPE_RGBA32R);
         }
         so->texture = prsc;
         so->reference.count = 1;
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index adff67a..c2eedf8 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -170,6 +170,16 @@ struct pipe_context {
    void   (*bind_gs_state)(struct pipe_context *, void *);
    void   (*delete_gs_state)(struct pipe_context *, void *);
 
+   void * (*create_tcs_state)(struct pipe_context *,
+                              const struct pipe_shader_state *);
+   void   (*bind_tcs_state)(struct pipe_context *, void *);
+   void   (*delete_tcs_state)(struct pipe_context *, void *);
+
+   void * (*create_tes_state)(struct pipe_context *,
+                              const struct pipe_shader_state *);
+   void   (*bind_tes_state)(struct pipe_context *, void *);
+   void   (*delete_tes_state)(struct pipe_context *, void *);
+
    void * (*create_vertex_elements_state)(struct pipe_context *,
                                           unsigned num_elements,
                                           const struct pipe_vertex_element *);
@@ -221,6 +231,10 @@ struct pipe_context {
                              unsigned start_slot, unsigned num_views,
                              struct pipe_sampler_view **);
 
+   void (*set_tess_state)(struct pipe_context *,
+                          const float default_outer_level[4],
+                          const float default_inner_level[2]);
+
    /**
     * Bind an array of shader resources that will be used by the
     * graphics pipeline.  Any resources that were previously bound to
@@ -562,6 +576,10 @@ struct pipe_context {
    void (*invalidate_resource)(struct pipe_context *ctx,
                                struct pipe_resource *resource);
 
+   /**
+    * Return information about unexpected device resets.
+    */
+   enum pipe_reset_status (*get_device_reset_status)(struct pipe_context *ctx);
 };
 
 
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 8a16fde..88b7b76 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -404,8 +404,10 @@ enum pipe_flush_flags
 #define PIPE_SHADER_VERTEX   0
 #define PIPE_SHADER_FRAGMENT 1
 #define PIPE_SHADER_GEOMETRY 2
-#define PIPE_SHADER_COMPUTE  3
-#define PIPE_SHADER_TYPES    4
+#define PIPE_SHADER_TESS_CTRL 3
+#define PIPE_SHADER_TESS_EVAL 4
+#define PIPE_SHADER_COMPUTE  5
+#define PIPE_SHADER_TYPES    6
 
 
 /**
@@ -425,10 +427,18 @@ enum pipe_flush_flags
 #define PIPE_PRIM_LINE_STRIP_ADJACENCY     11
 #define PIPE_PRIM_TRIANGLES_ADJACENCY      12
 #define PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY 13
-#define PIPE_PRIM_MAX                      14
+#define PIPE_PRIM_PATCHES                  14
+#define PIPE_PRIM_MAX                      15
 
 
 /**
+ * Tessellator spacing types
+ */
+#define PIPE_TESS_SPACING_FRACTIONAL_ODD    0
+#define PIPE_TESS_SPACING_FRACTIONAL_EVEN   1
+#define PIPE_TESS_SPACING_EQUAL             2
+
+/**
  * Query object types
  */
 #define PIPE_QUERY_OCCLUSION_COUNTER     0
@@ -476,111 +486,125 @@ enum pipe_flush_flags
 
 #define PIPE_TIMEOUT_INFINITE 0xffffffffffffffffull
 
+
+/**
+ * Device reset status.
+ */
+enum pipe_reset_status
+{
+   PIPE_NO_RESET = 0,
+   PIPE_GUILTY_CONTEXT_RESET = 1,
+   PIPE_INNOCENT_CONTEXT_RESET = 2,
+   PIPE_UNKNOWN_CONTEXT_RESET = 3
+};
+
+
 /**
  * Implementation capabilities/limits which are queried through
  * pipe_screen::get_param()
  */
 enum pipe_cap
 {
-   PIPE_CAP_NPOT_TEXTURES = 1,
-   PIPE_CAP_TWO_SIDED_STENCIL = 2,
-   PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS = 4,
-   PIPE_CAP_ANISOTROPIC_FILTER = 5,
-   PIPE_CAP_POINT_SPRITE = 6,
-   PIPE_CAP_MAX_RENDER_TARGETS = 7,
-   PIPE_CAP_OCCLUSION_QUERY = 8,
-   PIPE_CAP_QUERY_TIME_ELAPSED = 9,
-   PIPE_CAP_TEXTURE_SHADOW_MAP = 10,
-   PIPE_CAP_TEXTURE_SWIZZLE = 11,
-   PIPE_CAP_MAX_TEXTURE_2D_LEVELS = 12,
-   PIPE_CAP_MAX_TEXTURE_3D_LEVELS = 13,
-   PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS = 14,
-   PIPE_CAP_TEXTURE_MIRROR_CLAMP = 25,
-   PIPE_CAP_BLEND_EQUATION_SEPARATE = 28,
-   PIPE_CAP_SM3 = 29,  /*< Shader Model, supported */
-   PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS = 30,
-   PIPE_CAP_PRIMITIVE_RESTART = 31,
+   PIPE_CAP_NPOT_TEXTURES,
+   PIPE_CAP_TWO_SIDED_STENCIL,
+   PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS,
+   PIPE_CAP_ANISOTROPIC_FILTER,
+   PIPE_CAP_POINT_SPRITE,
+   PIPE_CAP_MAX_RENDER_TARGETS,
+   PIPE_CAP_OCCLUSION_QUERY,
+   PIPE_CAP_QUERY_TIME_ELAPSED,
+   PIPE_CAP_TEXTURE_SHADOW_MAP,
+   PIPE_CAP_TEXTURE_SWIZZLE,
+   PIPE_CAP_MAX_TEXTURE_2D_LEVELS,
+   PIPE_CAP_MAX_TEXTURE_3D_LEVELS,
+   PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS,
+   PIPE_CAP_TEXTURE_MIRROR_CLAMP,
+   PIPE_CAP_BLEND_EQUATION_SEPARATE,
+   PIPE_CAP_SM3,
+   PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS,
+   PIPE_CAP_PRIMITIVE_RESTART,
    /** blend enables and write masks per rendertarget */
-   PIPE_CAP_INDEP_BLEND_ENABLE = 33,
+   PIPE_CAP_INDEP_BLEND_ENABLE,
    /** different blend funcs per rendertarget */
-   PIPE_CAP_INDEP_BLEND_FUNC = 34,
-   PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS = 36,
-   PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT = 37,
-   PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT = 38,
-   PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER = 39,
-   PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER = 40,
-   PIPE_CAP_DEPTH_CLIP_DISABLE = 41,
-   PIPE_CAP_SHADER_STENCIL_EXPORT = 42,
-   PIPE_CAP_TGSI_INSTANCEID = 43,
-   PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR = 44,
-   PIPE_CAP_FRAGMENT_COLOR_CLAMPED = 45,
-   PIPE_CAP_MIXED_COLORBUFFER_FORMATS = 46,
-   PIPE_CAP_SEAMLESS_CUBE_MAP = 47,
-   PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE = 48,
-   PIPE_CAP_MIN_TEXEL_OFFSET = 50,
-   PIPE_CAP_MAX_TEXEL_OFFSET = 51,
-   PIPE_CAP_CONDITIONAL_RENDER = 52,
-   PIPE_CAP_TEXTURE_BARRIER = 53,
-   PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS = 55,
-   PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS = 56,
-   PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME = 57,
-   PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS = 59, /* temporary */
-   PIPE_CAP_VERTEX_COLOR_UNCLAMPED = 60,
-   PIPE_CAP_VERTEX_COLOR_CLAMPED = 61,
-   PIPE_CAP_GLSL_FEATURE_LEVEL = 62,
-   PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION = 63,
-   PIPE_CAP_USER_VERTEX_BUFFERS = 64,
-   PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY = 65,
-   PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY = 66,
-   PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY = 67,
-   PIPE_CAP_COMPUTE = 68,
-   PIPE_CAP_USER_INDEX_BUFFERS = 69,
-   PIPE_CAP_USER_CONSTANT_BUFFERS = 70,
-   PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT = 71,
-   PIPE_CAP_START_INSTANCE = 72,
-   PIPE_CAP_QUERY_TIMESTAMP = 73,
-   PIPE_CAP_TEXTURE_MULTISAMPLE = 74,
-   PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT = 75,
-   PIPE_CAP_CUBE_MAP_ARRAY = 76,
-   PIPE_CAP_TEXTURE_BUFFER_OBJECTS = 77,
-   PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT = 78,
-   PIPE_CAP_TGSI_TEXCOORD = 79,
-   PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER = 80,
-   PIPE_CAP_QUERY_PIPELINE_STATISTICS = 81,
-   PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK = 82,
-   PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE = 83,
-   PIPE_CAP_MAX_VIEWPORTS = 84,
-   PIPE_CAP_ENDIANNESS = 85,
-   PIPE_CAP_MIXED_FRAMEBUFFER_SIZES = 86,
-   PIPE_CAP_TGSI_VS_LAYER_VIEWPORT = 87,
-   PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES = 88,
-   PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS = 89,
-   PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS = 90,
-   PIPE_CAP_TEXTURE_GATHER_SM5 = 91,
-   PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT = 92,
-   PIPE_CAP_FAKE_SW_MSAA = 93,
-   PIPE_CAP_TEXTURE_QUERY_LOD = 94,
-   PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET = 95,
-   PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET = 96,
-   PIPE_CAP_SAMPLE_SHADING = 97,
-   PIPE_CAP_TEXTURE_GATHER_OFFSETS = 98,
-   PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION = 99,
-   PIPE_CAP_MAX_VERTEX_STREAMS = 100,
-   PIPE_CAP_DRAW_INDIRECT = 101,
-   PIPE_CAP_TGSI_FS_FINE_DERIVATIVE = 102,
-   PIPE_CAP_VENDOR_ID = 103,
-   PIPE_CAP_DEVICE_ID = 104,
-   PIPE_CAP_ACCELERATED = 105,
-   PIPE_CAP_VIDEO_MEMORY = 106,
-   PIPE_CAP_UMA = 107,
-   PIPE_CAP_CONDITIONAL_RENDER_INVERTED = 108,
-   PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE = 109,
-   PIPE_CAP_SAMPLER_VIEW_TARGET = 110,
-   PIPE_CAP_CLIP_HALFZ = 111,
-   PIPE_CAP_VERTEXID_NOBASE = 112,
-   PIPE_CAP_POLYGON_OFFSET_CLAMP = 113,
-   PIPE_CAP_MULTISAMPLE_Z_RESOLVE = 114,
-   PIPE_CAP_RESOURCE_FROM_USER_MEMORY = 115,
+   PIPE_CAP_INDEP_BLEND_FUNC,
+   PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS,
+   PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT,
+   PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT,
+   PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER,
+   PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER,
+   PIPE_CAP_DEPTH_CLIP_DISABLE,
+   PIPE_CAP_SHADER_STENCIL_EXPORT,
+   PIPE_CAP_TGSI_INSTANCEID,
+   PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR,
+   PIPE_CAP_FRAGMENT_COLOR_CLAMPED,
+   PIPE_CAP_MIXED_COLORBUFFER_FORMATS,
+   PIPE_CAP_SEAMLESS_CUBE_MAP,
+   PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE,
+   PIPE_CAP_MIN_TEXEL_OFFSET,
+   PIPE_CAP_MAX_TEXEL_OFFSET,
+   PIPE_CAP_CONDITIONAL_RENDER,
+   PIPE_CAP_TEXTURE_BARRIER,
+   PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS,
+   PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS,
+   PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME,
+   PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS,
+   PIPE_CAP_VERTEX_COLOR_UNCLAMPED,
+   PIPE_CAP_VERTEX_COLOR_CLAMPED,
+   PIPE_CAP_GLSL_FEATURE_LEVEL,
+   PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION,
+   PIPE_CAP_USER_VERTEX_BUFFERS,
+   PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY,
+   PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY,
+   PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY,
+   PIPE_CAP_COMPUTE,
+   PIPE_CAP_USER_INDEX_BUFFERS,
+   PIPE_CAP_USER_CONSTANT_BUFFERS,
+   PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT,
+   PIPE_CAP_START_INSTANCE,
+   PIPE_CAP_QUERY_TIMESTAMP,
+   PIPE_CAP_TEXTURE_MULTISAMPLE,
+   PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT,
+   PIPE_CAP_CUBE_MAP_ARRAY,
+   PIPE_CAP_TEXTURE_BUFFER_OBJECTS,
+   PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT,
+   PIPE_CAP_TGSI_TEXCOORD,
+   PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER,
+   PIPE_CAP_QUERY_PIPELINE_STATISTICS,
+   PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK,
+   PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE,
+   PIPE_CAP_MAX_VIEWPORTS,
+   PIPE_CAP_ENDIANNESS,
+   PIPE_CAP_MIXED_FRAMEBUFFER_SIZES,
+   PIPE_CAP_TGSI_VS_LAYER_VIEWPORT,
+   PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES,
+   PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS,
+   PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS,
+   PIPE_CAP_TEXTURE_GATHER_SM5,
+   PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT,
+   PIPE_CAP_FAKE_SW_MSAA,
+   PIPE_CAP_TEXTURE_QUERY_LOD,
+   PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET,
+   PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET,
+   PIPE_CAP_SAMPLE_SHADING,
+   PIPE_CAP_TEXTURE_GATHER_OFFSETS,
+   PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION,
+   PIPE_CAP_MAX_VERTEX_STREAMS,
+   PIPE_CAP_DRAW_INDIRECT,
+   PIPE_CAP_TGSI_FS_FINE_DERIVATIVE,
+   PIPE_CAP_VENDOR_ID,
+   PIPE_CAP_DEVICE_ID,
+   PIPE_CAP_ACCELERATED,
+   PIPE_CAP_VIDEO_MEMORY,
+   PIPE_CAP_UMA,
+   PIPE_CAP_CONDITIONAL_RENDER_INVERTED,
+   PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE,
+   PIPE_CAP_SAMPLER_VIEW_TARGET,
+   PIPE_CAP_CLIP_HALFZ,
+   PIPE_CAP_VERTEXID_NOBASE,
+   PIPE_CAP_POLYGON_OFFSET_CLAMP,
+   PIPE_CAP_MULTISAMPLE_Z_RESOLVE,
+   PIPE_CAP_RESOURCE_FROM_USER_MEMORY,
+   PIPE_CAP_DEVICE_RESET_STATUS_QUERY,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
@@ -645,6 +669,7 @@ enum pipe_shader_cap
    PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED, /* all rounding modes */
    PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED,
    PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED,
+   PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE
 };
 
 /**
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index c14bcbc..bb57e80 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -43,7 +43,9 @@ struct tgsi_header
 #define TGSI_PROCESSOR_FRAGMENT  0
 #define TGSI_PROCESSOR_VERTEX    1
 #define TGSI_PROCESSOR_GEOMETRY  2
-#define TGSI_PROCESSOR_COMPUTE   3
+#define TGSI_PROCESSOR_TESS_CTRL 3
+#define TGSI_PROCESSOR_TESS_EVAL 4
+#define TGSI_PROCESSOR_COMPUTE   5
 
 struct tgsi_processor
 {
@@ -178,7 +180,12 @@ struct tgsi_declaration_interp
 #define TGSI_SEMANTIC_INVOCATIONID 27
 #define TGSI_SEMANTIC_VERTEXID_NOBASE 28
 #define TGSI_SEMANTIC_BASEVERTEX 29
-#define TGSI_SEMANTIC_COUNT      30 /**< number of semantic values */
+#define TGSI_SEMANTIC_PATCH      30 /**< generic per-patch semantic */
+#define TGSI_SEMANTIC_TESSCOORD  31 /**< coordinate being processed by tess */
+#define TGSI_SEMANTIC_TESSOUTER  32 /**< outer tessellation levels */
+#define TGSI_SEMANTIC_TESSINNER  33 /**< inner tessellation levels */
+#define TGSI_SEMANTIC_VERTICESIN 34 /**< number of input vertices */
+#define TGSI_SEMANTIC_COUNT      35 /**< number of semantic values */
 
 struct tgsi_declaration_semantic
 {
@@ -255,7 +262,12 @@ union tgsi_immediate_data
 #define TGSI_PROPERTY_VS_PROHIBIT_UCPS       7
 #define TGSI_PROPERTY_GS_INVOCATIONS         8
 #define TGSI_PROPERTY_VS_WINDOW_SPACE_POSITION 9
-#define TGSI_PROPERTY_COUNT                  10
+#define TGSI_PROPERTY_TCS_VERTICES_OUT       10
+#define TGSI_PROPERTY_TES_PRIM_MODE          11
+#define TGSI_PROPERTY_TES_SPACING            12
+#define TGSI_PROPERTY_TES_VERTEX_ORDER_CW    13
+#define TGSI_PROPERTY_TES_POINT_MODE         14
+#define TGSI_PROPERTY_COUNT                  15
 
 struct tgsi_property {
    unsigned Type         : 4;  /**< TGSI_TOKEN_TYPE_PROPERTY */
@@ -526,10 +538,6 @@ struct tgsi_property_data {
 #define TGSI_OPCODE_DSSG                222
 #define TGSI_OPCODE_LAST                223
 
-#define TGSI_SAT_NONE            0  /* do not saturate */
-#define TGSI_SAT_ZERO_ONE        1  /* clamp to [0,1] */
-#define TGSI_SAT_MINUS_PLUS_ONE  2  /* clamp to [-1,1] */
-
 /**
  * Opcode is the operation code to execute. A given operation defines the
  * semantics how the source registers (if any) are interpreted and what is
@@ -549,13 +557,13 @@ struct tgsi_instruction
    unsigned Type       : 4;  /* TGSI_TOKEN_TYPE_INSTRUCTION */
    unsigned NrTokens   : 8;  /* UINT */
    unsigned Opcode     : 8;  /* TGSI_OPCODE_ */
-   unsigned Saturate   : 2;  /* TGSI_SAT_ */
+   unsigned Saturate   : 1;  /* BOOL */
    unsigned NumDstRegs : 2;  /* UINT */
    unsigned NumSrcRegs : 4;  /* UINT */
    unsigned Predicate  : 1;  /* BOOL */
    unsigned Label      : 1;
    unsigned Texture    : 1;
-   unsigned Padding    : 1;
+   unsigned Padding    : 2;
 };
 
 /*
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index e15860c..a18f12e 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -1,8 +1,8 @@
 /**************************************************************************
- * 
+ *
  * Copyright 2007 VMware, Inc.
  * All Rights Reserved.
- * 
+ *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
  * "Software"), to deal in the Software without restriction, including
@@ -10,11 +10,11 @@
  * distribute, sub license, and/or sell copies of the Software, and to
  * permit persons to whom the Software is furnished to do so, subject to
  * the following conditions:
- * 
+ *
  * The above copyright notice and this permission notice (including the
  * next paragraph) shall be included in all copies or substantial portions
  * of the Software.
- * 
+ *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
@@ -22,13 +22,13 @@
  * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
+ *
  **************************************************************************/
 
 
 /**
  * @file
- * 
+ *
  * Abstract graphics pipe state objects.
  *
  * Basic notes:
@@ -58,8 +58,8 @@ extern "C" {
 #define PIPE_MAX_COLOR_BUFS        8
 #define PIPE_MAX_CONSTANT_BUFFERS 32
 #define PIPE_MAX_SAMPLERS         18 /* 16 public + 2 driver internal */
-#define PIPE_MAX_SHADER_INPUTS    32
-#define PIPE_MAX_SHADER_OUTPUTS   48 /* 32 GENERICs + POS, PSIZE, FOG, etc. */
+#define PIPE_MAX_SHADER_INPUTS    80 /* 32 GENERIC + 32 PATCH + 16 others */
+#define PIPE_MAX_SHADER_OUTPUTS   80 /* 32 GENERIC + 32 PATCH + 16 others */
 #define PIPE_MAX_SHADER_SAMPLER_VIEWS 32
 #define PIPE_MAX_SHADER_RESOURCES 32
 #define PIPE_MAX_TEXTURE_LEVELS   16
@@ -217,7 +217,7 @@ struct pipe_shader_state
 };
 
 
-struct pipe_depth_state 
+struct pipe_depth_state
 {
    unsigned enabled:1;         /**< depth test enabled? */
    unsigned writemask:1;       /**< allow depth buffer writes? */
@@ -268,6 +268,7 @@ struct pipe_rt_blend_state
    unsigned colormask:4;         /**< bitmask of PIPE_MASK_R/G/B/A */
 };
 
+
 struct pipe_blend_state
 {
    unsigned independent_blend_enable:1;
@@ -285,11 +286,13 @@ struct pipe_blend_color
    float color[4];
 };
 
+
 struct pipe_stencil_ref
 {
    ubyte ref_value[2];
 };
 
+
 struct pipe_framebuffer_state
 {
    unsigned width, height;
@@ -367,10 +370,10 @@ struct pipe_sampler_view
    struct pipe_context *context; /**< context this view belongs to */
    union {
       struct {
-         unsigned first_layer:16;     /**< first layer to use for array textures */
-         unsigned last_layer:16;      /**< last layer to use for array textures */
-         unsigned first_level:8;      /**< first mipmap level to use */
-         unsigned last_level:8;       /**< last mipmap level to use */
+         unsigned first_layer:16;  /**< first layer to use for array textures */
+         unsigned last_layer:16;   /**< last layer to use for array textures */
+         unsigned first_level:8;   /**< first mipmap level to use */
+         unsigned last_level:8;    /**< last mipmap level to use */
       } tex;
       struct {
          unsigned first_element;
@@ -455,7 +458,8 @@ struct pipe_vertex_buffer
  * A constant buffer.  A subrange of an existing buffer can be set
  * as a constant buffer.
  */
-struct pipe_constant_buffer {
+struct pipe_constant_buffer
+{
    struct pipe_resource *buffer; /**< the actual buffer */
    unsigned buffer_offset; /**< offset to start of data in buffer, in bytes */
    unsigned buffer_size;   /**< how much data can be read in shader */
@@ -474,8 +478,8 @@ struct pipe_constant_buffer {
  * and the CPU actually doesn't have to query it.
  *
  * Note that the buffer_size variable is actually specifying the available
- * space in the buffer, not the size of the attached buffer. 
- * In other words in majority of cases buffer_size would simply be 
+ * space in the buffer, not the size of the attached buffer.
+ * In other words in majority of cases buffer_size would simply be
  * 'buffer->width0 - buffer_offset', so buffer_size refers to the size
  * of the buffer left, after accounting for buffer offset, for stream output
  * to write to.
@@ -511,7 +515,7 @@ struct pipe_vertex_element
     * this attribute live in?
     */
    unsigned vertex_buffer_index;
- 
+
    enum pipe_format src_format;
 };
 
@@ -543,6 +547,8 @@ struct pipe_draw_info
    unsigned start_instance; /**< first instance id */
    unsigned instance_count; /**< number of instances */
 
+   unsigned vertices_per_patch; /**< the number of vertices per patch */
+
    /**
     * For indexed drawing, these fields apply after index lookup.
     */
@@ -640,5 +646,5 @@ struct pipe_compute_state
 #ifdef __cplusplus
 }
 #endif
-   
+
 #endif
diff --git a/src/gallium/include/state_tracker/st_api.h b/src/gallium/include/state_tracker/st_api.h
index 86fdc69..ecf1c07 100644
--- a/src/gallium/include/state_tracker/st_api.h
+++ b/src/gallium/include/state_tracker/st_api.h
@@ -89,6 +89,7 @@ enum st_api_feature
 #define ST_CONTEXT_FLAG_DEBUG               (1 << 0)
 #define ST_CONTEXT_FLAG_FORWARD_COMPATIBLE  (1 << 1)
 #define ST_CONTEXT_FLAG_ROBUST_ACCESS       (1 << 2)
+#define ST_CONTEXT_FLAG_RESET_NOTIFICATION_ENABLED (1 << 3)
 
 /**
  * Reasons that context creation might fail.
diff --git a/src/gallium/state_trackers/clover/api/interop.cpp b/src/gallium/state_trackers/clover/api/interop.cpp
index ea0c7c7..b96069f 100644
--- a/src/gallium/state_trackers/clover/api/interop.cpp
+++ b/src/gallium/state_trackers/clover/api/interop.cpp
@@ -31,7 +31,12 @@ extern "C" {
 PUBLIC bool
 opencl_dri_event_add_ref(cl_event event)
 {
-   return clRetainEvent(event) == CL_SUCCESS;
+   /* This should fail if the event hasn't been created by
+    * clEnqueueReleaseGLObjects or clEnqueueReleaseEGLObjects.
+    *
+    * TODO: implement the CL functions
+    */
+   return false; /*return clRetainEvent(event) == CL_SUCCESS;*/
 }
 
 PUBLIC bool
diff --git a/src/gallium/state_trackers/clover/core/error.hpp b/src/gallium/state_trackers/clover/core/error.hpp
index eb65d62..780b973 100644
--- a/src/gallium/state_trackers/clover/core/error.hpp
+++ b/src/gallium/state_trackers/clover/core/error.hpp
@@ -26,6 +26,7 @@
 #include "CL/cl.h"
 
 #include <stdexcept>
+#include <string>
 
 namespace clover {
    class command_queue;
diff --git a/src/gallium/state_trackers/clover/core/event.cpp b/src/gallium/state_trackers/clover/core/event.cpp
index 58de888..e1f9de0 100644
--- a/src/gallium/state_trackers/clover/core/event.cpp
+++ b/src/gallium/state_trackers/clover/core/event.cpp
@@ -27,7 +27,7 @@ using namespace clover;
 
 event::event(clover::context &ctx, const ref_vector<event> &deps,
              action action_ok, action action_fail) :
-   context(ctx), _status(0), wait_count(1),
+   context(ctx), wait_count(1), _status(0),
    action_ok(action_ok), action_fail(action_fail) {
    for (auto &ev : deps)
       ev.chain(*this);
@@ -36,36 +36,69 @@ event::event(clover::context &ctx, const ref_vector<event> &deps,
 event::~event() {
 }
 
+std::vector<intrusive_ref<event>>
+event::trigger_self() {
+   std::lock_guard<std::mutex> lock(mutex);
+   std::vector<intrusive_ref<event>> evs;
+
+   if (!--wait_count)
+      std::swap(_chain, evs);
+
+   return evs;
+}
+
 void
 event::trigger() {
-   if (!--wait_count) {
-      action_ok(*this);
+   auto evs = trigger_self();
 
-      while (!_chain.empty()) {
-         _chain.back()().trigger();
-         _chain.pop_back();
-      }
+   if (signalled()) {
+      action_ok(*this);
+      cv.notify_all();
    }
+
+   for (event &ev : evs)
+      ev.trigger();
+}
+
+std::vector<intrusive_ref<event>>
+event::abort_self(cl_int status) {
+   std::lock_guard<std::mutex> lock(mutex);
+   std::vector<intrusive_ref<event>> evs;
+
+   _status = status;
+   std::swap(_chain, evs);
+
+   return evs;
 }
 
 void
 event::abort(cl_int status) {
-   _status = status;
+   auto evs = abort_self(status);
+
    action_fail(*this);
 
-   while (!_chain.empty()) {
-      _chain.back()().abort(status);
-      _chain.pop_back();
-   }
+   for (event &ev : evs)
+      ev.abort(status);
 }
 
 bool
 event::signalled() const {
+   std::lock_guard<std::mutex> lock(mutex);
    return !wait_count;
 }
 
+cl_int
+event::status() const {
+   std::lock_guard<std::mutex> lock(mutex);
+   return _status;
+}
+
 void
 event::chain(event &ev) {
+   std::unique_lock<std::mutex> lock(mutex, std::defer_lock);
+   std::unique_lock<std::mutex> lock_ev(ev.mutex, std::defer_lock);
+   std::lock(lock, lock_ev);
+
    if (wait_count) {
       ev.wait_count++;
       _chain.push_back(ev);
@@ -73,6 +106,15 @@ event::chain(event &ev) {
    ev.deps.push_back(*this);
 }
 
+void
+event::wait() const {
+   for (event &ev : deps)
+      ev.wait();
+
+   std::unique_lock<std::mutex> lock(mutex);
+   cv.wait(lock, [=]{ return !wait_count; });
+}
+
 hard_event::hard_event(command_queue &q, cl_command_type command,
                        const ref_vector<event> &deps, action action) :
    event(q.context(), deps, profile(q, action), [](event &ev){}),
@@ -93,8 +135,8 @@ cl_int
 hard_event::status() const {
    pipe_screen *screen = queue()->device().pipe;
 
-   if (_status < 0)
-      return _status;
+   if (event::status() < 0)
+      return event::status();
 
    else if (!_fence)
       return CL_QUEUED;
@@ -120,6 +162,8 @@ void
 hard_event::wait() const {
    pipe_screen *screen = queue()->device().pipe;
 
+   event::wait();
+
    if (status() == CL_QUEUED)
       queue()->flush();
 
@@ -182,8 +226,8 @@ soft_event::soft_event(clover::context &ctx, const ref_vector<event> &deps,
 
 cl_int
 soft_event::status() const {
-   if (_status < 0)
-      return _status;
+   if (event::status() < 0)
+      return event::status();
 
    else if (!signalled() ||
             any_of([](const event &ev) {
@@ -207,8 +251,7 @@ soft_event::command() const {
 
 void
 soft_event::wait() const {
-   for (event &ev : deps)
-      ev.wait();
+   event::wait();
 
    if (status() != CL_COMPLETE)
       throw error(CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST);
diff --git a/src/gallium/state_trackers/clover/core/event.hpp b/src/gallium/state_trackers/clover/core/event.hpp
index d407c80..6469e48 100644
--- a/src/gallium/state_trackers/clover/core/event.hpp
+++ b/src/gallium/state_trackers/clover/core/event.hpp
@@ -23,6 +23,7 @@
 #ifndef CLOVER_CORE_EVENT_HPP
 #define CLOVER_CORE_EVENT_HPP
 
+#include <condition_variable>
 #include <functional>
 
 #include "core/object.hpp"
@@ -65,10 +66,10 @@ namespace clover {
       void abort(cl_int status);
       bool signalled() const;
 
-      virtual cl_int status() const = 0;
+      virtual cl_int status() const;
       virtual command_queue *queue() const = 0;
       virtual cl_command_type command() const = 0;
-      virtual void wait() const = 0;
+      virtual void wait() const;
 
       virtual struct pipe_fence_handle *fence() const {
          return NULL;
@@ -79,14 +80,19 @@ namespace clover {
    protected:
       void chain(event &ev);
 
-      cl_int _status;
       std::vector<intrusive_ref<event>> deps;
 
    private:
+      std::vector<intrusive_ref<event>> trigger_self();
+      std::vector<intrusive_ref<event>> abort_self(cl_int status);
+
       unsigned wait_count;
+      cl_int _status;
       action action_ok;
       action action_fail;
       std::vector<intrusive_ref<event>> _chain;
+      mutable std::condition_variable cv;
+      mutable std::mutex mutex;
    };
 
    ///
diff --git a/src/gallium/state_trackers/clover/core/memory.cpp b/src/gallium/state_trackers/clover/core/memory.cpp
index 905ebc0..055336a 100644
--- a/src/gallium/state_trackers/clover/core/memory.cpp
+++ b/src/gallium/state_trackers/clover/core/memory.cpp
@@ -30,7 +30,7 @@ memory_obj::memory_obj(clover::context &ctx, cl_mem_flags flags,
                        size_t size, void *host_ptr) :
    context(ctx), _flags(flags),
    _size(size), _host_ptr(host_ptr) {
-   if (flags & (CL_MEM_COPY_HOST_PTR | CL_MEM_USE_HOST_PTR))
+   if (flags & CL_MEM_COPY_HOST_PTR)
       data.append((char *)host_ptr, size);
 }
 
diff --git a/src/gallium/state_trackers/clover/core/queue.cpp b/src/gallium/state_trackers/clover/core/queue.cpp
index 24f9326..87f9dcc 100644
--- a/src/gallium/state_trackers/clover/core/queue.cpp
+++ b/src/gallium/state_trackers/clover/core/queue.cpp
@@ -44,6 +44,7 @@ command_queue::flush() {
    pipe_screen *screen = device().pipe;
    pipe_fence_handle *fence = NULL;
 
+   std::lock_guard<std::mutex> lock(queued_events_mutex);
    if (!queued_events.empty()) {
       pipe->flush(pipe, &fence, 0);
 
@@ -69,6 +70,7 @@ command_queue::profiling_enabled() const {
 
 void
 command_queue::sequence(hard_event &ev) {
+   std::lock_guard<std::mutex> lock(queued_events_mutex);
    if (!queued_events.empty())
       queued_events.back()().chain(ev);
 
diff --git a/src/gallium/state_trackers/clover/core/queue.hpp b/src/gallium/state_trackers/clover/core/queue.hpp
index b7166e6..bddb86c 100644
--- a/src/gallium/state_trackers/clover/core/queue.hpp
+++ b/src/gallium/state_trackers/clover/core/queue.hpp
@@ -24,6 +24,7 @@
 #define CLOVER_CORE_QUEUE_HPP
 
 #include <deque>
+#include <mutex>
 
 #include "core/object.hpp"
 #include "core/context.hpp"
@@ -69,6 +70,7 @@ namespace clover {
 
       cl_command_queue_properties props;
       pipe_context *pipe;
+      std::mutex queued_events_mutex;
       std::deque<intrusive_ref<hard_event>> queued_events;
    };
 }
diff --git a/src/gallium/state_trackers/clover/core/resource.cpp b/src/gallium/state_trackers/clover/core/resource.cpp
index bcf87e1..78ebafb 100644
--- a/src/gallium/state_trackers/clover/core/resource.cpp
+++ b/src/gallium/state_trackers/clover/core/resource.cpp
@@ -118,6 +118,8 @@ root_resource::root_resource(clover::device &dev, memory_obj &obj,
                              command_queue &q, const std::string &data) :
    resource(dev, obj) {
    pipe_resource info {};
+   const bool user_ptr_support = dev.pipe->get_param(dev.pipe,
+         PIPE_CAP_RESOURCE_FROM_USER_MEMORY);
 
    if (image *img = dynamic_cast<image *>(&obj)) {
       info.format = translate_format(img->format());
@@ -137,16 +139,29 @@ root_resource::root_resource(clover::device &dev, memory_obj &obj,
                 PIPE_BIND_TRANSFER_READ |
                 PIPE_BIND_TRANSFER_WRITE);
 
+   if (obj.flags() & CL_MEM_USE_HOST_PTR && user_ptr_support) {
+      // Page alignment is normally required for this, just try, hope for the
+      // best and fall back if it fails.
+      pipe = dev.pipe->resource_from_user_memory(dev.pipe, &info, obj.host_ptr());
+      if (pipe)
+         return;
+   }
+
+   if (obj.flags() & (CL_MEM_ALLOC_HOST_PTR | CL_MEM_USE_HOST_PTR)) {
+      info.usage = PIPE_USAGE_STAGING;
+   }
+
    pipe = dev.pipe->resource_create(dev.pipe, &info);
    if (!pipe)
       throw error(CL_OUT_OF_RESOURCES);
 
-   if (!data.empty()) {
+   if (obj.flags() & (CL_MEM_USE_HOST_PTR | CL_MEM_COPY_HOST_PTR)) {
+      const void *data_ptr = !data.empty() ? data.data() : obj.host_ptr();
       box rect { {{ 0, 0, 0 }}, {{ info.width0, info.height0, info.depth0 }} };
       unsigned cpp = util_format_get_blocksize(info.format);
 
       q.pipe->transfer_inline_write(q.pipe, pipe, 0, PIPE_TRANSFER_WRITE,
-                                    rect, data.data(), cpp * info.width0,
+                                    rect, data_ptr, cpp * info.width0,
                                     cpp * info.width0 * info.height0);
    }
 }
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index 7d2d941..9b91fee 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -709,7 +709,7 @@ clover::compile_program_llvm(const std::string &source,
    llvm_ctx.setDiagnosticHandler(diagnostic_handler, &r_log);
 
    if (get_debug_flags() & DBG_CLC)
-      debug_log(source, ".cl");
+      debug_log("// Build options: " + opts + '\n' + source, ".cl");
 
    // The input file name must have the .cl extension in order for the
    // CompilerInvocation class to recognize it as an OpenCL source file.
diff --git a/src/gallium/state_trackers/dri/Android.mk b/src/gallium/state_trackers/dri/Android.mk
new file mode 100644
index 0000000..188e4a1
--- /dev/null
+++ b/src/gallium/state_trackers/dri/Android.mk
@@ -0,0 +1,64 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2015 Chih-Wei Huang <cwhuang@linux.org.tw>
+# Copyright (C) 2015 Android-x86 Open Source Project
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(common_SOURCES)
+
+LOCAL_CFLAGS := \
+	-DGALLIUM_STATIC_TARGETS=1 \
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/mapi \
+	$(MESA_TOP)/src/mesa \
+
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+	$(LOCAL_PATH) \
+	$(LOCAL_C_INCLUDES) \
+
+LOCAL_STATIC_LIBRARIES := \
+	libmesa_dri_common \
+
+ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
+LOCAL_SRC_FILES += $(drisw_SOURCES)
+endif
+
+# swrast only?
+ifeq ($(MESA_GPU_DRIVERS),swrast)
+LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
+else
+LOCAL_SRC_FILES += $(dri2_SOURCES)
+LOCAL_SHARED_LIBRARIES := libdrm
+endif
+
+LOCAL_MODULE := libmesa_st_dri
+
+LOCAL_GENERATED_SOURCES := $(MESA_DRI_OPTIONS_H)
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index 8b6fe67..8d93f78 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -1399,6 +1399,10 @@ static __DRI2fenceExtension dri2FenceExtension = {
    .server_wait_sync = dri2_server_wait_sync
 };
 
+static const __DRIrobustnessExtension dri2Robustness = {
+   .base = { __DRI2_ROBUSTNESS, 1 }
+};
+
 /*
  * Backend function init_screen.
  */
@@ -1414,6 +1418,18 @@ static const __DRIextension *dri_screen_extensions[] = {
    NULL
 };
 
+static const __DRIextension *dri_robust_screen_extensions[] = {
+   &driTexBufferExtension.base,
+   &dri2FlushExtension.base,
+   &dri2ImageExtension.base,
+   &dri2RendererQueryExtension.base,
+   &dri2ConfigQueryExtension.base,
+   &dri2ThrottleExtension.base,
+   &dri2FenceExtension.base,
+   &dri2Robustness.base,
+   NULL
+};
+
 /**
  * This is the driver specific part of the createNewScreen entry point.
  *
@@ -1467,7 +1483,12 @@ dri2_init_screen(__DRIscreen * sPriv)
       }
    }
 
-   sPriv->extensions = dri_screen_extensions;
+   if (pscreen && pscreen->get_param(pscreen, PIPE_CAP_DEVICE_RESET_STATUS_QUERY)) {
+      sPriv->extensions = dri_robust_screen_extensions;
+      screen->has_reset_status_query = true;
+   }
+   else
+      sPriv->extensions = dri_screen_extensions;
 
    /* dri_init_screen_helper checks pscreen for us */
 
diff --git a/src/gallium/state_trackers/dri/dri_context.c b/src/gallium/state_trackers/dri/dri_context.c
index 8ac81b7..3d8af65 100644
--- a/src/gallium/state_trackers/dri/dri_context.c
+++ b/src/gallium/state_trackers/dri/dri_context.c
@@ -56,6 +56,21 @@ dri_create_context(gl_api api, const struct gl_config * visual,
    struct st_context_iface *st_share = NULL;
    struct st_context_attribs attribs;
    enum st_context_error ctx_err = 0;
+   unsigned allowed_flags = __DRI_CTX_FLAG_DEBUG |
+                            __DRI_CTX_FLAG_FORWARD_COMPATIBLE;
+
+   if (screen->has_reset_status_query)
+      allowed_flags |= __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS;
+
+   if (flags & ~allowed_flags) {
+      *error = __DRI_CTX_ERROR_UNKNOWN_FLAG;
+      goto fail;
+   }
+
+   if (!screen->has_reset_status_query && notify_reset) {
+      *error = __DRI_CTX_ERROR_UNKNOWN_ATTRIBUTE;
+      goto fail;
+   }
 
    memset(&attribs, 0, sizeof(attribs));
    switch (api) {
@@ -83,15 +98,11 @@ dri_create_context(gl_api api, const struct gl_config * visual,
    if ((flags & __DRI_CTX_FLAG_DEBUG) != 0)
       attribs.flags |= ST_CONTEXT_FLAG_DEBUG;
 
-   if (flags & ~(__DRI_CTX_FLAG_DEBUG | __DRI_CTX_FLAG_FORWARD_COMPATIBLE)) {
-      *error = __DRI_CTX_ERROR_UNKNOWN_FLAG;
-      goto fail;
-   }
+   if (flags & __DRI_CTX_FLAG_ROBUST_BUFFER_ACCESS)
+      attribs.flags |= ST_CONTEXT_FLAG_ROBUST_ACCESS;
 
-   if (notify_reset) {
-      *error = __DRI_CTX_ERROR_UNKNOWN_ATTRIBUTE;
-      goto fail;
-   }
+   if (notify_reset)
+      attribs.flags |= ST_CONTEXT_FLAG_RESET_NOTIFICATION_ENABLED;
 
    if (sharedContextPrivate) {
       st_share = ((struct dri_context *)sharedContextPrivate)->st;
@@ -233,11 +244,10 @@ dri_make_current(__DRIcontext * cPriv,
 
    ctx->stapi->make_current(ctx->stapi, ctx->st, &draw->base, &read->base);
 
-   // This is ok to call here. If they are already init, it's a no-op.
-   if (draw->textures[ST_ATTACHMENT_BACK_LEFT] && draw->textures[ST_ATTACHMENT_DEPTH_STENCIL]
-      && ctx->pp)
-         pp_init_fbos(ctx->pp, draw->textures[ST_ATTACHMENT_BACK_LEFT]->width0,
-            draw->textures[ST_ATTACHMENT_BACK_LEFT]->height0);
+   /* This is ok to call here. If they are already init, it's a no-op. */
+   if (ctx->pp && draw->textures[ST_ATTACHMENT_BACK_LEFT])
+      pp_init_fbos(ctx->pp, draw->textures[ST_ATTACHMENT_BACK_LEFT]->width0,
+                   draw->textures[ST_ATTACHMENT_BACK_LEFT]->height0);
 
    return GL_TRUE;
 }
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index bdab74f..173f403 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -82,6 +82,7 @@ struct dri_screen
    boolean d_depth_bits_last;
    boolean sd_depth_bits_last;
    boolean auto_fake_front;
+   boolean has_reset_status_query;
    enum pipe_texture_target target;
 
    /* hooks filled in by dri2 & drisw */
diff --git a/src/gallium/state_trackers/dri/drisw.c b/src/gallium/state_trackers/dri/drisw.c
index 5f69a2d..4a2c1bb 100644
--- a/src/gallium/state_trackers/dri/drisw.c
+++ b/src/gallium/state_trackers/dri/drisw.c
@@ -333,6 +333,7 @@ drisw_update_tex_buffer(struct dri_drawable *drawable,
 static const __DRIextension *drisw_screen_extensions[] = {
    &driTexBufferExtension.base,
    &dri2RendererQueryExtension.base,
+   &dri2ConfigQueryExtension.base,
    NULL
 };
 
diff --git a/src/gallium/state_trackers/glx/xlib/glx_api.c b/src/gallium/state_trackers/glx/xlib/glx_api.c
index 0508255..0456d44 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_api.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_api.c
@@ -40,6 +40,13 @@
 
 #include "xm_api.h"
 
+/* An "Atrribs/Attribs" typo was fixed in glxproto.h in Nov 2014.
+ * This is in case we don't have the updated header.
+ */
+#if !defined(X_GLXCreateContextAttribsARB) && \
+     defined(X_GLXCreateContextAtrribsARB)
+#define X_GLXCreateContextAttribsARB X_GLXCreateContextAtrribsARB
+#endif 
 
 /* This indicates the client-side GLX API and GLX encoder version. */
 #define CLIENT_MAJOR_VERSION 1
@@ -2168,7 +2175,7 @@ glXQueryDrawable(Display *dpy, GLXDrawable draw, int attribute,
 #endif
 
       default:
-         generate_error(dpy, BadValue, 0, X_GLXCreateContextAtrribsARB, true);
+         generate_error(dpy, BadValue, 0, X_GLXCreateContextAttribsARB, true);
          return;
    }
 }
@@ -2762,14 +2769,14 @@ glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
          break;
       default:
          /* bad attribute */
-         generate_error(dpy, BadValue, 0, X_GLXCreateContextAtrribsARB, True);
+         generate_error(dpy, BadValue, 0, X_GLXCreateContextAttribsARB, True);
          return NULL;
       }
    }
 
    /* check contextFlags */
    if (contextFlags & ~contextFlagsAll) {
-      generate_error(dpy, BadValue, 0, X_GLXCreateContextAtrribsARB, True);
+      generate_error(dpy, BadValue, 0, X_GLXCreateContextAttribsARB, True);
       return NULL;
    }
 
@@ -2777,14 +2784,14 @@ glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
    if (profileMask != GLX_CONTEXT_CORE_PROFILE_BIT_ARB &&
        profileMask != GLX_CONTEXT_COMPATIBILITY_PROFILE_BIT_ARB &&
        profileMask != GLX_CONTEXT_ES_PROFILE_BIT_EXT) {
-      generate_error(dpy, GLXBadProfileARB, 0, X_GLXCreateContextAtrribsARB, False);
+      generate_error(dpy, GLXBadProfileARB, 0, X_GLXCreateContextAttribsARB, False);
       return NULL;
    }
 
    /* check renderType */
    if (renderType != GLX_RGBA_TYPE &&
        renderType != GLX_COLOR_INDEX_TYPE) {
-      generate_error(dpy, BadValue, 0, X_GLXCreateContextAtrribsARB, True);
+      generate_error(dpy, BadValue, 0, X_GLXCreateContextAttribsARB, True);
       return NULL;
    }
 
@@ -2797,7 +2804,7 @@ glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
          (majorVersion == 3 && minorVersion > 3) ||
          (majorVersion == 4 && minorVersion > 5) ||
          majorVersion > 4))) {
-      generate_error(dpy, BadMatch, 0, X_GLXCreateContextAtrribsARB, True);
+      generate_error(dpy, BadMatch, 0, X_GLXCreateContextAttribsARB, True);
       return NULL;
    }
    if (profileMask == GLX_CONTEXT_ES_PROFILE_BIT_EXT &&
@@ -2809,18 +2816,18 @@ glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
        * different error code for invalid ES versions, but this is what NVIDIA
        * does and piglit expects.
        */
-      generate_error(dpy, GLXBadProfileARB, 0, X_GLXCreateContextAtrribsARB, False);
+      generate_error(dpy, GLXBadProfileARB, 0, X_GLXCreateContextAttribsARB, False);
       return NULL;
    }
 
    if ((contextFlags & GLX_CONTEXT_FORWARD_COMPATIBLE_BIT_ARB) &&
        majorVersion < 3) {
-      generate_error(dpy, BadMatch, 0, X_GLXCreateContextAtrribsARB, True);
+      generate_error(dpy, BadMatch, 0, X_GLXCreateContextAttribsARB, True);
       return NULL;
    }
 
    if (renderType == GLX_COLOR_INDEX_TYPE && majorVersion >= 3) {
-      generate_error(dpy, BadMatch, 0, X_GLXCreateContextAtrribsARB, True);
+      generate_error(dpy, BadMatch, 0, X_GLXCreateContextAttribsARB, True);
       return NULL;
    }
 
@@ -2830,7 +2837,7 @@ glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
                         majorVersion, minorVersion,
                         profileMask, contextFlags);
    if (!ctx) {
-      generate_error(dpy, GLXBadFBConfig, 0, X_GLXCreateContextAtrribsARB, False);
+      generate_error(dpy, GLXBadFBConfig, 0, X_GLXCreateContextAttribsARB, False);
    }
 
    return ctx;
diff --git a/src/gallium/state_trackers/hgl/hgl.c b/src/gallium/state_trackers/hgl/hgl.c
index b75dc26..1e804c0 100644
--- a/src/gallium/state_trackers/hgl/hgl.c
+++ b/src/gallium/state_trackers/hgl/hgl.c
@@ -7,16 +7,18 @@
  *      Alexander von Gluck IV, kallisti5@unixzen.com
  */
 
+#include "hgl_context.h"
 
-#include "GLView.h"
+#include <stdio.h>
 
 #include "pipe/p_format.h"
 #include "util/u_atomic.h"
 #include "util/u_format.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
+#include "state_tracker/st_gl_api.h" /* for st_gl_api_create */
 
-#include "hgl_context.h"
+#include "GLView.h"
 
 
 #ifdef DEBUG
@@ -91,7 +93,7 @@ hgl_st_framebuffer_validate_textures(struct st_framebuffer_iface *stfbi,
 		for (i = 0; i < ST_ATTACHMENT_COUNT; i++)
 			pipe_resource_reference(&buffer->textures[i], NULL);
 	}
-	
+
 	memset(&templat, 0, sizeof(templat));
 	templat.target = buffer->target;
 	templat.width0 = width;
@@ -256,6 +258,14 @@ hgl_create_st_framebuffer(struct hgl_context* context)
 }
 
 
+struct st_api*
+hgl_create_st_api()
+{
+	CALLED();
+	return st_gl_api_create();
+}
+
+
 struct st_manager *
 hgl_create_st_manager(struct hgl_context* context)
 {
diff --git a/src/gallium/state_trackers/hgl/hgl_context.h b/src/gallium/state_trackers/hgl/hgl_context.h
index 4840d9e..d2ec7fb 100644
--- a/src/gallium/state_trackers/hgl/hgl_context.h
+++ b/src/gallium/state_trackers/hgl/hgl_context.h
@@ -9,9 +9,6 @@
 #define HGL_CONTEXT_H
 
 
-#ifdef __cplusplus
-extern "C" {
-#endif
 #include "state_tracker/st_api.h"
 #include "state_tracker/st_manager.h"
 #include "pipe/p_compiler.h"
@@ -20,8 +17,10 @@ extern "C" {
 #include "os/os_thread.h"
 
 #include "bitmap_wrapper.h"
+
+
 #ifdef __cplusplus
-}
+extern "C" {
 #endif
 
 
@@ -82,6 +81,9 @@ struct hgl_context
 };
 
 
+// hgl state_tracker api
+struct st_api* hgl_create_st_api(void);
+
 // hgl state_tracker framebuffer
 struct hgl_buffer* hgl_create_st_framebuffer(struct hgl_context* context);
 
@@ -94,4 +96,8 @@ struct st_visual* hgl_create_st_visual(ulong options);
 void hgl_destroy_st_visual(struct st_visual* visual);
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* HGL_CONTEXT_H */
diff --git a/src/gallium/state_trackers/nine/nine_ff.c b/src/gallium/state_trackers/nine/nine_ff.c
index e6f2b21..c2213e6 100644
--- a/src/gallium/state_trackers/nine/nine_ff.c
+++ b/src/gallium/state_trackers/nine/nine_ff.c
@@ -422,13 +422,15 @@ nine_ff_build_vs(struct NineDevice9 *device, struct vs_build_ctx *vs)
     oCol[1] = ureg_saturate(ureg_DECL_output(ureg, TGSI_SEMANTIC_COLOR, 1));
 
     if (key->vertexpointsize || key->pointscale) {
-        oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0, TGSI_WRITEMASK_X);
+        oPsz = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_PSIZE, 0,
+                                       TGSI_WRITEMASK_X, 0, 1);
         oPsz = ureg_writemask(oPsz, TGSI_WRITEMASK_X);
     }
     if (key->fog_mode) {
         /* We apply fog to the vertex colors, oFog is for programmable shaders only ?
          */
-        oFog = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_FOG, 0, TGSI_WRITEMASK_X);
+        oFog = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_FOG, 0,
+                                       TGSI_WRITEMASK_X, 0, 1);
         oFog = ureg_writemask(oFog, TGSI_WRITEMASK_X);
     }
 
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index fd0f76e..22a5882 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -1098,7 +1098,7 @@ _tx_dst_param(struct shader_translator *tx, const struct sm1_dst_param *param)
         if (ureg_dst_is_undef(tx->regs.oDepth))
            tx->regs.oDepth =
               ureg_DECL_output_masked(tx->ureg, TGSI_SEMANTIC_POSITION, 0,
-                                      TGSI_WRITEMASK_Z);
+                                      TGSI_WRITEMASK_Z, 0, 1);
         dst = tx->regs.oDepth; /* XXX: must write .z component */
         break;
     case D3DSPR_PREDICATE:
@@ -1966,7 +1966,7 @@ DECL_SPECIAL(DCL)
                 tx->info->position_t = TRUE;
             assert(sem.reg.idx < Elements(tx->regs.o));
             tx->regs.o[sem.reg.idx] = ureg_DECL_output_masked(
-                ureg, tgsi.Name, tgsi.Index, sem.reg.mask);
+                ureg, tgsi.Name, tgsi.Index, sem.reg.mask, 0, 1);
 
             if (tgsi.Name == TGSI_SEMANTIC_PSIZE)
                 tx->regs.oPts = tx->regs.o[sem.reg.idx];
@@ -1979,12 +1979,13 @@ DECL_SPECIAL(DCL)
                 ureg, tgsi.Name, tgsi.Index,
                 nine_tgsi_to_interp_mode(&tgsi),
                 0, /* cylwrap */
-                sem.reg.mod & NINED3DSPDM_CENTROID);
+                sem.reg.mod & NINED3DSPDM_CENTROID, 0, 1);
         } else
         if (!is_input && 0) { /* declare in COLOROUT/DEPTHOUT case */
             /* FragColor or FragDepth */
             assert(sem.reg.mask != 0);
-            ureg_DECL_output_masked(ureg, tgsi.Name, tgsi.Index, sem.reg.mask);
+            ureg_DECL_output_masked(ureg, tgsi.Name, tgsi.Index, sem.reg.mask,
+                                    0, 1);
         }
     }
     return D3D_OK;
@@ -2312,7 +2313,8 @@ DECL_SPECIAL(TEXM3x2DEPTH)
     ureg_CMP(ureg, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_negate(ureg_abs(ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y))),
              ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X), ureg_imm1f(ureg, 1.0f));
     /* replace the depth for depth testing with the result */
-    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0, TGSI_WRITEMASK_Z);
+    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                              TGSI_WRITEMASK_Z, 0, 1);
     ureg_MOV(ureg, tx->regs.oDepth, ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
     /* note that we write nothing to the destination, since it's disallowed to use it afterward */
     return D3D_OK;
@@ -2410,7 +2412,8 @@ DECL_SPECIAL(TEXDEPTH)
     ureg_CMP(ureg, ureg_writemask(r5, TGSI_WRITEMASK_X), ureg_negate(ureg_abs(r5g)),
              r5r, ureg_imm1f(ureg, 1.0f));
     /* replace the depth for depth testing with the result */
-    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0, TGSI_WRITEMASK_Z);
+    tx->regs.oDepth = ureg_DECL_output_masked(ureg, TGSI_SEMANTIC_POSITION, 0,
+                                              TGSI_WRITEMASK_Z, 0, 1);
     ureg_MOV(ureg, tx->regs.oDepth, r5r);
 
     return D3D_OK;
diff --git a/src/gallium/state_trackers/wgl/Makefile.sources b/src/gallium/state_trackers/wgl/Makefile.sources
index d204efd..8c463d5 100644
--- a/src/gallium/state_trackers/wgl/Makefile.sources
+++ b/src/gallium/state_trackers/wgl/Makefile.sources
@@ -8,6 +8,7 @@ C_SOURCES := \
 	stw_ext_swapinterval.c \
 	stw_framebuffer.c \
 	stw_getprocaddress.c \
+	stw_nopfuncs.c \
 	stw_pixelformat.c \
 	stw_st.c \
 	stw_tls.c \
diff --git a/src/gallium/state_trackers/wgl/stw_context.c b/src/gallium/state_trackers/wgl/stw_context.c
index 2ed6c2b..3e99cc4 100644
--- a/src/gallium/state_trackers/wgl/stw_context.c
+++ b/src/gallium/state_trackers/wgl/stw_context.c
@@ -226,14 +226,13 @@ stw_create_context_attribs(HDC hdc, INT iLayerPlane, DHGLRC hShareContext,
        *         be implemented, as determined by the implementation.
        *       * The core profile of version 3.2 or greater."
        *
-       * and because Mesa doesn't support GL_ARB_compatibility, the only chance to
-       * honour a 3.1 context is through core profile.
+       * But Mesa doesn't support GL_ARB_compatibility, while most prevalent
+       * Windows OpenGL implementations do, and unfortunately many Windows
+       * applications don't check whether they receive or not a context with
+       * GL_ARB_compatibility, so returning a core profile here does more harm
+       * than good.
        */
-      if (majorVersion == 3 && minorVersion == 1) {
-         attribs.profile = ST_PROFILE_OPENGL_CORE;
-      } else {
-         attribs.profile = ST_PROFILE_DEFAULT;
-      }
+      attribs.profile = ST_PROFILE_DEFAULT;
       break;
    case WGL_CONTEXT_ES_PROFILE_BIT_EXT:
       if (majorVersion >= 2) {
diff --git a/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
index 91682d1..e38086e 100644
--- a/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_ext_pixelformat.c
@@ -88,7 +88,12 @@ stw_query_attrib(
       return TRUE;
 
    case WGL_SWAP_METHOD_ARB:
-      *pvalue = pfi->pfd.dwFlags & PFD_SWAP_COPY ? WGL_SWAP_COPY_ARB : WGL_SWAP_UNDEFINED_ARB;
+      if (pfi->pfd.dwFlags & PFD_SWAP_COPY)
+         *pvalue = WGL_SWAP_COPY_ARB;
+      else if (pfi->pfd.dwFlags & PFD_SWAP_EXCHANGE)
+         *pvalue = WGL_SWAP_EXCHANGE_EXT;
+      else
+         *pvalue = WGL_SWAP_UNDEFINED_ARB;
       return TRUE;
 
    case WGL_SWAP_LAYER_BUFFERS_ARB:
@@ -232,7 +237,7 @@ stw_query_attrib(
       break;
 
    case WGL_SAMPLE_BUFFERS_ARB:
-      *pvalue = 1;
+      *pvalue = (pfi->stvis.samples > 1);
       break;
 
    case WGL_SAMPLES_ARB:
diff --git a/src/gallium/state_trackers/wgl/stw_getprocaddress.c b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
index 2ffeec1..33949b6 100644
--- a/src/gallium/state_trackers/wgl/stw_getprocaddress.c
+++ b/src/gallium/state_trackers/wgl/stw_getprocaddress.c
@@ -35,6 +35,7 @@
 #include "glapi/glapi.h"
 #include "stw_device.h"
 #include "stw_icd.h"
+#include "stw_nopfuncs.h"
 
 struct stw_extension_entry
 {
@@ -79,6 +80,7 @@ DrvGetProcAddress(
    LPCSTR lpszProc )
 {
    const struct stw_extension_entry *entry;
+   PROC p;
 
    if (!stw_dev)
       return NULL;
@@ -88,8 +90,23 @@ DrvGetProcAddress(
          if (strcmp( lpszProc, entry->name ) == 0)
             return entry->proc;
 
-   if (lpszProc[0] == 'g' && lpszProc[1] == 'l')
-      return (PROC) _glapi_get_proc_address( lpszProc );
-
+   if (lpszProc[0] == 'g' && lpszProc[1] == 'l') {
+      p = (PROC) _glapi_get_proc_address(lpszProc);
+      if (p)
+         return p;
+   }
+
+   /* If we get here, we'd normally just return NULL, but since some apps
+    * (like Viewperf12) crash when they try to use the null pointer, try
+    * returning a pointer to a no-op function instead.
+    */
+   p = stw_get_nop_function(lpszProc);
+   if (p) {
+      debug_printf("wglGetProcAddress(\"%s\") returning no-op function\n",
+                   lpszProc);
+      return p;
+   }
+
+   debug_printf("wglGetProcAddress(\"%s\") returning NULL\n", lpszProc);
    return NULL;
 }
diff --git a/src/gallium/state_trackers/wgl/stw_nopfuncs.c b/src/gallium/state_trackers/wgl/stw_nopfuncs.c
new file mode 100644
index 0000000..d69c013
--- /dev/null
+++ b/src/gallium/state_trackers/wgl/stw_nopfuncs.c
@@ -0,0 +1,464 @@
+/**************************************************************************
+ *
+ * Copyright 2015 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * No-op GL API functions.
+ *
+ * Some OpenGL apps (like Viewperf12) call wglGetProcAddress() to get
+ * a pointer to an extension function, get a NULL pointer, but don't bother
+ * to check for NULL before jumping through the pointer.  This causes a
+ * crash.
+ *
+ * As a work-around we provide some no-op functions here to avoid those
+ * crashes.
+ */
+
+#include <GL/gl.h>
+#include "stw_nopfuncs.h"
+#include "util/u_debug.h"
+
+
+static void
+warning(const char *name)
+{
+   /* use name+4 to skip "nop_" prefix */
+   _debug_printf("Application calling unsupported %s function\n", name+4);
+}
+
+static void APIENTRY
+nop_glBindMultiTextureEXT(GLenum texunit, GLenum target, GLuint texture)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glColor3hNV(GLhalfNV red, GLhalfNV green, GLhalfNV blue)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glColor3hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glColor4hNV(GLhalfNV red, GLhalfNV green, GLhalfNV blue, GLhalfNV alpha)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glColor4hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glDisableClientStateIndexedEXT(GLenum array, GLuint index)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glEnableClientStateIndexedEXT(GLenum array, GLuint index)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glFogCoordhNV(GLhalfNV fog)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glFogCoordhvNV(const GLhalfNV *fog)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glGetNamedBufferParameterivEXT(GLuint buffer, GLenum pname, GLint *params)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glGetNamedBufferSubDataEXT(GLuint buffer, GLintptr offset, GLsizeiptr size, void *data)
+{
+   warning(__func__);
+}
+
+static void *APIENTRY
+nop_glMapNamedBufferEXT(GLuint buffer, GLenum access)
+{
+   warning(__func__);
+   return NULL;
+}
+
+static void APIENTRY
+nop_glMatrixLoadfEXT(GLenum mode, const GLfloat *m)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMatrixLoadIdentityEXT(GLenum mode)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord1hNV(GLenum target, GLhalfNV s)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord1hvNV(GLenum target, const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord2hNV(GLenum target, GLhalfNV s, GLhalfNV t)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord2hvNV(GLenum target, const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord3hNV(GLenum target, GLhalfNV s, GLhalfNV t, GLhalfNV r)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord3hvNV(GLenum target, const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord4hNV(GLenum target, GLhalfNV s, GLhalfNV t, GLhalfNV r, GLhalfNV q)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoord4hvNV(GLenum target, const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexCoordPointerEXT(GLenum texunit, GLint size, GLenum type, GLsizei stride, const void *pointer)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexEnvfEXT(GLenum texunit, GLenum target, GLenum pname, GLfloat param)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexEnvfvEXT(GLenum texunit, GLenum target, GLenum pname, const GLfloat *params)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexEnviEXT(GLenum texunit, GLenum target, GLenum pname, GLint param)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexGenfvEXT(GLenum texunit, GLenum coord, GLenum pname, const GLfloat *params)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glMultiTexGeniEXT(GLenum texunit, GLenum coord, GLenum pname, GLint param)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glNamedBufferDataEXT(GLuint buffer, GLsizeiptr size, const void *data, GLenum usage)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glNamedBufferSubDataEXT(GLuint buffer, GLintptr offset, GLsizeiptr size, const void *data)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glNamedProgramLocalParameter4fvEXT(GLuint program, GLenum target, GLuint index, const GLfloat *params)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glNamedProgramLocalParameters4fvEXT(GLuint program, GLenum target, GLuint index, GLsizei count, const GLfloat *params)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glNormal3hNV(GLhalfNV nx, GLhalfNV ny, GLhalfNV nz)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glNormal3hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glPatchParameterfv(GLenum pname, const GLfloat *values)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glPatchParameteri(GLenum pname, GLint value)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glSecondaryColor3hNV(GLhalfNV red, GLhalfNV green, GLhalfNV blue)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glSecondaryColor3hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord1hNV(GLhalfNV s)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord1hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord2hNV(GLhalfNV s, GLhalfNV t)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord2hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord3hNV(GLhalfNV s, GLhalfNV t, GLhalfNV r)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord3hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord4hNV(GLhalfNV s, GLhalfNV t, GLhalfNV r, GLhalfNV q)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTexCoord4hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTextureParameterfEXT(GLuint texture, GLenum target, GLenum pname, GLfloat param)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTextureParameterfvEXT(GLuint texture, GLenum target, GLenum pname, const GLfloat *params)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glTextureParameteriEXT(GLuint texture, GLenum target, GLenum pname, GLint param)
+{
+   warning(__func__);
+}
+
+static GLboolean APIENTRY
+nop_glUnmapNamedBufferEXT(GLuint buffer)
+{
+   warning(__func__);
+   return GL_FALSE;
+}
+
+static void APIENTRY
+nop_glVertex2hNV(GLhalfNV x, GLhalfNV y)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glVertex2hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glVertex3hNV(GLhalfNV x, GLhalfNV y, GLhalfNV z)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glVertex3hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glVertex4hNV(GLhalfNV x, GLhalfNV y, GLhalfNV z, GLhalfNV w)
+{
+   warning(__func__);
+}
+
+static void APIENTRY
+nop_glVertex4hvNV(const GLhalfNV *v)
+{
+   warning(__func__);
+}
+
+
+PROC
+stw_get_nop_function(const char *name)
+{
+   struct {
+      const char *name;
+      PROC p;
+   } table[] = {
+      { "glBindMultiTextureEXT", (PROC) nop_glBindMultiTextureEXT },
+      { "glColor3hNV", (PROC) nop_glColor3hNV },
+      { "glColor3hvNV", (PROC) nop_glColor3hvNV },
+      { "glColor4hNV", (PROC) nop_glColor4hNV },
+      { "glColor4hvNV", (PROC) nop_glColor4hvNV },
+      { "glDisableClientStateIndexedEXT", (PROC) nop_glDisableClientStateIndexedEXT },
+      { "glEnableClientStateIndexedEXT", (PROC) nop_glEnableClientStateIndexedEXT },
+      { "glFogCoordhNV", (PROC) nop_glFogCoordhNV },
+      { "glFogCoordhvNV", (PROC) nop_glFogCoordhvNV },
+      { "glGetNamedBufferParameterivEXT", (PROC) nop_glGetNamedBufferParameterivEXT },
+      { "glGetNamedBufferSubDataEXT", (PROC) nop_glGetNamedBufferSubDataEXT },
+      { "glMapNamedBufferEXT", (PROC) nop_glMapNamedBufferEXT },
+      { "glMatrixLoadfEXT", (PROC) nop_glMatrixLoadfEXT },
+      { "glMatrixLoadIdentityEXT", (PROC) nop_glMatrixLoadIdentityEXT },
+      { "glMultiTexCoord1hNV", (PROC) nop_glMultiTexCoord1hNV },
+      { "glMultiTexCoord1hvNV", (PROC) nop_glMultiTexCoord1hvNV },
+      { "glMultiTexCoord2hNV", (PROC) nop_glMultiTexCoord2hNV },
+      { "glMultiTexCoord2hvNV", (PROC) nop_glMultiTexCoord2hvNV },
+      { "glMultiTexCoord3hNV", (PROC) nop_glMultiTexCoord3hNV },
+      { "glMultiTexCoord3hvNV", (PROC) nop_glMultiTexCoord3hvNV },
+      { "glMultiTexCoord4hNV", (PROC) nop_glMultiTexCoord4hNV },
+      { "glMultiTexCoord4hvNV", (PROC) nop_glMultiTexCoord4hvNV },
+      { "glMultiTexCoordPointerEXT", (PROC) nop_glMultiTexCoordPointerEXT },
+      { "glMultiTexEnvfEXT", (PROC) nop_glMultiTexEnvfEXT },
+      { "glMultiTexEnvfvEXT", (PROC) nop_glMultiTexEnvfvEXT },
+      { "glMultiTexEnviEXT", (PROC) nop_glMultiTexEnviEXT },
+      { "glMultiTexGenfvEXT", (PROC) nop_glMultiTexGenfvEXT },
+      { "glMultiTexGeniEXT", (PROC) nop_glMultiTexGeniEXT },
+      { "glNamedBufferDataEXT", (PROC) nop_glNamedBufferDataEXT },
+      { "glNamedBufferSubDataEXT", (PROC) nop_glNamedBufferSubDataEXT },
+      { "glNamedProgramLocalParameter4fvEXT", (PROC) nop_glNamedProgramLocalParameter4fvEXT },
+      { "glNamedProgramLocalParameters4fvEXT", (PROC) nop_glNamedProgramLocalParameters4fvEXT },
+      { "glNormal3hNV", (PROC) nop_glNormal3hNV },
+      { "glNormal3hvNV", (PROC) nop_glNormal3hvNV },
+      { "glPatchParameterfv", (PROC) nop_glPatchParameterfv },
+      { "glPatchParameteri", (PROC) nop_glPatchParameteri },
+      { "glSecondaryColor3hNV", (PROC) nop_glSecondaryColor3hNV },
+      { "glSecondaryColor3hvNV", (PROC) nop_glSecondaryColor3hvNV },
+      { "glTexCoord1hNV", (PROC) nop_glTexCoord1hNV },
+      { "glTexCoord1hvNV", (PROC) nop_glTexCoord1hvNV },
+      { "glTexCoord2hNV", (PROC) nop_glTexCoord2hNV },
+      { "glTexCoord2hvNV", (PROC) nop_glTexCoord2hvNV },
+      { "glTexCoord3hNV", (PROC) nop_glTexCoord3hNV },
+      { "glTexCoord3hvNV", (PROC) nop_glTexCoord3hvNV },
+      { "glTexCoord4hNV", (PROC) nop_glTexCoord4hNV },
+      { "glTexCoord4hvNV", (PROC) nop_glTexCoord4hvNV },
+      { "glTextureParameterfEXT", (PROC) nop_glTextureParameterfEXT },
+      { "glTextureParameterfvEXT", (PROC) nop_glTextureParameterfvEXT },
+      { "glTextureParameteriEXT", (PROC) nop_glTextureParameteriEXT },
+      { "glUnmapNamedBufferEXT", (PROC) nop_glUnmapNamedBufferEXT },
+      { "glVertex2hNV", (PROC) nop_glVertex2hNV },
+      { "glVertex2hvNV", (PROC) nop_glVertex2hvNV },
+      { "glVertex3hNV", (PROC) nop_glVertex3hNV },
+      { "glVertex3hvNV", (PROC) nop_glVertex3hvNV },
+      { "glVertex4hNV", (PROC) nop_glVertex4hNV },
+      { "glVertex4hvNV", (PROC) nop_glVertex4hvNV },
+      { NULL, NULL }
+   };
+
+   int i;
+
+   for (i = 0; table[i].name; i++) {
+      if (strcmp(table[i].name, name) == 0)
+         return table[i].p;
+   }
+   return NULL;
+}
diff --git a/src/gallium/state_trackers/wgl/stw_nopfuncs.h b/src/gallium/state_trackers/wgl/stw_nopfuncs.h
new file mode 100644
index 0000000..f00d420
--- /dev/null
+++ b/src/gallium/state_trackers/wgl/stw_nopfuncs.h
@@ -0,0 +1,11 @@
+
+
+#ifndef STW_NOPFUNCS_H
+#define STW_NOPFUNCS_H
+
+
+PROC
+stw_get_nop_function(const char *name);
+
+
+#endif /* STW_NOPFUNCS_H */
diff --git a/src/gallium/state_trackers/wgl/stw_pixelformat.c b/src/gallium/state_trackers/wgl/stw_pixelformat.c
index b0cd5ab..db6cf8e 100644
--- a/src/gallium/state_trackers/wgl/stw_pixelformat.c
+++ b/src/gallium/state_trackers/wgl/stw_pixelformat.c
@@ -113,7 +113,9 @@ stw_pf_doublebuffer[] = {
 const unsigned 
 stw_pf_multisample[] = {
    0,
-   4
+   4,
+   8,
+   16
 };
 
 
@@ -222,23 +224,32 @@ add_color_format_variants(const struct stw_pf_color_info *color,
    unsigned ms, db, ds, acc;
    unsigned bind_flags = PIPE_BIND_RENDER_TARGET;
    unsigned num_added = 0;
+   int force_samples = 0;
 
-   if (!extended) {
-      bind_flags |= PIPE_BIND_DISPLAY_TARGET;
+   /* Since GLUT for Windows doesn't support MSAA we have an env var
+    * to force all pixel formats to have a particular number of samples.
+    */
+   {
+      const char *samples= getenv("SVGA_FORCE_MSAA");
+      if (samples)
+         force_samples = atoi(samples);
    }
 
-   if (!screen->is_format_supported(screen, color->format,
-                                    PIPE_TEXTURE_2D, 0, bind_flags)) {
-      return 0;
+   if (!extended) {
+      bind_flags |= PIPE_BIND_DISPLAY_TARGET;
    }
 
    for (ms = 0; ms < Elements(stw_pf_multisample); ms++) {
       unsigned samples = stw_pf_multisample[ms];
 
-      /* FIXME: re-enabled MSAA when we can query it */
-      if (samples)
+      if (force_samples && samples != force_samples)
          continue;
 
+      if (!screen->is_format_supported(screen, color->format,
+                                       PIPE_TEXTURE_2D, samples, bind_flags)) {
+         continue;
+      }
+
       for (db = 0; db < Elements(stw_pf_doublebuffer); db++) {
          unsigned doublebuffer = stw_pf_doublebuffer[db];
 
@@ -246,7 +257,7 @@ add_color_format_variants(const struct stw_pf_color_info *color,
             const struct stw_pf_depth_info *depth = &stw_pf_depth_stencil[ds];
 
             if (!screen->is_format_supported(screen, depth->format,
-                                             PIPE_TEXTURE_2D, 0,
+                                             PIPE_TEXTURE_2D, samples,
                                              PIPE_BIND_DEPTH_STENCIL)) {
                continue;
             }
diff --git a/src/gallium/state_trackers/wgl/stw_st.c b/src/gallium/state_trackers/wgl/stw_st.c
index e95c37f..0a9116c 100644
--- a/src/gallium/state_trackers/wgl/stw_st.c
+++ b/src/gallium/state_trackers/wgl/stw_st.c
@@ -77,6 +77,7 @@ stw_st_framebuffer_validate_locked(struct st_framebuffer_iface *stfb,
    templ.depth0 = 1;
    templ.array_size = 1;
    templ.last_level = 0;
+   templ.nr_samples = stwfb->stvis.samples;
 
    for (i = 0; i < ST_ATTACHMENT_COUNT; i++) {
       enum pipe_format format;
@@ -95,6 +96,7 @@ stw_st_framebuffer_validate_locked(struct st_framebuffer_iface *stfb,
       case ST_ATTACHMENT_BACK_LEFT:
          format = stwfb->stvis.color_format;
          bind = PIPE_BIND_DISPLAY_TARGET |
+                PIPE_BIND_SAMPLER_VIEW |
                 PIPE_BIND_RENDER_TARGET;
          break;
       case ST_ATTACHMENT_DEPTH_STENCIL:
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index 8901998..f69ac8e 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -535,15 +535,3 @@ xa_surface_format(const struct xa_surface *srf)
 {
     return srf->fdesc.xa_format;
 }
-
-/*
- * _mesa_error_no_memory() is expected by NIR to be provided by the
- * user.  Normally this is in mesa st, but other state trackers
- * must provide their own.
- */
-void _mesa_error_no_memory(const char *caller);
-void
-_mesa_error_no_memory(const char *caller)
-{
-	debug_printf("Mesa error: out of memory in %s", caller);
-}
diff --git a/src/gallium/targets/d3dadapter9/Makefile.am b/src/gallium/targets/d3dadapter9/Makefile.am
index 1dc55f5..591978f 100644
--- a/src/gallium/targets/d3dadapter9/Makefile.am
+++ b/src/gallium/targets/d3dadapter9/Makefile.am
@@ -74,6 +74,8 @@ endif # HAVE_LD_VERSION_SCRIPT
 d3dadapter9_la_LIBADD = \
 	$(top_builddir)/src/gallium/auxiliary/libgalliumvl_stub.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(top_builddir)/src/glsl/libnir.la \
+	$(top_builddir)/src/libglsl_util.la \
 	$(top_builddir)/src/gallium/state_trackers/nine/libninetracker.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(top_builddir)/src/gallium/winsys/sw/wrapper/libwsw.la \
diff --git a/src/gallium/targets/dri/Android.mk b/src/gallium/targets/dri/Android.mk
new file mode 100644
index 0000000..5ba129b
--- /dev/null
+++ b/src/gallium/targets/dri/Android.mk
@@ -0,0 +1,127 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2015 Chih-Wei Huang <cwhuang@linux.org.tw>
+# Copyright (C) 2015 Android-x86 Open Source Project
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := gallium_dri
+
+ifeq ($(MESA_LOLLIPOP_BUILD),true)
+LOCAL_MODULE_RELATIVE_PATH := $(notdir $(MESA_DRI_MODULE_PATH))
+else
+LOCAL_MODULE_PATH := $(MESA_DRI_MODULE_PATH)
+endif
+
+LOCAL_SRC_FILES := target.c
+
+LOCAL_CFLAGS := -DDRI_TARGET -DHAVE_LIBDRM
+
+LOCAL_SHARED_LIBRARIES := \
+	libdl \
+	libglapi \
+	libexpat \
+
+# swrast only?
+ifeq ($(MESA_GPU_DRIVERS),swrast)
+LOCAL_CFLAGS += -D__NOT_HAVE_DRM_H
+else
+LOCAL_SHARED_LIBRARIES += libdrm
+endif
+
+ifneq ($(filter freedreno,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DGALLIUM_FREEDRENO
+gallium_DRIVERS += libmesa_winsys_freedreno libmesa_pipe_freedreno
+LOCAL_SHARED_LIBRARIES += libdrm_freedreno
+endif
+ifneq ($(filter i915g,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_winsys_i915 libmesa_pipe_i915
+LOCAL_SHARED_LIBRARIES += libdrm_intel
+LOCAL_CFLAGS += -DGALLIUM_I915
+endif
+ifneq ($(filter ilo,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_winsys_intel libmesa_pipe_ilo
+LOCAL_SHARED_LIBRARIES += libdrm_intel
+LOCAL_CFLAGS += -DGALLIUM_ILO
+endif
+ifneq ($(filter nouveau,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS +=  libmesa_winsys_nouveau libmesa_pipe_nouveau
+LOCAL_CFLAGS += -DGALLIUM_NOUVEAU
+LOCAL_SHARED_LIBRARIES += libdrm_nouveau
+endif
+ifneq ($(filter r%,$(MESA_GPU_DRIVERS)),)
+ifneq ($(filter r300g,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_pipe_r300
+LOCAL_CFLAGS += -DGALLIUM_R300
+endif
+ifneq ($(filter r600g,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_pipe_r600
+LOCAL_CFLAGS += -DGALLIUM_R600
+endif
+ifneq ($(filter radeonsi,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_pipe_radeonsi
+LOCAL_SHARED_LIBRARIES += libLLVM
+LOCAL_CFLAGS += -DGALLIUM_RADEONSI
+endif
+gallium_DRIVERS += libmesa_winsys_radeon libmesa_pipe_radeon
+LOCAL_SHARED_LIBRARIES += libdrm_radeon
+endif
+ifneq ($(filter swrast,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_pipe_softpipe libmesa_winsys_sw_dri libmesa_winsys_sw_kms_dri
+LOCAL_CFLAGS += -DGALLIUM_SOFTPIPE
+endif
+ifneq ($(filter vc4,$(MESA_GPU_DRIVERS)),)
+LOCAL_CFLAGS += -DGALLIUM_VC4
+gallium_DRIVERS += libmesa_winsys_vc4 libmesa_pipe_vc4
+endif
+ifneq ($(filter vmwgfx,$(MESA_GPU_DRIVERS)),)
+gallium_DRIVERS += libmesa_winsys_svga libmesa_pipe_svga
+LOCAL_CFLAGS += -DGALLIUM_VMWGFX
+endif
+ifneq ($(filter nouveau r600g,$(MESA_GPU_DRIVERS)),)
+LOCAL_SHARED_LIBRARIES += $(if $(filter true,$(MESA_LOLLIPOP_BUILD)),libc++,libstlport)
+endif
+
+LOCAL_STATIC_LIBRARIES := \
+	$(gallium_DRIVERS) \
+	libmesa_st_dri \
+	libmesa_st_mesa \
+	libmesa_glsl \
+	libmesa_dri_common \
+	libmesa_megadriver_stub \
+	libmesa_gallium \
+	libmesa_util \
+	libmesa_loader \
+
+ifeq ($(MESA_ENABLE_LLVM),true)
+LOCAL_STATIC_LIBRARIES += \
+	libLLVMR600CodeGen \
+	libLLVMR600Desc \
+	libLLVMR600Info \
+	libLLVMR600AsmPrinter \
+	libelf
+LOCAL_LDLIBS += $(if $(filter true,$(MESA_LOLLIPOP_BUILD)),-lgcc)
+endif
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_SHARED_LIBRARY)
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index f9e4ada..9648396 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -53,12 +53,6 @@ gallium_dri_la_LIBADD = \
 	$(LIBDRM_LIBS) \
 	$(GALLIUM_COMMON_LIB_DEPS)
 
-# XXX: Temporary allow duplicated symbols, as the loader pulls in xmlconfig.c
-# which already provides driParse* and driQuery* amongst others.
-# Remove this hack as we come up with a cleaner solution.
-gallium_dri_la_LDFLAGS += \
-	-Wl,--allow-multiple-definition
-
 EXTRA_gallium_dri_la_DEPENDENCIES = \
 	dri.sym \
 	$(top_srcdir)/src/gallium/targets/dri-vdpau.dyn
diff --git a/src/gallium/targets/haiku-softpipe/GalliumContext.cpp b/src/gallium/targets/haiku-softpipe/GalliumContext.cpp
index f9d7dfc..1e3874b 100644
--- a/src/gallium/targets/haiku-softpipe/GalliumContext.cpp
+++ b/src/gallium/targets/haiku-softpipe/GalliumContext.cpp
@@ -10,17 +10,18 @@
 
 #include "GalliumContext.h"
 
+#include <stdio.h>
+
 #include "GLView.h"
 
 #include "bitmap_wrapper.h"
-extern "C" {
+
 #include "glapi/glapi.h"
 #include "pipe/p_format.h"
-#include "state_tracker/st_cb_fbo.h"
-#include "state_tracker/st_cb_flush.h"
+//#include "state_tracker/st_cb_fbo.h"
+//#include "state_tracker/st_cb_flush.h"
 #include "state_tracker/st_context.h"
 #include "state_tracker/st_gl_api.h"
-#include "state_tracker/st_manager.h"
 #include "state_tracker/sw_winsys.h"
 #include "sw/hgl/hgl_sw_winsys.h"
 #include "util/u_atomic.h"
@@ -28,7 +29,6 @@ extern "C" {
 
 #include "target-helpers/inline_sw_helper.h"
 #include "target-helpers/inline_debug_helper.h"
-}
 
 
 #ifdef DEBUG
@@ -125,7 +125,8 @@ GalliumContext::CreateContext(Bitmap *bitmap)
 	context->read = NULL;
 	context->st = NULL;
 
-	context->api = st_gl_api_create();
+	// Create st_gl_api
+	context->api = hgl_create_st_api();
 	if (!context->api) {
 		ERROR("%s: Couldn't obtain Mesa state tracker API!\n", __func__);
 		return -1;
@@ -157,12 +158,10 @@ GalliumContext::CreateContext(Bitmap *bitmap)
 	attribs.minor = 0;
 	//attribs.flags |= ST_CONTEXT_FLAG_DEBUG;
 
-	struct st_api* api = context->api;
-
 	// Create context using state tracker api call
 	enum st_context_error result;
-	context->st = api->create_context(api, context->manager, &attribs,
-		&result, context->st);
+	context->st = context->api->create_context(context->api, context->manager,
+		&attribs, &result, context->st);
 
 	if (!context->st) {
 		ERROR("%s: Couldn't create mesa state tracker context!\n",
@@ -287,10 +286,8 @@ GalliumContext::SetCurrentContext(Bitmap *bitmap, context_id contextID)
 		return B_ERROR;
 	}
 
-	struct st_api* api = context->api;
-
 	if (!bitmap) {
-		api->make_current(context->api, NULL, NULL, NULL);
+		context->api->make_current(context->api, NULL, NULL, NULL);
 		return B_OK;
 	}
 
@@ -303,7 +300,7 @@ GalliumContext::SetCurrentContext(Bitmap *bitmap, context_id contextID)
 	}
 
 	// We need to lock and unlock framebuffers before accessing them
-	api->make_current(context->api, context->st, context->draw->stfbi,
+	context->api->make_current(context->api, context->st, context->draw->stfbi,
 		context->read->stfbi);
 
 	//if (context->textures[ST_ATTACHMENT_BACK_LEFT]
diff --git a/src/gallium/targets/haiku-softpipe/GalliumContext.h b/src/gallium/targets/haiku-softpipe/GalliumContext.h
index b50d528..22076cb 100644
--- a/src/gallium/targets/haiku-softpipe/GalliumContext.h
+++ b/src/gallium/targets/haiku-softpipe/GalliumContext.h
@@ -12,14 +12,10 @@
 #include <stddef.h>
 #include <kernel/image.h>
 
-extern "C" {
-//#include "state_tracker/st_api.h"
 #include "pipe/p_compiler.h"
 #include "pipe/p_screen.h"
 #include "postprocess/filters.h"
-#include "os/os_thread.h"
 #include "hgl_context.h"
-}
 
 #include "bitmap_wrapper.h"
 
@@ -56,6 +52,6 @@ private:
 		context_id			fCurrentContext;
 		pipe_mutex			fMutex;
 };
-	
+
 
 #endif /* GALLIUMCONTEXT_H */
diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am
index 33b0d13..d99caae 100644
--- a/src/gallium/targets/libgl-xlib/Makefile.am
+++ b/src/gallium/targets/libgl-xlib/Makefile.am
@@ -24,6 +24,11 @@ GL_MAJOR = 1
 GL_MINOR = 5
 GL_TINY = $(MESA_MAJOR)$(MESA_MINOR)0$(MESA_TINY)
 
+if HAVE_SHARED_GLAPI
+SHARED_GLAPI_CFLAGS = -DGLX_SHARED_GLAPI
+SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
+endif
+
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src \
@@ -35,6 +40,7 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/gallium/state_trackers/glx/xlib \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gallium/winsys \
+	$(SHARED_GLAPI_CFLAGS) \
 	-DGALLIUM_SOFTPIPE \
 	-DGALLIUM_RBUG \
 	-DGALLIUM_TRACE
@@ -65,6 +71,7 @@ lib@GL_LIB@_la_LIBADD = \
 	$(top_builddir)/src/mapi/glapi/libglapi.la \
 	$(top_builddir)/src/mesa/libmesagallium.la \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(SHARED_GLAPI_LIB) \
 	$(GL_LIB_DEPS) \
 	$(CLOCK_LIB)
 
diff --git a/src/gallium/targets/osmesa/Makefile.am b/src/gallium/targets/osmesa/Makefile.am
index 2c09736..38e515f 100644
--- a/src/gallium/targets/osmesa/Makefile.am
+++ b/src/gallium/targets/osmesa/Makefile.am
@@ -42,7 +42,6 @@ nodist_EXTRA_lib@OSMESA_LIB@_la_SOURCES = dummy.cpp
 lib@OSMESA_LIB@_la_SOURCES = target.c
 
 lib@OSMESA_LIB@_la_LDFLAGS = \
-	-module \
 	-no-undefined \
 	-version-number @OSMESA_VERSION@ \
 	$(GC_SECTIONS) \
diff --git a/src/gallium/targets/pipe-loader/Makefile.am b/src/gallium/targets/pipe-loader/Makefile.am
index 967cdb7..e4048b5 100644
--- a/src/gallium/targets/pipe-loader/Makefile.am
+++ b/src/gallium/targets/pipe-loader/Makefile.am
@@ -52,6 +52,8 @@ endif
 
 PIPE_LIBS += \
 	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(top_builddir)/src/glsl/libnir.la \
+	$(top_builddir)/src/libglsl_util.la \
 	$(top_builddir)/src/util/libmesautil.la \
 	$(top_builddir)/src/gallium/drivers/rbug/librbug.la \
 	$(top_builddir)/src/gallium/drivers/trace/libtrace.la \
diff --git a/src/gallium/tests/trivial/quad-tex.c b/src/gallium/tests/trivial/quad-tex.c
index abecedb..daae577 100644
--- a/src/gallium/tests/trivial/quad-tex.c
+++ b/src/gallium/tests/trivial/quad-tex.c
@@ -270,7 +270,9 @@ static void init_prog(struct program *p)
 	}
 
 	/* fragment shader */
-	p->fs = util_make_fragment_tex_shader(p->pipe, TGSI_TEXTURE_2D, TGSI_INTERPOLATE_LINEAR);
+	p->fs = util_make_fragment_tex_shader(p->pipe, TGSI_TEXTURE_2D,
+	                                      TGSI_INTERPOLATE_LINEAR,
+	                                      TGSI_RETURN_TYPE_FLOAT);
 }
 
 static void close_prog(struct program *p)
diff --git a/src/gallium/winsys/sw/android/android_sw_winsys.cpp b/src/gallium/winsys/sw/android/android_sw_winsys.cpp
deleted file mode 100644
index 4b1040c..0000000
--- a/src/gallium/winsys/sw/android/android_sw_winsys.cpp
+++ /dev/null
@@ -1,264 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2010-2011 LunarG Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#include "pipe/p_compiler.h"
-#include "pipe/p_state.h"
-#include "util/u_memory.h"
-#include "util/u_format.h"
-#include "state_tracker/sw_winsys.h"
-
-#include <hardware/gralloc.h>
-#include <utils/Errors.h>
-
-#if ANDROID_VERSION < 0x0300
-#include <private/ui/sw_gralloc_handle.h>
-#endif
-
-#include "android_sw_winsys.h"
-
-struct android_sw_winsys
-{
-   struct sw_winsys base;
-
-   const gralloc_module_t *grmod;
-};
-
-struct android_sw_displaytarget
-{
-   buffer_handle_t handle;
-   int stride;
-   int width, height;
-   int usage; /* gralloc usage */
-
-   void *mapped;
-};
-
-static INLINE struct android_sw_winsys *
-android_sw_winsys(struct sw_winsys *ws)
-{
-   return (struct android_sw_winsys *) ws;
-}
-
-static INLINE struct android_sw_displaytarget *
-android_sw_displaytarget(struct sw_displaytarget *dt)
-{
-   return (struct android_sw_displaytarget *) dt;
-}
-
-namespace android {
-
-static void
-android_displaytarget_display(struct sw_winsys *ws,
-                              struct sw_displaytarget *dt,
-                              void *context_private,
-                              struct pipe_box *box)
-{
-}
-
-static struct sw_displaytarget *
-android_displaytarget_create(struct sw_winsys *ws,
-                             unsigned tex_usage,
-                             enum pipe_format format,
-                             unsigned width, unsigned height,
-                             unsigned alignment,
-                             unsigned *stride)
-{
-   return NULL;
-}
-
-static void
-android_displaytarget_destroy(struct sw_winsys *ws,
-                              struct sw_displaytarget *dt)
-{
-   struct android_sw_displaytarget *adt = android_sw_displaytarget(dt);
-
-   assert(!adt->mapped);
-   FREE(adt);
-}
-
-static void
-android_displaytarget_unmap(struct sw_winsys *ws,
-                            struct sw_displaytarget *dt)
-{
-   struct android_sw_winsys *droid = android_sw_winsys(ws);
-   struct android_sw_displaytarget *adt = android_sw_displaytarget(dt);
-
-#if ANDROID_VERSION < 0x0300
-   /* try sw_gralloc first */
-   if (adt->mapped && sw_gralloc_handle_t::validate(adt->handle) >= 0) {
-      adt->mapped = NULL;
-      return;
-   }
-#endif
-
-   if (adt->mapped) {
-      droid->grmod->unlock(droid->grmod, adt->handle);
-      adt->mapped = NULL;
-   }
-}
-
-static void *
-android_displaytarget_map(struct sw_winsys *ws,
-                          struct sw_displaytarget *dt,
-                          unsigned flags)
-{
-   struct android_sw_winsys *droid = android_sw_winsys(ws);
-   struct android_sw_displaytarget *adt = android_sw_displaytarget(dt);
-
-#if ANDROID_VERSION < 0x0300
-   /* try sw_gralloc first */
-   if (sw_gralloc_handle_t::validate(adt->handle) >= 0) {
-      const sw_gralloc_handle_t *swhandle =
-         reinterpret_cast<const sw_gralloc_handle_t *>(adt->handle);
-      adt->mapped = reinterpret_cast<void *>(swhandle->base);
-
-      return adt->mapped;
-   }
-#endif
-
-   if (!adt->mapped) {
-      /* lock the buffer for CPU access */
-      droid->grmod->lock(droid->grmod, adt->handle,
-            adt->usage, 0, 0, adt->width, adt->height, &adt->mapped);
-   }
-
-   return adt->mapped;
-}
-
-static struct sw_displaytarget *
-android_displaytarget_from_handle(struct sw_winsys *ws,
-                                  const struct pipe_resource *templ,
-                                  struct winsys_handle *whandle,
-                                  unsigned *stride)
-{
-   struct android_winsys_handle *ahandle =
-      (struct android_winsys_handle *) whandle;
-   struct android_sw_displaytarget *adt;
-
-   adt = CALLOC_STRUCT(android_sw_displaytarget);
-   if (!adt)
-      return NULL;
-
-   adt->handle = ahandle->handle;
-   adt->stride = ahandle->stride;
-   adt->width = templ->width0;
-   adt->height = templ->height0;
-
-   if (templ->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_TRANSFER_WRITE))
-      adt->usage |= GRALLOC_USAGE_SW_WRITE_OFTEN;
-   if (templ->bind & (PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_TRANSFER_READ))
-      adt->usage |= GRALLOC_USAGE_SW_READ_OFTEN;
-
-   if (stride)
-      *stride = adt->stride;
-
-   return reinterpret_cast<struct sw_displaytarget *>(adt);
-}
-
-static boolean
-android_displaytarget_get_handle(struct sw_winsys *ws,
-                                 struct sw_displaytarget *dt,
-                                 struct winsys_handle *whandle)
-{
-   return FALSE;
-}
-
-static boolean
-android_is_displaytarget_format_supported(struct sw_winsys *ws,
-                                          unsigned tex_usage,
-                                          enum pipe_format format)
-{
-   struct android_sw_winsys *droid = android_sw_winsys(ws);
-   int fmt = -1;
-
-   switch (format) {
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      fmt = HAL_PIXEL_FORMAT_RGBA_8888;
-      break;
-   case PIPE_FORMAT_R8G8B8X8_UNORM:
-      fmt = HAL_PIXEL_FORMAT_RGBX_8888;
-      break;
-   case PIPE_FORMAT_R8G8B8_UNORM:
-      fmt = HAL_PIXEL_FORMAT_RGB_888;
-      break;
-   case PIPE_FORMAT_B5G6R5_UNORM:
-      fmt = HAL_PIXEL_FORMAT_RGB_565;
-      break;
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      fmt = HAL_PIXEL_FORMAT_BGRA_8888;
-      break;
-   default:
-      break;
-   }
-
-   return (fmt != -1);
-}
-
-static void
-android_destroy(struct sw_winsys *ws)
-{
-   struct android_sw_winsys *droid = android_sw_winsys(ws);
-
-   FREE(droid);
-}
-
-}; /* namespace android */
-
-using namespace android;
-
-struct sw_winsys *
-android_create_sw_winsys(void)
-{
-   struct android_sw_winsys *droid;
-   const hw_module_t *mod;
-
-   droid = CALLOC_STRUCT(android_sw_winsys);
-   if (!droid)
-      return NULL;
-
-   if (hw_get_module(GRALLOC_HARDWARE_MODULE_ID, &mod)) {
-      FREE(droid);
-      return NULL;
-   }
-
-   droid->grmod = (const gralloc_module_t *) mod;
-
-   droid->base.destroy = android_destroy;
-   droid->base.is_displaytarget_format_supported =
-      android_is_displaytarget_format_supported;
-
-   droid->base.displaytarget_create = android_displaytarget_create;
-   droid->base.displaytarget_destroy = android_displaytarget_destroy;
-   droid->base.displaytarget_from_handle = android_displaytarget_from_handle;
-   droid->base.displaytarget_get_handle = android_displaytarget_get_handle;
-
-   droid->base.displaytarget_map = android_displaytarget_map;
-   droid->base.displaytarget_unmap = android_displaytarget_unmap;
-   droid->base.displaytarget_display = android_displaytarget_display;
-
-   return &droid->base;
-}
diff --git a/src/gallium/winsys/sw/android/android_sw_winsys.h b/src/gallium/winsys/sw/android/android_sw_winsys.h
deleted file mode 100644
index 24c85ed..0000000
--- a/src/gallium/winsys/sw/android/android_sw_winsys.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * Mesa 3-D graphics library
- *
- * Copyright (C) 2010-2011 LunarG Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
- * DEALINGS IN THE SOFTWARE.
- *
- * Authors:
- *    Chia-I Wu <olv@lunarg.com>
- */
-
-#ifndef ANDROID_SW_WINSYS
-#define ANDROID_SW_WINSYS
-
-#include <sys/cdefs.h>
-#include <hardware/gralloc.h>
-
-__BEGIN_DECLS
-
-struct sw_winsys;
-
-struct android_winsys_handle {
-   buffer_handle_t handle;
-   int stride;
-};
-
-struct sw_winsys *
-android_create_sw_winsys(void);
-
-__END_DECLS
-
-#endif /* ANDROID_SW_WINSYS */
diff --git a/src/gallium/winsys/sw/android/Android.mk b/src/gallium/winsys/sw/dri/Android.mk
index 4fb2715..72fb920 100644
--- a/src/gallium/winsys/sw/android/Android.mk
+++ b/src/gallium/winsys/sw/dri/Android.mk
@@ -1,7 +1,7 @@
 # Mesa 3-D graphics library
 #
-# Copyright (C) 2010-2011 Chia-I Wu <olvaffe@gmail.com>
-# Copyright (C) 2010-2011 LunarG Inc.
+# Copyright (C) 2015 Chih-Wei Huang <cwhuang@linux.org.tw>
+# Copyright (C) 2015 Android-x86 Open Source Project
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -23,12 +23,13 @@
 
 LOCAL_PATH := $(call my-dir)
 
+include $(LOCAL_PATH)/Makefile.sources
+
 include $(CLEAR_VARS)
 
-LOCAL_SRC_FILES := \
-	android_sw_winsys.cpp
+LOCAL_SRC_FILES := $(C_SOURCES)
 
-LOCAL_MODULE := libmesa_winsys_sw_android
+LOCAL_MODULE := libmesa_winsys_sw_dri
 
 include $(GALLIUM_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/winsys/sw/hgl/hgl_sw_winsys.h b/src/gallium/winsys/sw/hgl/hgl_sw_winsys.h
index bdcddfb..a81f890 100644
--- a/src/gallium/winsys/sw/hgl/hgl_sw_winsys.h
+++ b/src/gallium/winsys/sw/hgl/hgl_sw_winsys.h
@@ -27,9 +27,16 @@
 #ifndef _HGL_SOFTWAREWINSYS_H
 #define _HGL_SOFTWAREWINSYS_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 struct sw_winsys;
 
 struct sw_winsys* hgl_create_sw_winsys(void);
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif
diff --git a/src/gallium/winsys/sw/kms-dri/Android.mk b/src/gallium/winsys/sw/kms-dri/Android.mk
new file mode 100644
index 0000000..b065242
--- /dev/null
+++ b/src/gallium/winsys/sw/kms-dri/Android.mk
@@ -0,0 +1,37 @@
+# Mesa 3-D graphics library
+#
+# Copyright (C) 2015 Chih-Wei Huang <cwhuang@linux.org.tw>
+# Copyright (C) 2015 Android-x86 Open Source Project
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_MODULE := libmesa_winsys_sw_kms_dri
+
+LOCAL_SHARED_LIBRARIES := libdrm
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gallium/winsys/vc4/drm/Android.mk b/src/gallium/winsys/vc4/drm/Android.mk
new file mode 100644
index 0000000..55edc17
--- /dev/null
+++ b/src/gallium/winsys/vc4/drm/Android.mk
@@ -0,0 +1,34 @@
+# Copyright (C) 2014 Emil Velikov <emil.l.velikov@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+LOCAL_PATH := $(call my-dir)
+
+# get C_SOURCES
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := $(C_SOURCES)
+
+LOCAL_SHARED_LIBRARIES := libdrm
+LOCAL_MODULE := libmesa_winsys_vc4
+
+include $(GALLIUM_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
diff --git a/src/gbm/Makefile.am b/src/gbm/Makefile.am
index dbd4f83..918fdf7 100644
--- a/src/gbm/Makefile.am
+++ b/src/gbm/Makefile.am
@@ -52,7 +52,8 @@ libgbm_dri_la_CFLAGS = \
 	$(LIBDRM_CFLAGS)
 
 libgbm_la_LIBADD += \
-	libgbm_dri.la $(top_builddir)/src/mapi/shared-glapi/libglapi.la $(LIBDRM_LIBS)
+	libgbm_dri.la \
+	$(LIBDRM_LIBS)
 endif
 
 TESTS = gbm-symbols-check
diff --git a/src/gbm/backends/dri/gbm_dri.c b/src/gbm/backends/dri/gbm_dri.c
index 62bdf89..ccc3cc6 100644
--- a/src/gbm/backends/dri/gbm_dri.c
+++ b/src/gbm/backends/dri/gbm_dri.c
@@ -311,6 +311,14 @@ dri_open_driver(struct gbm_dri_device *dri)
    if (search_paths == NULL)
       search_paths = DEFAULT_DRIVER_DIR;
 
+   /* Temporarily work around dri driver libs that need symbols in libglapi
+    * but don't automatically link it in.
+    */
+   /* XXX: Library name differs on per platforms basis. Update this as
+    * osx/cygwin/windows/bsd gets support for GBM..
+    */
+   dlopen("libglapi.so.0", RTLD_LAZY | RTLD_GLOBAL);
+
    dri->driver = NULL;
    end = search_paths + strlen(search_paths);
    for (p = search_paths; p < end && dri->driver == NULL; p = next + 1) {
diff --git a/src/glsl/Android.mk b/src/glsl/Android.mk
index f20741e..f63b7da 100644
--- a/src/glsl/Android.mk
+++ b/src/glsl/Android.mk
@@ -46,7 +46,6 @@ LOCAL_C_INCLUDES := \
 
 LOCAL_MODULE := libmesa_glsl
 
-include external/stlport/libstlport.mk
 include $(LOCAL_PATH)/Android.gen.mk
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
diff --git a/src/glsl/Makefile.am b/src/glsl/Makefile.am
index 7af9a70..74da9e5 100644
--- a/src/glsl/Makefile.am
+++ b/src/glsl/Makefile.am
@@ -89,8 +89,7 @@ tests_general_ir_test_SOURCES =		\
 	tests/builtin_variable_test.cpp			\
 	tests/invalidate_locations_test.cpp		\
 	tests/general_ir_test.cpp			\
-	tests/varyings_test.cpp				\
-	tests/common.c
+	tests/varyings_test.cpp
 tests_general_ir_test_CFLAGS =				\
 	$(PTHREAD_CFLAGS)
 tests_general_ir_test_LDADD =				\
@@ -103,8 +102,7 @@ tests_uniform_initializer_test_SOURCES =		\
 	tests/copy_constant_to_storage_tests.cpp	\
 	tests/set_uniform_initializer_tests.cpp		\
 	tests/uniform_initializer_utils.cpp		\
-	tests/uniform_initializer_utils.h		\
-	tests/common.c
+	tests/uniform_initializer_utils.h
 tests_uniform_initializer_test_CFLAGS =			\
 	$(PTHREAD_CFLAGS)
 tests_uniform_initializer_test_LDADD =			\
@@ -114,8 +112,7 @@ tests_uniform_initializer_test_LDADD =			\
 	$(PTHREAD_LIBS)
 
 tests_sampler_types_test_SOURCES =			\
-	tests/sampler_types_test.cpp			\
-	tests/common.c
+	tests/sampler_types_test.cpp
 tests_sampler_types_test_CFLAGS =			\
 	$(PTHREAD_CFLAGS)
 tests_sampler_types_test_LDADD =			\
@@ -133,8 +130,7 @@ libglcpp_la_SOURCES =					\
 	$(LIBGLCPP_FILES)
 
 glcpp_glcpp_SOURCES =					\
-	glcpp/glcpp.c					\
-	tests/common.c
+	glcpp/glcpp.c
 glcpp_glcpp_LDADD =					\
 	libglcpp.la					\
 	$(top_builddir)/src/libglsl_util.la		\
@@ -174,7 +170,6 @@ spirv2nir_LDADD =					\
 
 glsl_test_SOURCES = \
 	standalone_scaffolding.cpp \
-	tests/common.c \
 	test.cpp \
 	test_optpass.cpp \
 	test_optpass.h
@@ -257,21 +252,21 @@ dist-hook:
 	$(RM) glcpp/tests/subtest*/*.out
 
 nir/nir_builder_opcodes.h: nir/nir_opcodes.py nir/nir_builder_opcodes_h.py
-	$(MKDIR_P) nir;							\
-	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_builder_opcodes_h.py > $@
+	$(AM_V_at)$(MKDIR_P) nir
+	$(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_builder_opcodes_h.py > $@
 
 nir/nir_constant_expressions.c: nir/nir_opcodes.py nir/nir_constant_expressions.py nir/nir_constant_expressions.h
-	$(MKDIR_P) nir;							\
-	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_constant_expressions.py > $@
+	$(AM_V_at)$(MKDIR_P) nir
+	$(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_constant_expressions.py > $@
 
 nir/nir_opcodes.h: nir/nir_opcodes.py nir/nir_opcodes_h.py
-	$(MKDIR_P) nir;							\
-	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_h.py > $@
+	$(AM_V_at)$(MKDIR_P) nir
+	$(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_h.py > $@
 
 nir/nir_opcodes.c: nir/nir_opcodes.py nir/nir_opcodes_c.py
-	$(MKDIR_P) nir;							\
-	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_c.py > $@
+	$(AM_V_at)$(MKDIR_P) nir
+	$(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opcodes_c.py > $@
 
 nir/nir_opt_algebraic.c: nir/nir_opt_algebraic.py nir/nir_algebraic.py
-	$(MKDIR_P) nir;							\
-	$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opt_algebraic.py > $@
+	$(AM_V_at)$(MKDIR_P) nir
+	$(AM_V_GEN)$(PYTHON2) $(PYTHON_FLAGS) $(srcdir)/nir/nir_opt_algebraic.py > $@
diff --git a/src/glsl/SConscript b/src/glsl/SConscript
index 284b375..89c6035 100644
--- a/src/glsl/SConscript
+++ b/src/glsl/SConscript
@@ -71,6 +71,7 @@ env.Command('imports.c', '#src/mesa/main/imports.c', Copy('$TARGET', '$SOURCE'))
 env.Prepend(CPPPATH = ['#src/mesa/program'])
 env.Command('prog_hash_table.c', '#src/mesa/program/prog_hash_table.c', Copy('$TARGET', '$SOURCE'))
 env.Command('symbol_table.c', '#src/mesa/program/symbol_table.c', Copy('$TARGET', '$SOURCE'))
+env.Command('dummy_errors.c', '#src/mesa/program/dummy_errors.c', Copy('$TARGET', '$SOURCE'))
 
 compiler_objs = env.StaticObject(source_lists['GLSL_COMPILER_CXX_FILES'])
 
@@ -78,6 +79,7 @@ mesa_objs = env.StaticObject([
     'imports.c',
     'prog_hash_table.c',
     'symbol_table.c',
+    'dummy_errors.c',
 ])
 
 compiler_objs += mesa_objs
@@ -115,6 +117,6 @@ env.Alias('glsl_compiler', glsl_compiler)
 
 glcpp = env.Program(
     target = 'glcpp/glcpp',
-    source = ['glcpp/glcpp.c', 'tests/common.c'] + mesa_objs,
+    source = ['glcpp/glcpp.c'] + mesa_objs,
 )
 env.Alias('glcpp', glcpp)
diff --git a/src/glsl/ast_array_index.cpp b/src/glsl/ast_array_index.cpp
index ecef651..752d86f 100644
--- a/src/glsl/ast_array_index.cpp
+++ b/src/glsl/ast_array_index.cpp
@@ -225,7 +225,7 @@ _mesa_ast_array_index_to_hir(void *mem_ctx,
        * values *do* diverge, then the behavior of the operation requiring a
        * dynamically uniform expression is undefined.
        */
-      if (array->type->element_type()->is_sampler()) {
+      if (array->type->without_array()->is_sampler()) {
 	 if (!state->is_version(130, 100)) {
 	    if (state->es_shader) {
 	       _mesa_glsl_warning(&loc, state,
diff --git a/src/glsl/ast_function.cpp b/src/glsl/ast_function.cpp
index 7583613..92e26bf 100644
--- a/src/glsl/ast_function.cpp
+++ b/src/glsl/ast_function.cpp
@@ -863,7 +863,7 @@ process_array_constructor(exec_list *instructions,
 
    if (is_unsized_array) {
       constructor_type =
-	 glsl_type::get_array_instance(constructor_type->element_type(),
+	 glsl_type::get_array_instance(constructor_type->fields.array,
 				       parameter_count);
       assert(constructor_type != NULL);
       assert(constructor_type->length == parameter_count);
@@ -876,7 +876,7 @@ process_array_constructor(exec_list *instructions,
       ir_rvalue *result = ir;
 
       const glsl_base_type element_base_type =
-         constructor_type->element_type()->base_type;
+         constructor_type->fields.array->base_type;
 
       /* Apply implicit conversions (not the scalar constructor rules!). See
        * the spec quote above. */
@@ -896,10 +896,10 @@ process_array_constructor(exec_list *instructions,
 	 }
       }
 
-      if (result->type != constructor_type->element_type()) {
+      if (result->type != constructor_type->fields.array) {
 	 _mesa_glsl_error(loc, state, "type error in array constructor: "
 			  "expected: %s, found %s",
-			  constructor_type->element_type()->name,
+			  constructor_type->fields.array->name,
 			  result->type->name);
          return ir_rvalue::error_value(ctx);
       }
@@ -993,11 +993,15 @@ emit_inline_vector_constructor(const glsl_type *type,
    ir_variable *var = new(ctx) ir_variable(type, "vec_ctor", ir_var_temporary);
    instructions->push_tail(var);
 
-   /* There are two kinds of vector constructors.
+   /* There are three kinds of vector constructors.
     *
     *  - Construct a vector from a single scalar by replicating that scalar to
     *    all components of the vector.
     *
+    *  - Construct a vector from at least a matrix. This case should already
+    *    have been taken care of in ast_function_expression::hir by breaking
+    *    down the matrix into a series of column vectors.
+    *
     *  - Construct a vector from an arbirary combination of vectors and
     *    scalars.  The components of the constructor parameters are assigned
     *    to the vector in order until the vector is full.
@@ -1091,6 +1095,14 @@ emit_inline_vector_constructor(const glsl_type *type,
 	    rhs_components = lhs_components - base_component;
 	 }
 
+	 /* If we do not have any components left to copy, break out of the
+	  * loop. This can happen when initializing a vec4 with a mat3 as the
+	  * mat3 would have been broken into a series of column vectors.
+	  */
+	 if (rhs_components == 0) {
+	    break;
+	 }
+
 	 const ir_constant *const c = param->as_constant();
 	 if (c == NULL) {
 	    /* Mask of fields to be written in the assignment.
@@ -1681,11 +1693,11 @@ ast_function_expression::hir(exec_list *instructions,
 	 return ir_rvalue::error_value(ctx);
       }
 
-      /* Later, we cast each parameter to the same base type as the
-       * constructor.  Since there are no non-floating point matrices, we
-       * need to break them up into a series of column vectors.
+      /* Matrices can never be consumed as is by any constructor but matrix
+       * constructors. If the constructor type is not matrix, always break the
+       * matrix up into a series of column vectors.
        */
-      if (constructor_type->base_type != GLSL_TYPE_FLOAT) {
+      if (!constructor_type->is_matrix()) {
 	 foreach_in_list_safe(ir_rvalue, matrix, &actual_parameters) {
 	    if (!matrix->type->is_matrix())
 	       continue;
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index cd6a068..6896b70 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -678,7 +678,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
     * is handled by ir_dereference::is_lvalue.
     */
    if (lhs_type->is_unsized_array() && rhs->type->is_array()
-       && (lhs_type->element_type() == rhs->type->element_type())) {
+       && (lhs_type->fields.array == rhs->type->fields.array)) {
       if (is_initializer) {
          return rhs;
       } else {
@@ -820,7 +820,7 @@ do_assignment(exec_list *instructions, struct _mesa_glsl_parse_state *state,
                              var->data.max_array_access);
          }
 
-         var->type = glsl_type::get_array_instance(lhs->type->element_type(),
+         var->type = glsl_type::get_array_instance(lhs->type->fields.array,
                                                    rhs->type->array_size());
          d->type = var->type;
       }
@@ -2087,7 +2087,7 @@ validate_binding_qualifier(struct _mesa_glsl_parse_state *state,
        *  with an array of size N, all elements of the array from binding
        *  through binding + N - 1 must be within this range."
        */
-      unsigned limit = ctx->Const.Program[state->stage].MaxTextureImageUnits;
+      unsigned limit = ctx->Const.MaxCombinedTextureImageUnits;
 
       if (max_index >= limit) {
          _mesa_glsl_error(loc, state, "layout(binding = %d) for %d samplers "
@@ -2331,8 +2331,7 @@ apply_image_qualifier_to_variable(const struct ast_type_qualifier *qual,
                                   struct _mesa_glsl_parse_state *state,
                                   YYLTYPE *loc)
 {
-   const glsl_type *base_type =
-      (var->type->is_array() ? var->type->element_type() : var->type);
+   const glsl_type *base_type = var->type->without_array();
 
    if (base_type->is_image()) {
       if (var->data.mode != ir_var_uniform &&
@@ -2740,7 +2739,7 @@ apply_type_qualifier_to_variable(const struct ast_type_qualifier *qual,
     *    GL_ARB_conservative_depth
     *    GL_ARB_gpu_shader5
     *    GL_ARB_separate_shader_objects
-    *    GL_ARB_tesselation_shader
+    *    GL_ARB_tessellation_shader
     *    GL_ARB_transform_feedback3
     *    GL_ARB_uniform_buffer_object
     *
@@ -2865,7 +2864,7 @@ get_variable_being_redeclared(ir_variable *var, YYLTYPE loc,
     *  type and specify a size."
     */
    if (earlier->type->is_unsized_array() && var->type->is_array()
-       && (var->type->element_type() == earlier->type->element_type())) {
+       && (var->type->fields.array == earlier->type->fields.array)) {
       /* FINISHME: This doesn't match the qualifiers on the two
        * FINISHME: declarations.  It's not 100% clear whether this is
        * FINISHME: required or not.
@@ -3618,6 +3617,51 @@ ast_declarator_list::hir(exec_list *instructions,
             }
 
             handle_geometry_shader_input_decl(state, loc, var);
+         } else if (state->stage == MESA_SHADER_FRAGMENT) {
+            /* From section 4.3.4 (Input Variables) of the GLSL ES 3.10 spec:
+             *
+             *     It is a compile-time error to declare a fragment shader
+             *     input with, or that contains, any of the following types:
+             *
+             *     * A boolean type
+             *     * An opaque type
+             *     * An array of arrays
+             *     * An array of structures
+             *     * A structure containing an array
+             *     * A structure containing a structure
+             */
+            if (state->es_shader) {
+               const glsl_type *check_type = var->type->without_array();
+               if (check_type->is_boolean() ||
+                   check_type->contains_opaque()) {
+                  _mesa_glsl_error(&loc, state,
+                                   "fragment shader input cannot have type %s",
+                                   check_type->name);
+               }
+               if (var->type->is_array() &&
+                   var->type->fields.array->is_array()) {
+                  _mesa_glsl_error(&loc, state,
+                                   "%s shader output "
+                                   "cannot have an array of arrays",
+                                   _mesa_shader_stage_to_string(state->stage));
+               }
+               if (var->type->is_array() &&
+                   var->type->fields.array->is_record()) {
+                  _mesa_glsl_error(&loc, state,
+                                   "fragment shader input "
+                                   "cannot have an array of structs");
+               }
+               if (var->type->is_record()) {
+                  for (unsigned i = 0; i < var->type->length; i++) {
+                     if (var->type->fields.structure[i].type->is_array() ||
+                         var->type->fields.structure[i].type->is_record())
+                        _mesa_glsl_error(&loc, state,
+                                         "fragement shader input cannot have "
+                                         "a struct that contains an "
+                                         "array or struct");
+                  }
+               }
+            }
          }
       } else if (var->data.mode == ir_var_shader_out) {
          const glsl_type *check_type = var->type->without_array();
@@ -3652,7 +3696,7 @@ ast_declarator_list::hir(exec_list *instructions,
             if (check_type->is_record() || check_type->is_matrix())
                _mesa_glsl_error(&loc, state,
                                 "fragment shader output "
-                                "cannot have struct or array type");
+                                "cannot have struct or matrix type");
             switch (check_type->base_type) {
             case GLSL_TYPE_UINT:
             case GLSL_TYPE_INT:
@@ -3664,6 +3708,55 @@ ast_declarator_list::hir(exec_list *instructions,
                                 "type %s", check_type->name);
             }
          }
+
+         /* From section 4.3.6 (Output Variables) of the GLSL ES 3.10 spec:
+          *
+          *     It is a compile-time error to declare a vertex shader output
+          *     with, or that contains, any of the following types:
+          *
+          *     * A boolean type
+          *     * An opaque type
+          *     * An array of arrays
+          *     * An array of structures
+          *     * A structure containing an array
+          *     * A structure containing a structure
+          *
+          *     It is a compile-time error to declare a fragment shader output
+          *     with, or that contains, any of the following types:
+          *
+          *     * A boolean type
+          *     * An opaque type
+          *     * A matrix
+          *     * A structure
+          *     * An array of array
+          */
+         if (state->es_shader) {
+            if (var->type->is_array() &&
+                var->type->fields.array->is_array()) {
+               _mesa_glsl_error(&loc, state,
+                                "%s shader output "
+                                "cannot have an array of arrays",
+                                _mesa_shader_stage_to_string(state->stage));
+            }
+            if (state->stage == MESA_SHADER_VERTEX) {
+               if (var->type->is_array() &&
+                   var->type->fields.array->is_record()) {
+                  _mesa_glsl_error(&loc, state,
+                                   "vertex shader output "
+                                   "cannot have an array of structs");
+               }
+               if (var->type->is_record()) {
+                  for (unsigned i = 0; i < var->type->length; i++) {
+                     if (var->type->fields.structure[i].type->is_array() ||
+                         var->type->fields.structure[i].type->is_record())
+                        _mesa_glsl_error(&loc, state,
+                                         "vertex shader output cannot have a "
+                                         "struct that contains an "
+                                         "array or struct");
+                  }
+               }
+            }
+         }
       }
 
       /* Integer fragment inputs must be qualified with 'flat'.  In GLSL ES,
@@ -5756,6 +5849,17 @@ ast_interface_block::hir(exec_list *instructions,
          const glsl_type *block_array_type =
             process_array_type(&loc, block_type, this->array_specifier, state);
 
+          /* From section 4.3.9 (Interface Blocks) of the GLSL ES 3.10 spec:
+          *
+          *     * Arrays of arrays of blocks are not allowed
+          */
+         if (state->es_shader && block_array_type->is_array() &&
+             block_array_type->fields.array->is_array()) {
+            _mesa_glsl_error(&loc, state,
+                             "arrays of arrays interface blocks are "
+                             "not allowed");
+         }
+
          var = new(state) ir_variable(block_array_type,
                                       this->instance_name,
                                       var_mode);
diff --git a/src/glsl/builtin_functions.cpp b/src/glsl/builtin_functions.cpp
index 97055d8..efab299 100644
--- a/src/glsl/builtin_functions.cpp
+++ b/src/glsl/builtin_functions.cpp
@@ -410,6 +410,13 @@ fp64(const _mesa_glsl_parse_state *state)
    return state->has_double();
 }
 
+static bool
+barrier_supported(const _mesa_glsl_parse_state *state)
+{
+   return state->stage == MESA_SHADER_COMPUTE;
+   /* TODO: || stage->state == MESA_SHADER_TESS_CTRL; */
+}
+
 /** @} */
 
 /******************************************************************************/
@@ -654,6 +661,7 @@ private:
                                             const glsl_type *stream_type);
    ir_function_signature *_EndStreamPrimitive(builtin_available_predicate avail,
                                               const glsl_type *stream_type);
+   B0(barrier)
 
    B2(textureQueryLod);
    B1(textureQueryLevels);
@@ -1933,6 +1941,7 @@ builtin_builder::create_builtins()
                 _EndStreamPrimitive(gs_streams, glsl_type::uint_type),
                 _EndStreamPrimitive(gs_streams, glsl_type::int_type),
                 NULL);
+   add_function("barrier", _barrier(), NULL);
 
    add_function("textureQueryLOD",
                 _textureQueryLod(glsl_type::sampler1D_type,  glsl_type::float_type),
@@ -4296,6 +4305,15 @@ builtin_builder::_EndStreamPrimitive(builtin_available_predicate avail,
 }
 
 ir_function_signature *
+builtin_builder::_barrier()
+{
+   MAKE_SIG(glsl_type::void_type, barrier_supported, 0);
+
+   body.emit(new(mem_ctx) ir_barrier());
+   return sig;
+}
+
+ir_function_signature *
 builtin_builder::_textureQueryLod(const glsl_type *sampler_type,
                                   const glsl_type *coord_type)
 {
diff --git a/src/glsl/builtin_variables.cpp b/src/glsl/builtin_variables.cpp
index 6806aa1..a765d35 100644
--- a/src/glsl/builtin_variables.cpp
+++ b/src/glsl/builtin_variables.cpp
@@ -764,7 +764,8 @@ builtin_variable_generator::generate_constants()
 void
 builtin_variable_generator::generate_uniforms()
 {
-   add_uniform(int_t, "gl_NumSamples");
+   if (state->is_version(400, 0) || state->ARB_sample_shading_enable)
+      add_uniform(int_t, "gl_NumSamples");
    add_uniform(type("gl_DepthRangeParameters"), "gl_DepthRange");
    add_uniform(array(vec4_t, VERT_ATTRIB_MAX), "gl_CurrentAttribVertMESA");
    add_uniform(array(vec4_t, VARYING_SLOT_MAX), "gl_CurrentAttribFragMESA");
@@ -876,9 +877,9 @@ void
 builtin_variable_generator::generate_gs_special_vars()
 {
    add_output(VARYING_SLOT_LAYER, int_t, "gl_Layer");
-   if (state->ARB_viewport_array_enable)
+   if (state->is_version(410, 0) || state->ARB_viewport_array_enable)
       add_output(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
-   if (state->ARB_gpu_shader5_enable)
+   if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable)
       add_system_value(SYSTEM_VALUE_INVOCATION_ID, int_t, "gl_InvocationID");
 
    /* Although gl_PrimitiveID appears in tessellation control and tessellation
@@ -946,7 +947,7 @@ builtin_variable_generator::generate_fs_special_vars()
          var->enable_extension_warning("GL_AMD_shader_stencil_export");
    }
 
-   if (state->ARB_sample_shading_enable) {
+   if (state->is_version(400, 0) || state->ARB_sample_shading_enable) {
       add_system_value(SYSTEM_VALUE_SAMPLE_ID, int_t, "gl_SampleID");
       add_system_value(SYSTEM_VALUE_SAMPLE_POS, vec2_t, "gl_SamplePosition");
       /* From the ARB_sample_shading specification:
@@ -959,11 +960,11 @@ builtin_variable_generator::generate_fs_special_vars()
       add_output(FRAG_RESULT_SAMPLE_MASK, array(int_t, 1), "gl_SampleMask");
    }
 
-   if (state->ARB_gpu_shader5_enable) {
+   if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable) {
       add_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN, array(int_t, 1), "gl_SampleMaskIn");
    }
 
-   if (state->ARB_fragment_layer_viewport_enable) {
+   if (state->is_version(430, 0) || state->ARB_fragment_layer_viewport_enable) {
       add_input(VARYING_SLOT_LAYER, int_t, "gl_Layer");
       add_input(VARYING_SLOT_VIEWPORT, int_t, "gl_ViewportIndex");
    }
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index 982ade6..e26931d 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -780,7 +780,7 @@ _mesa_ast_set_aggregate_type(const glsl_type *type,
 
    /* If the aggregate is an array, recursively set its elements' types. */
    if (type->is_array()) {
-      /* Each array element has the type type->element_type().
+      /* Each array element has the type type->fields.array.
        *
        * E.g., if <type> if struct S[2] we want to set each element's type to
        * struct S.
@@ -792,7 +792,7 @@ _mesa_ast_set_aggregate_type(const glsl_type *type,
                                                link);
 
          if (expr->oper == ast_aggregate)
-            _mesa_ast_set_aggregate_type(type->element_type(), expr);
+            _mesa_ast_set_aggregate_type(type->fields.array, expr);
       }
 
    /* If the aggregate is a struct, recursively set its fields' types. */
diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index 0d83ee6..37406b8 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -251,7 +251,7 @@ glsl_type::contains_opaque() const {
    case GLSL_TYPE_ATOMIC_UINT:
       return true;
    case GLSL_TYPE_ARRAY:
-      return element_type()->contains_opaque();
+      return fields.array->contains_opaque();
    case GLSL_TYPE_STRUCT:
       for (unsigned int i = 0; i < length; i++) {
          if (fields.structure[i].type->contains_opaque())
diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index 2d47185..836259a 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -229,18 +229,6 @@ struct glsl_type {
    const glsl_type *get_scalar_type() const;
 
    /**
-    * Query the type of elements in an array
-    *
-    * \return
-    * Pointer to the type of elements in the array for array types, or \c NULL
-    * for non-array types.
-    */
-   const glsl_type *element_type() const
-   {
-      return is_array() ? fields.array : NULL;
-   }
-
-   /**
     * Get the instance of a built-in scalar, vector, or matrix type
     */
    static const glsl_type *get_instance(unsigned base_type, unsigned rows,
@@ -564,7 +552,7 @@ struct glsl_type {
       if (base_type == GLSL_TYPE_ATOMIC_UINT)
          return ATOMIC_COUNTER_SIZE;
       else if (is_array())
-         return length * element_type()->atomic_size();
+         return length * fields.array->atomic_size();
       else
          return 0;
    }
diff --git a/src/glsl/ir.cpp b/src/glsl/ir.cpp
index 9e32385..dbd064f 100644
--- a/src/glsl/ir.cpp
+++ b/src/glsl/ir.cpp
@@ -912,7 +912,7 @@ ir_constant::zero(void *mem_ctx, const glsl_type *type)
       c->array_elements = ralloc_array(c, ir_constant *, type->length);
 
       for (unsigned i = 0; i < type->length; i++)
-	 c->array_elements[i] = ir_constant::zero(c, type->element_type());
+	 c->array_elements[i] = ir_constant::zero(c, type->fields.array);
    }
 
    if (type->is_record()) {
@@ -1341,7 +1341,7 @@ ir_dereference_array::set_array(ir_rvalue *value)
    const glsl_type *const vt = this->array->type;
 
    if (vt->is_array()) {
-      type = vt->element_type();
+      type = vt->fields.array;
    } else if (vt->is_matrix()) {
       type = vt->column_type();
    } else if (vt->is_vector()) {
diff --git a/src/glsl/ir.h b/src/glsl/ir.h
index fdb5951..5af029b 100644
--- a/src/glsl/ir.h
+++ b/src/glsl/ir.h
@@ -78,6 +78,7 @@ enum ir_node_type {
    ir_type_discard,
    ir_type_emit_vertex,
    ir_type_end_primitive,
+   ir_type_barrier,
    ir_type_max, /**< maximum ir_type enum number, for validation */
    ir_type_unset = ir_type_max
 };
@@ -2408,6 +2409,29 @@ public:
    ir_rvalue *stream;
 };
 
+/**
+ * IR instruction for tessellation control and compute shader barrier.
+ */
+class ir_barrier : public ir_instruction {
+public:
+   ir_barrier()
+      : ir_instruction(ir_type_barrier)
+   {
+   }
+
+   virtual void accept(ir_visitor *v)
+   {
+      v->visit(this);
+   }
+
+   virtual ir_barrier *clone(void *mem_ctx, struct hash_table *) const
+   {
+      return new(mem_ctx) ir_barrier();
+   }
+
+   virtual ir_visitor_status accept(ir_hierarchical_visitor *);
+};
+
 /*@}*/
 
 /**
diff --git a/src/glsl/ir_function.cpp b/src/glsl/ir_function.cpp
index 2b2643c..1319443 100644
--- a/src/glsl/ir_function.cpp
+++ b/src/glsl/ir_function.cpp
@@ -148,9 +148,11 @@ get_parameter_match_type(const ir_variable *param,
    if (from_type == to_type)
       return PARAMETER_EXACT_MATCH;
 
-   /* XXX: When ARB_gpu_shader_fp64 support is added, check for float->double,
-    * and int/uint->double conversions
-    */
+   if (to_type->base_type == GLSL_TYPE_DOUBLE) {
+      if (from_type->base_type == GLSL_TYPE_FLOAT)
+         return PARAMETER_FLOAT_TO_DOUBLE;
+      return PARAMETER_INT_TO_DOUBLE;
+   }
 
    if (to_type->base_type == GLSL_TYPE_FLOAT)
       return PARAMETER_INT_TO_FLOAT;
diff --git a/src/glsl/ir_hierarchical_visitor.cpp b/src/glsl/ir_hierarchical_visitor.cpp
index adb6294..1d23a77 100644
--- a/src/glsl/ir_hierarchical_visitor.cpp
+++ b/src/glsl/ir_hierarchical_visitor.cpp
@@ -80,6 +80,15 @@ ir_hierarchical_visitor::visit(ir_dereference_variable *ir)
 }
 
 ir_visitor_status
+ir_hierarchical_visitor::visit(ir_barrier *ir)
+{
+   if (this->callback_enter != NULL)
+      this->callback_enter(ir, this->data_enter);
+
+   return visit_continue;
+}
+
+ir_visitor_status
 ir_hierarchical_visitor::visit_enter(ir_loop *ir)
 {
    if (this->callback_enter != NULL)
diff --git a/src/glsl/ir_hierarchical_visitor.h b/src/glsl/ir_hierarchical_visitor.h
index faa52fd..28517b6 100644
--- a/src/glsl/ir_hierarchical_visitor.h
+++ b/src/glsl/ir_hierarchical_visitor.h
@@ -59,7 +59,7 @@ enum ir_visitor_status {
  * in the composite's \c accept method.  The \c accept method for a leaf-node
  * class will simply call the \c visit method, as usual, and pass its return
  * value on.  The \c accept method for internal-node classes will call the \c
- * visit_enter method, call the \c accpet method of each child node, and,
+ * visit_enter method, call the \c accept method of each child node, and,
  * finally, call the \c visit_leave method.  If any of these return a value
  * other that \c visit_continue, the correct action must be taken.
  *
@@ -87,6 +87,7 @@ public:
    virtual ir_visitor_status visit(class ir_variable *);
    virtual ir_visitor_status visit(class ir_constant *);
    virtual ir_visitor_status visit(class ir_loop_jump *);
+   virtual ir_visitor_status visit(class ir_barrier *);
 
    /**
     * ir_dereference_variable isn't technically a leaf, but it is treated as a
diff --git a/src/glsl/ir_hv_accept.cpp b/src/glsl/ir_hv_accept.cpp
index be5b3ea..d3662cf 100644
--- a/src/glsl/ir_hv_accept.cpp
+++ b/src/glsl/ir_hv_accept.cpp
@@ -429,3 +429,9 @@ ir_end_primitive::accept(ir_hierarchical_visitor *v)
 
    return (s == visit_stop) ? s : v->visit_leave(this);
 }
+
+ir_visitor_status
+ir_barrier::accept(ir_hierarchical_visitor *v)
+{
+   return v->visit(this);
+}
diff --git a/src/glsl/ir_print_visitor.cpp b/src/glsl/ir_print_visitor.cpp
index 01f52e8..4cbcad4 100644
--- a/src/glsl/ir_print_visitor.cpp
+++ b/src/glsl/ir_print_visitor.cpp
@@ -72,7 +72,7 @@ _mesa_print_ir(FILE *f, exec_list *instructions,
       if (ir->ir_type != ir_type_function)
 	 fprintf(f, "\n");
    }
-   fprintf(f, "\n)");
+   fprintf(f, ")\n");
 }
 
 void
@@ -161,6 +161,10 @@ void ir_print_visitor::visit(ir_variable *ir)
 {
    fprintf(f, "(declare ");
 
+   char loc[256] = {0};
+   if (ir->data.location != -1)
+      snprintf(loc, sizeof(loc), "location=%i ", ir->data.location);
+
    const char *const cent = (ir->data.centroid) ? "centroid " : "";
    const char *const samp = (ir->data.sample) ? "sample " : "";
    const char *const inv = (ir->data.invariant) ? "invariant " : "";
@@ -172,8 +176,8 @@ void ir_print_visitor::visit(ir_variable *ir)
    const char *const interp[] = { "", "smooth", "flat", "noperspective" };
    STATIC_ASSERT(ARRAY_SIZE(interp) == INTERP_QUALIFIER_COUNT);
 
-   fprintf(f, "(%s%s%s%s%s%s) ",
-           cent, samp, inv, mode[ir->data.mode],
+   fprintf(f, "(%s%s%s%s%s%s%s) ",
+           loc, cent, samp, inv, mode[ir->data.mode],
            stream[ir->data.stream],
            interp[ir->data.interpolation]);
 
@@ -573,5 +577,10 @@ ir_print_visitor::visit(ir_end_primitive *ir)
    fprintf(f, "(end-primitive ");
    ir->stream->accept(this);
    fprintf(f, ")\n");
+}
 
+void
+ir_print_visitor::visit(ir_barrier *ir)
+{
+   fprintf(f, "(barrier)\n");
 }
diff --git a/src/glsl/ir_print_visitor.h b/src/glsl/ir_print_visitor.h
index 98f041d..965e63a 100644
--- a/src/glsl/ir_print_visitor.h
+++ b/src/glsl/ir_print_visitor.h
@@ -71,6 +71,7 @@ public:
    virtual void visit(ir_loop_jump *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void visit(ir_barrier *);
    /*@}*/
 
 private:
diff --git a/src/glsl/ir_reader.cpp b/src/glsl/ir_reader.cpp
index fd318c0..4eae413 100644
--- a/src/glsl/ir_reader.cpp
+++ b/src/glsl/ir_reader.cpp
@@ -63,6 +63,7 @@ private:
    ir_texture *read_texture(s_expression *);
    ir_emit_vertex *read_emit_vertex(s_expression *);
    ir_end_primitive *read_end_primitive(s_expression *);
+   ir_barrier *read_barrier(s_expression *);
 
    ir_dereference *read_dereference(s_expression *);
    ir_dereference_variable *read_var_ref(s_expression *);
@@ -375,6 +376,8 @@ ir_reader::read_instruction(s_expression *expr, ir_loop *loop_ctx)
       inst = read_emit_vertex(list);
    } else if (strcmp(tag->value(), "end-primitive") == 0) {
       inst = read_end_primitive(list);
+   } else if (strcmp(tag->value(), "barrier") == 0) {
+      inst = read_barrier(list);
    } else {
       inst = read_rvalue(list);
       if (inst == NULL)
@@ -1142,3 +1145,15 @@ ir_reader::read_end_primitive(s_expression *expr)
    ir_read_error(NULL, "when reading end-primitive");
    return NULL;
 }
+
+ir_barrier *
+ir_reader::read_barrier(s_expression *expr)
+{
+   s_pattern pat[] = { "barrier" };
+
+   if (MATCH(expr, pat)) {
+      return new(mem_ctx) ir_barrier();
+   }
+   ir_read_error(NULL, "when reading barrier");
+   return NULL;
+}
diff --git a/src/glsl/ir_uniform.h b/src/glsl/ir_uniform.h
index 21b5d05..e1b8014 100644
--- a/src/glsl/ir_uniform.h
+++ b/src/glsl/ir_uniform.h
@@ -181,6 +181,11 @@ struct gl_uniform_storage {
     * via the API.
     */
    bool hidden;
+
+   /**
+    * This is a built-in uniform that should not be modified through any gl API.
+    */
+   bool builtin;
 };
 
 #ifdef __cplusplus
diff --git a/src/glsl/ir_visitor.h b/src/glsl/ir_visitor.h
index 40f96ff..7c38481 100644
--- a/src/glsl/ir_visitor.h
+++ b/src/glsl/ir_visitor.h
@@ -65,6 +65,7 @@ public:
    virtual void visit(class ir_loop_jump *) = 0;
    virtual void visit(class ir_emit_vertex *) = 0;
    virtual void visit(class ir_end_primitive *) = 0;
+   virtual void visit(class ir_barrier *) = 0;
    /*@}*/
 };
 
@@ -85,6 +86,7 @@ public:
    virtual void visit(class ir_call *) {}
    virtual void visit(class ir_emit_vertex *) {}
    virtual void visit(class ir_end_primitive *) {}
+   virtual void visit(class ir_barrier *) {}
 };
 #endif /* __cplusplus */
 
diff --git a/src/glsl/link_atomics.cpp b/src/glsl/link_atomics.cpp
index 603873a..100d03c 100644
--- a/src/glsl/link_atomics.cpp
+++ b/src/glsl/link_atomics.cpp
@@ -207,7 +207,7 @@ link_assign_atomic_counter_resources(struct gl_context *ctx,
          storage->atomic_buffer_index = i;
          storage->offset = var->data.atomic.offset;
          storage->array_stride = (var->type->is_array() ?
-                                  var->type->element_type()->atomic_size() : 0);
+                                  var->type->without_array()->atomic_size() : 0);
       }
 
       /* Assign stage-specific fields. */
diff --git a/src/glsl/link_uniform_initializers.cpp b/src/glsl/link_uniform_initializers.cpp
index 60bfc9c..5f57079 100644
--- a/src/glsl/link_uniform_initializers.cpp
+++ b/src/glsl/link_uniform_initializers.cpp
@@ -104,7 +104,7 @@ void
 set_sampler_binding(gl_shader_program *prog, const char *name, int binding)
 {
    struct gl_uniform_storage *const storage =
-      get_storage(prog->UniformStorage, prog->NumUserUniformStorage, name);
+      get_storage(prog->UniformStorage, prog->NumUniformStorage, name);
 
    if (storage == NULL) {
       assert(storage != NULL);
@@ -194,7 +194,7 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
 
    struct gl_uniform_storage *const storage =
       get_storage(prog->UniformStorage,
-		  prog->NumUserUniformStorage,
+                  prog->NumUniformStorage,
 		  name);
    if (storage == NULL) {
       assert(storage != NULL);
diff --git a/src/glsl/link_uniforms.cpp b/src/glsl/link_uniforms.cpp
index 2c928e1..11ae06f 100644
--- a/src/glsl/link_uniforms.cpp
+++ b/src/glsl/link_uniforms.cpp
@@ -589,12 +589,13 @@ private:
       handle_samplers(base_type, &this->uniforms[id]);
       handle_images(base_type, &this->uniforms[id]);
 
-      /* If there is already storage associated with this uniform, it means
-       * that it was set while processing an earlier shader stage.  For
-       * example, we may be processing the uniform in the fragment shader, but
-       * the uniform was already processed in the vertex shader.
+      /* If there is already storage associated with this uniform or if the
+       * uniform is set as builtin, it means that it was set while processing
+       * an earlier shader stage.  For example, we may be processing the
+       * uniform in the fragment shader, but the uniform was already processed
+       * in the vertex shader.
        */
-      if (this->uniforms[id].storage != NULL) {
+      if (this->uniforms[id].storage != NULL || this->uniforms[id].builtin) {
          return;
       }
 
@@ -619,10 +620,15 @@ private:
       this->uniforms[id].initialized = 0;
       this->uniforms[id].num_driver_storage = 0;
       this->uniforms[id].driver_storage = NULL;
-      this->uniforms[id].storage = this->values;
       this->uniforms[id].atomic_buffer_index = -1;
       this->uniforms[id].hidden =
          current_var->data.how_declared == ir_var_hidden;
+      this->uniforms[id].builtin = is_gl_identifier(name);
+
+      /* Do not assign storage if the uniform is builtin */
+      if (!this->uniforms[id].builtin)
+         this->uniforms[id].storage = this->values;
+
       if (this->ubo_block_index != -1) {
 	 this->uniforms[id].block_index = this->ubo_block_index;
 
@@ -894,7 +900,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 {
    ralloc_free(prog->UniformStorage);
    prog->UniformStorage = NULL;
-   prog->NumUserUniformStorage = 0;
+   prog->NumUniformStorage = 0;
 
    if (prog->UniformHash != NULL) {
       prog->UniformHash->clear();
@@ -940,14 +946,6 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 	 if ((var == NULL) || (var->data.mode != ir_var_uniform))
 	    continue;
 
-	 /* FINISHME: Update code to process built-in uniforms!
-	  */
-	 if (is_gl_identifier(var->name)) {
-	    uniform_size.num_shader_uniform_components +=
-	       var->type->component_slots();
-	    continue;
-	 }
-
 	 uniform_size.process(var);
       }
 
@@ -962,16 +960,16 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       }
    }
 
-   const unsigned num_user_uniforms = uniform_size.num_active_uniforms;
+   const unsigned num_uniforms = uniform_size.num_active_uniforms;
    const unsigned num_data_slots = uniform_size.num_values;
 
    /* On the outside chance that there were no uniforms, bail out.
     */
-   if (num_user_uniforms == 0)
+   if (num_uniforms == 0)
       return;
 
    struct gl_uniform_storage *uniforms =
-      rzalloc_array(prog, struct gl_uniform_storage, num_user_uniforms);
+      rzalloc_array(prog, struct gl_uniform_storage, num_uniforms);
    union gl_constant_value *data =
       rzalloc_array(uniforms, union gl_constant_value, num_data_slots);
 #ifndef NDEBUG
@@ -992,11 +990,6 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 	 if ((var == NULL) || (var->data.mode != ir_var_uniform))
 	    continue;
 
-	 /* FINISHME: Update code to process built-in uniforms!
-	  */
-	 if (is_gl_identifier(var->name))
-	    continue;
-
 	 parcel.set_and_process(prog, var);
       }
 
@@ -1009,10 +1002,10 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    }
 
    const unsigned hidden_uniforms =
-      move_hidden_uniforms_to_end(prog, uniforms, num_user_uniforms);
+      move_hidden_uniforms_to_end(prog, uniforms, num_uniforms);
 
    /* Reserve all the explicit locations of the active uniforms. */
-   for (unsigned i = 0; i < num_user_uniforms; i++) {
+   for (unsigned i = 0; i < num_uniforms; i++) {
       if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC) {
          /* How many new entries for this uniform? */
          const unsigned entries = MAX2(1, uniforms[i].array_elements);
@@ -1028,7 +1021,11 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    }
 
    /* Reserve locations for rest of the uniforms. */
-   for (unsigned i = 0; i < num_user_uniforms; i++) {
+   for (unsigned i = 0; i < num_uniforms; i++) {
+
+      /* Built-in uniforms should not get any location. */
+      if (uniforms[i].builtin)
+         continue;
 
       /* Explicit ones have been set already. */
       if (uniforms[i].remap_location != UNMAPPED_UNIFORM_LOC)
@@ -1055,14 +1052,14 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
    }
 
 #ifndef NDEBUG
-   for (unsigned i = 0; i < num_user_uniforms; i++) {
-      assert(uniforms[i].storage != NULL);
+   for (unsigned i = 0; i < num_uniforms; i++) {
+      assert(uniforms[i].storage != NULL || uniforms[i].builtin);
    }
 
    assert(parcel.values == data_end);
 #endif
 
-   prog->NumUserUniformStorage = num_user_uniforms;
+   prog->NumUniformStorage = num_uniforms;
    prog->NumHiddenUniforms = hidden_uniforms;
    prog->UniformStorage = uniforms;
 
diff --git a/src/glsl/link_varyings.cpp b/src/glsl/link_varyings.cpp
index 605748a..278a778 100644
--- a/src/glsl/link_varyings.cpp
+++ b/src/glsl/link_varyings.cpp
@@ -56,7 +56,7 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
    const glsl_type *type_to_match = input->type;
    if (consumer_stage == MESA_SHADER_GEOMETRY) {
       assert(type_to_match->is_array()); /* Enforced by ast_to_hir */
-      type_to_match = type_to_match->element_type();
+      type_to_match = type_to_match->fields.array;
    }
    if (type_to_match != output->type) {
       /* There is a bit of a special case for gl_TexCoord.  This
@@ -1540,13 +1540,15 @@ check_against_output_limit(struct gl_context *ctx,
    const unsigned output_components = output_vectors * 4;
    if (output_components > max_output_components) {
       if (ctx->API == API_OPENGLES2 || prog->IsES)
-         linker_error(prog, "shader uses too many output vectors "
+         linker_error(prog, "%s shader uses too many output vectors "
                       "(%u > %u)\n",
+                      _mesa_shader_stage_to_string(producer->Stage),
                       output_vectors,
                       max_output_components / 4);
       else
-         linker_error(prog, "shader uses too many output components "
+         linker_error(prog, "%s shader uses too many output components "
                       "(%u > %u)\n",
+                      _mesa_shader_stage_to_string(producer->Stage),
                       output_components,
                       max_output_components);
 
@@ -1579,13 +1581,15 @@ check_against_input_limit(struct gl_context *ctx,
    const unsigned input_components = input_vectors * 4;
    if (input_components > max_input_components) {
       if (ctx->API == API_OPENGLES2 || prog->IsES)
-         linker_error(prog, "shader uses too many input vectors "
+         linker_error(prog, "%s shader uses too many input vectors "
                       "(%u > %u)\n",
+                      _mesa_shader_stage_to_string(consumer->Stage),
                       input_vectors,
                       max_input_components / 4);
       else
-         linker_error(prog, "shader uses too many input components "
+         linker_error(prog, "%s shader uses too many input components "
                       "(%u > %u)\n",
+                      _mesa_shader_stage_to_string(consumer->Stage),
                       input_components,
                       max_input_components);
 
diff --git a/src/glsl/linker.cpp b/src/glsl/linker.cpp
index ea73c6f..4a726d4 100644
--- a/src/glsl/linker.cpp
+++ b/src/glsl/linker.cpp
@@ -224,7 +224,7 @@ public:
          return visit_continue;
       }
 
-      var->type = glsl_type::get_array_instance(var->type->element_type(),
+      var->type = glsl_type::get_array_instance(var->type->fields.array,
                                                 this->num_vertices);
       var->data.max_array_access = this->num_vertices - 1;
 
@@ -245,7 +245,7 @@ public:
    {
       const glsl_type *const vt = ir->array->type;
       if (vt->is_array())
-         ir->type = vt->element_type();
+         ir->type = vt->fields.array;
       return visit_continue;
    }
 };
@@ -1400,8 +1400,8 @@ link_fs_input_layout_qualifiers(struct gl_shader_program *prog,
                       "layout qualifiers for gl_FragCoord\n");
       }
 
-      /* Update the linked shader state.  Note that uses_gl_fragcoord should
-       * accumulate the results.  The other values should replace.  If there
+      /* Update the linked shader state.  Note that uses_gl_fragcoord should
+       * accumulate the results.  The other values should replace.  If there
        * are multiple redeclarations, all the fields except uses_gl_fragcoord
        * are already known to be the same.
        */
@@ -2355,6 +2355,13 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
    unsigned total_uniform_blocks = 0;
 
    for (unsigned i = 0; i < prog->NumUniformBlocks; i++) {
+      if (prog->UniformBlocks[i].UniformBufferSize > ctx->Const.MaxUniformBlockSize) {
+         linker_error(prog, "Uniform block %s too big (%d/%d)\n",
+                      prog->UniformBlocks[i].Name,
+                      prog->UniformBlocks[i].UniformBufferSize,
+                      ctx->Const.MaxUniformBlockSize);
+      }
+
       for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
 	 if (prog->UniformBlockStageIndex[j][i] != -1) {
 	    blocks[j]++;
@@ -2693,13 +2700,23 @@ build_program_resource_list(struct gl_context *ctx,
    }
 
    /* Add uniforms from uniform storage. */
-   for (unsigned i = 0; i < shProg->NumUserUniformStorage; i++) {
+   for (unsigned i = 0; i < shProg->NumUniformStorage; i++) {
       /* Do not add uniforms internally used by Mesa. */
       if (shProg->UniformStorage[i].hidden)
          continue;
 
       uint8_t stageref =
          build_stageref(shProg, shProg->UniformStorage[i].name);
+
+      /* Add stagereferences for uniforms in a uniform block. */
+      int block_index = shProg->UniformStorage[i].block_index;
+      if (block_index != -1) {
+         for (unsigned j = 0; j < MESA_SHADER_STAGES; j++) {
+             if (shProg->UniformBlockStageIndex[j][block_index] != -1)
+                stageref |= (1 << j);
+         }
+      }
+
       if (!add_program_resource(shProg, GL_UNIFORM,
                                 &shProg->UniformStorage[i], stageref))
          return;
@@ -2819,8 +2836,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
             link_intrastage_shaders(mem_ctx, ctx, prog, shader_list[stage],
                                     num_shaders[stage]);
 
-         if (!prog->LinkStatus)
+         if (!prog->LinkStatus) {
+            if (sh)
+               ctx->Driver.DeleteShader(ctx, sh);
             goto done;
+         }
 
          switch (stage) {
          case MESA_SHADER_VERTEX:
@@ -2833,8 +2853,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
             validate_fragment_shader_executable(prog, sh);
             break;
          }
-         if (!prog->LinkStatus)
+         if (!prog->LinkStatus) {
+            if (sh)
+               ctx->Driver.DeleteShader(ctx, sh);
             goto done;
+         }
 
          _mesa_reference_shader(ctx, &prog->_LinkedShaders[stage], sh);
       }
diff --git a/src/glsl/lower_clip_distance.cpp b/src/glsl/lower_clip_distance.cpp
index 2d6138d..01f028b 100644
--- a/src/glsl/lower_clip_distance.cpp
+++ b/src/glsl/lower_clip_distance.cpp
@@ -114,7 +114,7 @@ lower_clip_distance_visitor::visit(ir_variable *ir)
       return visit_continue;
    assert (ir->type->is_array());
 
-   if (!ir->type->element_type()->is_array()) {
+   if (!ir->type->fields.array->is_array()) {
       /* 1D gl_ClipDistance (used for vertex and geometry output, and fragment
        * input).
        */
@@ -123,7 +123,7 @@ lower_clip_distance_visitor::visit(ir_variable *ir)
 
       this->progress = true;
       this->old_clip_distance_1d_var = ir;
-      assert (ir->type->element_type() == glsl_type::float_type);
+      assert (ir->type->fields.array == glsl_type::float_type);
       unsigned new_size = (ir->type->array_size() + 3) / 4;
 
       /* Clone the old var so that we inherit all of its properties */
@@ -148,8 +148,8 @@ lower_clip_distance_visitor::visit(ir_variable *ir)
 
       this->progress = true;
       this->old_clip_distance_2d_var = ir;
-      assert (ir->type->element_type()->element_type() == glsl_type::float_type);
-      unsigned new_size = (ir->type->element_type()->array_size() + 3) / 4;
+      assert (ir->type->fields.array->fields.array == glsl_type::float_type);
+      unsigned new_size = (ir->type->fields.array->array_size() + 3) / 4;
 
       /* Clone the old var so that we inherit all of its properties */
       this->new_clip_distance_2d_var = ir->clone(ralloc_parent(ir), NULL);
diff --git a/src/glsl/main.cpp b/src/glsl/main.cpp
index fc54ddd..2341298 100644
--- a/src/glsl/main.cpp
+++ b/src/glsl/main.cpp
@@ -276,7 +276,7 @@ usage_fail(const char *name)
       "usage: %s [options] <file.vert | file.geom | file.frag>\n"
       "\n"
       "Possible options are:\n";
-   printf(header, name, name);
+   printf(header, name);
    for (const struct option *o = compiler_opts; o->name != 0; ++o) {
       printf("    --%s\n", o->name);
    }
diff --git a/src/glsl/nir/glsl_to_nir.cpp b/src/glsl/nir/glsl_to_nir.cpp
index 7c30be3..0338af6 100644
--- a/src/glsl/nir/glsl_to_nir.cpp
+++ b/src/glsl/nir/glsl_to_nir.cpp
@@ -65,6 +65,7 @@ public:
    virtual void visit(ir_dereference_variable *);
    virtual void visit(ir_dereference_record *);
    virtual void visit(ir_dereference_array *);
+   virtual void visit(ir_barrier *);
 
    void create_function(ir_function *ir);
 
@@ -615,27 +616,135 @@ nir_visitor::visit(ir_call *ir)
          op = nir_intrinsic_atomic_counter_inc_var;
       } else if (strcmp(ir->callee_name(), "__intrinsic_atomic_predecrement") == 0) {
          op = nir_intrinsic_atomic_counter_dec_var;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_load") == 0) {
+         op = nir_intrinsic_image_load;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_store") == 0) {
+         op = nir_intrinsic_image_store;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_add") == 0) {
+         op = nir_intrinsic_image_atomic_add;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_min") == 0) {
+         op = nir_intrinsic_image_atomic_min;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_max") == 0) {
+         op = nir_intrinsic_image_atomic_max;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_and") == 0) {
+         op = nir_intrinsic_image_atomic_and;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_or") == 0) {
+         op = nir_intrinsic_image_atomic_or;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_xor") == 0) {
+         op = nir_intrinsic_image_atomic_xor;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_exchange") == 0) {
+         op = nir_intrinsic_image_atomic_exchange;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_image_atomic_comp_swap") == 0) {
+         op = nir_intrinsic_image_atomic_comp_swap;
+      } else if (strcmp(ir->callee_name(), "__intrinsic_memory_barrier") == 0) {
+         op = nir_intrinsic_memory_barrier;
       } else {
          unreachable("not reached");
       }
 
       nir_intrinsic_instr *instr = nir_intrinsic_instr_create(shader, op);
-      ir_dereference *param =
-         (ir_dereference *) ir->actual_parameters.get_head();
-      instr->variables[0] = evaluate_deref(&instr->instr, param);
-      nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+
+      switch (op) {
+      case nir_intrinsic_atomic_counter_read_var:
+      case nir_intrinsic_atomic_counter_inc_var:
+      case nir_intrinsic_atomic_counter_dec_var: {
+         ir_dereference *param =
+            (ir_dereference *) ir->actual_parameters.get_head();
+         instr->variables[0] = evaluate_deref(&instr->instr, param);
+         nir_ssa_dest_init(&instr->instr, &instr->dest, 1, NULL);
+         break;
+      }
+      case nir_intrinsic_image_load:
+      case nir_intrinsic_image_store:
+      case nir_intrinsic_image_atomic_add:
+      case nir_intrinsic_image_atomic_min:
+      case nir_intrinsic_image_atomic_max:
+      case nir_intrinsic_image_atomic_and:
+      case nir_intrinsic_image_atomic_or:
+      case nir_intrinsic_image_atomic_xor:
+      case nir_intrinsic_image_atomic_exchange:
+      case nir_intrinsic_image_atomic_comp_swap: {
+         nir_ssa_undef_instr *instr_undef =
+            nir_ssa_undef_instr_create(shader, 1);
+         nir_instr_insert_after_cf_list(this->cf_node_list,
+                                        &instr_undef->instr);
+
+         /* Set the image variable dereference. */
+         exec_node *param = ir->actual_parameters.get_head();
+         ir_dereference *image = (ir_dereference *)param;
+         const glsl_type *type =
+            image->variable_referenced()->type->without_array();
+
+         instr->variables[0] = evaluate_deref(&instr->instr, image);
+         param = param->get_next();
+
+         /* Set the address argument, extending the coordinate vector to four
+          * components.
+          */
+         const nir_src src_addr = evaluate_rvalue((ir_dereference *)param);
+         nir_alu_instr *instr_addr = nir_alu_instr_create(shader, nir_op_vec4);
+         nir_ssa_dest_init(&instr_addr->instr, &instr_addr->dest.dest, 4, NULL);
+
+         for (int i = 0; i < 4; i++) {
+            if (i < type->coordinate_components()) {
+               instr_addr->src[i].src = src_addr;
+               instr_addr->src[i].swizzle[0] = i;
+            } else {
+               instr_addr->src[i].src = nir_src_for_ssa(&instr_undef->def);
+            }
+         }
+
+         nir_instr_insert_after_cf_list(cf_node_list, &instr_addr->instr);
+         instr->src[0] = nir_src_for_ssa(&instr_addr->dest.dest.ssa);
+         param = param->get_next();
+
+         /* Set the sample argument, which is undefined for single-sample
+          * images.
+          */
+         if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_MS) {
+            instr->src[1] = evaluate_rvalue((ir_dereference *)param);
+            param = param->get_next();
+         } else {
+            instr->src[1] = nir_src_for_ssa(&instr_undef->def);
+         }
+
+         /* Set the intrinsic parameters. */
+         if (!param->is_tail_sentinel()) {
+            instr->src[2] = evaluate_rvalue((ir_dereference *)param);
+            param = param->get_next();
+         }
+
+         if (!param->is_tail_sentinel()) {
+            instr->src[3] = evaluate_rvalue((ir_dereference *)param);
+            param = param->get_next();
+         }
+
+         /* Set the intrinsic destination. */
+         if (ir->return_deref)
+            nir_ssa_dest_init(&instr->instr, &instr->dest,
+                              ir->return_deref->type->vector_elements, NULL);
+         break;
+      }
+      case nir_intrinsic_memory_barrier:
+         break;
+      default:
+         unreachable("not reached");
+      }
 
       nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
 
-      nir_intrinsic_instr *store_instr =
-         nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
-      store_instr->num_components = 1;
+      if (ir->return_deref) {
+         nir_intrinsic_instr *store_instr =
+            nir_intrinsic_instr_create(shader, nir_intrinsic_store_var);
+         store_instr->num_components = ir->return_deref->type->vector_elements;
 
-      store_instr->variables[0] = evaluate_deref(&store_instr->instr, ir->return_deref);
-      store_instr->src[0].is_ssa = true;
-      store_instr->src[0].ssa = &instr->dest.ssa;
+         store_instr->variables[0] =
+            evaluate_deref(&store_instr->instr, ir->return_deref);
+         store_instr->src[0] = nir_src_for_ssa(&instr->dest.ssa);
 
-      nir_instr_insert_after_cf_list(this->cf_node_list, &store_instr->instr);
+         nir_instr_insert_after_cf_list(this->cf_node_list,
+                                        &store_instr->instr);
+      }
 
       return;
    }
@@ -823,13 +932,9 @@ nir_visitor::evaluate_rvalue(ir_rvalue* ir)
    }
 
    nir_dest *dest = get_instr_dest(this->result);
-
    assert(dest->is_ssa);
-   nir_src src = NIR_SRC_INIT;
-   src.is_ssa = true;
-   src.ssa = &dest->ssa;
 
-   return src;
+   return nir_src_for_ssa(&dest->ssa);
 }
 
 nir_alu_instr *
@@ -1786,3 +1891,11 @@ nir_visitor::visit(ir_dereference_array *ir)
    ralloc_steal(this->deref_tail, deref);
    this->deref_tail = &deref->deref;
 }
+
+void
+nir_visitor::visit(ir_barrier *ir)
+{
+   nir_intrinsic_instr *instr =
+      nir_intrinsic_instr_create(this->shader, nir_intrinsic_barrier);
+   nir_instr_insert_after_cf_list(this->cf_node_list, &instr->instr);
+}
diff --git a/src/glsl/nir/nir_intrinsics.h b/src/glsl/nir/nir_intrinsics.h
index 8e28765..bc6e6b8 100644
--- a/src/glsl/nir/nir_intrinsics.h
+++ b/src/glsl/nir/nir_intrinsics.h
@@ -67,7 +67,15 @@ INTRINSIC(interp_var_at_offset, 1, ARR(2), true, 0, 1, 0,
  */
 #define BARRIER(name) INTRINSIC(name, 0, ARR(), false, 0, 0, 0, 0)
 
+BARRIER(barrier)
 BARRIER(discard)
+
+/*
+ * Memory barrier with semantics analogous to the memoryBarrier() GLSL
+ * intrinsic.
+ */
+BARRIER(memory_barrier)
+
 /** A conditional discard, with a single boolean source. */
 INTRINSIC(discard_if, 1, ARR(1), false, 0, 0, 0, 0)
 
@@ -89,6 +97,33 @@ ATOMIC(inc, 0)
 ATOMIC(dec, 0)
 ATOMIC(read, NIR_INTRINSIC_CAN_ELIMINATE)
 
+/*
+ * Image load, store and atomic intrinsics.
+ *
+ * All image intrinsics take an image target passed as a nir_variable.  Image
+ * variables contain a number of memory and layout qualifiers that influence
+ * the semantics of the intrinsic.
+ *
+ * All image intrinsics take a four-coordinate vector and a sample index as
+ * first two sources, determining the location within the image that will be
+ * accessed by the intrinsic.  Components not applicable to the image target
+ * in use are undefined.  Image store takes an additional four-component
+ * argument with the value to be written, and image atomic operations take
+ * either one or two additional scalar arguments with the same meaning as in
+ * the ARB_shader_image_load_store specification.
+ */
+INTRINSIC(image_load, 2, ARR(4, 1), true, 4, 1, 0,
+          NIR_INTRINSIC_CAN_ELIMINATE)
+INTRINSIC(image_store, 3, ARR(4, 1, 4), false, 0, 1, 0, 0)
+INTRINSIC(image_atomic_add, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_atomic_min, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_atomic_max, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_atomic_and, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_atomic_or, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_atomic_xor, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_atomic_exchange, 3, ARR(4, 1, 1), true, 1, 1, 0, 0)
+INTRINSIC(image_atomic_comp_swap, 4, ARR(4, 1, 1, 1), true, 1, 1, 0, 0)
+
 #define SYSTEM_VALUE(name, components) \
    INTRINSIC(load_##name, 0, ARR(), true, components, 0, 0, \
    NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
@@ -104,12 +139,11 @@ SYSTEM_VALUE(sample_mask_in, 1)
 SYSTEM_VALUE(invocation_id, 1)
 
 /*
- * The first index is the address to load from, and the second index is the
- * number of array elements to load.  Indirect loads have an additional
- * register input, which is added to the constant address to compute the
- * final address to load from.  For UBO's (and SSBO's), the first source is
- * the (possibly constant) UBO buffer index and the indirect (if it exists)
- * is the second source.
+ * The first and only index is the base address to load from.  Indirect
+ * loads have an additional register input, which is added to the constant
+ * address to compute the final address to load from.  For UBO's (and
+ * SSBO's), the first source is the (possibly constant) UBO buffer index
+ * and the indirect (if it exists) is the second source.
  *
  * For vector backends, the address is in terms of one vec4, and so each array
  * element is +4 scalar components from the previous array element. For scalar
@@ -118,9 +152,9 @@ SYSTEM_VALUE(invocation_id, 1)
  */
 
 #define LOAD(name, extra_srcs, flags) \
-   INTRINSIC(load_##name, extra_srcs, ARR(1), true, 0, 0, 2, flags) \
+   INTRINSIC(load_##name, extra_srcs, ARR(1), true, 0, 0, 1, flags) \
    INTRINSIC(load_##name##_indirect, extra_srcs + 1, ARR(1, 1), \
-             true, 0, 0, 2, flags)
+             true, 0, 0, 1, flags)
 
 LOAD(uniform, 0, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
 LOAD(ubo, 1, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
@@ -138,7 +172,7 @@ LOAD(input, 0, NIR_INTRINSIC_CAN_ELIMINATE | NIR_INTRINSIC_CAN_REORDER)
    INTRINSIC(store_##name##_indirect, 2, ARR(0, 1), false, 0, 0, \
              num_indices, flags) \
 
-STORE(output, 2, 0)
-/* STORE(ssbo, 3, 0) */
+STORE(output, 1, 0)
+/* STORE(ssbo, 2, 0) */
 
 LAST_INTRINSIC(store_output_indirect)
diff --git a/src/glsl/nir/nir_lower_atomics.c b/src/glsl/nir/nir_lower_atomics.c
index e82df01..0457de6 100644
--- a/src/glsl/nir/nir_lower_atomics.c
+++ b/src/glsl/nir/nir_lower_atomics.c
@@ -78,7 +78,8 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
          nir_deref_as_array(instr->variables[0]->deref.child);
       assert(deref_array->deref.child == NULL);
 
-      offset_const->value.u[0] += deref_array->base_offset;
+      offset_const->value.u[0] +=
+         deref_array->base_offset * ATOMIC_COUNTER_SIZE;
 
       if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
          nir_load_const_instr *atomic_counter_size =
@@ -108,7 +109,7 @@ lower_instr(nir_intrinsic_instr *instr, nir_function_impl *impl)
    }
 
    new_instr->src[0].is_ssa = true;
-   new_instr->src[0].ssa = offset_def;;
+   new_instr->src[0].ssa = offset_def;
 
    if (instr->dest.is_ssa) {
       nir_ssa_dest_init(&new_instr->instr, &new_instr->dest,
diff --git a/src/glsl/nir/nir_lower_io.c b/src/glsl/nir/nir_lower_io.c
index 561bebd..4c59298 100644
--- a/src/glsl/nir/nir_lower_io.c
+++ b/src/glsl/nir/nir_lower_io.c
@@ -289,7 +289,6 @@ nir_lower_io_block(nir_block *block, void *void_state)
          offset += intrin->variables[0]->var->data.driver_location;
 
          load->const_index[0] = offset;
-         load->const_index[1] = 1;
 
          if (has_indirect)
             load->src[0] = indirect;
@@ -332,7 +331,6 @@ nir_lower_io_block(nir_block *block, void *void_state)
          offset += intrin->variables[0]->var->data.driver_location;
 
          store->const_index[0] = offset;
-         store->const_index[1] = 1;
 
          nir_src_copy(&store->src[0], &intrin->src[0], state->mem_ctx);
 
diff --git a/src/glsl/nir/nir_lower_phis_to_scalar.c b/src/glsl/nir/nir_lower_phis_to_scalar.c
index 4bdb800..a57d253 100644
--- a/src/glsl/nir/nir_lower_phis_to_scalar.c
+++ b/src/glsl/nir/nir_lower_phis_to_scalar.c
@@ -153,6 +153,11 @@ should_lower_phi(nir_phi_instr *phi, struct lower_phis_to_scalar_state *state)
          break;
    }
 
+   /* The hash table entry for 'phi' may have changed while recursing the
+    * dependence graph, so we need to reset it */
+   entry = _mesa_hash_table_search(state->phi_table, phi);
+   assert(entry);
+
    entry->data = (void *)(intptr_t)scalarizable;
 
    return scalarizable;
diff --git a/src/glsl/nir/nir_lower_samplers.cpp b/src/glsl/nir/nir_lower_samplers.cpp
index 6ed5a4c..9a9cdd1 100644
--- a/src/glsl/nir/nir_lower_samplers.cpp
+++ b/src/glsl/nir/nir_lower_samplers.cpp
@@ -94,34 +94,45 @@ lower_sampler(nir_tex_instr *instr, const struct gl_shader_program *shader_progr
       case nir_deref_type_array: {
          nir_deref_array *deref_array = nir_deref_as_array(deref->child);
 
+         assert(deref_array->deref_array_type != nir_deref_array_type_wildcard);
+
+         if (deref_array->deref.child) {
+            ralloc_asprintf_append(&name, "[%u]",
+               deref_array->deref_array_type == nir_deref_array_type_direct ?
+                  deref_array->base_offset : 0);
+         } else {
+            assert(deref->child->type->base_type == GLSL_TYPE_SAMPLER);
+            instr->sampler_index = deref_array->base_offset;
+         }
+
          /* XXX: We're assuming here that the indirect is the last array
           * thing we have.  This should be ok for now as we don't support
           * arrays_of_arrays yet.
           */
-
-         instr->sampler_index *= glsl_get_length(deref->type);
-         switch (deref_array->deref_array_type) {
-         case nir_deref_array_type_direct:
-            instr->sampler_index += deref_array->base_offset;
-            if (deref_array->deref.child)
-               ralloc_asprintf_append(&name, "[%u]", deref_array->base_offset);
-            break;
-         case nir_deref_array_type_indirect: {
-            add_indirect_to_tex(instr, deref_array->indirect);
-            nir_instr_rewrite_src(&instr->instr, &deref_array->indirect,
-                                  NIR_SRC_INIT);
+         if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+            /* First, we have to resize the array of texture sources */
+            nir_tex_src *new_srcs = rzalloc_array(instr, nir_tex_src,
+                                                  instr->num_srcs + 1);
+
+            for (unsigned i = 0; i < instr->num_srcs; i++) {
+               new_srcs[i].src_type = instr->src[i].src_type;
+               nir_instr_move_src(&instr->instr, &new_srcs[i].src,
+                                  &instr->src[i].src);
+            }
+
+            ralloc_free(instr->src);
+            instr->src = new_srcs;
+
+            /* Now we can go ahead and move the source over to being a
+             * first-class texture source.
+             */
+            instr->src[instr->num_srcs].src_type = nir_tex_src_sampler_offset;
+            instr->num_srcs++;
+            nir_instr_move_src(&instr->instr,
+                               &instr->src[instr->num_srcs - 1].src,
+                               &deref_array->indirect);
 
             instr->sampler_array_size = glsl_get_length(deref->type);
-
-            if (deref_array->deref.child)
-               ralloc_strcat(&name, "[0]");
-            break;
-         }
-
-         case nir_deref_array_type_wildcard:
-            unreachable("Cannot copy samplers");
-         default:
-            unreachable("Invalid deref array type");
          }
          break;
       }
diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
index fa03922..eace791 100644
--- a/src/glsl/nir/nir_opt_algebraic.py
+++ b/src/glsl/nir/nir_opt_algebraic.py
@@ -156,6 +156,8 @@ optimizations = [
    (('fpow', a, 2.0), ('fmul', a, a)),
    (('fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
    (('fpow', 2.0, a), ('fexp2', a)),
+   (('fpow', ('fpow', a, 2.2), 0.454545), a),
+   (('fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
    (('fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
    (('frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
    (('frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
diff --git a/src/glsl/nir/nir_opt_peephole_ffma.c b/src/glsl/nir/nir_opt_peephole_ffma.c
index b430eac..798506b 100644
--- a/src/glsl/nir/nir_opt_peephole_ffma.c
+++ b/src/glsl/nir/nir_opt_peephole_ffma.c
@@ -73,7 +73,8 @@ are_all_uses_fadd(nir_ssa_def *def)
 }
 
 static nir_alu_instr *
-get_mul_for_src(nir_alu_src *src, uint8_t swizzle[4], bool *negate, bool *abs)
+get_mul_for_src(nir_alu_src *src, int num_components,
+                uint8_t swizzle[4], bool *negate, bool *abs)
 {
    assert(src->src.is_ssa && !src->abs && !src->negate);
 
@@ -85,16 +86,16 @@ get_mul_for_src(nir_alu_src *src, uint8_t swizzle[4], bool *negate, bool *abs)
    switch (alu->op) {
    case nir_op_imov:
    case nir_op_fmov:
-      alu = get_mul_for_src(&alu->src[0], swizzle, negate, abs);
+      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
       break;
 
    case nir_op_fneg:
-      alu = get_mul_for_src(&alu->src[0], swizzle, negate, abs);
+      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
       *negate = !*negate;
       break;
 
    case nir_op_fabs:
-      alu = get_mul_for_src(&alu->src[0], swizzle, negate, abs);
+      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
       *negate = false;
       *abs = true;
       break;
@@ -115,12 +116,8 @@ get_mul_for_src(nir_alu_src *src, uint8_t swizzle[4], bool *negate, bool *abs)
    if (!alu)
       return NULL;
 
-   for (unsigned i = 0; i < 4; i++) {
-      if (!(alu->dest.write_mask & (1 << i)))
-         break;
-
+   for (unsigned i = 0; i < num_components; i++)
       swizzle[i] = swizzle[src->swizzle[i]];
-   }
 
    return alu;
 }
@@ -160,7 +157,9 @@ nir_opt_peephole_ffma_block(nir_block *block, void *void_state)
          negate = false;
          abs = false;
 
-         mul = get_mul_for_src(&add->src[add_mul_src], swizzle, &negate, &abs);
+         mul = get_mul_for_src(&add->src[add_mul_src],
+                               add->dest.dest.ssa.num_components,
+                               swizzle, &negate, &abs);
 
          if (mul != NULL)
             break;
diff --git a/src/glsl/nir/nir_opt_peephole_select.c b/src/glsl/nir/nir_opt_peephole_select.c
index 82c65bb..ef7c977 100644
--- a/src/glsl/nir/nir_opt_peephole_select.c
+++ b/src/glsl/nir/nir_opt_peephole_select.c
@@ -86,7 +86,9 @@ block_check_for_allowed_instrs(nir_block *block)
          nir_alu_instr *mov = nir_instr_as_alu(instr);
          if (mov->op != nir_op_fmov && mov->op != nir_op_imov &&
              mov->op != nir_op_fneg && mov->op != nir_op_ineg &&
-             mov->op != nir_op_fabs && mov->op != nir_op_iabs)
+             mov->op != nir_op_fabs && mov->op != nir_op_iabs &&
+             mov->op != nir_op_vec2 && mov->op != nir_op_vec3 &&
+             mov->op != nir_op_vec4)
             return false;
 
          /* Can't handle saturate */
diff --git a/src/glsl/standalone_scaffolding.cpp b/src/glsl/standalone_scaffolding.cpp
index 6e1ecec..050e733 100644
--- a/src/glsl/standalone_scaffolding.cpp
+++ b/src/glsl/standalone_scaffolding.cpp
@@ -95,7 +95,7 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
 {
    unsigned i;
 
-   shProg->NumUserUniformStorage = 0;
+   shProg->NumUniformStorage = 0;
    shProg->UniformStorage = NULL;
    shProg->NumUniformRemapTable = 0;
    shProg->UniformRemapTable = NULL;
diff --git a/src/glsl/tests/set_uniform_initializer_tests.cpp b/src/glsl/tests/set_uniform_initializer_tests.cpp
index d3fdeb3..91227d9 100644
--- a/src/glsl/tests/set_uniform_initializer_tests.cpp
+++ b/src/glsl/tests/set_uniform_initializer_tests.cpp
@@ -110,7 +110,7 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
 
    prog->UniformStorage = rzalloc_array(prog, struct gl_uniform_storage,
 					num_storage);
-   prog->NumUserUniformStorage = num_storage;
+   prog->NumUniformStorage = num_storage;
 
    prog->UniformStorage[index_to_set].name = (char *) name;
    prog->UniformStorage[index_to_set].type = type;
@@ -155,7 +155,7 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
 static void
 verify_initialization(struct gl_shader_program *prog, unsigned actual_index)
 {
-   for (unsigned i = 0; i < prog->NumUserUniformStorage; i++) {
+   for (unsigned i = 0; i < prog->NumUniformStorage; i++) {
       if (i == actual_index) {
 	 EXPECT_TRUE(prog->UniformStorage[actual_index].initialized);
       } else {
diff --git a/src/glx/SConscript b/src/glx/SConscript
index b91c0bd..619e4c3 100644
--- a/src/glx/SConscript
+++ b/src/glx/SConscript
@@ -125,7 +125,7 @@ env.CodeGenerate(
     target = 'indirect_size.h',
     script = GLAPI + 'gen/glX_proto_size.py',
     source = sources,
-    command = python_cmd + ' $SCRIPT -f $SOURCE -m size_h --only-set -h _INDIRECT_SIZE_H > $TARGET'
+    command = python_cmd + ' $SCRIPT -f $SOURCE -m size_h --only-set --header-tag _INDIRECT_SIZE_H > $TARGET'
 )
 
 env.CodeGenerate(
diff --git a/src/glx/dri2_glx.c b/src/glx/dri2_glx.c
index 538cf1a..27ea952 100644
--- a/src/glx/dri2_glx.c
+++ b/src/glx/dri2_glx.c
@@ -1183,15 +1183,7 @@ dri2CreateScreen(int screen, struct glx_display * priv)
       return NULL;
    }
 
-#ifdef O_CLOEXEC
-   psc->fd = open(deviceName, O_RDWR | O_CLOEXEC);
-   if (psc->fd == -1 && errno == EINVAL)
-#endif
-   {
-      psc->fd = open(deviceName, O_RDWR);
-      if (psc->fd != -1)
-         fcntl(psc->fd, F_SETFD, fcntl(psc->fd, F_GETFD) | FD_CLOEXEC);
-   }
+   psc->fd = loader_open_device(deviceName);
    if (psc->fd < 0) {
       ErrorMessageF("failed to open drm device: %s\n", strerror(errno));
       goto handle_error;
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index ff77a91..dfb0093 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -1985,6 +1985,11 @@ dri3_create_screen(int screen, struct glx_display * priv)
       goto handle_error;
    }
 
+   if (psc->is_different_gpu && !psc->image->blitImage) {
+      ErrorMessageF("Different GPU, but blitImage not implemented for this driver\n");
+      goto handle_error;
+   }
+
    if (!psc->is_different_gpu && (
        !psc->texBuffer || psc->texBuffer->base.version < 2 ||
        !psc->texBuffer->setTexBuffer2
diff --git a/src/hgl/GLDispatcher.cpp b/src/hgl/GLDispatcher.cpp
index 46b91d5..a1e9053 100644
--- a/src/hgl/GLDispatcher.cpp
+++ b/src/hgl/GLDispatcher.cpp
@@ -1,6 +1,6 @@
 /*
  * Copyright 1998-1999 Precision Insight, Inc., Cedar Park, Texas.
- * Copyright 2000-2012 Haiku, Inc. All Rights Reserved.
+ * Copyright 2000-2015 Haiku, Inc. All Rights Reserved.
  * Distributed under the terms of the MIT License.
  *
  * Authors:
@@ -10,10 +10,11 @@
  */
 
 
-extern "C" {
 #include "glapi/glapi.h"
 #include "glapi/glapi_priv.h"
 
+
+extern "C" {
 /*
  * NOTE: this file portion implements C-based dispatch of the OpenGL entrypoints
  * (glAccum, glBegin, etc).
diff --git a/src/hgl/GLDispatcher.h b/src/hgl/GLDispatcher.h
index 44bca8c..7ee095d 100644
--- a/src/hgl/GLDispatcher.h
+++ b/src/hgl/GLDispatcher.h
@@ -1,6 +1,6 @@
 /*
  * Copyright 1998-1999 Precision Insight, Inc., Cedar Park, Texas.
- * Copyright 2000-2012 Haiku, Inc. All Rights Reserved.
+ * Copyright 2000-2015 Haiku, Inc. All Rights Reserved.
  * Distributed under the terms of the MIT License.
  *
  * Authors:
@@ -17,9 +17,7 @@
 
 #include "glheader.h"
 
-extern "C" {
 #include "glapi/glapi.h"
-}
 
 
 class BGLDispatcher
diff --git a/src/hgl/SConscript b/src/hgl/SConscript
index 70db149..71881f5 100644
--- a/src/hgl/SConscript
+++ b/src/hgl/SConscript
@@ -6,6 +6,7 @@ Import('*')
 env = env.Clone()
 
 env.Append(CPPPATH = [
+    '#/src',
     '#/src/mapi',
     '#/src/mesa',
     '#/src/mesa/main',
diff --git a/src/loader/Android.mk b/src/loader/Android.mk
index 8e215de..92d9fd2 100644
--- a/src/loader/Android.mk
+++ b/src/loader/Android.mk
@@ -40,6 +40,8 @@ else
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
 
+LOCAL_EXPORT_C_INCLUDE_DIRS := $(LOCAL_PATH)
+
 LOCAL_MODULE := libmesa_loader
 
 include $(MESA_COMMON_MK)
diff --git a/src/loader/Makefile.am b/src/loader/Makefile.am
index 36ddba8..aef1bd6 100644
--- a/src/loader/Makefile.am
+++ b/src/loader/Makefile.am
@@ -41,15 +41,11 @@ libloader_la_CPPFLAGS += \
 	-I$(top_builddir)/src/mesa/drivers/dri/common/ \
 	-I$(top_srcdir)/src/mesa/ \
 	-I$(top_srcdir)/src/mapi/ \
-	-DUSE_DRICONF \
-	$(EXPAT_CFLAGS)
+	-DUSE_DRICONF
 
-libloader_la_SOURCES += \
-	$(top_srcdir)/src/mesa/drivers/dri/common/xmlconfig.c
+ libloader_la_LIBADD += \
+	$(top_builddir)/src/mesa/drivers/dri/common/libxmlconfig.la
 
-libloader_la_LIBADD += \
-	-lm \
-	$(EXPAT_LIBS)
 endif
 
 if !HAVE_LIBDRM
diff --git a/src/loader/loader.c b/src/loader/loader.c
index 17bf133..fc46815 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -314,8 +314,8 @@ get_id_path_tag_from_fd(struct udev *udev, int fd)
    return id_path_tag;
 }
 
-static int
-drm_open_device(const char *device_name)
+int
+loader_open_device(const char *device_name)
 {
    int fd;
 #ifdef O_CLOEXEC
@@ -404,7 +404,7 @@ int loader_get_user_preferred_fd(int default_fd, int *different_device)
       goto default_device_clean;
    }
 
-   fd = drm_open_device(device_name);
+   fd = loader_open_device(device_name);
    if (fd >= 0) {
       close(default_fd);
    } else {
diff --git a/src/loader/loader.h b/src/loader/loader.h
index 810e7da..055dc78 100644
--- a/src/loader/loader.h
+++ b/src/loader/loader.h
@@ -27,12 +27,19 @@
 #ifndef LOADER_H
 #define LOADER_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* Helpers to figure out driver and device name, eg. from pci-id, etc. */
 
 #define _LOADER_DRI          (1 << 0)
 #define _LOADER_GALLIUM      (1 << 1)
 
 int
+loader_open_device(const char *);
+
+int
 loader_get_pci_id_for_fd(int fd, int *vendor_id, int *chip_id);
 
 char *
@@ -61,4 +68,9 @@ loader_get_user_preferred_fd(int default_fd, int *different_device);
 void
 loader_set_logger(void (*logger)(int level, const char *fmt, ...));
 
+
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* LOADER_H */
diff --git a/src/mapi/glapi/gen/AMD_performance_monitor.xml b/src/mapi/glapi/gen/AMD_performance_monitor.xml
index b96b263..41b5208 100644
--- a/src/mapi/glapi/gen/AMD_performance_monitor.xml
+++ b/src/mapi/glapi/gen/AMD_performance_monitor.xml
@@ -5,13 +5,13 @@
 
 <category name="GL_AMD_performance_monitor" number="360">
 
-    <function name="GetPerfMonitorGroupsAMD" offset="assign">
+    <function name="GetPerfMonitorGroupsAMD">
         <param name="numGroups" type="GLint *"/>
         <param name="groupsSize" type="GLsizei"/>
         <param name="groups" type="GLuint *"/>
     </function>
 
-    <function name="GetPerfMonitorCountersAMD" offset="assign">
+    <function name="GetPerfMonitorCountersAMD">
         <param name="group" type="GLuint"/>
         <param name="numCounters" type="GLint *"/>
         <param name="maxActiveCounters" type="GLint *"/>
@@ -19,14 +19,14 @@
         <param name="counters" type="GLuint *"/>
     </function>
 
-    <function name="GetPerfMonitorGroupStringAMD" offset="assign">
+    <function name="GetPerfMonitorGroupStringAMD">
         <param name="group" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *"/>
         <param name="groupString" type="GLchar *"/>
     </function>
 
-    <function name="GetPerfMonitorCounterStringAMD" offset="assign">
+    <function name="GetPerfMonitorCounterStringAMD">
         <param name="group" type="GLuint"/>
         <param name="counter" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
@@ -34,24 +34,24 @@
         <param name="counterString" type="GLchar *"/>
     </function>
 
-    <function name="GetPerfMonitorCounterInfoAMD" offset="assign">
+    <function name="GetPerfMonitorCounterInfoAMD">
         <param name="group" type="GLuint"/>
         <param name="counter" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="data" type="GLvoid *"/>
     </function>
 
-    <function name="GenPerfMonitorsAMD" offset="assign">
+    <function name="GenPerfMonitorsAMD">
         <param name="n" type="GLsizei"/>
         <param name="monitors" type="GLuint *"/>
     </function>
 
-    <function name="DeletePerfMonitorsAMD" offset="assign">
+    <function name="DeletePerfMonitorsAMD">
         <param name="n" type="GLsizei"/>
         <param name="monitors" type="GLuint *"/>
     </function>
 
-    <function name="SelectPerfMonitorCountersAMD" offset="assign">
+    <function name="SelectPerfMonitorCountersAMD">
         <param name="monitor" type="GLuint"/>
         <param name="enable" type="GLboolean"/>
         <param name="group" type="GLuint"/>
@@ -59,15 +59,15 @@
         <param name="counterList" type="GLuint *"/>
     </function>
 
-    <function name="BeginPerfMonitorAMD" offset="assign">
+    <function name="BeginPerfMonitorAMD">
         <param name="monitor" type="GLuint"/>
     </function>
 
-    <function name="EndPerfMonitorAMD" offset="assign">
+    <function name="EndPerfMonitorAMD">
         <param name="monitor" type="GLuint"/>
     </function>
 
-    <function name="GetPerfMonitorCounterDataAMD" offset="assign">
+    <function name="GetPerfMonitorCounterDataAMD">
         <param name="monitor" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="dataSize" type="GLsizei"/>
diff --git a/src/mapi/glapi/gen/APPLE_object_purgeable.xml b/src/mapi/glapi/gen/APPLE_object_purgeable.xml
index 62fa64a..829fc26 100644
--- a/src/mapi/glapi/gen/APPLE_object_purgeable.xml
+++ b/src/mapi/glapi/gen/APPLE_object_purgeable.xml
@@ -13,21 +13,21 @@
 
     <enum name="BUFFER_OBJECT_APPLE"          value="0x85B3"/>
 
-    <function name="ObjectPurgeableAPPLE" offset="assign">
+    <function name="ObjectPurgeableAPPLE">
         <param name="objectType" type="GLenum"/>
         <param name="name" type="GLuint"/>
         <param name="option" type="GLenum"/>
 	<return type="GLenum"/>
     </function>
 
-    <function name="ObjectUnpurgeableAPPLE" offset="assign">
+    <function name="ObjectUnpurgeableAPPLE">
         <param name="objectType" type="GLenum"/>
         <param name="name" type="GLuint"/>
         <param name="option" type="GLenum"/>
 	<return type="GLenum"/>
     </function>
 
-    <function name="GetObjectParameterivAPPLE" offset="assign">
+    <function name="GetObjectParameterivAPPLE">
         <param name="objectType" type="GLenum"/>
         <param name="name" type="GLuint"/>
         <param name="pname" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/APPLE_vertex_array_object.xml b/src/mapi/glapi/gen/APPLE_vertex_array_object.xml
index 5eb53b1..7312f9b 100644
--- a/src/mapi/glapi/gen/APPLE_vertex_array_object.xml
+++ b/src/mapi/glapi/gen/APPLE_vertex_array_object.xml
@@ -5,23 +5,21 @@
 <category name="GL_APPLE_vertex_array_object" number="273">
     <enum name="VERTEX_ARRAY_BINDING_APPLE"               value="0x85B5"/>
 
-    <function name="BindVertexArrayAPPLE" offset="assign"
-              static_dispatch="false" deprecated="3.1">
+    <function name="BindVertexArrayAPPLE" deprecated="3.1">
         <param name="array" type="GLuint"/>
     </function>
 
-    <function name="DeleteVertexArraysAPPLE" static_dispatch="false" alias="DeleteVertexArrays">
+    <function name="DeleteVertexArraysAPPLE" alias="DeleteVertexArrays">
         <param name="n" type="GLsizei"/>
 	<param name="arrays" type="const GLuint *"/>
     </function>
 
-    <function name="GenVertexArraysAPPLE" offset="assign"
-              static_dispatch="false" deprecated="3.1">
+    <function name="GenVertexArraysAPPLE" deprecated="3.1">
         <param name="n" type="GLsizei"/>
 	<param name="arrays" type="GLuint *" count="n" output="true"/>
     </function>
 
-    <function name="IsVertexArrayAPPLE" static_dispatch="false" alias="IsVertexArray">
+    <function name="IsVertexArrayAPPLE" alias="IsVertexArray">
         <param name="array" type="GLuint"/>
 	<return type="GLboolean"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_ES2_compatibility.xml b/src/mapi/glapi/gen/ARB_ES2_compatibility.xml
index d157366..c96e71c 100644
--- a/src/mapi/glapi/gen/ARB_ES2_compatibility.xml
+++ b/src/mapi/glapi/gen/ARB_ES2_compatibility.xml
@@ -18,17 +18,17 @@
     <enum name="MAX_VARYING_VECTORS"                          value="0x8DFC"/>
     <enum name="MAX_FRAGMENT_UNIFORM_VECTORS"                 value="0x8DFD"/>
 
-    <function name="GetShaderPrecisionFormat" offset="assign" es2="2.0">
+    <function name="GetShaderPrecisionFormat" es2="2.0">
         <param name="shadertype" type="GLenum"/>
         <param name="precisiontype" type="GLenum"/>
         <param name="range" type="GLint *"/>
         <param name="precision" type="GLint *"/>
     </function>
 
-    <function name="ReleaseShaderCompiler" offset="assign" es2="2.0">
+    <function name="ReleaseShaderCompiler" es2="2.0">
     </function>
 
-    <function name="ShaderBinary" offset="assign" es2="2.0">
+    <function name="ShaderBinary" es2="2.0">
         <param name="n" type="GLsizei"/>
         <param name="shaders" type="const GLuint *"/>
         <param name="binaryformat" type="GLenum"/>
@@ -45,11 +45,11 @@
     <enum name="IMPLEMENTATION_COLOR_READ_FORMAT"         value="0x8B9B"/>
 
     <!-- from GL_OES_single_precision -->
-    <function name="ClearDepthf" offset="assign" es1="1.0" es2="2.0">
+    <function name="ClearDepthf" es1="1.0" es2="2.0">
         <param name="depth" type="GLclampf"/>
     </function>
 
-    <function name="DepthRangef" offset="assign" es1="1.0" es2="2.0">
+    <function name="DepthRangef" es1="1.0" es2="2.0">
         <param name="zNear" type="GLclampf"/>
         <param name="zFar" type="GLclampf"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_base_instance.xml b/src/mapi/glapi/gen/ARB_base_instance.xml
index 1478e39..56de639 100644
--- a/src/mapi/glapi/gen/ARB_base_instance.xml
+++ b/src/mapi/glapi/gen/ARB_base_instance.xml
@@ -8,8 +8,7 @@
 
 <category name="GL_ARB_base_instance" number="107">
 
-  <function name="DrawArraysInstancedBaseInstance" offset="assign"
-            exec="dynamic">
+  <function name="DrawArraysInstancedBaseInstance" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="first" type="GLint"/>
     <param name="count" type="GLsizei"/>
@@ -17,8 +16,7 @@
     <param name="baseinstance" type="GLuint"/>
   </function>
 
-  <function name="DrawElementsInstancedBaseInstance" offset="assign"
-            exec="dynamic">
+  <function name="DrawElementsInstancedBaseInstance" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="count" type="GLsizei"/>
     <param name="type" type="GLenum"/>
@@ -27,8 +25,7 @@
     <param name="baseinstance" type="GLuint"/>
   </function>
 
-  <function name="DrawElementsInstancedBaseVertexBaseInstance" offset="assign"
-            exec="dynamic">
+  <function name="DrawElementsInstancedBaseVertexBaseInstance" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="count" type="GLsizei"/>
     <param name="type" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_blend_func_extended.xml b/src/mapi/glapi/gen/ARB_blend_func_extended.xml
index 32adcde..406140f 100644
--- a/src/mapi/glapi/gen/ARB_blend_func_extended.xml
+++ b/src/mapi/glapi/gen/ARB_blend_func_extended.xml
@@ -8,14 +8,14 @@
 
 <category name="GL_ARB_blend_func_extended" number="78">
 
-    <function name="BindFragDataLocationIndexed" offset="assign">
+    <function name="BindFragDataLocationIndexed">
         <param name="program" type="GLuint"/>
         <param name="colorNumber" type="GLuint"/>
         <param name="index" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
     </function>
 
-    <function name="GetFragDataIndex" offset="assign">
+    <function name="GetFragDataIndex">
         <param name="program" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
 	  <return type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_clear_buffer_object.xml b/src/mapi/glapi/gen/ARB_clear_buffer_object.xml
index cb97a01..2284eac 100644
--- a/src/mapi/glapi/gen/ARB_clear_buffer_object.xml
+++ b/src/mapi/glapi/gen/ARB_clear_buffer_object.xml
@@ -8,7 +8,7 @@
 
 <category name="GL_ARB_clear_buffer_object" number="121">
 
-    <function name ="ClearBufferData" offset="assign">
+    <function name ="ClearBufferData">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="format" type="GLenum"/>
@@ -16,7 +16,7 @@
         <param name="data" type="const GLvoid *"/>
     </function>
 
-    <function name ="ClearBufferSubData" offset="assign">
+    <function name ="ClearBufferSubData">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
@@ -26,7 +26,7 @@
         <param name="data" type="const GLvoid *"/>
     </function>
 
-<!--    <function name="ClearNamedBufferDataEXT" offset="assign">
+<!--    <function name="ClearNamedBufferDataEXT">
         <param name="buffer" type="GLuint"/>
         <param name="internalformat" type="GLenum"/>
         <param name="format" type="GLenum"/>
@@ -35,7 +35,7 @@
     </function>
 
 
-    <function name="ClearNamedBufferSubDataEXT" offset="assign">
+    <function name="ClearNamedBufferSubDataEXT">
         <param name="buffer" type="GLuint"/>
         <param name="internalformat" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
diff --git a/src/mapi/glapi/gen/ARB_clear_texture.xml b/src/mapi/glapi/gen/ARB_clear_texture.xml
index bd9116f..1ff981e 100644
--- a/src/mapi/glapi/gen/ARB_clear_texture.xml
+++ b/src/mapi/glapi/gen/ARB_clear_texture.xml
@@ -7,7 +7,7 @@
 
     <enum name="CLEAR_TEXTURE" value="0x9365"/>
 
-    <function name ="ClearTexImage" offset="assign">
+    <function name ="ClearTexImage">
         <param name="texture" type="GLuint"/>
         <param name="level" type="GLint"/>
         <param name="format" type="GLenum"/>
@@ -15,7 +15,7 @@
         <param name="data" type="const GLvoid *"/>
     </function>
 
-    <function name ="ClearTexSubImage" offset="assign">
+    <function name ="ClearTexSubImage">
         <param name="texture" type="GLuint"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_clip_control.xml b/src/mapi/glapi/gen/ARB_clip_control.xml
index ab1a388..ecce133 100644
--- a/src/mapi/glapi/gen/ARB_clip_control.xml
+++ b/src/mapi/glapi/gen/ARB_clip_control.xml
@@ -14,7 +14,7 @@
     <enum name="CLIP_ORIGIN" value = "0x935C"/>
     <enum name="CLIP_DEPTH_MODE" value = "0x935D"/>
 
-    <function name="ClipControl" offset="assign">
+    <function name="ClipControl">
         <param name="origin" type="GLenum"/>
         <param name="depth" type="GLenum"/>
         <!-- <glx rop="1340"/> -->
diff --git a/src/mapi/glapi/gen/ARB_compute_shader.xml b/src/mapi/glapi/gen/ARB_compute_shader.xml
index 1db373e..c2ec842 100644
--- a/src/mapi/glapi/gen/ARB_compute_shader.xml
+++ b/src/mapi/glapi/gen/ARB_compute_shader.xml
@@ -26,13 +26,13 @@
   <enum name="DISPATCH_INDIRECT_BUFFER_BINDING"                value="0x90EF"/>
   <enum name="COMPUTE_SHADER_BIT"                              value="0x00000020"/>
 
-  <function name="DispatchCompute" offset="assign">
+  <function name="DispatchCompute" es2="3.1">
     <param name="num_groups_x" type="GLuint"/>
     <param name="num_groups_y" type="GLuint"/>
     <param name="num_groups_z" type="GLuint"/>
   </function>
 
-  <function name="DispatchComputeIndirect" offset="assign">
+  <function name="DispatchComputeIndirect" es2="3.1">
     <param name="indirect" type="GLintptr"/>
   </function>
 </category>
diff --git a/src/mapi/glapi/gen/ARB_copy_buffer.xml b/src/mapi/glapi/gen/ARB_copy_buffer.xml
index 6982ed1..d1c6f1f 100644
--- a/src/mapi/glapi/gen/ARB_copy_buffer.xml
+++ b/src/mapi/glapi/gen/ARB_copy_buffer.xml
@@ -11,7 +11,7 @@
     <enum name="COPY_READ_BUFFER"   value="0x8F36"/>
     <enum name="COPY_WRITE_BUFFER"  value="0x8F37"/>
 
-    <function name="CopyBufferSubData" offset="assign" es2="3.0">
+    <function name="CopyBufferSubData" es2="3.0">
         <param name="readTarget" type="GLenum"/>
         <param name="writeTarget" type="GLenum"/>
         <param name="readOffset" type="GLintptr"/>
diff --git a/src/mapi/glapi/gen/ARB_copy_image.xml b/src/mapi/glapi/gen/ARB_copy_image.xml
index 2fbd845..af672cd 100644
--- a/src/mapi/glapi/gen/ARB_copy_image.xml
+++ b/src/mapi/glapi/gen/ARB_copy_image.xml
@@ -5,7 +5,7 @@
 
 <category name="GL_ARB_copy_image" number="123">
 
-    <function name="CopyImageSubData" offset="assign">
+    <function name="CopyImageSubData">
         <param name="srcName" type="GLuint"/>
         <param name="srcTarget" type="GLenum"/>
         <param name="srcLevel" type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_direct_state_access.xml b/src/mapi/glapi/gen/ARB_direct_state_access.xml
index 9e0cf2d..4c8f73e 100644
--- a/src/mapi/glapi/gen/ARB_direct_state_access.xml
+++ b/src/mapi/glapi/gen/ARB_direct_state_access.xml
@@ -9,18 +9,18 @@
 
    <!-- Transform Feedback object functions -->
 
-  <function name="CreateTransformFeedbacks" offset="assign">
+  <function name="CreateTransformFeedbacks">
       <param name="n" type="GLsizei" />
       <param name="ids" type="GLuint *" />
    </function>
 
-   <function name="TransformFeedbackBufferBase" offset="assign">
+   <function name="TransformFeedbackBufferBase">
       <param name="xfb" type="GLuint" />
       <param name="index" type="GLuint" />
       <param name="buffer" type="GLuint" />
    </function>
 
-   <function name="TransformFeedbackBufferRange" offset="assign">
+   <function name="TransformFeedbackBufferRange">
       <param name="xfb" type="GLuint" />
       <param name="index" type="GLuint" />
       <param name="buffer" type="GLuint" />
@@ -28,20 +28,20 @@
       <param name="size" type="GLsizeiptr" />
    </function>
 
-   <function name="GetTransformFeedbackiv" offset="assign">
+   <function name="GetTransformFeedbackiv">
       <param name="xfb" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="GLint *" />
    </function>
 
-   <function name="GetTransformFeedbacki_v" offset="assign">
+   <function name="GetTransformFeedbacki_v">
       <param name="xfb" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="index" type="GLuint" />
       <param name="param" type="GLint *" />
    </function>
 
-   <function name="GetTransformFeedbacki64_v" offset="assign">
+   <function name="GetTransformFeedbacki64_v">
       <param name="xfb" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="index" type="GLuint" />
@@ -50,33 +50,33 @@
 
    <!-- Buffer object functions -->
 
-   <function name="CreateBuffers" offset="assign">
+   <function name="CreateBuffers">
       <param name="n" type="GLsizei" />
       <param name="buffers" type="GLuint *" />
    </function>
 
-   <function name="NamedBufferStorage" offset="assign">
+   <function name="NamedBufferStorage">
       <param name="buffer" type="GLuint" />
       <param name="size" type="GLsizeiptr" />
       <param name="data" type="const GLvoid *" />
       <param name="flags" type="GLbitfield" />
    </function>
 
-   <function name="NamedBufferData" offset="assign">
+   <function name="NamedBufferData">
       <param name="buffer" type="GLuint" />
       <param name="size" type="GLsizeiptr" />
       <param name="data" type="const GLvoid *" />
       <param name="usage" type="GLenum" />
    </function>
 
-   <function name="NamedBufferSubData" offset="assign">
+   <function name="NamedBufferSubData">
       <param name="buffer" type="GLuint" />
       <param name="offset" type="GLintptr" />
       <param name="size" type="GLsizeiptr" />
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CopyNamedBufferSubData" offset="assign">
+   <function name="CopyNamedBufferSubData">
       <param name="readBuffer" type="GLuint" />
       <param name="writeBuffer" type="GLuint" />
       <param name="readOffset" type="GLintptr" />
@@ -84,7 +84,7 @@
       <param name="size" type="GLsizeiptr" />
    </function>
 
-   <function name="ClearNamedBufferData" offset="assign">
+   <function name="ClearNamedBufferData">
       <param name="buffer" type="GLuint" />
       <param name="internalformat" type="GLenum" />
       <param name="format" type="GLenum" />
@@ -92,7 +92,7 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="ClearNamedBufferSubData" offset="assign">
+   <function name="ClearNamedBufferSubData">
       <param name="buffer" type="GLuint" />
       <param name="internalformat" type="GLenum" />
       <param name="offset" type="GLintptr" />
@@ -102,13 +102,13 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="MapNamedBuffer" offset="assign">
+   <function name="MapNamedBuffer">
       <return type="GLvoid *" />
       <param name="buffer" type="GLuint" />
       <param name="access" type="GLenum" />
    </function>
 
-   <function name="MapNamedBufferRange" offset="assign">
+   <function name="MapNamedBufferRange">
       <return type="GLvoid *" />
       <param name="buffer" type="GLuint" />
       <param name="offset" type="GLintptr" />
@@ -116,57 +116,186 @@
       <param name="access" type="GLbitfield" />
    </function>
 
-   <function name="UnmapNamedBuffer" offset="assign">
+   <function name="UnmapNamedBuffer">
       <return type="GLboolean" />
       <param name="buffer" type="GLuint" />
    </function>
 
-   <function name="FlushMappedNamedBufferRange" offset="assign">
+   <function name="FlushMappedNamedBufferRange">
       <param name="buffer" type="GLuint" />
       <param name="offset" type="GLintptr" />
       <param name="length" type="GLsizeiptr" />
    </function>
 
-   <function name="GetNamedBufferParameteriv" offset="assign">
+   <function name="GetNamedBufferParameteriv">
       <param name="buffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLint *" />
    </function>
 
-   <function name="GetNamedBufferParameteri64v" offset="assign">
+   <function name="GetNamedBufferParameteri64v">
       <param name="buffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLint64 *" />
    </function>
 
-   <function name="GetNamedBufferPointerv" offset="assign">
+   <function name="GetNamedBufferPointerv">
       <param name="buffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLvoid **" />
    </function>
 
-   <function name="GetNamedBufferSubData" offset="assign">
+   <function name="GetNamedBufferSubData">
       <param name="buffer" type="GLuint" />
       <param name="offset" type="GLintptr" />
       <param name="size" type="GLsizeiptr" />
       <param name="data" type="GLvoid *" />
    </function>
 
+   <!-- Framebuffer object functions -->
+
+   <function name="CreateFramebuffers" offset="assign">
+      <param name="n" type="GLsizei" />
+      <param name="framebuffers" type="GLuint *" />
+   </function>
+
+   <function name="NamedFramebufferRenderbuffer" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="attachment" type="GLenum" />
+      <param name="renderbuffertarget" type="GLenum" />
+      <param name="renderbuffer" type="GLuint" />
+   </function>
+
+   <function name="NamedFramebufferParameteri" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="pname" type="GLenum" />
+      <param name="param" type="GLint" />
+   </function>
+
+   <function name="NamedFramebufferTexture" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="attachment" type="GLenum" />
+      <param name="texture" type="GLuint" />
+      <param name="level" type="GLint" />
+   </function>
+
+   <function name="NamedFramebufferTextureLayer" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="attachment" type="GLenum" />
+      <param name="texture" type="GLuint" />
+      <param name="level" type="GLint" />
+      <param name="layer" type="GLint" />
+   </function>
+
+   <function name="NamedFramebufferDrawBuffer" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="buf" type="GLenum" />
+   </function>
+
+   <function name="NamedFramebufferDrawBuffers" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="n" type="GLsizei" />
+      <param name="bufs" type="const GLenum *" />
+   </function>
+
+   <function name="NamedFramebufferReadBuffer" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="buf" type="GLenum" />
+   </function>
+
+   <function name="InvalidateNamedFramebufferData" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="numAttachments" type="GLsizei" />
+      <param name="attachments" type="const GLenum *" />
+   </function>
+
+   <function name="InvalidateNamedFramebufferSubData" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="numAttachments" type="GLsizei" />
+      <param name="attachments" type="const GLenum *" />
+      <param name="x" type="GLint" />
+      <param name="y" type="GLint" />
+      <param name="width" type="GLsizei" />
+      <param name="height" type="GLsizei" />
+   </function>
+
+   <function name="ClearNamedFramebufferiv" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="buffer" type="GLenum" />
+      <param name="drawbuffer" type="GLint" />
+      <param name="value" type="const GLint *" />
+   </function>
+
+   <function name="ClearNamedFramebufferuiv" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="buffer" type="GLenum" />
+      <param name="drawbuffer" type="GLint" />
+      <param name="value" type="const GLuint *" />
+   </function>
+
+   <function name="ClearNamedFramebufferfv" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="buffer" type="GLenum" />
+      <param name="drawbuffer" type="GLint" />
+      <param name="value" type="const GLfloat *" />
+   </function>
+
+   <function name="ClearNamedFramebufferfi" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="buffer" type="GLenum" />
+      <param name="depth" type="GLfloat" />
+      <param name="stencil" type="GLint" />
+   </function>
+
+   <function name="BlitNamedFramebuffer" offset="assign">
+      <param name="readFramebuffer" type="GLuint" />
+      <param name="drawFramebuffer" type="GLuint" />
+      <param name="srcX0" type="GLint" />
+      <param name="srcY0" type="GLint" />
+      <param name="srcX1" type="GLint" />
+      <param name="srcY1" type="GLint" />
+      <param name="dstX0" type="GLint" />
+      <param name="dstY0" type="GLint" />
+      <param name="dstX1" type="GLint" />
+      <param name="dstY1" type="GLint" />
+      <param name="mask" type="GLbitfield" />
+      <param name="filter" type="GLenum" />
+   </function>
+
+   <function name="CheckNamedFramebufferStatus" offset="assign">
+      <return type="GLenum" />
+      <param name="framebuffer" type="GLuint" />
+      <param name="target" type="GLenum" />
+   </function>
+
+   <function name="GetNamedFramebufferParameteriv" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="pname" type="GLenum" />
+      <param name="param" type="GLint *" />
+   </function>
+
+   <function name="GetNamedFramebufferAttachmentParameteriv" offset="assign">
+      <param name="framebuffer" type="GLuint" />
+      <param name="attachment" type="GLenum" />
+      <param name="pname" type="GLenum" />
+      <param name="params" type="GLint *" />
+   </function>
+
    <!-- Renderbuffer object functions -->
 
-   <function name="CreateRenderbuffers" offset="assign">
+   <function name="CreateRenderbuffers">
       <param name="n" type="GLsizei" />
       <param name="renderbuffers" type="GLuint *" />
    </function>
 
-   <function name="NamedRenderbufferStorage" offset="assign">
+   <function name="NamedRenderbufferStorage">
       <param name="renderbuffer" type="GLuint" />
       <param name="internalformat" type="GLenum" />
       <param name="width" type="GLsizei" />
       <param name="height" type="GLsizei" />
    </function>
 
-   <function name="NamedRenderbufferStorageMultisample" offset="assign">
+   <function name="NamedRenderbufferStorageMultisample">
       <param name="renderbuffer" type="GLuint" />
       <param name="samples" type="GLsizei" />
       <param name="internalformat" type="GLenum" />
@@ -174,7 +303,7 @@
       <param name="height" type="GLsizei" />
    </function>
 
-   <function name="GetNamedRenderbufferParameteriv" offset="assign">
+   <function name="GetNamedRenderbufferParameteriv">
       <param name="renderbuffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLint *" />
@@ -182,19 +311,19 @@
 
    <!-- Texture object functions -->
 
-   <function name="CreateTextures" offset="assign">
+   <function name="CreateTextures">
       <param name="target" type="GLenum" />
       <param name="n" type="GLsizei" />
       <param name="textures" type="GLuint *" />
    </function>
 
-   <function name="TextureBuffer" offset="assign">
+   <function name="TextureBuffer">
       <param name="texture" type="GLuint" />
       <param name="internalformat" type="GLenum" />
       <param name="buffer" type="GLuint" />
    </function>
 
-   <function name="TextureBufferRange" offset="assign">
+   <function name="TextureBufferRange">
       <param name="texture" type="GLuint" />
       <param name="internalformat" type="GLenum" />
       <param name="buffer" type="GLuint" />
@@ -202,14 +331,14 @@
       <param name="size" type="GLsizeiptr" />
    </function>
 
-   <function name="TextureStorage1D" offset="assign">
+   <function name="TextureStorage1D">
       <param name="texture" type="GLuint" />
       <param name="levels" type="GLsizei" />
       <param name="internalformat" type="GLenum" />
       <param name="width" type="GLsizei" />
    </function>
 
-   <function name="TextureStorage2D" offset="assign">
+   <function name="TextureStorage2D">
       <param name="texture" type="GLuint" />
       <param name="levels" type="GLsizei" />
       <param name="internalformat" type="GLenum" />
@@ -217,7 +346,7 @@
       <param name="height" type="GLsizei" />
    </function>
 
-   <function name="TextureStorage3D" offset="assign">
+   <function name="TextureStorage3D">
       <param name="texture" type="GLuint" />
       <param name="levels" type="GLsizei" />
       <param name="internalformat" type="GLenum" />
@@ -226,7 +355,7 @@
       <param name="depth" type="GLsizei" />
    </function>
 
-   <function name="TextureStorage2DMultisample" offset="assign">
+   <function name="TextureStorage2DMultisample">
       <param name="texture" type="GLuint" />
       <param name="samples" type="GLsizei" />
       <param name="internalformat" type="GLenum" />
@@ -235,7 +364,7 @@
       <param name="fixedsamplelocations" type="GLboolean" />
    </function>
 
-   <function name="TextureStorage3DMultisample" offset="assign">
+   <function name="TextureStorage3DMultisample">
       <param name="texture" type="GLuint" />
       <param name="samples" type="GLsizei" />
       <param name="internalformat" type="GLenum" />
@@ -245,7 +374,7 @@
       <param name="fixedsamplelocations" type="GLboolean" />
    </function>
 
-   <function name="TextureSubImage1D" offset="assign">
+   <function name="TextureSubImage1D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -255,7 +384,7 @@
       <param name="pixels" type="const GLvoid *" />
    </function>
 
-   <function name="TextureSubImage2D" offset="assign">
+   <function name="TextureSubImage2D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -267,7 +396,7 @@
       <param name="pixels" type="const GLvoid *" />
    </function>
 
-   <function name="TextureSubImage3D" offset="assign">
+   <function name="TextureSubImage3D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -281,7 +410,7 @@
       <param name="pixels" type="const GLvoid *" />
    </function>
 
-   <function name="CompressedTextureSubImage1D" offset="assign">
+   <function name="CompressedTextureSubImage1D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -291,7 +420,7 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CompressedTextureSubImage2D" offset="assign">
+   <function name="CompressedTextureSubImage2D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -303,7 +432,7 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CompressedTextureSubImage3D" offset="assign">
+   <function name="CompressedTextureSubImage3D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -317,7 +446,7 @@
       <param name="data" type="const GLvoid *" />
    </function>
 
-   <function name="CopyTextureSubImage1D" offset="assign">
+   <function name="CopyTextureSubImage1D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -326,7 +455,7 @@
       <param name="width" type="GLsizei" />
    </function>
 
-   <function name="CopyTextureSubImage2D" offset="assign">
+   <function name="CopyTextureSubImage2D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -337,7 +466,7 @@
       <param name="height" type="GLsizei" />
    </function>
 
-   <function name="CopyTextureSubImage3D" offset="assign">
+   <function name="CopyTextureSubImage3D">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="xoffset" type="GLint" />
@@ -349,52 +478,52 @@
       <param name="height" type="GLsizei" />
    </function>
 
-   <function name="TextureParameterf" offset="assign">
+   <function name="TextureParameterf">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="GLfloat" />
    </function>
 
-   <function name="TextureParameterfv" offset="assign">
+   <function name="TextureParameterfv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="const GLfloat *" />
    </function>
 
-   <function name="TextureParameteri" offset="assign">
+   <function name="TextureParameteri">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="GLint" />
    </function>
 
-   <function name="TextureParameterIiv" offset="assign">
+   <function name="TextureParameterIiv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="const GLint *" />
    </function>
 
-   <function name="TextureParameterIuiv" offset="assign">
+   <function name="TextureParameterIuiv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="const GLuint *" />
    </function>
 
-   <function name="TextureParameteriv" offset="assign">
+   <function name="TextureParameteriv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="const GLint *" />
    </function>
 
-   <function name="GenerateTextureMipmap" offset="assign">
+   <function name="GenerateTextureMipmap">
       <param name="texture" type="GLuint" />
    </function>
 
-   <function name="BindTextureUnit" offset="assign">
+   <function name="BindTextureUnit">
       <param name="unit" type="GLuint" />
       <param name="texture" type="GLuint" />
    </function>
 
-   <function name="GetTextureImage" offset="assign">
+   <function name="GetTextureImage">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="format" type="GLenum" />
@@ -403,46 +532,46 @@
       <param name="pixels" type="GLvoid *" />
    </function>
 
-   <function name="GetCompressedTextureImage" offset="assign">
+   <function name="GetCompressedTextureImage">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="bufSize" type="GLsizei" />
       <param name="pixels" type="GLvoid *" />
    </function>
 
-   <function name="GetTextureLevelParameterfv" offset="assign">
+   <function name="GetTextureLevelParameterfv">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLfloat *" />
    </function>
 
-   <function name="GetTextureLevelParameteriv" offset="assign">
+   <function name="GetTextureLevelParameteriv">
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLint *" />
    </function>
 
-   <function name="GetTextureParameterfv" offset="assign">
+   <function name="GetTextureParameterfv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLfloat *" />
    </function>
 
-   <function name="GetTextureParameterIiv" offset="assign">
+   <function name="GetTextureParameterIiv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLint *" />
    </function>
 
-   <function name="GetTextureParameterIuiv" offset="assign">
+   <function name="GetTextureParameterIuiv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLuint *" />
    </function>
 
-   <function name="GetTextureParameteriv" offset="assign">
+   <function name="GetTextureParameteriv">
       <param name="texture" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="params" type="GLint *" />
@@ -450,27 +579,27 @@
 
    <!-- Vertex Array object functions -->
 
-   <function name="CreateVertexArrays" offset="assign">
+   <function name="CreateVertexArrays">
       <param name="n" type="GLsizei" />
       <param name="arrays" type="GLuint *" />
    </function>
 
-   <function name="DisableVertexArrayAttrib" offset="assign">
+   <function name="DisableVertexArrayAttrib">
       <param name="vaobj" type="GLuint" />
       <param name="index" type="GLuint" />
    </function>
 
-   <function name="EnableVertexArrayAttrib" offset="assign">
+   <function name="EnableVertexArrayAttrib">
       <param name="vaobj" type="GLuint" />
       <param name="index" type="GLuint" />
    </function>
 
-   <function name="VertexArrayElementBuffer" offset="assign">
+   <function name="VertexArrayElementBuffer">
       <param name="vaobj" type="GLuint" />
       <param name="buffer" type="GLuint" />
    </function>
 
-   <function name="VertexArrayVertexBuffer" offset="assign">
+   <function name="VertexArrayVertexBuffer">
       <param name="vaobj" type="GLuint" />
       <param name="bindingindex" type="GLuint" />
       <param name="buffer" type="GLuint" />
@@ -478,7 +607,7 @@
       <param name="stride" type="GLsizei" />
    </function>
 
-   <function name="VertexArrayVertexBuffers" offset="assign">
+   <function name="VertexArrayVertexBuffers">
       <param name="vaobj" type="GLuint" />
       <param name="first" type="GLuint" />
       <param name="count" type="GLsizei" />
@@ -487,7 +616,7 @@
       <param name="strides" type="const GLsizei *" />
    </function>
 
-   <function name="VertexArrayAttribFormat" offset="assign">
+   <function name="VertexArrayAttribFormat">
       <param name="vaobj" type="GLuint" />
       <param name="attribindex" type="GLuint" />
       <param name="size" type="GLint" />
@@ -496,7 +625,7 @@
       <param name="relativeoffset" type="GLuint" />
    </function>
 
-   <function name="VertexArrayAttribIFormat" offset="assign">
+   <function name="VertexArrayAttribIFormat">
       <param name="vaobj" type="GLuint" />
       <param name="attribindex" type="GLuint" />
       <param name="size" type="GLint" />
@@ -504,7 +633,7 @@
       <param name="relativeoffset" type="GLuint" />
    </function>
 
-   <function name="VertexArrayAttribLFormat" offset="assign">
+   <function name="VertexArrayAttribLFormat">
       <param name="vaobj" type="GLuint" />
       <param name="attribindex" type="GLuint" />
       <param name="size" type="GLint" />
@@ -512,32 +641,32 @@
       <param name="relativeoffset" type="GLuint" />
    </function>
 
-   <function name="VertexArrayAttribBinding" offset="assign">
+   <function name="VertexArrayAttribBinding">
       <param name="vaobj" type="GLuint" />
       <param name="attribindex" type="GLuint" />
       <param name="bindingindex" type="GLuint" />
    </function>
 
-   <function name="VertexArrayBindingDivisor" offset="assign">
+   <function name="VertexArrayBindingDivisor">
       <param name="vaobj" type="GLuint" />
       <param name="bindingindex" type="GLuint" />
       <param name="divisor" type="GLuint" />
    </function>
 
-   <function name="GetVertexArrayiv" offset="assign">
+   <function name="GetVertexArrayiv">
       <param name="vaobj" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="GLint *" />
    </function>
 
-   <function name="GetVertexArrayIndexediv" offset="assign">
+   <function name="GetVertexArrayIndexediv">
       <param name="vaobj" type="GLuint" />
       <param name="index" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="GLint *" />
    </function>
 
-   <function name="GetVertexArrayIndexed64iv" offset="assign">
+   <function name="GetVertexArrayIndexed64iv">
       <param name="vaobj" type="GLuint" />
       <param name="index" type="GLuint" />
       <param name="pname" type="GLenum" />
@@ -546,48 +675,48 @@
 
    <!-- Sampler object functions -->
 
-   <function name="CreateSamplers" offset="assign">
+   <function name="CreateSamplers">
       <param name="n" type="GLsizei" />
       <param name="samplers" type="GLuint *" />
    </function>
 
    <!-- Program Pipeline object functions -->
 
-   <function name="CreateProgramPipelines" offset="assign">
+   <function name="CreateProgramPipelines">
       <param name="n" type="GLsizei" />
       <param name="pipelines" type="GLuint *" />
    </function>
 
    <!-- Query object functions -->
 
-   <function name="CreateQueries" offset="assign">
+   <function name="CreateQueries">
       <param name="target" type="GLenum" />
       <param name="n" type="GLsizei" />
       <param name="ids" type="GLuint *" />
    </function>
 
-   <function name="GetQueryBufferObjectiv" offset="assign">
+   <function name="GetQueryBufferObjectiv">
       <param name="id" type="GLuint" />
       <param name="buffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="offset" type="GLintptr" />
    </function>
 
-   <function name="GetQueryBufferObjectuiv" offset="assign">
+   <function name="GetQueryBufferObjectuiv">
       <param name="id" type="GLuint" />
       <param name="buffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="offset" type="GLintptr" />
    </function>
 
-   <function name="GetQueryBufferObjecti64v" offset="assign">
+   <function name="GetQueryBufferObjecti64v">
       <param name="id" type="GLuint" />
       <param name="buffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="offset" type="GLintptr" />
    </function>
 
-   <function name="GetQueryBufferObjectui64v" offset="assign">
+   <function name="GetQueryBufferObjectui64v">
       <param name="id" type="GLuint" />
       <param name="buffer" type="GLuint" />
       <param name="pname" type="GLenum" />
diff --git a/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml b/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml
index 0b6947c..8c33fbf 100644
--- a/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml
+++ b/src/mapi/glapi/gen/ARB_draw_buffers_blend.xml
@@ -8,24 +8,24 @@
 
 <category name="GL_ARB_draw_buffers_blend" number="69">
 
-    <function name="BlendEquationiARB" offset="assign">
+    <function name="BlendEquationiARB">
         <param name="buf" type="GLuint"/>
         <param name="mode" type="GLenum"/>
     </function>
 
-    <function name="BlendEquationSeparateiARB" offset="assign">
+    <function name="BlendEquationSeparateiARB">
         <param name="buf" type="GLuint"/>
         <param name="modeRGB" type="GLenum"/>
         <param name="modeA" type="GLenum"/>
     </function>
 
-    <function name="BlendFunciARB" offset="assign">
+    <function name="BlendFunciARB">
         <param name="buf" type="GLuint"/>
         <param name="src" type="GLenum"/>
         <param name="dst" type="GLenum"/>
     </function>
 
-    <function name="BlendFuncSeparateiARB" offset="assign">
+    <function name="BlendFuncSeparateiARB">
         <param name="buf" type="GLuint"/>
         <param name="srcRGB" type="GLenum"/>
         <param name="dstRGB" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml b/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml
index 9866548..120bda1 100644
--- a/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml
+++ b/src/mapi/glapi/gen/ARB_draw_elements_base_vertex.xml
@@ -8,7 +8,7 @@
 
 <category name="GL_ARB_draw_elements_base_vertex" number="62">
 
-    <function name="DrawElementsBaseVertex" offset="assign" exec="dynamic">
+    <function name="DrawElementsBaseVertex" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="count" type="GLsizei"/>
         <param name="type" type="GLenum"/>
@@ -16,8 +16,7 @@
         <param name="basevertex" type="GLint"/>
     </function>
 
-    <function name="DrawRangeElementsBaseVertex" offset="assign"
-              exec="dynamic">
+    <function name="DrawRangeElementsBaseVertex" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="start" type="GLuint"/>
         <param name="end" type="GLuint"/>
@@ -27,8 +26,7 @@
         <param name="basevertex" type="GLint"/>
     </function>
 
-    <function name="MultiDrawElementsBaseVertex" offset="assign"
-              exec="dynamic">
+    <function name="MultiDrawElementsBaseVertex" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="count" type="const GLsizei *"/>
         <param name="type" type="GLenum"/>
@@ -37,8 +35,7 @@
         <param name="basevertex" type="const GLint *"/>
     </function>
 
-    <function name="DrawElementsInstancedBaseVertex" offset="assign"
-              exec="dynamic">
+    <function name="DrawElementsInstancedBaseVertex" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="count" type="GLsizei"/>
         <param name="type" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_draw_indirect.xml b/src/mapi/glapi/gen/ARB_draw_indirect.xml
index 7de03cd..3b29d6b 100644
--- a/src/mapi/glapi/gen/ARB_draw_indirect.xml
+++ b/src/mapi/glapi/gen/ARB_draw_indirect.xml
@@ -8,12 +8,12 @@
     <enum name="DRAW_INDIRECT_BUFFER"                   value="0x8F3F"/>
     <enum name="DRAW_INDIRECT_BUFFER_BINDING"           value="0x8F43"/>
 
-    <function name="DrawArraysIndirect" offset="assign" exec="dynamic">
+    <function name="DrawArraysIndirect" exec="dynamic" es2="3.1">
         <param name="mode" type="GLenum"/>
         <param name="indirect" type="const GLvoid *"/>
     </function>
 
-    <function name="DrawElementsIndirect" offset="assign" exec="dynamic">
+    <function name="DrawElementsIndirect" exec="dynamic" es2="3.1">
         <param name="mode" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="indirect" type="const GLvoid *"/>
@@ -24,14 +24,14 @@
 
 <category name="GL_ARB_multi_draw_indirect" number="133">
 
-    <function name="MultiDrawArraysIndirect" offset="assign" exec="dynamic">
+    <function name="MultiDrawArraysIndirect" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="indirect" type="const GLvoid *"/>
         <param name="primcount" type="GLsizei"/>
         <param name="stride" type="GLsizei"/>
     </function>
 
-    <function name="MultiDrawElementsIndirect" offset="assign" exec="dynamic">
+    <function name="MultiDrawElementsIndirect" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="indirect" type="const GLvoid *"/>
diff --git a/src/mapi/glapi/gen/ARB_draw_instanced.xml b/src/mapi/glapi/gen/ARB_draw_instanced.xml
index 7ee7629..b1c8221 100644
--- a/src/mapi/glapi/gen/ARB_draw_instanced.xml
+++ b/src/mapi/glapi/gen/ARB_draw_instanced.xml
@@ -8,14 +8,14 @@
 
 <category name="GL_ARB_draw_instanced" number="44">
 
-  <function name="DrawArraysInstancedARB" offset="assign" exec="dynamic">
+  <function name="DrawArraysInstancedARB" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="first" type="GLint"/>
     <param name="count" type="GLsizei"/>
     <param name="primcount" type="GLsizei"/>
   </function>
 
-  <function name="DrawElementsInstancedARB" offset="assign" exec="dynamic">
+  <function name="DrawElementsInstancedARB" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="count" type="GLsizei"/>
     <param name="type" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_framebuffer_no_attachments.xml b/src/mapi/glapi/gen/ARB_framebuffer_no_attachments.xml
new file mode 100644
index 0000000..59839a0
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_framebuffer_no_attachments.xml
@@ -0,0 +1,32 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<OpenGLAPI>
+
+<category name="GL_ARB_framebuffer_no_attachments" number="130">
+
+   <enum name="FRAMEBUFFER_DEFAULT_WIDTH"                  value="0x9310" />
+   <enum name="FRAMEBUFFER_DEFAULT_HEIGHT"                 value="0x9311" />
+   <enum name="FRAMEBUFFER_DEFAULT_LAYERS"                 value="0x9312" />
+   <enum name="FRAMEBUFFER_DEFAULT_SAMPLES"                value="0x9313" />
+   <enum name="FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS" value="0x9314" />
+   <enum name="MAX_FRAMEBUFFER_WIDTH"                      value="0x9315" />
+   <enum name="MAX_FRAMEBUFFER_HEIGHT"                     value="0x9316" />
+   <enum name="MAX_FRAMEBUFFER_LAYERS"                     value="0x9317" />
+   <enum name="MAX_FRAMEBUFFER_SAMPLES"                    value="0x9318" />
+
+    <function name="FramebufferParameteri">
+       <param name="target" type="GLenum"/>
+       <param name="pname"  type="GLenum"/>
+       <param name="param"  type="GLint" />
+    </function>
+
+    <function name="GetFramebufferParameteriv">
+       <param name="target" type="GLenum" />
+       <param name="pname"  type="GLenum" />
+       <param name="params" type="GLint *" output="true" />
+    </function>
+
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/ARB_framebuffer_object.xml b/src/mapi/glapi/gen/ARB_framebuffer_object.xml
index 7c547c1..1573e7e 100644
--- a/src/mapi/glapi/gen/ARB_framebuffer_object.xml
+++ b/src/mapi/glapi/gen/ARB_framebuffer_object.xml
@@ -140,33 +140,31 @@
 
 
 
-    <function name="IsRenderbuffer" es2="2.0" offset="assign">
+    <function name="IsRenderbuffer" es2="2.0">
         <param name="renderbuffer" type="GLuint"/>
 	<return type="GLboolean"/>
 	<glx vendorpriv="1422"/>
     </function>
 
-    <function name="BindRenderbuffer" es2="2.0" offset="assign">
+    <function name="BindRenderbuffer" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="renderbuffer" type="GLuint"/>
         <glx rop="235"/>
     </function>
 
-    <function name="DeleteRenderbuffers"
-              es2="2.0" offset="assign">
+    <function name="DeleteRenderbuffers" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="renderbuffers" type="const GLuint *" count="n"/>
 	<glx rop="4317"/>
     </function>
 
-    <function name="GenRenderbuffers" es2="2.0" offset="assign">
+    <function name="GenRenderbuffers" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="renderbuffers" type="GLuint *" count="n" output="true"/>
 	<glx vendorpriv="1423" always_array="true"/>
     </function>
 
-    <function name="RenderbufferStorage"
-              es2="2.0" offset="assign">
+    <function name="RenderbufferStorage" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -174,7 +172,7 @@
 	<glx rop="4318"/>
     </function>
 
-    <function name="RenderbufferStorageMultisample" offset="assign" es2="3.0">
+    <function name="RenderbufferStorageMultisample" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="samples" type="GLsizei"/>
         <param name="internalformat" type="GLenum"/>
@@ -183,46 +181,44 @@
         <glx rop="4331"/>
     </function>
 
-    <function name="GetRenderbufferParameteriv" es2="2.0" offset="assign">
+    <function name="GetRenderbufferParameteriv" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true"/>
 	<glx vendorpriv="1424"/>
     </function>
 
-    <function name="IsFramebuffer" es2="2.0" offset="assign">
+    <function name="IsFramebuffer" es2="2.0">
         <param name="framebuffer" type="GLuint"/>
 	<return type="GLboolean"/>
 	<glx vendorpriv="1425"/>
     </function>
 
-    <function name="BindFramebuffer" es2="2.0" offset="assign">
+    <function name="BindFramebuffer" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="framebuffer" type="GLuint"/>
         <glx rop="236"/>
     </function>
 
-    <function name="DeleteFramebuffers"
-              es2="2.0" offset="assign">
+    <function name="DeleteFramebuffers" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="framebuffers" type="const GLuint *" count="n"/>
 	<glx rop="4320"/>
     </function>
 
-    <function name="GenFramebuffers" es2="2.0" offset="assign">
+    <function name="GenFramebuffers" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="framebuffers" type="GLuint *" count="n" output="true"/>
 	<glx vendorpriv="1426" always_array="true"/>
     </function>
 
-    <function name="CheckFramebufferStatus"
-              es2="2.0" offset="assign">
+    <function name="CheckFramebufferStatus" es2="2.0">
         <param name="target" type="GLenum"/>
 	<return type="GLenum"/>
 	<glx vendorpriv="1427"/>
     </function>
 
-    <function name="FramebufferTexture1D" offset="assign">
+    <function name="FramebufferTexture1D">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
@@ -231,8 +227,7 @@
 	<glx rop="4321"/>
     </function>
 
-    <function name="FramebufferTexture2D"
-              es2="2.0" offset="assign">
+    <function name="FramebufferTexture2D" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
@@ -241,17 +236,17 @@
 	<glx rop="4322"/>
     </function>
 
-    <function name="FramebufferTexture3D" offset="assign">
+    <function name="FramebufferTexture3D">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
         <param name="texture" type="GLuint"/>
         <param name="level" type="GLint"/>
-        <param name="zoffset" type="GLint"/>
+        <param name="layer" type="GLint"/>
 	<glx rop="4323"/>
     </function>
 
-    <function name="FramebufferTextureLayer" es2="3.0" offset="assign">
+    <function name="FramebufferTextureLayer" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="texture" type="GLuint"/>
@@ -260,8 +255,7 @@
 	<glx rop="237"/>
     </function>
 
-    <function name="FramebufferRenderbuffer"
-              es2="2.0" offset="assign">
+    <function name="FramebufferRenderbuffer" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="renderbuffertarget" type="GLenum"/>
@@ -269,7 +263,7 @@
 	<glx rop="4324"/>
     </function>
 
-    <function name="GetFramebufferAttachmentParameteriv" es2="2.0" offset="assign">
+    <function name="GetFramebufferAttachmentParameteriv" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="pname" type="GLenum"/>
@@ -277,7 +271,7 @@
 	<glx vendorpriv="1428"/>
     </function>
 
-    <function name="BlitFramebuffer" es2="3.0" offset="assign">
+    <function name="BlitFramebuffer" es2="3.0">
         <param name="srcX0" type="GLint"/>
         <param name="srcY0" type="GLint"/>
         <param name="srcX1" type="GLint"/>
@@ -291,7 +285,7 @@
         <glx rop="4330"/>
     </function>
 
-    <function name="GenerateMipmap" es2="2.0" offset="assign">
+    <function name="GenerateMipmap" es2="2.0">
         <param name="target" type="GLenum"/>
 	<glx rop="4325"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_geometry_shader4.xml b/src/mapi/glapi/gen/ARB_geometry_shader4.xml
index e62047c..280e7a0 100644
--- a/src/mapi/glapi/gen/ARB_geometry_shader4.xml
+++ b/src/mapi/glapi/gen/ARB_geometry_shader4.xml
@@ -45,7 +45,7 @@
         <param name="level" type="GLint"/>
         <param name="layer" type="GLint"/>
     </function>
-    <function name="FramebufferTextureFaceARB" exec="skip" offset="assign">
+    <function name="FramebufferTextureFaceARB" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="texture" type="GLuint"/>
diff --git a/src/mapi/glapi/gen/ARB_get_program_binary.xml b/src/mapi/glapi/gen/ARB_get_program_binary.xml
index e84d067..25e0a37 100644
--- a/src/mapi/glapi/gen/ARB_get_program_binary.xml
+++ b/src/mapi/glapi/gen/ARB_get_program_binary.xml
@@ -11,7 +11,7 @@
     <enum name="NUM_PROGRAM_BINARY_FORMATS"               value="0x87FE"/>
     <enum name="PROGRAM_BINARY_FORMATS"                   value="0x87FF"/>
 
-    <function name="GetProgramBinary" offset="assign" es2="3.0">
+    <function name="GetProgramBinary" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *"/>
@@ -19,14 +19,14 @@
         <param name="binary" type="GLvoid *"/>
     </function>
 
-    <function name="ProgramBinary" offset="assign" es2="3.0">
+    <function name="ProgramBinary" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="binaryFormat" type="GLenum"/>
         <param name="binary" type="const GLvoid *"/>
         <param name="length" type="GLsizei"/>
     </function>
 
-    <function name="ProgramParameteri" offset="assign" es2="3.0">
+    <function name="ProgramParameteri" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="value" type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml b/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml
index 4f860ef..fd1ad11 100644
--- a/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml
+++ b/src/mapi/glapi/gen/ARB_gpu_shader_fp64.xml
@@ -5,25 +5,25 @@
 
 <category name="GL_ARB_gpu_shader_fp64" number="89">
 
-    <function name="Uniform1d" offset="assign">
+    <function name="Uniform1d">
         <param name="location" type="GLint"/>
         <param name="x" type="GLdouble"/>
     </function>
 
-    <function name="Uniform2d" offset="assign">
+    <function name="Uniform2d">
         <param name="location" type="GLint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
     </function>
 
-    <function name="Uniform3d" offset="assign">
+    <function name="Uniform3d">
         <param name="location" type="GLint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
     </function>
 
-    <function name="Uniform4d" offset="assign">
+    <function name="Uniform4d">
         <param name="location" type="GLint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
@@ -31,94 +31,94 @@
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="Uniform1dv" offset="assign">
+    <function name="Uniform1dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="Uniform2dv" offset="assign">
+    <function name="Uniform2dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="Uniform3dv" offset="assign">
+    <function name="Uniform3dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="Uniform4dv" offset="assign">
+    <function name="Uniform4dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix2dv" offset="assign">
+    <function name="UniformMatrix2dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix3dv" offset="assign">
+    <function name="UniformMatrix3dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix4dv" offset="assign">
+    <function name="UniformMatrix4dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix2x3dv" offset="assign">
+    <function name="UniformMatrix2x3dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix2x4dv" offset="assign">
+    <function name="UniformMatrix2x4dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix3x2dv" offset="assign">
+    <function name="UniformMatrix3x2dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix3x4dv" offset="assign">
+    <function name="UniformMatrix3x4dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix4x2dv" offset="assign">
+    <function name="UniformMatrix4x2dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="UniformMatrix4x3dv" offset="assign">
+    <function name="UniformMatrix4x3dv">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLdouble *"/>
     </function>
 
-    <function name="GetUniformdv" offset="assign">
+    <function name="GetUniformdv">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="params" type="GLdouble *"/>
diff --git a/src/mapi/glapi/gen/ARB_internalformat_query.xml b/src/mapi/glapi/gen/ARB_internalformat_query.xml
index 70a2a31..16d14b1 100644
--- a/src/mapi/glapi/gen/ARB_internalformat_query.xml
+++ b/src/mapi/glapi/gen/ARB_internalformat_query.xml
@@ -8,8 +8,7 @@
 
     <enum name="NUM_SAMPLE_COUNTS"                        value="0x9380"/>
 
-    <function name="GetInternalformativ" offset="assign" static_dispatch="false"
-              es2="3.0">
+    <function name="GetInternalformativ" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="pname" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_invalidate_subdata.xml b/src/mapi/glapi/gen/ARB_invalidate_subdata.xml
index 31b515c..052816a 100644
--- a/src/mapi/glapi/gen/ARB_invalidate_subdata.xml
+++ b/src/mapi/glapi/gen/ARB_invalidate_subdata.xml
@@ -3,7 +3,7 @@
 
 <OpenGLAPI>
 <category name="GL_ARB_invalidate_subdata" number="666">
-  <function name="InvalidateTexSubImage" offset="assign">
+  <function name="InvalidateTexSubImage">
     <param name="texture" type="GLuint"/>
     <param name="level" type="GLint"/>
     <param name="xoffset" type="GLint"/>
@@ -14,22 +14,22 @@
     <param name="depth" type="GLsizei"/>
   </function>
 
-  <function name="InvalidateTexImage" offset="assign">
+  <function name="InvalidateTexImage">
     <param name="texture" type="GLuint"/>
     <param name="level" type="GLint"/>
   </function>
 
-  <function name="InvalidateBufferSubData" offset="assign">
+  <function name="InvalidateBufferSubData">
     <param name="buffer" type="GLuint"/>
     <param name="offset" type="GLintptr"/>
     <param name="length" type="GLsizeiptr"/>
   </function>
 
-  <function name="InvalidateBufferData" offset="assign">
+  <function name="InvalidateBufferData">
     <param name="buffer" type="GLuint"/>
   </function>
 
-  <function name="InvalidateSubFramebuffer" offset="assign" es2="3.0">
+  <function name="InvalidateSubFramebuffer" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="numAttachments" type="GLsizei" counter="true"/>
     <param name="attachments" type="const GLenum *" count="numAttachments"/>
@@ -39,7 +39,7 @@
     <param name="height" type="GLsizei"/>
   </function>
 
-  <function name="InvalidateFramebuffer" offset="assign" es2="3.0">
+  <function name="InvalidateFramebuffer" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="numAttachments" type="GLsizei" counter="true"/>
     <param name="attachments" type="const GLenum *" count="numAttachments"/>
diff --git a/src/mapi/glapi/gen/ARB_map_buffer_range.xml b/src/mapi/glapi/gen/ARB_map_buffer_range.xml
index d874504..cf7b211 100644
--- a/src/mapi/glapi/gen/ARB_map_buffer_range.xml
+++ b/src/mapi/glapi/gen/ARB_map_buffer_range.xml
@@ -15,7 +15,7 @@
     <enum name="MAP_FLUSH_EXPLICIT_BIT"      value="0x0010"/>
     <enum name="MAP_UNSYNCHRONIZED_BIT"      value="0x0020"/>
 
-    <function name="MapBufferRange" offset="assign" es2="3.0">
+    <function name="MapBufferRange" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="length" type="GLsizeiptr"/>
@@ -23,7 +23,7 @@
         <return type="GLvoid *"/>
     </function>
 
-    <function name="FlushMappedBufferRange" offset="assign" es2="3.0">
+    <function name="FlushMappedBufferRange" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="length" type="GLsizeiptr"/>
diff --git a/src/mapi/glapi/gen/ARB_multi_bind.xml b/src/mapi/glapi/gen/ARB_multi_bind.xml
index 4f2f2a2..f42eaa2 100644
--- a/src/mapi/glapi/gen/ARB_multi_bind.xml
+++ b/src/mapi/glapi/gen/ARB_multi_bind.xml
@@ -7,14 +7,14 @@
 
 <category name="GL_ARB_multi_bind" number="147">
 
-    <function name="BindBuffersBase" offset="assign">
+    <function name="BindBuffersBase">
         <param name="target" type="GLenum"/>
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="buffers" type="const GLuint *"/>
     </function>
 
-    <function name="BindBuffersRange" offset="assign">
+    <function name="BindBuffersRange">
         <param name="target" type="GLenum"/>
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
@@ -23,25 +23,25 @@
         <param name="sizes" type="const GLsizeiptr *"/>
     </function>
 
-    <function name="BindTextures" offset="assign">
+    <function name="BindTextures">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="textures" type="const GLuint *"/>
     </function>
 
-    <function name="BindSamplers" offset="assign">
+    <function name="BindSamplers">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="samplers" type="const GLuint *"/>
     </function>
 
-    <function name="BindImageTextures" offset="assign">
+    <function name="BindImageTextures">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="textures" type="const GLuint *"/>
     </function>
 
-    <function name="BindVertexBuffers" offset="assign">
+    <function name="BindVertexBuffers">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="buffers" type="const GLuint *"/>
diff --git a/src/mapi/glapi/gen/ARB_program_interface_query.xml b/src/mapi/glapi/gen/ARB_program_interface_query.xml
index 59eb59c..c3162f5 100644
--- a/src/mapi/glapi/gen/ARB_program_interface_query.xml
+++ b/src/mapi/glapi/gen/ARB_program_interface_query.xml
@@ -56,21 +56,21 @@
     <enum name="NUM_COMPATIBLE_SUBROUTINES"                      value="0x8E4A"/>
     <enum name="COMPATIBLE_SUBROUTINES"                          value="0x8E4B"/>
 
-    <function name="GetProgramInterfaceiv" offset="assign">
+    <function name="GetProgramInterfaceiv" es2="3.1">
         <param name="program" type="GLuint"/>
         <param name="programInterface" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true"/>
     </function>
 
-    <function name="GetProgramResourceIndex" offset="assign">
+    <function name="GetProgramResourceIndex" es2="3.1">
         <param name="program" type="GLuint"/>
         <param name="programInterface" type="GLenum"/>
         <param name="name" type="const GLchar *"/>
         <return type="GLuint"/>
     </function>
 
-    <function name="GetProgramResourceName" offset="assign">
+    <function name="GetProgramResourceName" es2="3.1">
         <param name="program" type="GLuint"/>
         <param name="programInterface" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -79,7 +79,7 @@
         <param name="name" type="GLchar *" output="true"/>
     </function>
 
-    <function name="GetProgramResourceiv" offset="assign">
+    <function name="GetProgramResourceiv" es2="3.1">
         <param name="program" type="GLuint"/>
         <param name="programInterface" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -90,14 +90,14 @@
         <param name="params" type="GLint *" output="true"/>
     </function>
 
-    <function name="GetProgramResourceLocation" offset="assign">
+    <function name="GetProgramResourceLocation" es2="3.1">
         <param name="program" type="GLuint"/>
         <param name="programInterface" type="GLenum"/>
         <param name="name" type="const GLchar *"/>
         <return type="GLint"/>
     </function>
 
-    <function name="GetProgramResourceLocationIndex" offset="assign">
+    <function name="GetProgramResourceLocationIndex">
         <param name="program" type="GLuint"/>
         <param name="programInterface" type="GLenum"/>
         <param name="name" type="const GLchar *"/>
diff --git a/src/mapi/glapi/gen/ARB_robustness.xml b/src/mapi/glapi/gen/ARB_robustness.xml
index 6584314..9b2f2f0 100644
--- a/src/mapi/glapi/gen/ARB_robustness.xml
+++ b/src/mapi/glapi/gen/ARB_robustness.xml
@@ -20,26 +20,26 @@
 
     <enum name="CONTEXT_FLAG_ROBUST_ACCESS_BIT_ARB"       value="0x00000004"/>
 
-    <function name="GetGraphicsResetStatusARB" offset="assign">
+    <function name="GetGraphicsResetStatusARB">
         <return type="GLenum"/>
     </function>
 
 <!-- OpenGL 1.0 sized buffer queries -->
-    <function name="GetnMapdvARB" offset="assign" deprecated="3.1">
+    <function name="GetnMapdvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="query" type="GLenum"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="v" type="GLdouble *" output="true"/>
     </function>
 
-    <function name="GetnMapfvARB" offset="assign" deprecated="3.1">
+    <function name="GetnMapfvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="query" type="GLenum"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="v" type="GLfloat *" output="true"/>
     </function>
 
-    <function name="GetnMapivARB" offset="assign" deprecated="3.1">
+    <function name="GetnMapivARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="query" type="GLenum"/>
         <param name="bufSize" type="GLsizei"/>
@@ -48,19 +48,19 @@
 
 
 
-    <function name="GetnPixelMapfvARB" offset="assign" deprecated="3.1">
+    <function name="GetnPixelMapfvARB" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="values" type="GLfloat *" output="true"/>
     </function>
 
-    <function name="GetnPixelMapuivARB" offset="assign" deprecated="3.1">
+    <function name="GetnPixelMapuivARB" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="values" type="GLuint *" output="true"/>
     </function>
 
-    <function name="GetnPixelMapusvARB" offset="assign" deprecated="3.1">
+    <function name="GetnPixelMapusvARB" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="values" type="GLushort *" output="true"/>
@@ -68,12 +68,12 @@
 
 
 
-    <function name="GetnPolygonStippleARB" offset="assign">
+    <function name="GetnPolygonStippleARB">
         <param name="bufSize" type="GLsizei"/>
         <param name="pattern" type="GLubyte *" output="true"/>
     </function>
 
-    <function name="GetnTexImageARB" offset="assign">
+    <function name="GetnTexImageARB">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="format" type="GLenum"/>
@@ -82,7 +82,7 @@
         <param name="img" type="GLvoid *" output="true"/>
     </function>
 
-    <function name="ReadnPixelsARB" offset="assign">
+    <function name="ReadnPixelsARB">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="width" type="GLsizei"/>
@@ -95,7 +95,7 @@
 
 
 <!-- ARB_imaging sized buffer queries -->
-    <function name="GetnColorTableARB" offset="assign" deprecated="3.1">
+    <function name="GetnColorTableARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -103,7 +103,7 @@
         <param name="table" type="GLvoid *" output="true"/>
     </function>
 
-    <function name="GetnConvolutionFilterARB" offset="assign" deprecated="3.1">
+    <function name="GetnConvolutionFilterARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -111,7 +111,7 @@
         <param name="image" type="GLvoid *" output="true"/>
     </function>
 
-    <function name="GetnSeparableFilterARB" offset="assign" deprecated="3.1">
+    <function name="GetnSeparableFilterARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -122,7 +122,7 @@
         <param name="span" type="GLvoid *" output="true"/>
     </function>
 
-    <function name="GetnHistogramARB" offset="assign" deprecated="3.1">
+    <function name="GetnHistogramARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="reset" type="GLboolean"/>
         <param name="format" type="GLenum"/>
@@ -131,7 +131,7 @@
         <param name="values" type="GLvoid *" output="true"/>
     </function>
 
-    <function name="GetnMinmaxARB" offset="assign" deprecated="3.1">
+    <function name="GetnMinmaxARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="reset" type="GLboolean"/>
         <param name="format" type="GLenum"/>
@@ -142,7 +142,7 @@
 
 
 <!-- OpenGL 1.3 sized buffer queries -->
-    <function name="GetnCompressedTexImageARB" offset="assign">
+    <function name="GetnCompressedTexImageARB">
         <param name="target" type="GLenum"/>
         <param name="lod" type="GLint"/>
         <param name="bufSize" type="GLsizei"/>
@@ -151,28 +151,28 @@
 
 
 <!-- OpenGL 2.0 sized buffer queries -->
-    <function name="GetnUniformfvARB" offset="assign">
+    <function name="GetnUniformfvARB">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="params" type="GLfloat *" output="true"/>
     </function>
 
-    <function name="GetnUniformivARB" offset="assign">
+    <function name="GetnUniformivARB">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="params" type="GLint *" output="true"/>
     </function>
 
-    <function name="GetnUniformuivARB" offset="assign">
+    <function name="GetnUniformuivARB">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="params" type="GLuint *" output="true"/>
     </function>
 
-    <function name="GetnUniformdvARB" offset="assign">
+    <function name="GetnUniformdvARB">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="bufSize" type="GLsizei"/>
diff --git a/src/mapi/glapi/gen/ARB_sampler_objects.xml b/src/mapi/glapi/gen/ARB_sampler_objects.xml
index 9173dee..bc69e97 100644
--- a/src/mapi/glapi/gen/ARB_sampler_objects.xml
+++ b/src/mapi/glapi/gen/ARB_sampler_objects.xml
@@ -7,81 +7,81 @@
 
 <category name="GL_ARB_sampler_objects" number="81">
 
-    <function name="GenSamplers" offset="assign" es2="3.0">
+    <function name="GenSamplers" es2="3.0">
       <param name="count" type="GLsizei"/>
       <param name="samplers" type="GLuint *"/>
     </function>
 
-    <function name="DeleteSamplers" offset="assign" es2="3.0">
+    <function name="DeleteSamplers" es2="3.0">
       <param name="count" type="GLsizei"/>
       <param name="samplers" type="const GLuint *"/>
     </function>
 
-    <function name="IsSampler" offset="assign" es2="3.0">
+    <function name="IsSampler" es2="3.0">
       <param name="sampler" type="GLuint"/>
       <return type="GLboolean"/>
     </function>
 
-    <function name="BindSampler" offset="assign" es2="3.0">
+    <function name="BindSampler" es2="3.0">
       <param name="unit" type="GLuint"/>
       <param name="sampler" type="GLuint"/>
     </function>
 
-    <function name="SamplerParameteri" offset="assign" es2="3.0">
+    <function name="SamplerParameteri" es2="3.0">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="param" type="GLint"/>
     </function>
 
-    <function name="SamplerParameterf" offset="assign" es2="3.0">
+    <function name="SamplerParameterf" es2="3.0">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="param" type="GLfloat"/>
     </function>
 
-    <function name="SamplerParameteriv" offset="assign" es2="3.0">
+    <function name="SamplerParameteriv" es2="3.0">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="const GLint *"/>
     </function>
 
-    <function name="SamplerParameterfv" offset="assign" es2="3.0">
+    <function name="SamplerParameterfv" es2="3.0">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="const GLfloat *"/>
     </function>
 
-    <function name="SamplerParameterIiv" offset="assign">
+    <function name="SamplerParameterIiv">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="const GLint *"/>
     </function>
 
-    <function name="SamplerParameterIuiv" offset="assign">
+    <function name="SamplerParameterIuiv">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="const GLuint *"/>
     </function>
 
-    <function name="GetSamplerParameteriv" offset="assign" es2="3.0">
+    <function name="GetSamplerParameteriv" es2="3.0">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="GLint *"/>
     </function>
 
-    <function name="GetSamplerParameterfv" offset="assign" es2="3.0">
+    <function name="GetSamplerParameterfv" es2="3.0">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="GLfloat *"/>
     </function>
 
-    <function name="GetSamplerParameterIiv" offset="assign">
+    <function name="GetSamplerParameterIiv">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="GLint *"/>
     </function>
 
-    <function name="GetSamplerParameterIuiv" offset="assign">
+    <function name="GetSamplerParameterIuiv">
       <param name="sampler" type="GLuint"/>
       <param name="pname" type="GLenum"/>
       <param name="params" type="GLuint *"/>
diff --git a/src/mapi/glapi/gen/ARB_separate_shader_objects.xml b/src/mapi/glapi/gen/ARB_separate_shader_objects.xml
index 96ae2b9..c9f481d 100644
--- a/src/mapi/glapi/gen/ARB_separate_shader_objects.xml
+++ b/src/mapi/glapi/gen/ARB_separate_shader_objects.xml
@@ -15,69 +15,69 @@
       <enum   name="ALL_SHADER_BITS"                              value="0xFFFFFFFF"/>
       <enum   name="PROGRAM_SEPARABLE"                            value="0x8258"/>
 
-      <function name="UseProgramStages" offset="assign" static_dispatch="false">
+      <function name="UseProgramStages" es2="3.1">
          <param name="pipeline" type="GLuint" />
          <param name="stages" type="GLbitfield" />
          <param name="program" type="GLuint" />
       </function>
-      <function name="ActiveShaderProgram" offset="assign" static_dispatch="false">
+      <function name="ActiveShaderProgram" es2="3.1">
          <param name="pipeline" type="GLuint" />
          <param name="program" type="GLuint" />
       </function>
-      <function name="CreateShaderProgramv" offset="assign" static_dispatch="false">
+      <function name="CreateShaderProgramv" es2="3.1">
          <param name="type" type="GLenum" />
          <param name="count" type="GLsizei" />
          <param name="strings" type="const GLchar * const *" />
          <return type="GLuint"/>
       </function>
-      <function name="BindProgramPipeline" offset="assign" static_dispatch="false">
+      <function name="BindProgramPipeline" es2="3.1">
          <param name="pipeline" type="GLuint" />
       </function>
-      <function name="DeleteProgramPipelines" offset="assign" static_dispatch="false">
+      <function name="DeleteProgramPipelines" es2="3.1">
          <param name="n" type="GLsizei" />
          <param name="pipelines" type="const GLuint *" />
       </function>
-      <function name="GenProgramPipelines" offset="assign" static_dispatch="false">
+      <function name="GenProgramPipelines" es2="3.1">
          <param name="n" type="GLsizei" />
          <param name="pipelines" type="GLuint *" />
       </function>
-      <function name="IsProgramPipeline" offset="assign" static_dispatch="false">
+      <function name="IsProgramPipeline" es2="3.1">
          <param name="pipeline" type="GLuint" />
          <return type="GLboolean"/>
       </function>
       <!-- Function already included on ARB_get_program_binary.xml. Keep a commented
       version here for completeness -->
       <!--
-      <function name="ProgramParameteri" offset="assign" es2="3.0" static_dispatch="false">
+      <function name="ProgramParameteri" es2="3.0">
          <param name="program" type="GLuint"/>
          <param name="pname" type="GLenum"/>
          <param name="value" type="GLint"/>
       </function>
       -->
-      <function name="GetProgramPipelineiv" offset="assign" static_dispatch="false">
+      <function name="GetProgramPipelineiv" es2="3.1">
          <param name="pipeline" type="GLuint" />
          <param name="pname" type="GLenum" />
          <param name="params" type="GLint *" />
       </function>
-      <function name="ProgramUniform1i" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1i" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLint" />
       </function>
-      <function name="ProgramUniform2i" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2i" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLint" />
          <param name="y" type="GLint" />
       </function>
-      <function name="ProgramUniform3i" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3i" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLint" />
          <param name="y" type="GLint" />
          <param name="z" type="GLint" />
       </function>
-      <function name="ProgramUniform4i" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4i" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLint" />
@@ -85,25 +85,25 @@
          <param name="z" type="GLint" />
          <param name="w" type="GLint" />
       </function>
-      <function name="ProgramUniform1ui" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1ui" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLuint" />
       </function>
-      <function name="ProgramUniform2ui" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2ui" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLuint" />
          <param name="y" type="GLuint" />
       </function>
-      <function name="ProgramUniform3ui" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3ui" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLuint" />
          <param name="y" type="GLuint" />
          <param name="z" type="GLuint" />
       </function>
-      <function name="ProgramUniform4ui" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4ui" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLuint" />
@@ -111,25 +111,25 @@
          <param name="z" type="GLuint" />
          <param name="w" type="GLuint" />
       </function>
-      <function name="ProgramUniform1f" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1f" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLfloat" />
       </function>
-      <function name="ProgramUniform2f" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2f" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLfloat" />
          <param name="y" type="GLfloat" />
       </function>
-      <function name="ProgramUniform3f" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3f" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLfloat" />
          <param name="y" type="GLfloat" />
          <param name="z" type="GLfloat" />
       </function>
-      <function name="ProgramUniform4f" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4f" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLfloat" />
@@ -137,170 +137,170 @@
          <param name="z" type="GLfloat" />
          <param name="w" type="GLfloat" />
       </function>
-      <function name="ProgramUniform1iv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1iv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLint *" />
       </function>
-      <function name="ProgramUniform2iv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2iv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLint *" />
       </function>
-      <function name="ProgramUniform3iv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3iv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLint *" />
       </function>
-      <function name="ProgramUniform4iv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4iv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLint *" />
       </function>
-      <function name="ProgramUniform1uiv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1uiv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLuint *" />
       </function>
-      <function name="ProgramUniform2uiv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2uiv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLuint *" />
       </function>
-      <function name="ProgramUniform3uiv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3uiv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLuint *" />
       </function>
-      <function name="ProgramUniform4uiv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4uiv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLuint *" />
       </function>
-      <function name="ProgramUniform1fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniform2fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniform3fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniform4fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix2fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix2fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix3fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix3fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix4fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix4fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix2x3fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix2x3fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix3x2fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix3x2fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix2x4fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix2x4fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix4x2fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix4x2fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix3x4fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix3x4fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ProgramUniformMatrix4x3fv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix4x3fv" es2="3.1">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLfloat *" />
       </function>
-      <function name="ValidateProgramPipeline" offset="assign" static_dispatch="false">
+      <function name="ValidateProgramPipeline" es2="3.1">
          <param name="pipeline" type="GLuint" />
       </function>
-      <function name="GetProgramPipelineInfoLog" offset="assign" static_dispatch="false">
+      <function name="GetProgramPipelineInfoLog" es2="3.1">
          <param name="pipeline" type="GLuint" />
          <param name="bufSize" type="GLsizei" />
          <param name="length" type="GLsizei *" />
          <param name="infoLog" type="GLchar *" />
       </function>
 
-      <function name="ProgramUniform1d" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1d">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLdouble" />
       </function>
-      <function name="ProgramUniform2d" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2d">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLdouble" />
          <param name="y" type="GLdouble" />
       </function>
-      <function name="ProgramUniform3d" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3d">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLdouble" />
          <param name="y" type="GLdouble" />
          <param name="z" type="GLdouble" />
       </function>
-      <function name="ProgramUniform4d" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4d">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="x" type="GLdouble" />
@@ -308,88 +308,88 @@
          <param name="z" type="GLdouble" />
          <param name="w" type="GLdouble" />
       </function>
-      <function name="ProgramUniformMatrix2x3dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix2x3dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix3x2dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix3x2dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix2x4dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix2x4dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix4x2dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix4x2dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix3x4dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix3x4dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix4x3dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix4x3dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix2dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix2dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix3dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix3dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniformMatrix4dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniformMatrix4dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="transpose" type="GLboolean" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniform1dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform1dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniform2dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform2dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniform3dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform3dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
          <param name="value" type="const GLdouble *" />
       </function>
-      <function name="ProgramUniform4dv" offset="assign" static_dispatch="false">
+      <function name="ProgramUniform4dv">
          <param name="program" type="GLuint" />
          <param name="location" type="GLint" />
          <param name="count" type="GLsizei" />
diff --git a/src/mapi/glapi/gen/ARB_shader_atomic_counters.xml b/src/mapi/glapi/gen/ARB_shader_atomic_counters.xml
index f3b74e9..0b0b60f 100644
--- a/src/mapi/glapi/gen/ARB_shader_atomic_counters.xml
+++ b/src/mapi/glapi/gen/ARB_shader_atomic_counters.xml
@@ -35,7 +35,7 @@
 <enum name="UNSIGNED_INT_ATOMIC_COUNTER" value="0x92DB"/>
 <enum name="MAX_ATOMIC_COUNTER_BUFFER_BINDINGS" value="0x92DC"/>
 
-<function name="GetActiveAtomicCounterBufferiv" offset="assign">
+<function name="GetActiveAtomicCounterBufferiv">
     <param name="program" type="GLuint" />
     <param name="bufferIndex" type="GLuint" />
     <param name="pname" type="GLenum" />
diff --git a/src/mapi/glapi/gen/ARB_shader_image_load_store.xml b/src/mapi/glapi/gen/ARB_shader_image_load_store.xml
index 7ccfca4..178e930 100644
--- a/src/mapi/glapi/gen/ARB_shader_image_load_store.xml
+++ b/src/mapi/glapi/gen/ARB_shader_image_load_store.xml
@@ -70,7 +70,7 @@
 <enum name="MAX_FRAGMENT_IMAGE_UNIFORMS" value="0x90CE"/>
 <enum name="MAX_COMBINED_IMAGE_UNIFORMS" value="0x90CF"/>
 
-<function name="BindImageTexture" offset="assign">
+<function name="BindImageTexture" es2="3.1">
   <param name="unit" type="GLuint"/>
   <param name="texture" type="GLuint"/>
   <param name="level" type="GLint"/>
@@ -80,7 +80,7 @@
   <param name="format" type="GLenum"/>
 </function>
 
-<function name="MemoryBarrier" offset="assign">
+<function name="MemoryBarrier" es2="3.1">
   <param name="barriers" type="GLbitfield"/>
 </function>
 
diff --git a/src/mapi/glapi/gen/ARB_sync.xml b/src/mapi/glapi/gen/ARB_sync.xml
index 58f1639..d8a1c34 100644
--- a/src/mapi/glapi/gen/ARB_sync.xml
+++ b/src/mapi/glapi/gen/ARB_sync.xml
@@ -39,40 +39,40 @@
     -->
 
 
-    <function name="FenceSync" offset="assign" es2="3.0">
+    <function name="FenceSync" es2="3.0">
         <param name="condition" type="GLenum"/>
         <param name="flags" type="GLbitfield"/>
         <return type="GLsync"/>
     </function>
 
-    <function name="IsSync" offset="assign" es2="3.0">
+    <function name="IsSync" es2="3.0">
         <param name="sync" type="GLsync"/>
 	<return type="GLboolean"/>
     </function>
 
-    <function name="DeleteSync" offset="assign" es2="3.0">
+    <function name="DeleteSync" es2="3.0">
         <param name="sync" type="GLsync"/>
     </function>
 
-    <function name="ClientWaitSync" offset="assign" es2="3.0">
+    <function name="ClientWaitSync" es2="3.0">
         <param name="sync" type="GLsync"/>
         <param name="flags" type="GLbitfield"/>
 	<param name="timeout" type="GLuint64"/>
         <return type="GLenum"/>
     </function>
 
-    <function name="WaitSync" offset="assign" es2="3.0">
+    <function name="WaitSync" es2="3.0">
         <param name="sync" type="GLsync"/>
         <param name="flags" type="GLbitfield"/>
 	<param name="timeout" type="GLuint64"/>
     </function>
 
-    <function name="GetInteger64v" offset="assign" es2="3.0">
+    <function name="GetInteger64v" es2="3.0">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint64 *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="GetSynciv" offset="assign" es2="3.0">
+    <function name="GetSynciv" es2="3.0">
         <param name="sync" type="GLsync"/>
         <param name="pname" type="GLenum"/>
         <param name="bufSize" type="GLsizei"/>
diff --git a/src/mapi/glapi/gen/ARB_texture_buffer_range.xml b/src/mapi/glapi/gen/ARB_texture_buffer_range.xml
index 2176c08..36bcc49 100644
--- a/src/mapi/glapi/gen/ARB_texture_buffer_range.xml
+++ b/src/mapi/glapi/gen/ARB_texture_buffer_range.xml
@@ -9,7 +9,7 @@
     <enum name="TEXTURE_BUFFER_SIZE"                    value="0x919E"/>
     <enum name="TEXTURE_BUFFER_OFFSET_ALIGNMENT"        value="0x919F"/>
 
-    <function name="TexBufferRange" offset="assign">
+    <function name="TexBufferRange">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="buffer" type="GLuint"/>
diff --git a/src/mapi/glapi/gen/ARB_texture_multisample.xml b/src/mapi/glapi/gen/ARB_texture_multisample.xml
index 1f65a8b..595e1c7 100644
--- a/src/mapi/glapi/gen/ARB_texture_multisample.xml
+++ b/src/mapi/glapi/gen/ARB_texture_multisample.xml
@@ -34,7 +34,7 @@
    <enum name="INT_SAMPLER_2D_MULTISAMPLE_ARRAY"            value="0x910C"/>
    <enum name="UNSIGNED_INT_SAMPLER_2D_MULTISAMPLE_ARRAY"   value="0x910D"/>
 
-   <function name="TexImage2DMultisample" offset="assign">
+   <function name="TexImage2DMultisample">
       <param name="target" type="GLenum"/>
       <param name="samples" type="GLsizei"/>
       <param name="internalformat" type="GLenum"/>
@@ -43,7 +43,7 @@
       <param name="fixedsamplelocations" type="GLboolean"/>
    </function>
 
-   <function name="TexImage3DMultisample" offset="assign">
+   <function name="TexImage3DMultisample">
       <param name="target" type="GLenum"/>
       <param name="samples" type="GLsizei"/>
       <param name="internalformat" type="GLenum"/>
@@ -53,13 +53,13 @@
       <param name="fixedsamplelocations" type="GLboolean"/>
    </function>
 
-   <function name="GetMultisamplefv" offset="assign">
+   <function name="GetMultisamplefv" es2="3.1">
       <param name="pname" type="GLenum"/>
       <param name="index" type="GLuint"/>
       <param name="val" type="GLfloat *"/>
    </function>
 
-   <function name="SampleMaski" offset="assign">
+   <function name="SampleMaski" es2="3.1">
       <param name="index" type="GLuint"/>
       <param name="mask" type="GLbitfield"/>
    </function>
diff --git a/src/mapi/glapi/gen/ARB_texture_storage.xml b/src/mapi/glapi/gen/ARB_texture_storage.xml
index 1d63e7c..7df3942 100644
--- a/src/mapi/glapi/gen/ARB_texture_storage.xml
+++ b/src/mapi/glapi/gen/ARB_texture_storage.xml
@@ -10,14 +10,14 @@
 
   <enum name="TEXTURE_IMMUTABLE_FORMAT" value="0x912F"/>
 
-  <function name="TexStorage1D" offset="assign">
+  <function name="TexStorage1D">
     <param name="target" type="GLenum"/>
     <param name="levels" type="GLsizei"/>
     <param name="internalFormat" type="GLenum"/>
     <param name="width" type="GLsizei"/>
   </function>
 
-  <function name="TexStorage2D" offset="assign" es2="3.0">
+  <function name="TexStorage2D" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="levels" type="GLsizei"/>
     <param name="internalFormat" type="GLenum"/>
@@ -25,7 +25,7 @@
     <param name="height" type="GLsizei"/>
   </function>
 
-  <function name="TexStorage3D" offset="assign" es2="3.0">
+  <function name="TexStorage3D" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="levels" type="GLsizei"/>
     <param name="internalFormat" type="GLenum"/>
@@ -34,7 +34,7 @@
     <param name="depth" type="GLsizei"/>
   </function>
 
-  <function name="TextureStorage1DEXT" offset="assign">
+  <function name="TextureStorage1DEXT">
     <param name="texture" type="GLuint"/>
     <param name="target" type="GLenum"/>
     <param name="levels" type="GLsizei"/>
@@ -42,7 +42,7 @@
     <param name="width" type="GLsizei"/>
   </function>
 
-  <function name="TextureStorage2DEXT" offset="assign">
+  <function name="TextureStorage2DEXT">
     <param name="texture" type="GLuint"/>
     <param name="target" type="GLenum"/>
     <param name="levels" type="GLsizei"/>
@@ -51,7 +51,7 @@
     <param name="height" type="GLsizei"/>
   </function>
 
-  <function name="TextureStorage3DEXT" offset="assign">
+  <function name="TextureStorage3DEXT">
     <param name="texture" type="GLuint"/>
     <param name="target" type="GLenum"/>
     <param name="levels" type="GLsizei"/>
diff --git a/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml b/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml
index 0f9d323..6ed8f1a 100644
--- a/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml
+++ b/src/mapi/glapi/gen/ARB_texture_storage_multisample.xml
@@ -7,7 +7,7 @@
 
 <category name="GL_ARB_texture_storage_multisample" number="141">
 
-   <function name="TexStorage2DMultisample" offset="assign">
+   <function name="TexStorage2DMultisample" es2="3.1">
       <param name="target" type="GLenum"/>
       <param name="samples" type="GLsizei"/>
       <param name="internalformat" type="GLenum"/>
@@ -16,7 +16,7 @@
       <param name="fixedsamplelocations" type="GLboolean"/>
    </function>
 
-   <function name="TexStorage3DMultisample" offset="assign">
+   <function name="TexStorage3DMultisample">
       <param name="target" type="GLenum"/>
       <param name="samples" type="GLsizei"/>
       <param name="internalformat" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_texture_view.xml b/src/mapi/glapi/gen/ARB_texture_view.xml
index 3e6b8c9..4215fc5 100644
--- a/src/mapi/glapi/gen/ARB_texture_view.xml
+++ b/src/mapi/glapi/gen/ARB_texture_view.xml
@@ -7,7 +7,7 @@
 
 <category name="GL_ARB_texture_view" number="124">
 
-   <function name="TextureView" offset="assign">
+   <function name="TextureView">
       <param name="texture" type="GLuint"/>
       <param name="target" type="GLenum"/>
       <param name="origtexture" type="GLuint"/>
diff --git a/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml b/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml
index 11aacb0..cf86bbb 100644
--- a/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml
+++ b/src/mapi/glapi/gen/ARB_uniform_buffer_object.xml
@@ -39,14 +39,14 @@
 <enum name="UNIFORM_BLOCK_REFERENCED_BY_FRAGMENT_SHADER" value="0x8A46" />
 <enum name="INVALID_INDEX" value="0xFFFFFFFF" />
 
-<function name="GetUniformIndices" offset="assign" es2="3.0">
+<function name="GetUniformIndices" es2="3.0">
     <param name="program" type="GLuint" />
     <param name="uniformCount" type="GLsizei" />
     <param name="uniformNames" type="const GLchar * const *" />
     <param name="uniformIndices" type="GLuint *" />
 </function>
 
-<function name="GetActiveUniformsiv" offset="assign" es2="3.0">
+<function name="GetActiveUniformsiv" es2="3.0">
     <param name="program" type="GLuint" />
     <param name="uniformCount" type="GLsizei" />
     <param name="uniformIndices" type="const GLuint *" />
@@ -54,7 +54,7 @@
     <param name="params" type="GLint *" />
 </function>
 
-<function name="GetActiveUniformName" offset="assign">
+<function name="GetActiveUniformName">
     <param name="program" type="GLuint" />
     <param name="uniformIndex" type="GLuint" />
     <param name="bufSize" type="GLsizei" />
@@ -62,20 +62,20 @@
     <param name="uniformName" type="GLchar *" />
 </function>
 
-<function name="GetUniformBlockIndex" offset="assign" es2="3.0">
+<function name="GetUniformBlockIndex" es2="3.0">
     <return type="GLuint"/>
     <param name="program" type="GLuint" />
     <param name="uniformBlockName" type="const GLchar *" />
 </function>
 
-<function name="GetActiveUniformBlockiv" offset="assign" es2="3.0">
+<function name="GetActiveUniformBlockiv" es2="3.0">
     <param name="program" type="GLuint" />
     <param name="uniformBlockIndex" type="GLuint" />
     <param name="pname" type="GLenum" />
     <param name="params" type="GLint *" />
 </function>
 
-<function name="GetActiveUniformBlockName" offset="assign" es2="3.0">
+<function name="GetActiveUniformBlockName" es2="3.0">
     <param name="program" type="GLuint" />
     <param name="uniformBlockIndex" type="GLuint" />
     <param name="bufSize" type="GLsizei" />
@@ -86,7 +86,7 @@
 <!-- Duplicated with GL3x.xml: BindBufferRange, BindBufferBase,
      GetIntegeri_v -->
 
-<function name="UniformBlockBinding" offset="assign" es2="3.0">
+<function name="UniformBlockBinding" es2="3.0">
     <param name="program" type="GLuint" />
     <param name="uniformBlockIndex" type="GLuint" />
     <param name="uniformBlockBinding" type="GLuint" />
diff --git a/src/mapi/glapi/gen/ARB_vertex_array_object.xml b/src/mapi/glapi/gen/ARB_vertex_array_object.xml
index f2277d2..4a392db 100644
--- a/src/mapi/glapi/gen/ARB_vertex_array_object.xml
+++ b/src/mapi/glapi/gen/ARB_vertex_array_object.xml
@@ -10,21 +10,21 @@
 
     <enum name="VERTEX_ARRAY_BINDING" value="0x85B5"/>
 
-    <function name="BindVertexArray" offset="assign" es2="3.0">
+    <function name="BindVertexArray" es2="3.0">
         <param name="array" type="GLuint"/>
     </function>
 
-    <function name="DeleteVertexArrays" es2="3.0" offset="assign">
+    <function name="DeleteVertexArrays" es2="3.0">
         <param name="n" type="GLsizei"/>
         <param name="arrays" type="const GLuint *" count="n"/>
     </function>
 
-    <function name="GenVertexArrays" offset="assign" es2="3.0">
+    <function name="GenVertexArrays" es2="3.0">
         <param name="n" type="GLsizei"/>
         <param name="arrays" type="GLuint *"/>
     </function>
 
-    <function name="IsVertexArray" es2="3.0" offset="assign">
+    <function name="IsVertexArray" es2="3.0">
         <param name="array" type="GLuint"/>
         <return type="GLboolean"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml b/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml
index fc49f84..211642f 100644
--- a/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml
+++ b/src/mapi/glapi/gen/ARB_vertex_attrib_64bit.xml
@@ -5,25 +5,25 @@
 
 <category name="GL_ARB_vertex_attrib_64bit" number="99">
 
-    <function name="VertexAttribL1d" offset="assign">
+    <function name="VertexAttribL1d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttribL2d" offset="assign">
+    <function name="VertexAttribL2d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttribL3d" offset="assign">
+    <function name="VertexAttribL3d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttribL4d" offset="assign">
+    <function name="VertexAttribL4d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
@@ -31,27 +31,27 @@
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttribL1dv" offset="assign">
+    <function name="VertexAttribL1dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttribL2dv" offset="assign">
+    <function name="VertexAttribL2dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttribL3dv" offset="assign">
+    <function name="VertexAttribL3dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttribL4dv" offset="assign">
+    <function name="VertexAttribL4dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttribLPointer" offset="assign">
+    <function name="VertexAttribLPointer">
         <param name="index" type="GLuint"/>
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -59,7 +59,7 @@
         <param name="pointer" type="const GLvoid *"/>
     </function>
 
-    <function name="GetVertexAttribLdv" offset="assign">
+    <function name="GetVertexAttribLdv">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLdouble *"/>
diff --git a/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml b/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml
index 7e62688..ba9ca57 100644
--- a/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml
+++ b/src/mapi/glapi/gen/ARB_vertex_attrib_binding.xml
@@ -7,14 +7,14 @@
 
 <category name="GL_ARB_vertex_attrib_binding" number="125">
 
-    <function name="BindVertexBuffer" offset="assign">
+    <function name="BindVertexBuffer" es2="3.1">
         <param name="bindingindex" type="GLuint"/>
         <param name="buffer" type="GLuint"/>
         <param name="offset" type="GLintptr"/>
         <param name="stride" type="GLsizei"/>
     </function>
 
-    <function name="VertexAttribFormat" offset="assign">
+    <function name="VertexAttribFormat" es2="3.1">
         <param name="attribindex" type="GLuint"/>
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -22,26 +22,26 @@
         <param name="relativeoffset" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribIFormat" offset="assign">
+    <function name="VertexAttribIFormat" es2="3.1">
         <param name="attribindex" type="GLuint"/>
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="relativeoffset" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribLFormat" offset="assign">
+    <function name="VertexAttribLFormat">
         <param name="attribindex" type="GLuint"/>
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="relativeoffset" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribBinding" offset="assign">
+    <function name="VertexAttribBinding" es2="3.1">
         <param name="attribindex" type="GLuint"/>
         <param name="bindingindex" type="GLuint"/>
     </function>
 
-    <function name="VertexBindingDivisor" offset="assign">
+    <function name="VertexBindingDivisor" es2="3.1">
         <param name="attribindex" type="GLuint"/>
         <param name="divisor" type="GLuint"/>
     </function>
diff --git a/src/mapi/glapi/gen/ARB_vertex_type_2_10_10_10_rev.xml b/src/mapi/glapi/gen/ARB_vertex_type_2_10_10_10_rev.xml
index 6c6090c..92ec6e1 100644
--- a/src/mapi/glapi/gen/ARB_vertex_type_2_10_10_10_rev.xml
+++ b/src/mapi/glapi/gen/ARB_vertex_type_2_10_10_10_rev.xml
@@ -7,244 +7,214 @@
 
     <enum name="INT_2_10_10_10_REV"                value = "0x8D9F"/>
 
-    <function name="VertexP2ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexP2ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="value" type="GLuint"/>
     </function>
 
-    <function name="VertexP3ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexP3ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="value" type="GLuint"/>
     </function>
 
-    <function name="VertexP4ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexP4ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="value" type="GLuint"/>
     </function>
 
-    <function name="VertexP2uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexP2uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="value" type="const GLuint *"/>
     </function>
 
-    <function name="VertexP3uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexP3uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="value" type="const GLuint *"/>
     </function>
 
-    <function name="VertexP4uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexP4uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="value" type="const GLuint *"/>
     </function>
 
-    <function name="TexCoordP1ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP1ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="TexCoordP2ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP2ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="TexCoordP3ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP3ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="TexCoordP4ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP4ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="TexCoordP1uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP1uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="TexCoordP2uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP2uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="TexCoordP3uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP3uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="TexCoordP4uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoordP4uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="MultiTexCoordP1ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP1ui" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="MultiTexCoordP2ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP2ui" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="MultiTexCoordP3ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP3ui" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="MultiTexCoordP4ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP4ui" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="MultiTexCoordP1uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP1uiv" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="MultiTexCoordP2uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP2uiv" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="MultiTexCoordP3uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP3uiv" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="MultiTexCoordP4uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="MultiTexCoordP4uiv" deprecated="3.1" exec="dynamic">
         <param name="texture" type="GLenum"/>
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="NormalP3ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="NormalP3ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="GLuint"/>
     </function>
 
-    <function name="NormalP3uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="NormalP3uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="coords" type="const GLuint *"/>
     </function>
 
-    <function name="ColorP3ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="ColorP3ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="color" type="GLuint"/>
     </function>
 
-    <function name="ColorP4ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="ColorP4ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="color" type="GLuint"/>
     </function>
 
-    <function name="ColorP3uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="ColorP3uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="color" type="const GLuint *"/>
     </function>
 
-    <function name="ColorP4uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="ColorP4uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="color" type="const GLuint *"/>
     </function>
 
-    <function name="SecondaryColorP3ui" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="SecondaryColorP3ui" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="color" type="GLuint"/>
     </function>
 
-    <function name="SecondaryColorP3uiv" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="SecondaryColorP3uiv" deprecated="3.1" exec="dynamic">
         <param name="type" type="GLenum"/>
         <param name="color" type="const GLuint *"/>
     </function>
 
-    <function name="VertexAttribP1ui" offset="assign" exec="dynamic">
+    <function name="VertexAttribP1ui" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
         <param name="value" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribP2ui" offset="assign" exec="dynamic">
+    <function name="VertexAttribP2ui" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
         <param name="value" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribP3ui" offset="assign" exec="dynamic">
+    <function name="VertexAttribP3ui" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
         <param name="value" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribP4ui" offset="assign" exec="dynamic">
+    <function name="VertexAttribP4ui" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
         <param name="value" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribP1uiv" offset="assign" exec="dynamic">
+    <function name="VertexAttribP1uiv" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
         <param name="value" type="const GLuint *"/>
     </function>
 
-    <function name="VertexAttribP2uiv" offset="assign" exec="dynamic">
+    <function name="VertexAttribP2uiv" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
         <param name="value" type="const GLuint *"/>
     </function>
 
-    <function name="VertexAttribP3uiv" offset="assign" exec="dynamic">
+    <function name="VertexAttribP3uiv" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
         <param name="value" type="const GLuint *"/>
     </function>
 
-    <function name="VertexAttribP4uiv" offset="assign" exec="dynamic">
+    <function name="VertexAttribP4uiv" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="type" type="GLenum"/>
         <param name="normalized" type="GLboolean"/>
diff --git a/src/mapi/glapi/gen/ARB_viewport_array.xml b/src/mapi/glapi/gen/ARB_viewport_array.xml
index e1c6c2d..b20cf61 100644
--- a/src/mapi/glapi/gen/ARB_viewport_array.xml
+++ b/src/mapi/glapi/gen/ARB_viewport_array.xml
@@ -21,54 +21,54 @@
     <enum name="PROVOKING_VERTEX" value="0x8E4F"/>
     <enum name="UNDEFINED_VERTEX" value="0x8260"/>
 
-    <function name="ViewportArrayv" offset="assign">
+    <function name="ViewportArrayv">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="v" type="const GLfloat *"/>
     </function>
-    <function name="ViewportIndexedf" offset="assign">
+    <function name="ViewportIndexedf">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="w" type="GLfloat"/>
         <param name="h" type="GLfloat"/>
     </function>
-    <function name="ViewportIndexedfv" offset="assign">
+    <function name="ViewportIndexedfv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *"/>
     </function>
-    <function name="ScissorArrayv" offset="assign">
+    <function name="ScissorArrayv">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="v" type="const int *"/>
     </function>
-    <function name="ScissorIndexed" offset="assign">
+    <function name="ScissorIndexed">
         <param name="index" type="GLuint"/>
         <param name="left" type="GLint"/>
         <param name="bottom" type="GLint"/>
         <param name="width" type="GLsizei"/>
         <param name="height" type="GLsizei"/>
     </function>
-    <function name="ScissorIndexedv" offset="assign">
+    <function name="ScissorIndexedv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLint *"/>
     </function>
-    <function name="DepthRangeArrayv" offset="assign">
+    <function name="DepthRangeArrayv">
         <param name="first" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="v" type="const GLclampd *"/>
     </function>
-    <function name="DepthRangeIndexed" offset="assign">
+    <function name="DepthRangeIndexed">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLclampd"/>
         <param name="f" type="GLclampd"/>
     </function>
-    <function name="GetFloati_v" offset="assign">
+    <function name="GetFloati_v">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="data" type="GLfloat *"/>
     </function>
-    <function name="GetDoublei_v" offset="assign">
+    <function name="GetDoublei_v">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="data" type="GLdouble *"/>
diff --git a/src/mapi/glapi/gen/EXT_framebuffer_object.xml b/src/mapi/glapi/gen/EXT_framebuffer_object.xml
index 2cf75bc..9ae0291 100644
--- a/src/mapi/glapi/gen/EXT_framebuffer_object.xml
+++ b/src/mapi/glapi/gen/EXT_framebuffer_object.xml
@@ -78,7 +78,7 @@
 	<return type="GLboolean"/>
     </function>
 
-    <function name="BindRenderbufferEXT" offset="assign" deprecated="3.1">
+    <function name="BindRenderbufferEXT" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="renderbuffer" type="GLuint"/>
         <glx rop="4316"/>
@@ -112,7 +112,7 @@
 	<return type="GLboolean"/>
     </function>
 
-    <function name="BindFramebufferEXT" offset="assign" deprecated="3.1">
+    <function name="BindFramebufferEXT" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="framebuffer" type="GLuint"/>
         <glx rop="4319"/>
@@ -186,7 +186,7 @@
     <enum name="READ_FRAMEBUFFER_BINDING_EXT" count="1" value="0x8CAA">
         <size name="Get" mode="get"/>
     </enum>
-    <function name="BlitFramebufferEXT" static_dispatch="false" alias="BlitFramebuffer">
+    <function name="BlitFramebufferEXT" alias="BlitFramebuffer">
         <param name="srcX0" type="GLint"/>
         <param name="srcY0" type="GLint"/>
         <param name="srcX1" type="GLint"/>
diff --git a/src/mapi/glapi/gen/EXT_gpu_shader4.xml b/src/mapi/glapi/gen/EXT_gpu_shader4.xml
index d204c3f..b1f7eae 100644
--- a/src/mapi/glapi/gen/EXT_gpu_shader4.xml
+++ b/src/mapi/glapi/gen/EXT_gpu_shader4.xml
@@ -44,25 +44,25 @@
     </enum>
 
 
-    <function name="VertexAttribI1iEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI1iEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLint"/>
     </function>
 
-    <function name="VertexAttribI2iEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI2iEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
     </function>
 
-    <function name="VertexAttribI3iEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI3iEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
     </function>
 
-    <function name="VertexAttribI4iEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI4iEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
@@ -70,25 +70,25 @@
         <param name="w" type="GLint"/>
     </function>
 
-    <function name="VertexAttribI1uiEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI1uiEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribI2uiEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI2uiEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLuint"/>
         <param name="y" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribI3uiEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI3uiEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLuint"/>
         <param name="y" type="GLuint"/>
         <param name="z" type="GLuint"/>
     </function>
 
-    <function name="VertexAttribI4uiEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI4uiEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLuint"/>
         <param name="y" type="GLuint"/>
@@ -101,17 +101,17 @@
         <param name="v" type="const GLint *"/>
     </function>
 
-    <function name="VertexAttribI2ivEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI2ivEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLint *"/>
     </function>
 
-    <function name="VertexAttribI3ivEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI3ivEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLint *"/>
     </function>
 
-    <function name="VertexAttribI4ivEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI4ivEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLint *"/>
     </function>
@@ -121,17 +121,17 @@
         <param name="v" type="const GLuint *"/>
     </function>
 
-    <function name="VertexAttribI2uivEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI2uivEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLuint *"/>
     </function>
 
-    <function name="VertexAttribI3uivEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI3uivEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLuint *"/>
     </function>
 
-    <function name="VertexAttribI4uivEXT" offset="assign" exec="dynamic">
+    <function name="VertexAttribI4uivEXT" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLuint *"/>
     </function>
diff --git a/src/mapi/glapi/gen/EXT_provoking_vertex.xml b/src/mapi/glapi/gen/EXT_provoking_vertex.xml
index eb87209..3d1ae77 100644
--- a/src/mapi/glapi/gen/EXT_provoking_vertex.xml
+++ b/src/mapi/glapi/gen/EXT_provoking_vertex.xml
@@ -26,7 +26,7 @@
     <enum name="PROVOKING_VERTEX"                         value="0x8E4F"/>
     <enum name="QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION" value="0x8E4C"/>
 
-    <function name="ProvokingVertex" offset="assign">
+    <function name="ProvokingVertex">
         <param name="mode" type="GLenum"/>
     </function>
 
diff --git a/src/mapi/glapi/gen/EXT_separate_shader_objects.xml b/src/mapi/glapi/gen/EXT_separate_shader_objects.xml
index c6163a1..1fa699f 100644
--- a/src/mapi/glapi/gen/EXT_separate_shader_objects.xml
+++ b/src/mapi/glapi/gen/EXT_separate_shader_objects.xml
@@ -8,16 +8,16 @@
     <!-- Alias of CURRENT_PROGRAM -->
 <!--    <enum name="ACTIVE_PROGRAM_EXT"               value="0x8B8D"/> -->
 
-    <function name="UseShaderProgramEXT" deprecated="3.1" offset="assign" exec="skip">
+    <function name="UseShaderProgramEXT" deprecated="3.1" exec="skip">
         <param name="type" type="GLenum"/>
 	<param name="program" type="GLuint"/>
     </function>
 
-    <function name="ActiveProgramEXT" deprecated="3.1" offset="assign" exec="skip">
+    <function name="ActiveProgramEXT" deprecated="3.1" exec="skip">
 	<param name="program" type="GLuint"/>
     </function>
 
-    <function name="CreateShaderProgramEXT" deprecated="3.1" offset="assign" exec="skip">
+    <function name="CreateShaderProgramEXT" deprecated="3.1" exec="skip">
         <param name="type" type="GLenum"/>
         <param name="string" type="const GLchar *"/>
         <return type="GLuint"/>
@@ -31,65 +31,65 @@
     <enum name="ALL_SHADER_BITS_EXT"                          value="0xFFFFFFFF"/>
     <enum name="PROGRAM_SEPARABLE_EXT"                        value="0x8258"/>
 
-    <function name="UseProgramStagesEXT" alias="UseProgramStages" static_dispatch="false" es2="2.0">
+    <function name="UseProgramStagesEXT" alias="UseProgramStages" es2="2.0">
         <param name="pipeline" type="GLuint"/>
         <param name="stages" type="GLbitfield"/>
         <param name="program" type="GLuint"/>
     </function>
-    <function name="ActiveShaderProgramEXT" alias="ActiveShaderProgram" static_dispatch="false" es2="2.0">
+    <function name="ActiveShaderProgramEXT" alias="ActiveShaderProgram" es2="2.0">
         <param name="pipeline" type="GLuint"/>
         <param name="program" type="GLuint"/>
     </function>
-    <function name="CreateShaderProgramvEXT" alias="CreateShaderProgramv" static_dispatch="false" es2="2.0">
+    <function name="CreateShaderProgramvEXT" alias="CreateShaderProgramv" es2="2.0">
         <param name="type" type="GLenum"/>
         <param name="count" type="GLsizei"/>
         <param name="strings" type="const GLchar * const *"/>
         <return type="GLuint"/>
     </function>
-    <function name="BindProgramPipelineEXT" alias="BindProgramPipeline" static_dispatch="false" es2="2.0">
+    <function name="BindProgramPipelineEXT" alias="BindProgramPipeline" es2="2.0">
         <param name="pipeline" type="GLuint"/>
     </function>
-    <function name="DeleteProgramPipelinesEXT" alias="DeleteProgramPipelines" static_dispatch="false" es2="2.0">
+    <function name="DeleteProgramPipelinesEXT" alias="DeleteProgramPipelines" es2="2.0">
         <param name="n" type="GLsizei"/>
         <param name="pipelines" type="const GLuint *"/>
     </function>
-    <function name="GenProgramPipelinesEXT" alias="GenProgramPipelines" static_dispatch="false" es2="2.0">
+    <function name="GenProgramPipelinesEXT" alias="GenProgramPipelines" es2="2.0">
         <param name="n" type="GLsizei"/>
         <param name="pipelines" type="GLuint *"/>
     </function>
-    <function name="IsProgramPipelineEXT" alias="IsProgramPipeline" static_dispatch="false" es2="2.0">
+    <function name="IsProgramPipelineEXT" alias="IsProgramPipeline" es2="2.0">
         <param name="pipeline" type="GLuint"/>
         <return type="GLboolean"/>
     </function>
-    <function name="ProgramParameteriEXT" alias="ProgramParameteri" static_dispatch="false" es2="2.0">
+    <function name="ProgramParameteriEXT" alias="ProgramParameteri" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="value" type="GLint"/>
     </function>
-    <function name="GetProgramPipelineivEXT" alias="GetProgramPipelineiv" static_dispatch="false" es2="2.0">
+    <function name="GetProgramPipelineivEXT" alias="GetProgramPipelineiv" es2="2.0">
         <param name="pipeline" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *"/>
     </function>
-    <function name="ProgramUniform1iEXT" alias="ProgramUniform1i" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform1iEXT" alias="ProgramUniform1i" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLint"/>
     </function>
-    <function name="ProgramUniform2iEXT" alias="ProgramUniform2i" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform2iEXT" alias="ProgramUniform2i" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
     </function>
-    <function name="ProgramUniform3iEXT" alias="ProgramUniform3i" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform3iEXT" alias="ProgramUniform3i" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
     </function>
-    <function name="ProgramUniform4iEXT" alias="ProgramUniform4i" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform4iEXT" alias="ProgramUniform4i" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLint"/>
@@ -97,25 +97,25 @@
         <param name="z" type="GLint"/>
         <param name="w" type="GLint"/>
     </function>
-    <function name="ProgramUniform1uiEXT" alias="ProgramUniform1ui" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform1uiEXT" alias="ProgramUniform1ui" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLuint"/>
     </function>
-    <function name="ProgramUniform2uiEXT" alias="ProgramUniform2ui" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform2uiEXT" alias="ProgramUniform2ui" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLuint"/>
         <param name="y" type="GLuint"/>
     </function>
-    <function name="ProgramUniform3uiEXT" alias="ProgramUniform3ui" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform3uiEXT" alias="ProgramUniform3ui" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLuint"/>
         <param name="y" type="GLuint"/>
         <param name="z" type="GLuint"/>
     </function>
-    <function name="ProgramUniform4uiEXT" alias="ProgramUniform4ui" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform4uiEXT" alias="ProgramUniform4ui" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLuint"/>
@@ -123,25 +123,25 @@
         <param name="z" type="GLuint"/>
         <param name="w" type="GLuint"/>
     </function>
-    <function name="ProgramUniform1fEXT" alias="ProgramUniform1f" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform1fEXT" alias="ProgramUniform1f" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLfloat"/>
     </function>
-    <function name="ProgramUniform2fEXT" alias="ProgramUniform2f" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform2fEXT" alias="ProgramUniform2f" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
     </function>
-    <function name="ProgramUniform3fEXT" alias="ProgramUniform3f" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform3fEXT" alias="ProgramUniform3f" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
     </function>
-    <function name="ProgramUniform4fEXT" alias="ProgramUniform4f" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform4fEXT" alias="ProgramUniform4f" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="x" type="GLfloat"/>
@@ -149,145 +149,145 @@
         <param name="z" type="GLfloat"/>
         <param name="w" type="GLfloat"/>
     </function>
-    <function name="ProgramUniform1ivEXT" alias="ProgramUniform1iv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform1ivEXT" alias="ProgramUniform1iv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLint *"/>
     </function>
-    <function name="ProgramUniform2ivEXT" alias="ProgramUniform2iv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform2ivEXT" alias="ProgramUniform2iv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLint *"/>
     </function>
-    <function name="ProgramUniform3ivEXT" alias="ProgramUniform3iv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform3ivEXT" alias="ProgramUniform3iv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLint *"/>
     </function>
-    <function name="ProgramUniform4ivEXT" alias="ProgramUniform4iv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform4ivEXT" alias="ProgramUniform4iv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLint *"/>
     </function>
-    <function name="ProgramUniform1uivEXT" alias="ProgramUniform1uiv" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform1uivEXT" alias="ProgramUniform1uiv" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLuint *"/>
     </function>
-    <function name="ProgramUniform2uivEXT" alias="ProgramUniform2uiv" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform2uivEXT" alias="ProgramUniform2uiv" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLuint *"/>
     </function>
-    <function name="ProgramUniform3uivEXT" alias="ProgramUniform3uiv" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform3uivEXT" alias="ProgramUniform3uiv" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLuint *"/>
     </function>
-    <function name="ProgramUniform4uivEXT" alias="ProgramUniform4uiv" static_dispatch="false" es2="3.0">
+    <function name="ProgramUniform4uivEXT" alias="ProgramUniform4uiv" es2="3.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLuint *"/>
     </function>
-    <function name="ProgramUniform1fvEXT" alias="ProgramUniform1fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform1fvEXT" alias="ProgramUniform1fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniform2fvEXT" alias="ProgramUniform2fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform2fvEXT" alias="ProgramUniform2fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniform3fvEXT" alias="ProgramUniform3fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform3fvEXT" alias="ProgramUniform3fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniform4fvEXT" alias="ProgramUniform4fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniform4fvEXT" alias="ProgramUniform4fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix2fvEXT" alias="ProgramUniformMatrix2fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix2fvEXT" alias="ProgramUniformMatrix2fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix3fvEXT" alias="ProgramUniformMatrix3fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix3fvEXT" alias="ProgramUniformMatrix3fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix4fvEXT" alias="ProgramUniformMatrix4fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix4fvEXT" alias="ProgramUniformMatrix4fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix2x3fvEXT" alias="ProgramUniformMatrix2x3fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix2x3fvEXT" alias="ProgramUniformMatrix2x3fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix3x2fvEXT" alias="ProgramUniformMatrix3x2fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix3x2fvEXT" alias="ProgramUniformMatrix3x2fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix2x4fvEXT" alias="ProgramUniformMatrix2x4fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix2x4fvEXT" alias="ProgramUniformMatrix2x4fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix4x2fvEXT" alias="ProgramUniformMatrix4x2fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix4x2fvEXT" alias="ProgramUniformMatrix4x2fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix3x4fvEXT" alias="ProgramUniformMatrix3x4fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix3x4fvEXT" alias="ProgramUniformMatrix3x4fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ProgramUniformMatrix4x3fvEXT" alias="ProgramUniformMatrix4x3fv" static_dispatch="false" es2="2.0">
+    <function name="ProgramUniformMatrix4x3fvEXT" alias="ProgramUniformMatrix4x3fv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *"/>
     </function>
-    <function name="ValidateProgramPipelineEXT" alias="ValidateProgramPipeline" static_dispatch="false" es2="2.0">
+    <function name="ValidateProgramPipelineEXT" alias="ValidateProgramPipeline" es2="2.0">
         <param name="pipeline" type="GLuint"/>
     </function>
-    <function name="GetProgramPipelineInfoLogEXT" alias="GetProgramPipelineInfoLog" static_dispatch="false" es2="2.0">
+    <function name="GetProgramPipelineInfoLogEXT" alias="GetProgramPipelineInfoLog" es2="2.0">
         <param name="pipeline" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *"/>
diff --git a/src/mapi/glapi/gen/EXT_texture_integer.xml b/src/mapi/glapi/gen/EXT_texture_integer.xml
index aca21f4..5f9ea29 100644
--- a/src/mapi/glapi/gen/EXT_texture_integer.xml
+++ b/src/mapi/glapi/gen/EXT_texture_integer.xml
@@ -55,14 +55,14 @@
     <enum name="LUMINANCE_INTEGER_EXT"                  value="0x8D9C"/>
     <enum name="LUMINANCE_ALPHA_INTEGER_EXT"            value="0x8D9D"/>
 
-    <function name="ClearColorIiEXT" offset="assign">
+    <function name="ClearColorIiEXT">
         <param name="r" type="GLint"/>
         <param name="g" type="GLint"/>
         <param name="b" type="GLint"/>
         <param name="a" type="GLint"/>
     </function>
 
-    <function name="ClearColorIuiEXT" offset="assign">
+    <function name="ClearColorIuiEXT">
         <param name="r" type="GLuint"/>
         <param name="g" type="GLuint"/>
         <param name="b" type="GLuint"/>
diff --git a/src/mapi/glapi/gen/EXT_transform_feedback.xml b/src/mapi/glapi/gen/EXT_transform_feedback.xml
index 65259fc..2aa26ad 100644
--- a/src/mapi/glapi/gen/EXT_transform_feedback.xml
+++ b/src/mapi/glapi/gen/EXT_transform_feedback.xml
@@ -32,7 +32,7 @@
     <param name="size" type="GLsizeiptr"/>
   </function>
 
-  <function name="BindBufferOffsetEXT" offset="assign">
+  <function name="BindBufferOffsetEXT">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="buffer" type="GLuint"/>
@@ -82,33 +82,33 @@
   <enum name="TRANSFORM_FEEDBACK_BUFFER_ACTIVE"    value="0x8E24"/>
   <enum name="TRANSFORM_FEEDBACK_BINDING"          value="0x8E25"/>
 
-  <function name="BindTransformFeedback" offset="assign" es2="3.0">
+  <function name="BindTransformFeedback" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="id" type="GLuint"/>
   </function>
 
-  <function name="DeleteTransformFeedbacks" offset="assign" es2="3.0">
+  <function name="DeleteTransformFeedbacks" es2="3.0">
     <param name="n" type="GLsizei"/>
     <param name="ids" type="const GLuint *"/>
   </function>
 
-  <function name="GenTransformFeedbacks" offset="assign" es2="3.0">
+  <function name="GenTransformFeedbacks" es2="3.0">
     <param name="n" type="GLsizei"/>
     <param name="ids" type="GLuint *"/>
   </function>
 
-  <function name="IsTransformFeedback" offset="assign" es2="3.0">
+  <function name="IsTransformFeedback" es2="3.0">
     <param name="id" type="GLuint"/>
     <return type="GLboolean"/>
   </function>
 
-  <function name="PauseTransformFeedback" offset="assign" es2="3.0">
+  <function name="PauseTransformFeedback" es2="3.0">
   </function>
 
-  <function name="ResumeTransformFeedback" offset="assign" es2="3.0">
+  <function name="ResumeTransformFeedback" es2="3.0">
   </function>
 
-  <function name="DrawTransformFeedback" offset="assign" exec="dynamic">
+  <function name="DrawTransformFeedback" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="id" type="GLuint"/>
   </function>
diff --git a/src/mapi/glapi/gen/GL3x.xml b/src/mapi/glapi/gen/GL3x.xml
index 5078f7b..7919d65 100644
--- a/src/mapi/glapi/gen/GL3x.xml
+++ b/src/mapi/glapi/gen/GL3x.xml
@@ -119,38 +119,38 @@
 
   <!-- These functions are unique to GL3 -->
 
-  <function name="ClearBufferiv" offset="assign" es2="3.0">
+  <function name="ClearBufferiv" es2="3.0">
     <param name="buffer" type="GLenum"/>
     <param name="drawbuffer" type="GLint"/>
     <param name="value" type="const GLint *"/>
   </function>
 
-  <function name="ClearBufferuiv" offset="assign" es2="3.0">
+  <function name="ClearBufferuiv" es2="3.0">
     <param name="buffer" type="GLenum"/>
     <param name="drawbuffer" type="GLint"/>
     <param name="value" type="const GLuint *"/>
   </function>
 
-  <function name="ClearBufferfv" offset="assign" es2="3.0">
+  <function name="ClearBufferfv" es2="3.0">
     <param name="buffer" type="GLenum"/>
     <param name="drawbuffer" type="GLint"/>
     <param name="value" type="const GLfloat *"/>
   </function>
 
-  <function name="ClearBufferfi" offset="assign" es2="3.0">
+  <function name="ClearBufferfi" es2="3.0">
     <param name="buffer" type="GLenum"/>
     <param name="drawbuffer" type="GLint"/>
     <param name="depth" type="GLfloat"/>
     <param name="stencil" type="GLint"/>
   </function>
 
-  <function name="GetStringi" offset="assign" es2="3.0">
+  <function name="GetStringi" es2="3.0">
     <param name="name" type="GLenum"/>
     <param name="index" type="GLuint"/>
       <return type="const GLubyte *"/>
   </function>
 
-  <function name="ClampColor" offset="assign">
+  <function name="ClampColor">
     <param name="target" type="GLenum"/>
     <param name="clamp" type="GLenum"/>
     <glx rop="234"/>
@@ -158,7 +158,7 @@
 
   <!-- These functions alias ones form GL_ARB_draw_buffers2 -->
 
-  <function name="ColorMaski" offset="assign">
+  <function name="ColorMaski">
     <param name="buf" type="GLuint"/>
     <param name="r" type="GLboolean"/>
     <param name="g" type="GLboolean"/>
@@ -166,29 +166,29 @@
     <param name="a" type="GLboolean"/>
   </function>
 
-  <function name="GetBooleani_v" offset="assign">
+  <function name="GetBooleani_v" es2="3.1">
     <param name="value" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="data" type="GLboolean *"/>
   </function>
 
-  <function name="GetIntegeri_v" es2="3.0" offset="assign">
+  <function name="GetIntegeri_v" es2="3.0">
     <param name="value" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="data" type="GLint *"/>
   </function>
 
-  <function name="Enablei" offset="assign">
+  <function name="Enablei">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
   </function>
 
-  <function name="Disablei" offset="assign">
+  <function name="Disablei">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
   </function>
 
-  <function name="IsEnabledi" offset="assign">
+  <function name="IsEnabledi">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
       <return type="GLboolean"/>
@@ -196,26 +196,26 @@
 
   <!-- These functions alias ones form GL_EXT_transform_feedback -->
 
-  <function name="GetFragDataLocation" es2="3.0" offset="assign">
+  <function name="GetFragDataLocation" es2="3.0">
     <param name="program" type="GLuint"/>
     <param name="name" type="const GLchar *"/>
       <return type="GLint"/>
   </function>
 
-  <function name="BindFragDataLocation" offset="assign">
+  <function name="BindFragDataLocation">
     <param name="program" type="GLuint"/>
     <param name="colorNumber" type="GLuint"/>
     <param name="name" type="const GLchar *"/>
   </function>
 
-  <function name="BeginTransformFeedback" es2="3.0" offset="assign">
+  <function name="BeginTransformFeedback" es2="3.0">
     <param name="mode" type="GLenum"/>
   </function>
 
-  <function name="EndTransformFeedback" es2="3.0" offset="assign">
+  <function name="EndTransformFeedback" es2="3.0">
   </function>
 
-  <function name="BindBufferRange" es2="3.0" offset="assign">
+  <function name="BindBufferRange" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="buffer" type="GLuint"/>
@@ -223,20 +223,20 @@
     <param name="size" type="GLsizeiptr"/>
   </function>
 
-  <function name="BindBufferBase" es2="3.0" offset="assign">
+  <function name="BindBufferBase" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="buffer" type="GLuint"/>
   </function>
 
-  <function name="TransformFeedbackVaryings" es2="3.0" offset="assign">
+  <function name="TransformFeedbackVaryings" es2="3.0">
     <param name="program" type="GLuint"/>
     <param name="count" type="GLsizei"/>
     <param name="varyings" type="const GLchar * const *"/>
     <param name="bufferMode" type="GLenum"/>
   </function>
 
-  <function name="GetTransformFeedbackVarying" es2="3.0" offset="assign">
+  <function name="GetTransformFeedbackVarying" es2="3.0">
     <param name="program" type="GLuint"/>
     <param name="index" type="GLuint"/>
     <param name="bufSize" type="GLsizei"/>
@@ -248,17 +248,17 @@
 
   <!-- These functions alias ones from GL_NV_conditional_render -->
 
-  <function name="BeginConditionalRender" offset="assign">
+  <function name="BeginConditionalRender">
     <param name="query" type="GLuint"/>
     <param name="mode" type="GLenum"/>
   </function>
 
-  <function name="EndConditionalRender" offset="assign">
+  <function name="EndConditionalRender">
   </function>
 
   <!-- These functions alias ones from GL_EXT_gpu_shader4 -->
 
-  <function name="VertexAttribIPointer" es2="3.0" offset="assign">
+  <function name="VertexAttribIPointer" es2="3.0">
     <param name="index" type="GLuint"/>
     <param name="size" type="GLint"/>
     <param name="type" type="GLenum"/>
@@ -266,13 +266,13 @@
     <param name="pointer" type="const GLvoid *"/>
   </function>
 
-  <function name="GetVertexAttribIiv" es2="3.0" offset="assign">
+  <function name="GetVertexAttribIiv" es2="3.0">
     <param name="index" type="GLuint"/>
     <param name="pname" type="GLenum"/>
     <param name="params" type="GLint *"/>
   </function>
 
-  <function name="GetVertexAttribIuiv" es2="3.0" offset="assign">
+  <function name="GetVertexAttribIuiv" es2="3.0">
     <param name="index" type="GLuint"/>
     <param name="pname" type="GLenum"/>
     <param name="params" type="GLuint *"/>
@@ -330,7 +330,7 @@
     <param name="w" type="GLuint"/>
   </function>
 
-  <function name="VertexAttribI1iv" offset="assign">
+  <function name="VertexAttribI1iv">
     <param name="index" type="GLuint"/>
     <param name="v" type="const GLint *"/>
   </function>
@@ -350,7 +350,7 @@
     <param name="v" type="const GLint *"/>
   </function>
 
-  <function name="VertexAttribI1uiv" offset="assign">
+  <function name="VertexAttribI1uiv">
     <param name="index" type="GLuint"/>
     <param name="v" type="const GLuint *"/>
   </function>
@@ -370,51 +370,51 @@
     <param name="v" type="const GLuint *"/>
   </function>
 
-  <function name="VertexAttribI4bv" offset="assign">
+  <function name="VertexAttribI4bv">
     <param name="index" type="GLuint"/>
     <param name="v" type="const GLbyte *"/>
   </function>
 
-  <function name="VertexAttribI4sv" offset="assign">
+  <function name="VertexAttribI4sv">
     <param name="index" type="GLuint"/>
     <param name="v" type="const GLshort *"/>
   </function>
 
-  <function name="VertexAttribI4ubv" offset="assign">
+  <function name="VertexAttribI4ubv">
     <param name="index" type="GLuint"/>
     <param name="v" type="const GLubyte *"/>
   </function>
 
-  <function name="VertexAttribI4usv" offset="assign">
+  <function name="VertexAttribI4usv">
     <param name="index" type="GLuint"/>
     <param name="v" type="const GLushort *"/>
   </function>
 
-  <function name="GetUniformuiv" es2="3.0" offset="assign">
+  <function name="GetUniformuiv" es2="3.0">
     <param name="program" type="GLuint"/>
     <param name="location" type="GLint"/>
     <param name="params" type="GLuint *"/>
   </function>
 
-  <function name="Uniform1ui" es2="3.0" offset="assign">
+  <function name="Uniform1ui" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="x" type="GLuint"/>
   </function>
 
-  <function name="Uniform2ui" es2="3.0" offset="assign">
+  <function name="Uniform2ui" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="x" type="GLuint"/>
     <param name="y" type="GLuint"/>
   </function>
 
-  <function name="Uniform3ui" es2="3.0" offset="assign">
+  <function name="Uniform3ui" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="x" type="GLuint"/>
     <param name="y" type="GLuint"/>
     <param name="z" type="GLuint"/>
   </function>
 
-  <function name="Uniform4ui" es2="3.0" offset="assign">
+  <function name="Uniform4ui" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="x" type="GLuint"/>
     <param name="y" type="GLuint"/>
@@ -422,25 +422,25 @@
     <param name="w" type="GLuint"/>
   </function>
 
-  <function name="Uniform1uiv" es2="3.0" offset="assign">
+  <function name="Uniform1uiv" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="count" type="GLsizei" counter="true"/>
     <param name="value" type="const GLuint *" count="count"/>
   </function>
 
-  <function name="Uniform2uiv" es2="3.0" offset="assign">
+  <function name="Uniform2uiv" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="count" type="GLsizei" counter="true"/>
     <param name="value" type="const GLuint *" count="count" count_scale="2"/>
   </function>
 
-  <function name="Uniform3uiv" es2="3.0" offset="assign">
+  <function name="Uniform3uiv" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="count" type="GLsizei" counter="true"/>
     <param name="value" type="const GLuint *" count="count" count_scale="3"/>
   </function>
 
-  <function name="Uniform4uiv" es2="3.0" offset="assign">
+  <function name="Uniform4uiv" es2="3.0">
     <param name="location" type="GLint"/>
     <param name="count" type="GLsizei" counter="true"/>
     <param name="value" type="const GLuint *" count="count" count_scale="4"/>
@@ -448,25 +448,25 @@
 
   <!-- These functions alias ones from GL_EXT_texture_integer -->
 
-  <function name="TexParameterIiv" offset="assign">
+  <function name="TexParameterIiv">
     <param name="target" type="GLenum"/>
     <param name="pname" type="GLenum"/>
     <param name="params" type="const GLint *"/>
   </function>
 
-  <function name="TexParameterIuiv" offset="assign">
+  <function name="TexParameterIuiv">
     <param name="target" type="GLenum"/>
     <param name="pname" type="GLenum"/>
     <param name="params" type="const GLuint *"/>
   </function>
 
-  <function name="GetTexParameterIiv" offset="assign">
+  <function name="GetTexParameterIiv">
     <param name="target" type="GLenum"/>
     <param name="pname" type="GLenum"/>
     <param name="params" type="GLint *"/>
   </function>
 
-  <function name="GetTexParameterIuiv" offset="assign">
+  <function name="GetTexParameterIuiv">
     <param name="target" type="GLenum"/>
     <param name="pname" type="GLenum"/>
     <param name="params" type="GLuint *"/>
@@ -557,13 +557,13 @@
     <param name="primcount" type="GLsizei"/>
   </function>
 
-  <function name="TexBuffer" offset="assign">
+  <function name="TexBuffer">
     <param name="target" type="GLenum"/>
     <param name="internalFormat" type="GLenum"/>
     <param name="buffer" type="GLuint"/>
   </function>
 
-  <function name="PrimitiveRestartIndex" offset="assign">
+  <function name="PrimitiveRestartIndex">
     <param name="index" type="GLuint"/>
   </function>
 
@@ -595,19 +595,19 @@
   <enum name="MAX_FRAGMENT_INPUT_COMPONENTS"        value="0x9125"/>
   <enum name="CONTEXT_PROFILE_MASK"                 value="0x9126"/>
 
-  <function name="GetInteger64i_v" offset="assign" es2="3.0">
+  <function name="GetInteger64i_v" es2="3.0">
     <param name="cap" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="data" type="GLint64 *"/>
   </function>
 
-  <function name="GetBufferParameteri64v" offset="assign" es2="3.0">
+  <function name="GetBufferParameteri64v" es2="3.0">
     <param name="target" type="GLenum"/>
     <param name="pname" type="GLenum"/>
     <param name="params" type="GLint64 *"/>
   </function>
 
-  <function name="FramebufferTexture" offset="assign">
+  <function name="FramebufferTexture">
     <param name="target" type="GLenum"/>
     <param name="attachment" type="GLenum"/>
     <param name="texture" type="GLuint"/>
@@ -625,7 +625,7 @@
   <enum name="TEXTURE_SWIZZLE_A"                value="0x8E45"/>
   <enum name="TEXTURE_SWIZZLE_RGBA"             value="0x8E46"/>
 
-  <function name="VertexAttribDivisor" offset="assign" es2="3.0">
+  <function name="VertexAttribDivisor" es2="3.0">
     <param name="index" type="GLuint"/>
     <param name="divisor" type="GLuint"/>
   </function>
diff --git a/src/mapi/glapi/gen/GL4x.xml b/src/mapi/glapi/gen/GL4x.xml
index 848316e..94ddfb7 100644
--- a/src/mapi/glapi/gen/GL4x.xml
+++ b/src/mapi/glapi/gen/GL4x.xml
@@ -9,17 +9,17 @@
   <enum name="SAMPLE_SHADING"                          value="0x8C36"/>
   <enum name="MIN_SAMPLE_SHADING_VALUE"                value="0x8C37"/>
 
-  <function name="MinSampleShading" offset="assign">
+  <function name="MinSampleShading">
     <param name="value" type="GLfloat"/>
   </function>
 
-  <function name="BlendFunci" static_dispatch="false" alias="BlendFunciARB">
+  <function name="BlendFunci" alias="BlendFunciARB">
     <param name="buf" type="GLuint"/>
     <param name="sfactor" type="GLenum"/>
     <param name="dfactor" type="GLenum"/>
   </function>
 
-  <function name="BlendFuncSeparatei" static_dispatch="false" alias="BlendFuncSeparateiARB">
+  <function name="BlendFuncSeparatei" alias="BlendFuncSeparateiARB">
     <param name="buf" type="GLuint"/>
     <param name="sfactorRGB" type="GLenum"/>
     <param name="dfactorRGB" type="GLenum"/>
@@ -27,12 +27,12 @@
     <param name="dfactorAlpha" type="GLenum"/>
   </function>
 
-  <function name="BlendEquationi" static_dispatch="false" alias="BlendEquationiARB">
+  <function name="BlendEquationi" alias="BlendEquationiARB">
     <param name="buf" type="GLuint"/>
     <param name="mode" type="GLenum"/>
   </function>
 
-  <function name="BlendEquationSeparatei" static_dispatch="false" alias="BlendEquationSeparateiARB" >
+  <function name="BlendEquationSeparatei" alias="BlendEquationSeparateiARB" >
     <param name="buf" type="GLuint"/>
     <param name="modeRGB" type="GLenum"/>
     <param name="modeA" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/INTEL_performance_query.xml b/src/mapi/glapi/gen/INTEL_performance_query.xml
index 25cd181..9573cb1 100644
--- a/src/mapi/glapi/gen/INTEL_performance_query.xml
+++ b/src/mapi/glapi/gen/INTEL_performance_query.xml
@@ -5,21 +5,21 @@
 
 <category name="GL_INTEL_performance_query" number="443">
 
-  <function name="GetFirstPerfQueryIdINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="GetFirstPerfQueryIdINTEL" es2="2.0">
     <param name="queryId" type="GLuint *"/>
   </function>
 
-  <function name="GetNextPerfQueryIdINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="GetNextPerfQueryIdINTEL" es2="2.0">
     <param name="queryId" type="GLuint"/>
     <param name="nextQueryId" type="GLuint *"/>
   </function>
 
-  <function name="GetPerfQueryIdByNameINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="GetPerfQueryIdByNameINTEL" es2="2.0">
     <param name="queryName" type="GLchar *"/>
     <param name="queryId" type="GLuint *"/>
   </function>
 
-  <function name="GetPerfQueryInfoINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="GetPerfQueryInfoINTEL" es2="2.0">
     <param name="queryId" type="GLuint"/>
     <param name="queryNameLength" type="GLuint"/>
     <param name="queryName" type="GLchar *"/>
@@ -29,7 +29,7 @@
     <param name="capsMask" type="GLuint *"/>
   </function>
 
-  <function name="GetPerfCounterInfoINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="GetPerfCounterInfoINTEL" es2="2.0">
     <param name="queryId" type="GLuint"/>
     <param name="counterId" type="GLuint"/>
     <param name="counterNameLength" type="GLuint"/>
@@ -43,24 +43,24 @@
     <param name="rawCounterMaxValue" type="GLuint64 *"/>
   </function>
 
-  <function name="CreatePerfQueryINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="CreatePerfQueryINTEL" es2="2.0">
     <param name="queryId" type="GLuint"/>
     <param name="queryHandle" type="GLuint *"/>
   </function>
 
-  <function name="DeletePerfQueryINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="DeletePerfQueryINTEL" es2="2.0">
     <param name="queryHandle" type="GLuint"/>
   </function>
 
-  <function name="BeginPerfQueryINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="BeginPerfQueryINTEL" es2="2.0">
     <param name="queryHandle" type="GLuint"/>
   </function>
 
-  <function name="EndPerfQueryINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="EndPerfQueryINTEL" es2="2.0">
     <param name="queryHandle" type="GLuint"/>
   </function>
 
-  <function name="GetPerfQueryDataINTEL" offset="assign" static_dispatch="false" es2="2.0">
+  <function name="GetPerfQueryDataINTEL" es2="2.0">
     <param name="queryHandle" type="GLuint"/>
     <param name="flags" type="GLuint"/>
     <param name="dataSize" type="GLsizei"/>
diff --git a/src/mapi/glapi/gen/KHR_debug.xml b/src/mapi/glapi/gen/KHR_debug.xml
index 48f7fa7..77956d6 100644
--- a/src/mapi/glapi/gen/KHR_debug.xml
+++ b/src/mapi/glapi/gen/KHR_debug.xml
@@ -73,7 +73,7 @@
   <!-- Compatibility Profile -->
   <enum name="DISPLAY_LIST"                               value="0x82E7"/>
 
-  <function name="DebugMessageControl" offset="assign">
+  <function name="DebugMessageControl">
     <param name="source" type="GLenum"/>
     <param name="type" type="GLenum"/>
     <param name="severity" type="GLenum"/>
@@ -82,7 +82,7 @@
     <param name="enabled" type="GLboolean"/>
   </function>
 
-  <function name="DebugMessageInsert" offset="assign">
+  <function name="DebugMessageInsert">
     <param name="source" type="GLenum"/>
     <param name="type" type="GLenum"/>
     <param name="id" type="GLuint"/>
@@ -91,12 +91,12 @@
     <param name="buf" type="const GLchar *"/>
   </function>
 
-  <function name="DebugMessageCallback" offset="assign">
+  <function name="DebugMessageCallback">
     <param name="callback" type="GLDEBUGPROC"/>
     <param name="userParam" type="const GLvoid *"/>
   </function>
 
-  <function name="GetDebugMessageLog" offset="assign">
+  <function name="GetDebugMessageLog">
     <return type="GLuint"/>
     <param name="count" type="GLuint"/>
     <param name="bufsize" type="GLsizei"/>
@@ -108,23 +108,23 @@
     <param name="messageLog" type="GLchar *" output="true"/>
   </function>
 
-  <function name="PushDebugGroup" offset="assign">
+  <function name="PushDebugGroup">
     <param name="source" type="GLenum"/>
     <param name="id" type="GLuint"/>
     <param name="length" type="GLsizei"/>
     <param name="message" type="const GLchar *"/>
   </function>
 
-  <function name="PopDebugGroup" offset="assign"/>
+  <function name="PopDebugGroup" />
 
-  <function name="ObjectLabel" offset="assign">
+  <function name="ObjectLabel">
     <param name="identifier" type="GLenum"/>
     <param name="name" type="GLuint"/>
     <param name="length" type="GLsizei"/>
     <param name="label" type="const GLchar *"/>
   </function>
 
-  <function name="GetObjectLabel" offset="assign">
+  <function name="GetObjectLabel">
     <param name="identifier" type="GLenum"/>
     <param name="name" type="GLuint"/>
     <param name="bufSize" type="GLsizei"/>
@@ -132,13 +132,13 @@
     <param name="label" type="GLchar *"/>
   </function>
 
-  <function name="ObjectPtrLabel" offset="assign">
+  <function name="ObjectPtrLabel">
     <param name="ptr" type="const GLvoid *"/>
     <param name="length" type="GLsizei"/>
     <param name="label" type="const GLchar *"/>
   </function>
 
-  <function name="GetObjectPtrLabel" offset="assign">
+  <function name="GetObjectPtrLabel">
     <param name="ptr" type="const GLvoid *"/>
     <param name="bufSize" type="GLsizei"/>
     <param name="length" type="GLsizei *"/>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index c8d4174..5b163b0 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -61,6 +61,7 @@ EXTRA_DIST= \
 	$(MESA_GLAPI_DIR)/glapi_x86-64.S \
 	$(MESA_GLAPI_DIR)/glapi_sparc.S \
 	$(COMMON_GLX) \
+	apiexec.py \
 	gl_apitemp.py \
 	gl_enums.py \
 	gl_genexec.py \
@@ -75,6 +76,7 @@ EXTRA_DIST= \
 	glX_proto_size.py \
 	glX_server_table.py \
 	remap_helper.py \
+	static_data.py \
 	SConscript \
 	gl_API.dtd
 
@@ -129,6 +131,7 @@ API_XML = \
 	ARB_draw_instanced.xml \
 	ARB_ES2_compatibility.xml \
 	ARB_ES3_compatibility.xml \
+	ARB_framebuffer_no_attachments.xml \
 	ARB_framebuffer_object.xml \
 	ARB_geometry_shader4.xml \
 	ARB_get_program_binary.xml \
@@ -140,6 +143,7 @@ API_XML = \
 	ARB_map_buffer_range.xml \
 	ARB_multi_bind.xml \
 	ARB_pipeline_statistics_query.xml \
+	ARB_program_interface_query.xml \
 	ARB_robustness.xml \
 	ARB_sample_shading.xml \
 	ARB_sampler_objects.xml \
@@ -197,6 +201,7 @@ COMMON = $(API_XML) \
 	gl_XML.py \
 	glX_XML.py \
 	license.py \
+	static_data.py \
 	typeexpr.py
 
 COMMON_GLX = $(COMMON) glX_API.xml glX_XML.py glX_proto_common.py
@@ -264,7 +269,7 @@ $(MESA_GLAPI_DIR)/glapi_sparc.S: gl_SPARC_asm.py $(COMMON)
 $(MESA_DIR)/main/enums.c: gl_enums.py $(COMMON)
 	$(PYTHON_GEN) $< -f $(srcdir)/gl_and_es_API.xml > $@
 
-$(MESA_DIR)/main/api_exec.c: gl_genexec.py $(COMMON)
+$(MESA_DIR)/main/api_exec.c: gl_genexec.py apiexec.py $(COMMON)
 	$(PYTHON_GEN) $< -f $(srcdir)/gl_and_es_API.xml > $@
 
 $(MESA_DIR)/main/dispatch.h: gl_table.py $(COMMON)
@@ -287,7 +292,7 @@ $(MESA_GLX_DIR)/indirect_init.c: glX_proto_send.py $(COMMON_GLX)
 
 $(MESA_GLX_DIR)/indirect_size.h $(XORG_GLX_DIR)/indirect_size.h: glX_proto_size.py $(COMMON_GLX)
 	$(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m size_h --only-set \
-	    -h _INDIRECT_SIZE_H_ \
+	    --header-tag _INDIRECT_SIZE_H_ \
 	  | $(INDENT) $(INDENT_FLAGS) > $@
 
 $(MESA_GLX_DIR)/indirect_size.c: glX_proto_size.py $(COMMON_GLX)
diff --git a/src/mapi/glapi/gen/NV_primitive_restart.xml b/src/mapi/glapi/gen/NV_primitive_restart.xml
index 39edafe..2326652 100644
--- a/src/mapi/glapi/gen/NV_primitive_restart.xml
+++ b/src/mapi/glapi/gen/NV_primitive_restart.xml
@@ -11,9 +11,7 @@
     <enum name="PRIMITIVE_RESTART_NV"        value="0x8558"/>
     <enum name="PRIMITIVE_RESTART_INDEX_NV"  value="0x8559"/>
 
-    <function name="PrimitiveRestartNV" offset="assign" deprecated="3.1"
-              exec="dynamic">
-    </function>
+    <function name="PrimitiveRestartNV" deprecated="3.1" exec="dynamic"/>
 
     <function name="PrimitiveRestartIndexNV" alias="PrimitiveRestartIndex">
 	<param name="index" type="GLuint"/>
diff --git a/src/mapi/glapi/gen/NV_texture_barrier.xml b/src/mapi/glapi/gen/NV_texture_barrier.xml
index 52b1a3c..b4c361c 100644
--- a/src/mapi/glapi/gen/NV_texture_barrier.xml
+++ b/src/mapi/glapi/gen/NV_texture_barrier.xml
@@ -7,7 +7,7 @@
 <OpenGLAPI>
 
 <category name="GL_NV_texture_barrier" number="381">
-    <function name="TextureBarrierNV" offset="assign" />
+    <function name="TextureBarrierNV"/>
 </category>
 
 </OpenGLAPI>
diff --git a/src/mapi/glapi/gen/NV_vdpau_interop.xml b/src/mapi/glapi/gen/NV_vdpau_interop.xml
index 0b19e1a..ceef7bd 100644
--- a/src/mapi/glapi/gen/NV_vdpau_interop.xml
+++ b/src/mapi/glapi/gen/NV_vdpau_interop.xml
@@ -5,14 +5,14 @@
 
 <category name="GL_NV_vdpau_interop" number="396">
 
-    <function name="VDPAUInitNV" offset="assign">
+    <function name="VDPAUInitNV">
 	<param name="vdpDevice" type="const GLvoid *"/>
 	<param name="getProcAddress" type="const GLvoid *"/>
     </function>
 
-    <function name="VDPAUFiniNV" offset="assign"/>
+    <function name="VDPAUFiniNV"/>
 
-    <function name="VDPAURegisterVideoSurfaceNV" offset="assign">
+    <function name="VDPAURegisterVideoSurfaceNV">
         <return type="GLintptr"/>
 	<param name="vdpSurface" type="const GLvoid *"/>
 	<param name="target" type="GLenum"/>
@@ -20,7 +20,7 @@
 	<param name="textureNames" type="const GLuint *"/>
     </function>
 
-    <function name="VDPAURegisterOutputSurfaceNV" offset="assign">
+    <function name="VDPAURegisterOutputSurfaceNV">
         <return type="GLintptr"/>
 	<param name="vdpSurface" type="const GLvoid *"/>
 	<param name="target" type="GLenum"/>
@@ -28,16 +28,16 @@
 	<param name="textureNames" type="const GLuint *"/>
     </function>
 
-    <function name="VDPAUIsSurfaceNV" offset="assign">
+    <function name="VDPAUIsSurfaceNV">
         <return type="GLboolean"/>
 	<param name="surface" type="GLintptr"/>
     </function>
 
-    <function name="VDPAUUnregisterSurfaceNV" offset="assign">
+    <function name="VDPAUUnregisterSurfaceNV">
 	<param name="surface" type="GLintptr"/>
     </function>
 
-    <function name="VDPAUGetSurfaceivNV" offset="assign">
+    <function name="VDPAUGetSurfaceivNV">
 	<param name="surface" type="GLintptr"/>
 	<param name="pname" type="GLenum"/>
 	<param name="bufSize" type="GLsizei"/>
@@ -45,17 +45,17 @@
 	<param name="values" type="GLint *"/>
     </function>
 
-    <function name="VDPAUSurfaceAccessNV" offset="assign">
+    <function name="VDPAUSurfaceAccessNV">
 	<param name="surface" type="GLintptr"/>
 	<param name="access" type="GLenum"/>
     </function>
 
-    <function name="VDPAUMapSurfacesNV" offset="assign">
+    <function name="VDPAUMapSurfacesNV">
 	<param name="numSurfaces" type="GLsizei"/>
 	<param name="surfaces" type="const GLintptr *"/>
     </function>
 
-    <function name="VDPAUUnmapSurfacesNV" offset="assign">
+    <function name="VDPAUUnmapSurfacesNV">
 	<param name="numSurfaces" type="GLsizei"/>
 	<param name="surfaces" type="const GLintptr *"/>
     </function>
diff --git a/src/mapi/glapi/gen/OES_EGL_image.xml b/src/mapi/glapi/gen/OES_EGL_image.xml
index a995cad..c483e91 100644
--- a/src/mapi/glapi/gen/OES_EGL_image.xml
+++ b/src/mapi/glapi/gen/OES_EGL_image.xml
@@ -5,14 +5,12 @@
 
 <category name="GL_OES_EGL_image">
 
-    <function name="EGLImageTargetTexture2DOES" offset="assign" es1="1.0"
-              es2="2.0">
+    <function name="EGLImageTargetTexture2DOES" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="writeOffset" type="GLvoid *"/>
     </function>
 
-    <function name="EGLImageTargetRenderbufferStorageOES" offset="assign"
-              es1="1.0" es2="2.0">
+    <function name="EGLImageTargetRenderbufferStorageOES" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="writeOffset" type="GLvoid *"/>
     </function>
diff --git a/src/mapi/glapi/gen/OES_fixed_point.xml b/src/mapi/glapi/gen/OES_fixed_point.xml
index d62d6e2..edd0acd 100644
--- a/src/mapi/glapi/gen/OES_fixed_point.xml
+++ b/src/mapi/glapi/gen/OES_fixed_point.xml
@@ -13,52 +13,45 @@
     <type name="clampx"  size="4"                                    />
 
     <!-- OpenGL ES 1.0 -->
-    <function name="AlphaFuncxOES" static_dispatch="false"
-              es1="1.0" alias="AlphaFuncx">
+    <function name="AlphaFuncxOES" es1="1.0" alias="AlphaFuncx">
         <param name="func" type="GLenum"/>
         <param name="ref" type="GLclampx"/>
     </function>
 
-    <function name="ClearColorxOES" static_dispatch="false"
-              es1="1.0" alias="ClearColorx">
+    <function name="ClearColorxOES" es1="1.0" alias="ClearColorx">
         <param name="red" type="GLclampx"/>
         <param name="green" type="GLclampx"/>
         <param name="blue" type="GLclampx"/>
         <param name="alpha" type="GLclampx"/>
     </function>
 
-    <function name="ClearDepthxOES" static_dispatch="false"
-              es1="1.0" alias="ClearDepthx">
+    <function name="ClearDepthxOES" es1="1.0" alias="ClearDepthx">
         <param name="depth" type="GLclampx"/>
     </function>
 
-    <function name="Color4xOES" static_dispatch="false"
-              es1="1.0" alias="Color4x">
+    <function name="Color4xOES" es1="1.0" alias="Color4x">
         <param name="red" type="GLfixed"/>
         <param name="green" type="GLfixed"/>
         <param name="blue" type="GLfixed"/>
         <param name="alpha" type="GLfixed"/>
     </function>
 
-    <function name="DepthRangexOES" static_dispatch="false"
-              es1="1.0" alias="DepthRangex">
+    <function name="DepthRangexOES" es1="1.0" alias="DepthRangex">
         <param name="zNear" type="GLclampx"/>
         <param name="zFar" type="GLclampx"/>
     </function>
 
-    <function name="FogxOES" static_dispatch="false" es1="1.0" alias="Fogx">
+    <function name="FogxOES" es1="1.0" alias="Fogx">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="FogxvOES" static_dispatch="false"
-              es1="1.0" alias="Fogxv">
+    <function name="FogxvOES" es1="1.0" alias="Fogxv">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
-    <function name="FrustumxOES" static_dispatch="false"
-              es1="1.0" alias="Frustumx">
+    <function name="FrustumxOES" es1="1.0" alias="Frustumx">
         <param name="left" type="GLfixed"/>
         <param name="right" type="GLfixed"/>
         <param name="bottom" type="GLfixed"/>
@@ -67,63 +60,53 @@
         <param name="zFar" type="GLfixed"/>
     </function>
 
-    <function name="LightModelxOES" static_dispatch="false"
-              es1="1.0" alias="LightModelx">
+    <function name="LightModelxOES" es1="1.0" alias="LightModelx">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="LightModelxvOES" static_dispatch="false"
-              es1="1.0" alias="LightModelxv">
+    <function name="LightModelxvOES" es1="1.0" alias="LightModelxv">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
-    <function name="LightxOES" static_dispatch="false"
-              es1="1.0" alias="Lightx">
+    <function name="LightxOES" es1="1.0" alias="Lightx">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="LightxvOES" static_dispatch="false"
-              es1="1.0" alias="Lightxv">
+    <function name="LightxvOES" es1="1.0" alias="Lightxv">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
-    <function name="LineWidthxOES" static_dispatch="false"
-              es1="1.0" alias="LineWidthx">
+    <function name="LineWidthxOES" es1="1.0" alias="LineWidthx">
         <param name="width" type="GLfixed"/>
     </function>
 
-    <function name="LoadMatrixxOES" static_dispatch="false"
-              es1="1.0" alias="LoadMatrixx">
+    <function name="LoadMatrixxOES" es1="1.0" alias="LoadMatrixx">
         <param name="m" type="const GLfixed *"/>
     </function>
 
-    <function name="MaterialxOES" static_dispatch="false"
-              es1="1.0" alias="Materialx">
+    <function name="MaterialxOES" es1="1.0" alias="Materialx">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="MaterialxvOES" static_dispatch="false"
-              es1="1.0" alias="Materialxv">
+    <function name="MaterialxvOES" es1="1.0" alias="Materialxv">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
-    <function name="MultMatrixxOES" static_dispatch="false"
-              es1="1.0" alias="MultMatrixx">
+    <function name="MultMatrixxOES" es1="1.0" alias="MultMatrixx">
         <param name="m" type="const GLfixed *"/>
     </function>
 
-    <function name="MultiTexCoord4xOES" static_dispatch="false"
-              es1="1.0" alias="MultiTexCoord4x">
+    <function name="MultiTexCoord4xOES" es1="1.0" alias="MultiTexCoord4x">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLfixed"/>
         <param name="t" type="GLfixed"/>
@@ -131,15 +114,13 @@
         <param name="q" type="GLfixed"/>
     </function>
 
-    <function name="Normal3xOES" static_dispatch="false"
-              es1="1.0" alias="Normal3x">
+    <function name="Normal3xOES" es1="1.0" alias="Normal3x">
         <param name="nx" type="GLfixed"/>
         <param name="ny" type="GLfixed"/>
         <param name="nz" type="GLfixed"/>
     </function>
 
-    <function name="OrthoxOES" static_dispatch="false"
-              es1="1.0" alias="Orthox">
+    <function name="OrthoxOES" es1="1.0" alias="Orthox">
         <param name="left" type="GLfixed"/>
         <param name="right" type="GLfixed"/>
         <param name="bottom" type="GLfixed"/>
@@ -148,149 +129,129 @@
         <param name="zFar" type="GLfixed"/>
     </function>
 
-    <function name="PointSizexOES" static_dispatch="false"
-              es1="1.0" alias="PointSizex">
+    <function name="PointSizexOES" es1="1.0" alias="PointSizex">
         <param name="size" type="GLfixed"/>
     </function>
 
-    <function name="PolygonOffsetxOES" static_dispatch="false"
-              es1="1.0" alias="PolygonOffsetx">
+    <function name="PolygonOffsetxOES" es1="1.0" alias="PolygonOffsetx">
         <param name="factor" type="GLfixed"/>
         <param name="units" type="GLfixed"/>
     </function>
 
-    <function name="RotatexOES" static_dispatch="false"
-              es1="1.0" alias="Rotatex">
+    <function name="RotatexOES" es1="1.0" alias="Rotatex">
         <param name="angle" type="GLfixed"/>
         <param name="x" type="GLfixed"/>
         <param name="y" type="GLfixed"/>
         <param name="z" type="GLfixed"/>
     </function>
 
-    <function name="SampleCoveragexOES" static_dispatch="false"
-              es1="1.0" alias="SampleCoveragex">
+    <function name="SampleCoveragexOES" es1="1.0" alias="SampleCoveragex">
         <param name="value" type="GLclampx"/>
         <param name="invert" type="GLboolean"/>
     </function>
 
-    <function name="ScalexOES" static_dispatch="false"
-              es1="1.0" alias="Scalex">
+    <function name="ScalexOES" es1="1.0" alias="Scalex">
         <param name="x" type="GLfixed"/>
         <param name="y" type="GLfixed"/>
         <param name="z" type="GLfixed"/>
     </function>
 
-    <function name="TexEnvxOES" static_dispatch="false"
-              es1="1.0" alias="TexEnvx">
+    <function name="TexEnvxOES" es1="1.0" alias="TexEnvx">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="TexEnvxvOES" static_dispatch="false"
-              es1="1.0" alias="TexEnvxv">
+    <function name="TexEnvxvOES" es1="1.0" alias="TexEnvxv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
-    <function name="TexParameterxOES" static_dispatch="false"
-              es1="1.0" alias="TexParameterx">
+    <function name="TexParameterxOES" es1="1.0" alias="TexParameterx">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="TranslatexOES" static_dispatch="false"
-              es1="1.0" alias="Translatex">
+    <function name="TranslatexOES" es1="1.0" alias="Translatex">
         <param name="x" type="GLfixed"/>
         <param name="y" type="GLfixed"/>
         <param name="z" type="GLfixed"/>
     </function>
 
     <!-- OpenGL ES 1.1 -->
-    <function name="ClipPlanexOES" static_dispatch="false"
-              es1="1.0" alias="ClipPlanex">
+    <function name="ClipPlanexOES" es1="1.0" alias="ClipPlanex">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="const GLfixed *"/>
     </function>
 
-    <function name="GetClipPlanexOES" static_dispatch="false"
+    <function name="GetClipPlanexOES"
               es1="1.0" desktop="false" alias="GetClipPlanex">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="GLfixed *"/>
     </function>
 
-    <function name="GetFixedvOES" static_dispatch="false"
-              es1="1.0" alias="GetFixedv">
+    <function name="GetFixedvOES" es1="1.0" alias="GetFixedv">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *"/>
     </function>
 
-    <function name="GetLightxvOES" static_dispatch="false"
-              es1="1.0" alias="GetLightxv">
+    <function name="GetLightxvOES" es1="1.0" alias="GetLightxv">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *"/>
     </function>
 
-    <function name="GetMaterialxvOES" static_dispatch="false"
-              es1="1.0" alias="GetMaterialxv">
+    <function name="GetMaterialxvOES" es1="1.0" alias="GetMaterialxv">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *"/>
     </function>
 
-    <function name="GetTexEnvxvOES" static_dispatch="false"
-              es1="1.0" alias="GetTexEnvxv">
+    <function name="GetTexEnvxvOES" es1="1.0" alias="GetTexEnvxv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *"/>
     </function>
 
-    <function name="GetTexParameterxvOES"
-              static_dispatch="false" es1="1.0" alias="GetTexParameterxv">
+    <function name="GetTexParameterxvOES" es1="1.0" alias="GetTexParameterxv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *"/>
     </function>
 
-    <function name="PointParameterxOES" static_dispatch="false"
-              es1="1.0" alias="PointParameterx">
+    <function name="PointParameterxOES" es1="1.0" alias="PointParameterx">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="PointParameterxvOES"
-              static_dispatch="false" es1="1.0" alias="PointParameterxv">
+    <function name="PointParameterxvOES" es1="1.0" alias="PointParameterxv">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
-    <function name="TexParameterxvOES" static_dispatch="false"
-              es1="1.0" alias="TexParameterxv">
+    <function name="TexParameterxvOES" es1="1.0" alias="TexParameterxv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
     <!-- texgen -->
-    <function name="GetTexGenxvOES" offset="assign" static_dispatch="false"
+    <function name="GetTexGenxvOES"
               es1="1.0" desktop="false">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="TexGenxOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="TexGenxOES" es1="1.0" desktop="false">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
     </function>
 
-    <function name="TexGenxvOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="TexGenxvOES" es1="1.0" desktop="false">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *" variable_param="pname"/>
diff --git a/src/mapi/glapi/gen/OES_single_precision.xml b/src/mapi/glapi/gen/OES_single_precision.xml
index 8346b64..c679504 100644
--- a/src/mapi/glapi/gen/OES_single_precision.xml
+++ b/src/mapi/glapi/gen/OES_single_precision.xml
@@ -6,31 +6,27 @@
 <OpenGLAPI>
 
 <category name="GL_OES_single_precision" number="293">
-    <function name="ClearDepthfOES" alias="ClearDepthf" static_dispatch="false"
-              es1="1.0">
+    <function name="ClearDepthfOES" alias="ClearDepthf" es1="1.0">
         <param name="depth" type="GLclampf"/>
     </function>
 
-    <function name="ClipPlanefOES" static_dispatch="false"
-              es1="1.0" alias="ClipPlanef">
+    <function name="ClipPlanefOES" es1="1.0" alias="ClipPlanef">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="const GLfloat *"/>
     </function>
 
-    <function name="DepthRangefOES" alias="DepthRangef" static_dispatch="false"
-              es1="1.0">
+    <function name="DepthRangefOES" alias="DepthRangef" es1="1.0">
         <param name="zNear" type="GLclampf"/>
         <param name="zFar" type="GLclampf"/>
     </function>
 
-    <function name="GetClipPlanefOES" static_dispatch="false"
+    <function name="GetClipPlanefOES"
               es1="1.0" desktop="false" alias="GetClipPlanef">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="GLfloat *"/>
     </function>
 
-    <function name="FrustumfOES" static_dispatch="false"
-              es1="1.0" alias="Frustumf">
+    <function name="FrustumfOES" es1="1.0" alias="Frustumf">
         <param name="left" type="GLfloat"/>
         <param name="right" type="GLfloat"/>
         <param name="bottom" type="GLfloat"/>
@@ -39,8 +35,7 @@
         <param name="zFar" type="GLfloat"/>
     </function>
 
-    <function name="OrthofOES" static_dispatch="false"
-              es1="1.0" alias="Orthof">
+    <function name="OrthofOES" es1="1.0" alias="Orthof">
         <param name="left" type="GLfloat"/>
         <param name="right" type="GLfloat"/>
         <param name="bottom" type="GLfloat"/>
diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
new file mode 100644
index 0000000..b623b44
--- /dev/null
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -0,0 +1,245 @@
+# Copyright (C) 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+class exec_info():
+    """Information relating GL APIs to a function.
+
+    Each of the four attributes of this class, compatibility, core, es1, and
+    es2, specify the minimum API version where a function can possibly exist
+    in Mesa.  The version is specified as an integer of (real GL version *
+    10).  For example, glCreateProgram was added in OpenGL 2.0, so
+    compatibility=20 and core=31.
+
+    If the attribute is None, then it cannot be supported by that
+    API.  For example, glNewList was removed from core profiles, so
+    compatibility=10 and core=None.
+
+    Each of the attributes that is not None must have a valid value.  The
+    valid ranges are:
+
+        compatiblity: [10, 30]
+        core: [31, )
+        es1: [10, 11]
+        es2: [20, )
+
+    These ranges are enforced by the constructor.
+    """
+    def __init__(self, compatibility=None, core=None, es1=None, es2=None):
+        if compatibility is not None:
+            assert isinstance(compatibility, int)
+            assert compatibility >= 10
+            assert compatibility <= 30
+
+        if core is not None:
+            assert isinstance(core, int)
+            assert core >= 31
+
+        if es1 is not None:
+            assert isinstance(es1, int)
+            assert es1 == 10 or es1 == 11
+
+        if es2 is not None:
+            assert isinstance(es2, int)
+            assert es2 >= 20
+
+        self.compatibility = compatibility
+        self.core = core
+        self.es1 = es1
+        self.es2 = es2
+
+functions = {
+    # OpenGL 3.1 / GL_ARB_texture_buffer_object.  Mesa only exposes this
+    # extension with core profile.
+    "TexBuffer": exec_info(core=31),
+
+    # OpenGL 3.2 / GL_ARB_geometry_shader4.  Mesa does not support
+    # GL_ARB_geometry_shader4, so OpenGL 3.2 is required.
+    "FramebufferTexture": exec_info(core=32),
+
+    # OpenGL 4.0 / GL_ARB_gpu_shader_fp64.  The extension spec says:
+    #
+    #     "OpenGL 3.2 and GLSL 1.50 are required."
+    "Uniform1d": exec_info(core=32),
+    "Uniform2d": exec_info(core=32),
+    "Uniform3d": exec_info(core=32),
+    "Uniform4d": exec_info(core=32),
+    "Uniform1dv": exec_info(core=32),
+    "Uniform2dv": exec_info(core=32),
+    "Uniform3dv": exec_info(core=32),
+    "Uniform4dv": exec_info(core=32),
+    "UniformMatrix2dv": exec_info(core=32),
+    "UniformMatrix3dv": exec_info(core=32),
+    "UniformMatrix4dv": exec_info(core=32),
+    "UniformMatrix2x3dv": exec_info(core=32),
+    "UniformMatrix2x4dv": exec_info(core=32),
+    "UniformMatrix3x2dv": exec_info(core=32),
+    "UniformMatrix3x4dv": exec_info(core=32),
+    "UniformMatrix4x2dv": exec_info(core=32),
+    "UniformMatrix4x3dv": exec_info(core=32),
+    "GetUniformdv": exec_info(core=32),
+
+    # OpenGL 4.1 / GL_ARB_vertex_attrib_64bit.  The extension spec says:
+    #
+    #     "OpenGL 3.0 and GLSL 1.30 are required.
+    #
+    #     ARB_gpu_shader_fp64 (or equivalent functionality) is required."
+    #
+    # For Mesa this effectively means OpenGL 3.2 is required.  It seems
+    # unlikely that Mesa will ever get support for any of the NV extensions
+    # that add "equivalent functionality."
+    "VertexAttribL1d": exec_info(core=32),
+    "VertexAttribL2d": exec_info(core=32),
+    "VertexAttribL3d": exec_info(core=32),
+    "VertexAttribL4d": exec_info(core=32),
+    "VertexAttribL1dv": exec_info(core=32),
+    "VertexAttribL2dv": exec_info(core=32),
+    "VertexAttribL3dv": exec_info(core=32),
+    "VertexAttribL4dv": exec_info(core=32),
+    "VertexAttribLPointer": exec_info(core=32),
+    "GetVertexAttribLdv": exec_info(core=32),
+
+    # OpenGL 4.1 / GL_ARB_viewport_array.  The extension spec says:
+    #
+    #     "OpenGL 3.2 or the EXT_geometry_shader4 or ARB_geometry_shader4
+    #     extensions are required."
+    #
+    # Mesa does not support either of the geometry shader extensions, so
+    # OpenGL 3.2 is required.
+    "ViewportArrayv": exec_info(core=32),
+    "ViewportIndexedf": exec_info(core=32),
+    "ViewportIndexedfv": exec_info(core=32),
+    "ScissorArrayv": exec_info(core=32),
+    "ScissorIndexed": exec_info(core=32),
+    "ScissorIndexedv": exec_info(core=32),
+    "DepthRangeArrayv": exec_info(core=32),
+    "DepthRangeIndexed": exec_info(core=32),
+    # GetFloati_v also GL_ARB_shader_atomic_counters
+    # GetDoublei_v also GL_ARB_shader_atomic_counters
+
+    # OpenGL 4.3 / GL_ARB_texture_buffer_range.  Mesa can expose the extension
+    # with OpenGL 3.1.
+    "TexBufferRange": exec_info(core=31),
+
+    # OpenGL 4.3 / GL_ARB_framebuffer_no_attachments.  Mesa can expose the
+    # extension with OpenGL 3.0.
+    "FramebufferParameteri": exec_info(compatibility=30, core=31),
+    "GetFramebufferParameteri": exec_info(compatibility=30, core=31),
+
+    # OpenGL 4.5 / GL_ARB_direct_state_access.   Mesa can expose the extension
+    # with core profile.
+    "CreateTransformFeedbacks": exec_info(core=31),
+    "TransformFeedbackBufferBase": exec_info(core=31),
+    "TransformFeedbackBufferRange": exec_info(core=31),
+    "GetTransformFeedbackiv": exec_info(core=31),
+    "GetTransformFeedbacki_v": exec_info(core=31),
+    "GetTransformFeedbacki64_v": exec_info(core=31),
+    "CreateBuffers": exec_info(core=31),
+    "NamedBufferStorage": exec_info(core=31),
+    "NamedBufferData": exec_info(core=31),
+    "NamedBufferSubData": exec_info(core=31),
+    "CopyNamedBufferSubData": exec_info(core=31),
+    "ClearNamedBufferData": exec_info(core=31),
+    "ClearNamedBufferSubData": exec_info(core=31),
+    "MapNamedBuffer": exec_info(core=31),
+    "MapNamedBufferRange": exec_info(core=31),
+    "UnmapNamedBuffer": exec_info(core=31),
+    "FlushMappedNamedBufferRange": exec_info(core=31),
+    "GetNamedBufferParameteriv": exec_info(core=31),
+    "GetNamedBufferParameteri64v": exec_info(core=31),
+    "GetNamedBufferPointerv": exec_info(core=31),
+    "GetNamedBufferSubData": exec_info(core=31),
+    "CreateFramebuffers": exec_info(core=31),
+    "NamedFramebufferRenderbuffer": exec_info(core=31),
+    "NamedFramebufferParameteri": exec_info(core=31),
+    "NamedFramebufferTexture": exec_info(core=31),
+    "NamedFramebufferTextureLayer": exec_info(core=31),
+    "NamedFramebufferDrawBuffer": exec_info(core=31),
+    "NamedFramebufferDrawBuffers": exec_info(core=31),
+    "NamedFramebufferReadBuffer": exec_info(core=31),
+    "InvalidateNamedFramebufferData": exec_info(core=31),
+    "InvalidateNamedFramebufferSubData": exec_info(core=31),
+    "ClearNamedFramebufferiv": exec_info(core=31),
+    "ClearNamedFramebufferuiv": exec_info(core=31),
+    "ClearNamedFramebufferfv": exec_info(core=31),
+    "ClearNamedFramebufferfi": exec_info(core=31),
+    "BlitNamedFramebuffer": exec_info(core=31),
+    "CheckNamedFramebufferStatus": exec_info(core=31),
+    "GetNamedFramebufferParameteriv": exec_info(core=31),
+    "GetNamedFramebufferAttachmentParameteriv": exec_info(core=31),
+    "CreateRenderbuffers": exec_info(core=31),
+    "NamedRenderbufferStorage": exec_info(core=31),
+    "NamedRenderbufferStorageMultisample": exec_info(core=31),
+    "GetNamedRenderbufferParameteriv": exec_info(core=31),
+    "CreateTextures": exec_info(core=31),
+    "TextureBuffer": exec_info(core=31),
+    "TextureBufferRange": exec_info(core=31),
+    "TextureStorage1D": exec_info(core=31),
+    "TextureStorage2D": exec_info(core=31),
+    "TextureStorage3D": exec_info(core=31),
+    "TextureStorage2DMultisample": exec_info(core=31),
+    "TextureStorage3DMultisample": exec_info(core=31),
+    "TextureSubImage1D": exec_info(core=31),
+    "TextureSubImage2D": exec_info(core=31),
+    "TextureSubImage3D": exec_info(core=31),
+    "CompressedTextureSubImage1D": exec_info(core=31),
+    "CompressedTextureSubImage2D": exec_info(core=31),
+    "CompressedTextureSubImage3D": exec_info(core=31),
+    "CopyTextureSubImage1D": exec_info(core=31),
+    "CopyTextureSubImage2D": exec_info(core=31),
+    "CopyTextureSubImage3D": exec_info(core=31),
+    "TextureParameterf": exec_info(core=31),
+    "TextureParameterfv": exec_info(core=31),
+    "TextureParameteri": exec_info(core=31),
+    "TextureParameterIiv": exec_info(core=31),
+    "TextureParameterIuiv": exec_info(core=31),
+    "TextureParameteriv": exec_info(core=31),
+    "GenerateTextureMipmap": exec_info(core=31),
+    "BindTextureUnit": exec_info(core=31),
+    "GetTextureImage": exec_info(core=31),
+    "GetCompressedTextureImage": exec_info(core=31),
+    "GetTextureLevelParameterfv": exec_info(core=31),
+    "GetTextureLevelParameteriv": exec_info(core=31),
+    "GetTextureParameterfv": exec_info(core=31),
+    "GetTextureParameterIiv": exec_info(core=31),
+    "GetTextureParameterIuiv": exec_info(core=31),
+    "GetTextureParameteriv": exec_info(core=31),
+    "CreateVertexArrays": exec_info(core=31),
+    "DisableVertexArrayAttrib": exec_info(core=31),
+    "EnableVertexArrayAttrib": exec_info(core=31),
+    "VertexArrayElementBuffer": exec_info(core=31),
+    "VertexArrayVertexBuffer": exec_info(core=31),
+    "VertexArrayVertexBuffers": exec_info(core=31),
+    "VertexArrayAttribFormat": exec_info(core=31),
+    "VertexArrayAttribIFormat": exec_info(core=31),
+    "VertexArrayAttribLFormat": exec_info(core=31),
+    "VertexArrayAttribBinding": exec_info(core=31),
+    "VertexArrayBindingDivisor": exec_info(core=31),
+    "GetVertexArrayiv": exec_info(core=31),
+    "GetVertexArrayIndexediv": exec_info(core=31),
+    "GetVertexArrayIndexed64iv": exec_info(core=31),
+    "CreateSamplers": exec_info(core=31),
+    "CreateProgramPipelines": exec_info(core=31),
+    "CreateQueries": exec_info(core=31),
+    "GetQueryBufferObjectiv": exec_info(core=31),
+    "GetQueryBufferObjectuiv": exec_info(core=31),
+    "GetQueryBufferObjecti64v": exec_info(core=31),
+    "GetQueryBufferObjectui64v": exec_info(core=31),
+}
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 3a2adeb..642e3b3 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -11,7 +11,7 @@
     <enum name="BLEND_EQUATION_ALPHA_OES"                 value="0x883D"/>
 
     <function name="BlendEquationSeparateOES" alias="BlendEquationSeparate"
-              static_dispatch="false" es1="1.0">
+	      es1="1.0">
         <param name="modeRGB" type="GLenum"/>
         <param name="modeA" type="GLenum"/>
     </function>
@@ -24,8 +24,7 @@
     <enum name="BLEND_DST_ALPHA_OES"                      value="0x80CA"/>
     <enum name="BLEND_SRC_ALPHA_OES"                      value="0x80CB"/>
 
-    <function name="BlendFuncSeparateOES" alias="BlendFuncSeparate"
-              static_dispatch="false" es1="1.0">
+    <function name="BlendFuncSeparateOES" alias="BlendFuncSeparate" es1="1.0">
         <param name="sfactorRGB" type="GLenum"/>
         <param name="dfactorRGB" type="GLenum"/>
         <param name="sfactorAlpha" type="GLenum"/>
@@ -40,8 +39,7 @@
     <enum name="FUNC_SUBTRACT_OES"                        value="0x800A"/>
     <enum name="FUNC_REVERSE_SUBTRACT_OES"                value="0x800B"/>
 
-    <function name="BlendEquationOES" alias="BlendEquation"
-              static_dispatch="false" es1="1.0">
+    <function name="BlendEquationOES" alias="BlendEquation" es1="1.0">
         <param name="mode" type="GLenum"/>
     </function>
 </category>
@@ -72,8 +70,7 @@
 <category name="GL_OES_draw_texture" number="7">
     <enum name="TEXTURE_CROP_RECT_OES"                    value="0x8B9D"/>
 
-    <function name="DrawTexiOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexiOES" es1="1.0" desktop="false">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
@@ -81,13 +78,11 @@
         <param name="height" type="GLint"/>
     </function>
 
-    <function name="DrawTexivOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexivOES" es1="1.0" desktop="false">
         <param name="coords" type="const GLint *" count="5"/>
     </function>
 
-    <function name="DrawTexfOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexfOES" es1="1.0" desktop="false">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
@@ -95,13 +90,11 @@
         <param name="height" type="GLfloat"/>
     </function>
 
-    <function name="DrawTexfvOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexfvOES" es1="1.0" desktop="false">
         <param name="coords" type="const GLfloat *" count="5"/>
     </function>
 
-    <function name="DrawTexsOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexsOES" es1="1.0" desktop="false">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
@@ -109,13 +102,11 @@
         <param name="height" type="GLshort"/>
     </function>
 
-    <function name="DrawTexsvOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexsvOES" es1="1.0" desktop="false">
         <param name="coords" type="const GLshort *" count="5"/>
     </function>
 
-    <function name="DrawTexxOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexxOES" es1="1.0" desktop="false">
         <param name="x" type="GLfixed"/>
         <param name="y" type="GLfixed"/>
         <param name="z" type="GLfixed"/>
@@ -123,8 +114,7 @@
         <param name="height" type="GLfixed"/>
     </function>
 
-    <function name="DrawTexxvOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="DrawTexxvOES" es1="1.0" desktop="false">
         <param name="coords" type="const GLfixed *" count="5"/>
     </function>
 
@@ -177,40 +167,35 @@
     <enum name="RENDERBUFFER_STENCIL_SIZE_OES"            value="0x8D55"/>
     <enum name="RGB565_OES"                               value="0x8D62"/>
 
-    <function name="BindFramebufferOES" alias="BindFramebuffer"
-              static_dispatch="false" es1="1.0">
+    <function name="BindFramebufferOES" alias="BindFramebuffer" es1="1.0">
         <param name="target" type="GLenum"/>
         <param name="framebuffer" type="GLuint"/>
     </function>
 
-    <function name="BindRenderbufferOES" alias="BindRenderbuffer"
-              static_dispatch="false" es1="1.0">
+    <function name="BindRenderbufferOES" alias="BindRenderbuffer" es1="1.0">
         <param name="target" type="GLenum"/>
         <param name="renderbuffer" type="GLuint"/>
     </function>
 
     <function name="CheckFramebufferStatusOES"
-              alias="CheckFramebufferStatus" static_dispatch="false"
-              es1="1.0">
+              alias="CheckFramebufferStatus" es1="1.0">
         <param name="target" type="GLenum"/>
 	<return type="GLenum"/>
     </function>
 
-    <function name="DeleteFramebuffersOES" alias="DeleteFramebuffers"
-              static_dispatch="false" es1="1.0">
+    <function name="DeleteFramebuffersOES" alias="DeleteFramebuffers" es1="1.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="framebuffers" type="const GLuint *" count="n"/>
     </function>
 
     <function name="DeleteRenderbuffersOES" alias="DeleteRenderbuffers"
-              static_dispatch="false" es1="1.0">
+	      es1="1.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="renderbuffers" type="const GLuint *" count="n"/>
     </function>
 
     <function name="FramebufferRenderbufferOES"
-              alias="FramebufferRenderbuffer" static_dispatch="false"
-              es1="1.0">
+              alias="FramebufferRenderbuffer" es1="1.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="renderbuffertarget" type="GLenum"/>
@@ -218,7 +203,7 @@
     </function>
 
     <function name="FramebufferTexture2DOES" alias="FramebufferTexture2D"
-              static_dispatch="false" es1="1.0">
+	      es1="1.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
@@ -226,26 +211,22 @@
         <param name="level" type="GLint"/>
     </function>
 
-    <function name="GenerateMipmapOES" alias="GenerateMipmap"
-              static_dispatch="false" es1="1.0">
+    <function name="GenerateMipmapOES" alias="GenerateMipmap" es1="1.0">
         <param name="target" type="GLenum"/>
     </function>
 
-    <function name="GenFramebuffersOES" alias="GenFramebuffers"
-              static_dispatch="false" es1="1.0">
+    <function name="GenFramebuffersOES" alias="GenFramebuffers" es1="1.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="framebuffers" type="GLuint *" count="n" output="true"/>
     </function>
 
-    <function name="GenRenderbuffersOES" alias="GenRenderbuffers"
-              static_dispatch="false" es1="1.0">
+    <function name="GenRenderbuffersOES" alias="GenRenderbuffers" es1="1.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="renderbuffers" type="GLuint *" count="n" output="true"/>
     </function>
 
     <function name="GetFramebufferAttachmentParameterivOES"
-              alias="GetFramebufferAttachmentParameteriv"
-              static_dispatch="false" es1="1.0">
+              alias="GetFramebufferAttachmentParameteriv" es1="1.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="pname" type="GLenum"/>
@@ -253,27 +234,24 @@
     </function>
 
     <function name="GetRenderbufferParameterivOES"
-              alias="GetRenderbufferParameteriv" static_dispatch="false"
-              es1="1.0">
+              alias="GetRenderbufferParameteriv" es1="1.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true"/>
     </function>
 
-    <function name="IsFramebufferOES" alias="IsFramebuffer"
-              static_dispatch="false" es1="1.0">
+    <function name="IsFramebufferOES" alias="IsFramebuffer" es1="1.0">
         <param name="framebuffer" type="GLuint"/>
 	<return type="GLboolean"/>
     </function>
 
-    <function name="IsRenderbufferOES" alias="IsRenderbuffer"
-              static_dispatch="false" es1="1.0">
+    <function name="IsRenderbufferOES" alias="IsRenderbuffer" es1="1.0">
         <param name="renderbuffer" type="GLuint"/>
 	<return type="GLboolean"/>
     </function>
 
     <function name="RenderbufferStorageOES" alias="RenderbufferStorage"
-              static_dispatch="false" es1="1.0">
+	      es1="1.0">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -308,25 +286,23 @@
     <enum name="MATRIX_INDEX_ARRAY_BUFFER_BINDING_OES"    value="0x8B9E"/>
 
     <function name="CurrentPaletteMatrixOES" alias="CurrentPaletteMatrixARB"
-              static_dispatch="false" exec="skip">
+	      exec="skip">
         <param name="matrixpaletteindex" type="GLuint"/>
     </function>
 
     <!-- no offset -->
-    <function name="LoadPaletteFromModelViewMatrixOES" static_dispatch="false"
-              exec="skip">
+    <function name="LoadPaletteFromModelViewMatrixOES" exec="skip">
     </function>
 
     <function name="MatrixIndexPointerOES" alias="MatrixIndexPointerARB"
-              static_dispatch="false" exec="skip">
+	      exec="skip">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
     </function>
 
-    <function name="WeightPointerOES" alias="WeightPointerARB"
-              static_dispatch="false" exec="skip">
+    <function name="WeightPointerOES" alias="WeightPointerARB" exec="skip">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -342,8 +318,7 @@
     <enum name="POINT_SIZE_ARRAY_OES"                     value="0x8B9C"/>
     <enum name="POINT_SIZE_ARRAY_BUFFER_BINDING_OES"	  value="0x8B9F"/>
 
-    <function name="PointSizePointerOES" offset="assign"
-              static_dispatch="true" es1="1.0" desktop="false">
+    <function name="PointSizePointerOES" es1="1.0" desktop="false">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
@@ -358,8 +333,7 @@
 
 <!-- optional for es1.0 -->
 <category name="GL_OES_query_matrix" number="16">
-    <function name="QueryMatrixxOES" offset="assign" static_dispatch="false"
-              es1="1.0" desktop="false">
+    <function name="QueryMatrixxOES" es1="1.0" desktop="false">
         <param name="mantissa" type="GLfixed *" count="16" />
         <param name="exponent" type="GLint *" count="16" />
 	<return type="GLbitfield"/>
@@ -388,43 +362,37 @@
     <enum name="MAX_CUBE_MAP_TEXTURE_SIZE_OES"            value="0x851C"/>
     <enum name="TEXTURE_GEN_STR_OES"                      value="0x8D60"/>
 
-    <function name="GetTexGenfvOES" alias="GetTexGenfv" static_dispatch="false"
-              es1="1.0">
+    <function name="GetTexGenfvOES" alias="GetTexGenfv" es1="1.0">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="GetTexGenivOES" alias="GetTexGeniv" static_dispatch="false"
-              es1="1.0">
+    <function name="GetTexGenivOES" alias="GetTexGeniv" es1="1.0">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="TexGenfOES" alias="TexGenf" static_dispatch="false"
-              es1="1.0">
+    <function name="TexGenfOES" alias="TexGenf" es1="1.0">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
     </function>
 
-    <function name="TexGenfvOES" alias="TexGenfv" static_dispatch="false"
-              es1="1.0">
+    <function name="TexGenfvOES" alias="TexGenfv" es1="1.0">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
     </function>
 
-    <function name="TexGeniOES" alias="TexGeni" static_dispatch="false"
-              es1="1.0">
+    <function name="TexGeniOES" alias="TexGeni" es1="1.0">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
     </function>
 
-    <function name="TexGenivOES" alias="TexGeniv" static_dispatch="false"
-              es1="1.0">
+    <function name="TexGenivOES" alias="TexGeniv" es1="1.0">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
@@ -460,21 +428,19 @@
     <enum name="BUFFER_MAP_POINTER_OES"                   value="0x88BD"/>
 
     <function name="GetBufferPointervOES" alias="GetBufferPointerv"
-              static_dispatch="false" es1="1.0" es2="2.0">
+	      es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLvoid **"/>
     </function>
 
-    <function name="MapBufferOES" alias="MapBuffer" static_dispatch="false"
-              es1="1.0" es2="2.0">
+    <function name="MapBufferOES" alias="MapBuffer" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="access" type="GLenum"/>
 	<return type="GLvoid *"/>
     </function>
 
-    <function name="UnmapBufferOES" alias="UnmapBuffer"
-              static_dispatch="false" es1="1.0" es2="2.0">
+    <function name="UnmapBufferOES" alias="UnmapBuffer" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
 	<return type="GLboolean"/>
     </function>
@@ -506,7 +472,7 @@
     <enum name="FRAMEBUFFER_ATTACHMENT_TEXTURE_3D_ZOFFSET_OES" value="0x8CD4"/>
 
     <function name="CompressedTexImage3DOES" alias="CompressedTexImage3D"
-              static_dispatch="false" es2="2.0">
+	      es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -519,8 +485,7 @@
     </function>
 
     <function name="CompressedTexSubImage3DOES"
-              alias="CompressedTexSubImage3D" static_dispatch="false"
-              es2="2.0">
+              alias="CompressedTexSubImage3D" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -534,8 +499,7 @@
         <param name="data" type="const GLvoid *" count="imageSize"/>
     </function>
 
-    <function name="CopyTexSubImage3DOES" alias="CopyTexSubImage3D"
-              static_dispatch="false" es2="2.0">
+    <function name="CopyTexSubImage3DOES" alias="CopyTexSubImage3D" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -548,7 +512,7 @@
     </function>
 
     <function name="FramebufferTexture3DOES" alias="FramebufferTexture3D"
-              static_dispatch="false" es2="2.0">
+	      es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="attachment" type="GLenum"/>
         <param name="textarget" type="GLenum"/>
@@ -557,8 +521,7 @@
         <param name="zoffset" type="GLint"/>
     </function>
 
-    <function name="TexImage3DOES" alias="TexImage3D" static_dispatch="false"
-              es2="2.0">
+    <function name="TexImage3DOES" alias="TexImage3D" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -571,8 +534,7 @@
         <param name="pixels" type="const GLvoid *" img_width="width" img_height="height" img_depth="depth" img_format="format" img_type="type" img_target="target" img_null_flag="true" img_pad_dimensions="true"/>
     </function>
 
-    <function name="TexSubImage3DOES" alias="TexSubImage3D"
-              static_dispatch="false" es2="2.0">
+    <function name="TexSubImage3DOES" alias="TexSubImage3D" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -630,8 +592,7 @@
     <enum name="NUM_PROGRAM_BINARY_FORMATS_OES"           value="0x87FE"/>
     <enum name="PROGRAM_BINARY_FORMATS_OES"               value="0x87FF"/>
 
-    <function name="GetProgramBinaryOES" alias="GetProgramBinary"
-              static_dispatch="false" es2="2.0">
+    <function name="GetProgramBinaryOES" alias="GetProgramBinary" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *"/>
@@ -639,8 +600,7 @@
         <param name="binary" type="GLvoid *"/>
     </function>
 
-    <function name="ProgramBinaryOES" alias="ProgramBinary"
-              static_dispatch="false" es2="2.0">
+    <function name="ProgramBinaryOES" alias="ProgramBinary" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="binaryFormat" type="GLenum"/>
         <param name="binary" type="const GLvoid *"/>
@@ -666,8 +626,7 @@
 <!-- 64. GL_EXT_discard_framebuffer -->
 
 <category name="GL_EXT_discard_framebuffer" number="64">
-    <function name="DiscardFramebufferEXT" es1="1.0" es2="2.0"
-              offset="assign" static_dispatch="false" desktop="false">
+    <function name="DiscardFramebufferEXT" es1="1.0" es2="2.0" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="numAttachments" type="GLsizei"/>
         <param name="attachments" type="const GLenum *" count="numAttachments"/>
@@ -689,25 +648,21 @@
 
 <!-- 71. GL_OES_vertex_array_object -->
 <category name="GL_OES_vertex_array_object" number="71">
-    <function name="BindVertexArrayOES" alias="BindVertexArray"
-              static_dispatch="false" es2="2.0">
+    <function name="BindVertexArrayOES" alias="BindVertexArray" es2="2.0">
         <param name="array" type="GLuint"/>
     </function>
 
-    <function name="DeleteVertexArraysOES" alias="DeleteVertexArrays"
-              static_dispatch="false" es2="2.0">
+    <function name="DeleteVertexArraysOES" alias="DeleteVertexArrays" es2="2.0">
         <param name="n" type="GLsizei"/>
         <param name="arrays" type="const GLuint *" count="n"/>
     </function>
 
-    <function name="GenVertexArraysOES" alias="GenVertexArrays"
-              static_dispatch="false" es2="2.0">
+    <function name="GenVertexArraysOES" alias="GenVertexArrays" es2="2.0">
         <param name="n" type="GLsizei"/>
         <param name="arrays" type="GLuint *" output="true" count="n"/>
     </function>
 
-    <function name="IsVertexArrayOES" alias="IsVertexArray"
-              static_dispatch="false" es2="2.0">
+    <function name="IsVertexArrayOES" alias="IsVertexArray" es2="2.0">
         <param name="array" type="GLuint"/>
         <return type="GLboolean"/>
     </function>
@@ -782,8 +737,7 @@
         <size name="Get" mode="get"/>
     </enum>
 
-    <function name="DrawBuffersNV" alias="DrawBuffers"
-              static_dispatch="false" es2="2.0">
+    <function name="DrawBuffersNV" alias="DrawBuffers" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="bufs" type="const GLenum *" count="n"/>
     </function>
@@ -791,8 +745,7 @@
 
 <!-- 93. GL_NV_read_buffer -->
 <category name="GL_NV_read_buffer">
-    <function name="ReadBufferNV" alias="ReadBuffer"
-              static_dispatch="false" es2="2.0">
+    <function name="ReadBufferNV" alias="ReadBuffer" es2="2.0">
         <param name="mode" type="GLenum"/>
     </function>
 </category>
@@ -821,7 +774,7 @@
     <enum name="MAP_UNSYNCHRONIZED_BIT_EXT"               value="0x0020"/>
 
     <function name="MapBufferRangeEXT" alias="MapBufferRange"
-              static_dispatch="false" es1="1.0" es2="2.0">
+	      es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="size" type="GLsizeiptr"/>
@@ -830,7 +783,7 @@
     </function>
 
     <function name="FlushMappedBufferRangeEXT" alias="FlushMappedBufferRange"
-              static_dispatch="false" es1="1.0" es2="2.0">
+	      es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="length" type="GLsizeiptr"/>
@@ -839,8 +792,7 @@
 
 <!-- 151. GL_EXT_draw_buffers -->
 <category name="GL_EXT_draw_buffers" number="151">
-    <function name="DrawBuffersEXT" alias="DrawBuffers"
-              static_dispatch="false" es2="2.0">
+    <function name="DrawBuffersEXT" alias="DrawBuffers" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="bufs" type="const GLenum *" count="n"/>
     </function>
diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index d076409..da468dc 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -25,8 +25,10 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+import string
+
 import gl_XML, glX_XML, glX_proto_common, license
-import sys, getopt, string
 
 
 class PrintGlxDispatch_h(gl_XML.gl_print_base):
@@ -524,31 +526,39 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
         return
 
 
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:m:s")
-    except Exception,e:
-        show_usage()
-
-    mode = "dispatch_c"
-    do_swap = 0
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-m":
-            mode = val
-        elif arg == "-s":
-            do_swap = 1
-
-    if mode == "dispatch_c":
-        printer = PrintGlxDispatchFunctions(do_swap)
-    elif mode == "dispatch_h":
+def _parser():
+    """Parse any arguments passed and return a namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        dest='filename',
+                        default='gl_API.xml',
+                        help='an xml file describing an OpenGL API')
+    parser.add_argument('-m',
+                        dest='mode',
+                        default='dispatch_c',
+                        choices=['dispatch_c', 'dispatch_h'],
+                        help='what file to generate')
+    parser.add_argument('-s',
+                        dest='swap',
+                        action='store_true',
+                        help='emit swap in GlXDispatchFunctions')
+    return parser.parse_args()
+
+
+def main():
+    """Main function."""
+    args = _parser()
+
+    if args._mode == "dispatch_c":
+        printer = PrintGlxDispatchFunctions(args.swap)
+    elif args._mode == "dispatch_h":
         printer = PrintGlxDispatch_h()
-    else:
-        show_usage()
 
-    api = gl_XML.parse_GL_API( file_name, glX_proto_common.glx_proto_item_factory() )
+    api = gl_XML.parse_GL_API(
+        args.filename, glX_proto_common.glx_proto_item_factory())
+
+    printer.Print(api)
 
-    printer.Print( api )
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/glX_proto_send.py b/src/mapi/glapi/gen/glX_proto_send.py
index b93989f..2b33030 100644
--- a/src/mapi/glapi/gen/glX_proto_send.py
+++ b/src/mapi/glapi/gen/glX_proto_send.py
@@ -2,6 +2,7 @@
 
 # (C) Copyright IBM Corporation 2004, 2005
 # All Rights Reserved.
+# Copyright (c) 2015 Intel Corporation
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -26,8 +27,10 @@
 #    Ian Romanick <idr@us.ibm.com>
 #    Jeremy Kolb <jkolb@brandeis.edu>
 
+import argparse
+
 import gl_XML, glX_XML, glX_proto_common, license
-import sys, getopt, copy, string
+import copy, string
 
 def convertStringForXCB(str):
     tmp = ""
@@ -1085,42 +1088,41 @@ extern _X_HIDDEN NOINLINE FASTCALL GLubyte * __glXSetupVendorRequest(
         print '#endif'
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-m output_mode] [-d]" % sys.argv[0]
-    print "    -m output_mode   Output mode can be one of 'proto', 'init_c' or 'init_h'."
-    print "    -d               Enable extra debug information in the generated code."
-    sys.exit(1)
-
-
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:m:d")
-    except Exception,e:
-        show_usage()
-
-    debug = 0
-    mode = "proto"
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-m":
-            mode = val
-        elif arg == "-d":
-            debug = 1
-
-    if mode == "proto":
+def _parser():
+    """Parse input and returned a parsed namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        default='gl_API.xml',
+                        dest='filename',
+                        help='An XML file describing an API')
+    parser.add_argument('-m',
+                        required=True,
+                        dest='mode',
+                        choices=frozenset(['proto', 'init_c', 'init_h']),
+                        help='which file to generate')
+    parser.add_argument('-d',
+                        action='store_true',
+                        dest='debug',
+                        help='turn debug mode on.')
+    return parser.parse_args()
+
+
+def main():
+    """Main function."""
+    args = _parser()
+
+    if args.mode == "proto":
         printer = PrintGlxProtoStubs()
-    elif mode == "init_c":
+    elif args.mode == "init_c":
         printer = PrintGlxProtoInit_c()
-    elif mode == "init_h":
+    elif args.mode == "init_h":
         printer = PrintGlxProtoInit_h()
-    else:
-        show_usage()
 
-
-    printer.debug = debug
-    api = gl_XML.parse_GL_API( file_name, glX_XML.glx_item_factory() )
+    printer.debug = args.debug
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
 
     printer.Print( api )
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/glX_proto_size.py b/src/mapi/glapi/gen/glX_proto_size.py
index 4737fbf..75fc26f 100644
--- a/src/mapi/glapi/gen/glX_proto_size.py
+++ b/src/mapi/glapi/gen/glX_proto_size.py
@@ -25,9 +25,11 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+import sys, string
+
 import gl_XML, glX_XML
 import license
-import sys, getopt, copy, string
 
 
 class glx_enum_function(object):
@@ -650,54 +652,57 @@ class PrintGlxReqSize_c(PrintGlxReqSize_common):
         return alias
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] -m output_mode [--only-get | --only-set] [--get-alias-set]" % sys.argv[0]
-    print "    -m output_mode   Output mode can be one of 'size_c' or 'size_h'."
-    print "    --only-get       Only emit 'get'-type functions."
-    print "    --only-set       Only emit 'set'-type functions."
-    print ""
-    print "By default, both 'get' and 'set'-type functions are emitted."
-    sys.exit(1)
-
-
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:m:h:", ["only-get", "only-set", "header-tag"])
-    except Exception,e:
-        show_usage()
-
-    mode = None
-    header_tag = None
-    which_functions = PrintGlxSizeStubs_common.do_get | PrintGlxSizeStubs_common.do_set
-
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-m":
-            mode = val
-        elif arg == "--only-get":
-            which_functions = PrintGlxSizeStubs_common.do_get
-        elif arg == "--only-set":
-            which_functions = PrintGlxSizeStubs_common.do_set
-        elif (arg == '-h') or (arg == "--header-tag"):
-            header_tag = val
-
-    if mode == "size_c":
-        printer = PrintGlxSizeStubs_c( which_functions )
-    elif mode == "size_h":
-        printer = PrintGlxSizeStubs_h( which_functions )
-        if header_tag:
-            printer.header_tag = header_tag
-    elif mode == "reqsize_c":
+def _parser():
+    """Parse arguments and return a namespace."""
+    parser = argparse.ArgumentParser()
+    parser.set_defaults(which_functions=(PrintGlxSizeStubs_common.do_get |
+                                         PrintGlxSizeStubs_common.do_set))
+    parser.add_argument('-f',
+                        dest='filename',
+                        default='gl_API.xml',
+                        help='an XML file describing an OpenGL API.')
+    parser.add_argument('-m',
+                        dest='mode',
+                        choices=['size_c', 'size_h', 'reqsize_c', 'reqsize_h'],
+                        help='Which file to generate')
+    getset = parser.add_mutually_exclusive_group()
+    getset.add_argument('--only-get',
+                        dest='which_functions',
+                        action='store_const',
+                        const=PrintGlxSizeStubs_common.do_get,
+                        help='only emit "get-type" functions')
+    getset.add_argument('--only-set',
+                        dest='which_functions',
+                        action='store_const',
+                        const=PrintGlxSizeStubs_common.do_set,
+                        help='only emit "set-type" functions')
+    parser.add_argument('--header-tag',
+                        dest='header_tag',
+                        action='store',
+                        default=None,
+                        help='set header tag value')
+    return parser.parse_args()
+
+
+def main():
+    """Main function."""
+    args = _parser()
+
+    if args.mode == "size_c":
+        printer = PrintGlxSizeStubs_c(args.which_functions)
+    elif args.mode == "size_h":
+        printer = PrintGlxSizeStubs_h(args.which_functions)
+        if args.header_tag is not None:
+            printer.header_tag = args.header_tag
+    elif args.mode == "reqsize_c":
         printer = PrintGlxReqSize_c()
-    elif mode == "reqsize_h":
+    elif args.mode == "reqsize_h":
         printer = PrintGlxReqSize_h()
-    else:
-        show_usage()
 
-    api = gl_XML.parse_GL_API( file_name, glX_XML.glx_item_factory() )
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
+
+    printer.Print(api)
 
 
-    printer.Print( api )
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/glX_server_table.py b/src/mapi/glapi/gen/glX_server_table.py
index 47aa111..2d21f4e 100644
--- a/src/mapi/glapi/gen/glX_server_table.py
+++ b/src/mapi/glapi/gen/glX_server_table.py
@@ -25,8 +25,9 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+
 import gl_XML, glX_XML, glX_proto_common, license
-import sys, getopt
 
 
 def log2(value):
@@ -383,28 +384,19 @@ class PrintGlxDispatchTables(glX_proto_common.glx_print_proto):
         return
 
 
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:m")
-    except Exception,e:
-        show_usage()
-
-    mode = "table_c"
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-m":
-            mode = val
+def _parser():
+    """Parse arguments and return namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        dest='filename',
+                        default='gl_API.xml',
+                        help='An XML file describing an API.')
+    return parser.parse_args()
 
-    if mode == "table_c":
-        printer = PrintGlxDispatchTables()
-    else:
-        show_usage()
-
-
-    api = gl_XML.parse_GL_API( file_name, glX_XML.glx_item_factory() )
 
+if __name__ == '__main__':
+    args = _parser()
+    printer = PrintGlxDispatchTables()
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
 
-    printer.Print( api )
+    printer.Print(api)
diff --git a/src/mapi/glapi/gen/gl_API.dtd b/src/mapi/glapi/gen/gl_API.dtd
index ab321fa..bdc62f1 100644
--- a/src/mapi/glapi/gen/gl_API.dtd
+++ b/src/mapi/glapi/gen/gl_API.dtd
@@ -33,8 +33,6 @@
                    value               NMTOKEN #REQUIRED>
 <!ATTLIST function name                NMTOKEN #REQUIRED
                    alias               NMTOKEN #IMPLIED
-                   offset              CDATA   #IMPLIED
-                   static_dispatch     (true | false) "true"
                    vectorequiv         NMTOKEN #IMPLIED
                    es1                 CDATA   "none"
                    es2                 CDATA   "none"
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index d156598..2f33075 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -1109,51 +1109,51 @@
     <type name="DEBUGPROCARB" size="4" pointer="true"/>
     <type name="DEBUGPROC" size="4" pointer="true"/>
 
-    <function name="NewList" offset="0" deprecated="3.1">
+    <function name="NewList" deprecated="3.1">
         <param name="list" type="GLuint"/>
         <param name="mode" type="GLenum"/>
         <glx sop="101"/>
     </function>
 
-    <function name="EndList" offset="1" deprecated="3.1">
+    <function name="EndList" deprecated="3.1">
         <glx sop="102"/>
     </function>
 
-    <function name="CallList" offset="2" deprecated="3.1">
+    <function name="CallList" deprecated="3.1">
         <param name="list" type="GLuint"/>
         <glx rop="1"/>
     </function>
 
-    <function name="CallLists" offset="3" deprecated="3.1">
+    <function name="CallLists" deprecated="3.1">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="type" type="GLenum"/>
         <param name="lists" type="const GLvoid *" variable_param="type" count="n"/>
         <glx rop="2" large="true"/>
     </function>
 
-    <function name="DeleteLists" offset="4" deprecated="3.1">
+    <function name="DeleteLists" deprecated="3.1">
         <param name="list" type="GLuint"/>
         <param name="range" type="GLsizei"/>
         <glx sop="103"/>
     </function>
 
-    <function name="GenLists" offset="5" deprecated="3.1">
+    <function name="GenLists" deprecated="3.1">
         <param name="range" type="GLsizei"/>
         <return type="GLuint"/>
         <glx sop="104"/>
     </function>
 
-    <function name="ListBase" offset="6" deprecated="3.1">
+    <function name="ListBase" deprecated="3.1">
         <param name="base" type="GLuint"/>
         <glx rop="3"/>
     </function>
 
-    <function name="Begin" offset="7" deprecated="3.1" exec="dynamic">
+    <function name="Begin" deprecated="3.1" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <glx rop="4"/>
     </function>
 
-    <function name="Bitmap" offset="8" deprecated="3.1">
+    <function name="Bitmap" deprecated="3.1">
         <param name="width" type="GLsizei"/>
         <param name="height" type="GLsizei"/>
         <param name="xorig" type="GLfloat"/>
@@ -1164,129 +1164,120 @@
         <glx rop="5" large="true"/>
     </function>
 
-    <function name="Color3b" offset="9" vectorequiv="Color3bv"
-              deprecated="3.1">
+    <function name="Color3b" vectorequiv="Color3bv" deprecated="3.1">
         <param name="red" type="GLbyte"/>
         <param name="green" type="GLbyte"/>
         <param name="blue" type="GLbyte"/>
     </function>
 
-    <function name="Color3bv" offset="10" deprecated="3.1">
+    <function name="Color3bv" deprecated="3.1">
         <param name="v" type="const GLbyte *" count="3"/>
         <glx rop="6"/>
     </function>
 
-    <function name="Color3d" offset="11" vectorequiv="Color3dv"
-              deprecated="3.1">
+    <function name="Color3d" vectorequiv="Color3dv" deprecated="3.1">
         <param name="red" type="GLdouble"/>
         <param name="green" type="GLdouble"/>
         <param name="blue" type="GLdouble"/>
     </function>
 
-    <function name="Color3dv" offset="12" deprecated="3.1">
+    <function name="Color3dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="7"/>
     </function>
 
-    <function name="Color3f" offset="13" vectorequiv="Color3fv"
-              deprecated="3.1" exec="dynamic">
+    <function name="Color3f" vectorequiv="Color3fv"
+	      deprecated="3.1" exec="dynamic">
         <param name="red" type="GLfloat"/>
         <param name="green" type="GLfloat"/>
         <param name="blue" type="GLfloat"/>
     </function>
 
-    <function name="Color3fv" offset="14" deprecated="3.1" exec="dynamic">
+    <function name="Color3fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="8"/>
     </function>
 
-    <function name="Color3i" offset="15" vectorequiv="Color3iv"
-              deprecated="3.1">
+    <function name="Color3i" vectorequiv="Color3iv" deprecated="3.1">
         <param name="red" type="GLint"/>
         <param name="green" type="GLint"/>
         <param name="blue" type="GLint"/>
     </function>
 
-    <function name="Color3iv" offset="16" deprecated="3.1">
+    <function name="Color3iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="3"/>
         <glx rop="9"/>
     </function>
 
-    <function name="Color3s" offset="17" vectorequiv="Color3sv"
-              deprecated="3.1">
+    <function name="Color3s" vectorequiv="Color3sv" deprecated="3.1">
         <param name="red" type="GLshort"/>
         <param name="green" type="GLshort"/>
         <param name="blue" type="GLshort"/>
     </function>
 
-    <function name="Color3sv" offset="18" deprecated="3.1">
+    <function name="Color3sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="10"/>
     </function>
 
-    <function name="Color3ub" offset="19" vectorequiv="Color3ubv"
-              deprecated="3.1">
+    <function name="Color3ub" vectorequiv="Color3ubv" deprecated="3.1">
         <param name="red" type="GLubyte"/>
         <param name="green" type="GLubyte"/>
         <param name="blue" type="GLubyte"/>
     </function>
 
-    <function name="Color3ubv" offset="20" deprecated="3.1">
+    <function name="Color3ubv" deprecated="3.1">
         <param name="v" type="const GLubyte *" count="3"/>
         <glx rop="11"/>
     </function>
 
-    <function name="Color3ui" offset="21" vectorequiv="Color3uiv"
-              deprecated="3.1">
+    <function name="Color3ui" vectorequiv="Color3uiv" deprecated="3.1">
         <param name="red" type="GLuint"/>
         <param name="green" type="GLuint"/>
         <param name="blue" type="GLuint"/>
     </function>
 
-    <function name="Color3uiv" offset="22" deprecated="3.1">
+    <function name="Color3uiv" deprecated="3.1">
         <param name="v" type="const GLuint *" count="3"/>
         <glx rop="12"/>
     </function>
 
-    <function name="Color3us" offset="23" vectorequiv="Color3usv"
-              deprecated="3.1">
+    <function name="Color3us" vectorequiv="Color3usv" deprecated="3.1">
         <param name="red" type="GLushort"/>
         <param name="green" type="GLushort"/>
         <param name="blue" type="GLushort"/>
     </function>
 
-    <function name="Color3usv" offset="24" deprecated="3.1">
+    <function name="Color3usv" deprecated="3.1">
         <param name="v" type="const GLushort *" count="3"/>
         <glx rop="13"/>
     </function>
 
-    <function name="Color4b" offset="25" vectorequiv="Color4bv"
-              deprecated="3.1">
+    <function name="Color4b" vectorequiv="Color4bv" deprecated="3.1">
         <param name="red" type="GLbyte"/>
         <param name="green" type="GLbyte"/>
         <param name="blue" type="GLbyte"/>
         <param name="alpha" type="GLbyte"/>
     </function>
 
-    <function name="Color4bv" offset="26" deprecated="3.1">
+    <function name="Color4bv" deprecated="3.1">
         <param name="v" type="const GLbyte *" count="4"/>
         <glx rop="14"/>
     </function>
 
-    <function name="Color4d" offset="27" vectorequiv="Color4dv"
-              deprecated="3.1">
+    <function name="Color4d" vectorequiv="Color4dv" deprecated="3.1">
         <param name="red" type="GLdouble"/>
         <param name="green" type="GLdouble"/>
         <param name="blue" type="GLdouble"/>
         <param name="alpha" type="GLdouble"/>
     </function>
 
-    <function name="Color4dv" offset="28" deprecated="3.1">
+    <function name="Color4dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="4"/>
         <glx rop="15"/>
     </function>
 
-    <function name="Color4f" offset="29" vectorequiv="Color4fv" es1="1.0"
+    <function name="Color4f" vectorequiv="Color4fv" es1="1.0"
               deprecated="3.1" exec="dynamic">
         <param name="red" type="GLfloat"/>
         <param name="green" type="GLfloat"/>
@@ -1294,38 +1285,36 @@
         <param name="alpha" type="GLfloat"/>
     </function>
 
-    <function name="Color4fv" offset="30" deprecated="3.1" exec="dynamic">
+    <function name="Color4fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="4"/>
         <glx rop="16"/>
     </function>
 
-    <function name="Color4i" offset="31" vectorequiv="Color4iv"
-              deprecated="3.1">
+    <function name="Color4i" vectorequiv="Color4iv" deprecated="3.1">
         <param name="red" type="GLint"/>
         <param name="green" type="GLint"/>
         <param name="blue" type="GLint"/>
         <param name="alpha" type="GLint"/>
     </function>
 
-    <function name="Color4iv" offset="32" deprecated="3.1">
+    <function name="Color4iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="4"/>
         <glx rop="17"/>
     </function>
 
-    <function name="Color4s" offset="33" vectorequiv="Color4sv"
-              deprecated="3.1">
+    <function name="Color4s" vectorequiv="Color4sv" deprecated="3.1">
         <param name="red" type="GLshort"/>
         <param name="green" type="GLshort"/>
         <param name="blue" type="GLshort"/>
         <param name="alpha" type="GLshort"/>
     </function>
 
-    <function name="Color4sv" offset="34" deprecated="3.1">
+    <function name="Color4sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="18"/>
     </function>
 
-    <function name="Color4ub" offset="35" vectorequiv="Color4ubv" es1="1.1"
+    <function name="Color4ub" vectorequiv="Color4ubv" es1="1.1"
               deprecated="3.1">
         <param name="red" type="GLubyte"/>
         <param name="green" type="GLubyte"/>
@@ -1333,494 +1322,462 @@
         <param name="alpha" type="GLubyte"/>
     </function>
 
-    <function name="Color4ubv" offset="36" deprecated="3.1">
+    <function name="Color4ubv" deprecated="3.1">
         <param name="v" type="const GLubyte *" count="4"/>
         <glx rop="19"/>
     </function>
 
-    <function name="Color4ui" offset="37" vectorequiv="Color4uiv"
-              deprecated="3.1">
+    <function name="Color4ui" vectorequiv="Color4uiv" deprecated="3.1">
         <param name="red" type="GLuint"/>
         <param name="green" type="GLuint"/>
         <param name="blue" type="GLuint"/>
         <param name="alpha" type="GLuint"/>
     </function>
 
-    <function name="Color4uiv" offset="38" deprecated="3.1">
+    <function name="Color4uiv" deprecated="3.1">
         <param name="v" type="const GLuint *" count="4"/>
         <glx rop="20"/>
     </function>
 
-    <function name="Color4us" offset="39" vectorequiv="Color4usv"
-              deprecated="3.1">
+    <function name="Color4us" vectorequiv="Color4usv" deprecated="3.1">
         <param name="red" type="GLushort"/>
         <param name="green" type="GLushort"/>
         <param name="blue" type="GLushort"/>
         <param name="alpha" type="GLushort"/>
     </function>
 
-    <function name="Color4usv" offset="40" deprecated="3.1">
+    <function name="Color4usv" deprecated="3.1">
         <param name="v" type="const GLushort *" count="4"/>
         <glx rop="21"/>
     </function>
 
-    <function name="EdgeFlag" offset="41" vectorequiv="EdgeFlagv"
+    <function name="EdgeFlag" vectorequiv="EdgeFlagv"
               deprecated="3.1" exec="dynamic">
         <param name="flag" type="GLboolean"/>
     </function>
 
-    <function name="EdgeFlagv" offset="42" deprecated="3.1">
+    <function name="EdgeFlagv" deprecated="3.1">
         <param name="flag" type="const GLboolean *" count="1"/>
         <glx rop="22"/>
     </function>
 
-    <function name="End" offset="43" deprecated="3.1" exec="dynamic">
+    <function name="End" deprecated="3.1" exec="dynamic">
         <glx rop="23"/>
     </function>
 
-    <function name="Indexd" offset="44" vectorequiv="Indexdv" deprecated="3.1">
+    <function name="Indexd" vectorequiv="Indexdv" deprecated="3.1">
         <param name="c" type="GLdouble"/>
     </function>
 
-    <function name="Indexdv" offset="45" deprecated="3.1">
+    <function name="Indexdv" deprecated="3.1">
         <param name="c" type="const GLdouble *" count="1"/>
         <glx rop="24"/>
     </function>
 
-    <function name="Indexf" offset="46" vectorequiv="Indexfv" deprecated="3.1"
+    <function name="Indexf" vectorequiv="Indexfv" deprecated="3.1"
               exec="dynamic">
         <param name="c" type="GLfloat"/>
     </function>
 
-    <function name="Indexfv" offset="47" deprecated="3.1" exec="dynamic">
+    <function name="Indexfv" deprecated="3.1" exec="dynamic">
         <param name="c" type="const GLfloat *" count="1"/>
         <glx rop="25"/>
     </function>
 
-    <function name="Indexi" offset="48" vectorequiv="Indexiv" deprecated="3.1">
+    <function name="Indexi" vectorequiv="Indexiv" deprecated="3.1">
         <param name="c" type="GLint"/>
     </function>
 
-    <function name="Indexiv" offset="49" deprecated="3.1">
+    <function name="Indexiv" deprecated="3.1">
         <param name="c" type="const GLint *" count="1"/>
         <glx rop="26"/>
     </function>
 
-    <function name="Indexs" offset="50" vectorequiv="Indexsv" deprecated="3.1">
+    <function name="Indexs" vectorequiv="Indexsv" deprecated="3.1">
         <param name="c" type="GLshort"/>
     </function>
 
-    <function name="Indexsv" offset="51" deprecated="3.1">
+    <function name="Indexsv" deprecated="3.1">
         <param name="c" type="const GLshort *" count="1"/>
         <glx rop="27"/>
     </function>
 
-    <function name="Normal3b" offset="52" vectorequiv="Normal3bv"
-              deprecated="3.1">
+    <function name="Normal3b" vectorequiv="Normal3bv" deprecated="3.1">
         <param name="nx" type="GLbyte"/>
         <param name="ny" type="GLbyte"/>
         <param name="nz" type="GLbyte"/>
     </function>
 
-    <function name="Normal3bv" offset="53" deprecated="3.1">
+    <function name="Normal3bv" deprecated="3.1">
         <param name="v" type="const GLbyte *" count="3"/>
         <glx rop="28"/>
     </function>
 
-    <function name="Normal3d" offset="54" vectorequiv="Normal3dv"
-              deprecated="3.1">
+    <function name="Normal3d" vectorequiv="Normal3dv" deprecated="3.1">
         <param name="nx" type="GLdouble"/>
         <param name="ny" type="GLdouble"/>
         <param name="nz" type="GLdouble"/>
     </function>
 
-    <function name="Normal3dv" offset="55" deprecated="3.1">
+    <function name="Normal3dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="29"/>
     </function>
 
-    <function name="Normal3f" offset="56" vectorequiv="Normal3fv" es1="1.0"
+    <function name="Normal3f" vectorequiv="Normal3fv" es1="1.0"
               deprecated="3.1" exec="dynamic">
         <param name="nx" type="GLfloat"/>
         <param name="ny" type="GLfloat"/>
         <param name="nz" type="GLfloat"/>
     </function>
 
-    <function name="Normal3fv" offset="57" deprecated="3.1" exec="dynamic">
+    <function name="Normal3fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="30"/>
     </function>
 
-    <function name="Normal3i" offset="58" vectorequiv="Normal3iv"
-              deprecated="3.1">
+    <function name="Normal3i" vectorequiv="Normal3iv" deprecated="3.1">
         <param name="nx" type="GLint"/>
         <param name="ny" type="GLint"/>
         <param name="nz" type="GLint"/>
     </function>
 
-    <function name="Normal3iv" offset="59" deprecated="3.1">
+    <function name="Normal3iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="3"/>
         <glx rop="31"/>
     </function>
 
-    <function name="Normal3s" offset="60" vectorequiv="Normal3sv"
-              deprecated="3.1">
+    <function name="Normal3s" vectorequiv="Normal3sv" deprecated="3.1">
         <param name="nx" type="GLshort"/>
         <param name="ny" type="GLshort"/>
         <param name="nz" type="GLshort"/>
     </function>
 
-    <function name="Normal3sv" offset="61" deprecated="3.1">
+    <function name="Normal3sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="32"/>
     </function>
 
-    <function name="RasterPos2d" offset="62" vectorequiv="RasterPos2dv"
-              deprecated="3.1">
+    <function name="RasterPos2d" vectorequiv="RasterPos2dv" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
     </function>
 
-    <function name="RasterPos2dv" offset="63" deprecated="3.1">
+    <function name="RasterPos2dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="2"/>
         <glx rop="33"/>
     </function>
 
-    <function name="RasterPos2f" offset="64" vectorequiv="RasterPos2fv"
-              deprecated="3.1">
+    <function name="RasterPos2f" vectorequiv="RasterPos2fv" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
     </function>
 
-    <function name="RasterPos2fv" offset="65" deprecated="3.1">
+    <function name="RasterPos2fv" deprecated="3.1">
         <param name="v" type="const GLfloat *" count="2"/>
         <glx rop="34"/>
     </function>
 
-    <function name="RasterPos2i" offset="66" vectorequiv="RasterPos2iv"
-              deprecated="3.1">
+    <function name="RasterPos2i" vectorequiv="RasterPos2iv" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
     </function>
 
-    <function name="RasterPos2iv" offset="67" deprecated="3.1">
+    <function name="RasterPos2iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="2"/>
         <glx rop="35"/>
     </function>
 
-    <function name="RasterPos2s" offset="68" vectorequiv="RasterPos2sv"
-              deprecated="3.1">
+    <function name="RasterPos2s" vectorequiv="RasterPos2sv" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
     </function>
 
-    <function name="RasterPos2sv" offset="69" deprecated="3.1">
+    <function name="RasterPos2sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="2"/>
         <glx rop="36"/>
     </function>
 
-    <function name="RasterPos3d" offset="70" vectorequiv="RasterPos3dv"
-              deprecated="3.1">
+    <function name="RasterPos3d" vectorequiv="RasterPos3dv" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
     </function>
 
-    <function name="RasterPos3dv" offset="71" deprecated="3.1">
+    <function name="RasterPos3dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="37"/>
     </function>
 
-    <function name="RasterPos3f" offset="72" vectorequiv="RasterPos3fv"
-              deprecated="3.1">
+    <function name="RasterPos3f" vectorequiv="RasterPos3fv" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
     </function>
 
-    <function name="RasterPos3fv" offset="73" deprecated="3.1">
+    <function name="RasterPos3fv" deprecated="3.1">
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="38"/>
     </function>
 
-    <function name="RasterPos3i" offset="74" vectorequiv="RasterPos3iv"
-              deprecated="3.1">
+    <function name="RasterPos3i" vectorequiv="RasterPos3iv" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
     </function>
 
-    <function name="RasterPos3iv" offset="75" deprecated="3.1">
+    <function name="RasterPos3iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="3"/>
         <glx rop="39"/>
     </function>
 
-    <function name="RasterPos3s" offset="76" vectorequiv="RasterPos3sv"
-              deprecated="3.1">
+    <function name="RasterPos3s" vectorequiv="RasterPos3sv" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
     </function>
 
-    <function name="RasterPos3sv" offset="77" deprecated="3.1">
+    <function name="RasterPos3sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="40"/>
     </function>
 
-    <function name="RasterPos4d" offset="78" vectorequiv="RasterPos4dv"
-              deprecated="3.1">
+    <function name="RasterPos4d" vectorequiv="RasterPos4dv" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="RasterPos4dv" offset="79" deprecated="3.1">
+    <function name="RasterPos4dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="4"/>
         <glx rop="41"/>
     </function>
 
-    <function name="RasterPos4f" offset="80" vectorequiv="RasterPos4fv"
-              deprecated="3.1">
+    <function name="RasterPos4f" vectorequiv="RasterPos4fv" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
         <param name="w" type="GLfloat"/>
     </function>
 
-    <function name="RasterPos4fv" offset="81" deprecated="3.1">
+    <function name="RasterPos4fv" deprecated="3.1">
         <param name="v" type="const GLfloat *" count="4"/>
         <glx rop="42"/>
     </function>
 
-    <function name="RasterPos4i" offset="82" vectorequiv="RasterPos4iv"
-              deprecated="3.1">
+    <function name="RasterPos4i" vectorequiv="RasterPos4iv" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
         <param name="w" type="GLint"/>
     </function>
 
-    <function name="RasterPos4iv" offset="83" deprecated="3.1">
+    <function name="RasterPos4iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="4"/>
         <glx rop="43"/>
     </function>
 
-    <function name="RasterPos4s" offset="84" vectorequiv="RasterPos4sv"
-              deprecated="3.1">
+    <function name="RasterPos4s" vectorequiv="RasterPos4sv" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
         <param name="w" type="GLshort"/>
     </function>
 
-    <function name="RasterPos4sv" offset="85" deprecated="3.1">
+    <function name="RasterPos4sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="44"/>
     </function>
 
-    <function name="Rectd" offset="86" vectorequiv="Rectdv" deprecated="3.1">
+    <function name="Rectd" vectorequiv="Rectdv" deprecated="3.1">
         <param name="x1" type="GLdouble"/>
         <param name="y1" type="GLdouble"/>
         <param name="x2" type="GLdouble"/>
         <param name="y2" type="GLdouble"/>
     </function>
 
-    <function name="Rectdv" offset="87" deprecated="3.1">
+    <function name="Rectdv" deprecated="3.1">
         <param name="v1" type="const GLdouble *" count="2"/>
         <param name="v2" type="const GLdouble *" count="2"/>
         <glx rop="45"/>
     </function>
 
-    <function name="Rectf" offset="88" vectorequiv="Rectfv" deprecated="3.1"
-              exec="dynamic">
+    <function name="Rectf" vectorequiv="Rectfv" deprecated="3.1" exec="dynamic">
         <param name="x1" type="GLfloat"/>
         <param name="y1" type="GLfloat"/>
         <param name="x2" type="GLfloat"/>
         <param name="y2" type="GLfloat"/>
     </function>
 
-    <function name="Rectfv" offset="89" deprecated="3.1">
+    <function name="Rectfv" deprecated="3.1">
         <param name="v1" type="const GLfloat *" count="2"/>
         <param name="v2" type="const GLfloat *" count="2"/>
         <glx rop="46"/>
     </function>
 
-    <function name="Recti" offset="90" vectorequiv="Rectiv" deprecated="3.1">
+    <function name="Recti" vectorequiv="Rectiv" deprecated="3.1">
         <param name="x1" type="GLint"/>
         <param name="y1" type="GLint"/>
         <param name="x2" type="GLint"/>
         <param name="y2" type="GLint"/>
     </function>
 
-    <function name="Rectiv" offset="91" deprecated="3.1">
+    <function name="Rectiv" deprecated="3.1">
         <param name="v1" type="const GLint *" count="2"/>
         <param name="v2" type="const GLint *" count="2"/>
         <glx rop="47"/>
     </function>
 
-    <function name="Rects" offset="92" vectorequiv="Rectsv" deprecated="3.1">
+    <function name="Rects" vectorequiv="Rectsv" deprecated="3.1">
         <param name="x1" type="GLshort"/>
         <param name="y1" type="GLshort"/>
         <param name="x2" type="GLshort"/>
         <param name="y2" type="GLshort"/>
     </function>
 
-    <function name="Rectsv" offset="93" deprecated="3.1">
+    <function name="Rectsv" deprecated="3.1">
         <param name="v1" type="const GLshort *" count="2"/>
         <param name="v2" type="const GLshort *" count="2"/>
         <glx rop="48"/>
     </function>
 
-    <function name="TexCoord1d" offset="94" vectorequiv="TexCoord1dv"
-              deprecated="3.1">
+    <function name="TexCoord1d" vectorequiv="TexCoord1dv" deprecated="3.1">
         <param name="s" type="GLdouble"/>
     </function>
 
-    <function name="TexCoord1dv" offset="95" deprecated="3.1">
+    <function name="TexCoord1dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="1"/>
         <glx rop="49"/>
     </function>
 
-    <function name="TexCoord1f" offset="96" vectorequiv="TexCoord1fv"
+    <function name="TexCoord1f" vectorequiv="TexCoord1fv"
               deprecated="3.1" exec="dynamic">
         <param name="s" type="GLfloat"/>
     </function>
 
-    <function name="TexCoord1fv" offset="97" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoord1fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="1"/>
         <glx rop="50"/>
     </function>
 
-    <function name="TexCoord1i" offset="98" vectorequiv="TexCoord1iv"
-              deprecated="3.1">
+    <function name="TexCoord1i" vectorequiv="TexCoord1iv" deprecated="3.1">
         <param name="s" type="GLint"/>
     </function>
 
-    <function name="TexCoord1iv" offset="99" deprecated="3.1">
+    <function name="TexCoord1iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="1"/>
         <glx rop="51"/>
     </function>
 
-    <function name="TexCoord1s" offset="100" vectorequiv="TexCoord1sv"
-              deprecated="3.1">
+    <function name="TexCoord1s" vectorequiv="TexCoord1sv" deprecated="3.1">
         <param name="s" type="GLshort"/>
     </function>
 
-    <function name="TexCoord1sv" offset="101" deprecated="3.1">
+    <function name="TexCoord1sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="1"/>
         <glx rop="52"/>
     </function>
 
-    <function name="TexCoord2d" offset="102" vectorequiv="TexCoord2dv"
-              deprecated="3.1">
+    <function name="TexCoord2d" vectorequiv="TexCoord2dv" deprecated="3.1">
         <param name="s" type="GLdouble"/>
         <param name="t" type="GLdouble"/>
     </function>
 
-    <function name="TexCoord2dv" offset="103" deprecated="3.1">
+    <function name="TexCoord2dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="2"/>
         <glx rop="53"/>
     </function>
 
-    <function name="TexCoord2f" offset="104" vectorequiv="TexCoord2fv"
+    <function name="TexCoord2f" vectorequiv="TexCoord2fv"
               deprecated="3.1" exec="dynamic">
         <param name="s" type="GLfloat"/>
         <param name="t" type="GLfloat"/>
     </function>
 
-    <function name="TexCoord2fv" offset="105" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoord2fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="2"/>
         <glx rop="54"/>
     </function>
 
-    <function name="TexCoord2i" offset="106" vectorequiv="TexCoord2iv"
-              deprecated="3.1">
+    <function name="TexCoord2i" vectorequiv="TexCoord2iv" deprecated="3.1">
         <param name="s" type="GLint"/>
         <param name="t" type="GLint"/>
     </function>
 
-    <function name="TexCoord2iv" offset="107" deprecated="3.1">
+    <function name="TexCoord2iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="2"/>
         <glx rop="55"/>
     </function>
 
-    <function name="TexCoord2s" offset="108" vectorequiv="TexCoord2sv"
-              deprecated="3.1">
+    <function name="TexCoord2s" vectorequiv="TexCoord2sv" deprecated="3.1">
         <param name="s" type="GLshort"/>
         <param name="t" type="GLshort"/>
     </function>
 
-    <function name="TexCoord2sv" offset="109" deprecated="3.1">
+    <function name="TexCoord2sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="2"/>
         <glx rop="56"/>
     </function>
 
-    <function name="TexCoord3d" offset="110" vectorequiv="TexCoord3dv"
-              deprecated="3.1">
+    <function name="TexCoord3d" vectorequiv="TexCoord3dv" deprecated="3.1">
         <param name="s" type="GLdouble"/>
         <param name="t" type="GLdouble"/>
         <param name="r" type="GLdouble"/>
     </function>
 
-    <function name="TexCoord3dv" offset="111" deprecated="3.1">
+    <function name="TexCoord3dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="57"/>
     </function>
 
-    <function name="TexCoord3f" offset="112" vectorequiv="TexCoord3fv"
+    <function name="TexCoord3f" vectorequiv="TexCoord3fv"
               deprecated="3.1" exec="dynamic">
         <param name="s" type="GLfloat"/>
         <param name="t" type="GLfloat"/>
         <param name="r" type="GLfloat"/>
     </function>
 
-    <function name="TexCoord3fv" offset="113" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoord3fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="58"/>
     </function>
 
-    <function name="TexCoord3i" offset="114" vectorequiv="TexCoord3iv"
-              deprecated="3.1">
+    <function name="TexCoord3i" vectorequiv="TexCoord3iv" deprecated="3.1">
         <param name="s" type="GLint"/>
         <param name="t" type="GLint"/>
         <param name="r" type="GLint"/>
     </function>
 
-    <function name="TexCoord3iv" offset="115" deprecated="3.1">
+    <function name="TexCoord3iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="3"/>
         <glx rop="59"/>
     </function>
 
-    <function name="TexCoord3s" offset="116" vectorequiv="TexCoord3sv"
-              deprecated="3.1">
+    <function name="TexCoord3s" vectorequiv="TexCoord3sv" deprecated="3.1">
         <param name="s" type="GLshort"/>
         <param name="t" type="GLshort"/>
         <param name="r" type="GLshort"/>
     </function>
 
-    <function name="TexCoord3sv" offset="117" deprecated="3.1">
+    <function name="TexCoord3sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="60"/>
     </function>
 
-    <function name="TexCoord4d" offset="118" vectorequiv="TexCoord4dv"
-              deprecated="3.1">
+    <function name="TexCoord4d" vectorequiv="TexCoord4dv" deprecated="3.1">
         <param name="s" type="GLdouble"/>
         <param name="t" type="GLdouble"/>
         <param name="r" type="GLdouble"/>
         <param name="q" type="GLdouble"/>
     </function>
 
-    <function name="TexCoord4dv" offset="119" deprecated="3.1">
+    <function name="TexCoord4dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="4"/>
         <glx rop="61"/>
     </function>
 
-    <function name="TexCoord4f" offset="120" vectorequiv="TexCoord4fv"
+    <function name="TexCoord4f" vectorequiv="TexCoord4fv"
               deprecated="3.1" exec="dynamic">
         <param name="s" type="GLfloat"/>
         <param name="t" type="GLfloat"/>
@@ -1828,146 +1785,134 @@
         <param name="q" type="GLfloat"/>
     </function>
 
-    <function name="TexCoord4fv" offset="121" deprecated="3.1"
-              exec="dynamic">
+    <function name="TexCoord4fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="4"/>
         <glx rop="62"/>
     </function>
 
-    <function name="TexCoord4i" offset="122" vectorequiv="TexCoord4iv"
-              deprecated="3.1">
+    <function name="TexCoord4i" vectorequiv="TexCoord4iv" deprecated="3.1">
         <param name="s" type="GLint"/>
         <param name="t" type="GLint"/>
         <param name="r" type="GLint"/>
         <param name="q" type="GLint"/>
     </function>
 
-    <function name="TexCoord4iv" offset="123" deprecated="3.1">
+    <function name="TexCoord4iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="4"/>
         <glx rop="63"/>
     </function>
 
-    <function name="TexCoord4s" offset="124" vectorequiv="TexCoord4sv"
-              deprecated="3.1">
+    <function name="TexCoord4s" vectorequiv="TexCoord4sv" deprecated="3.1">
         <param name="s" type="GLshort"/>
         <param name="t" type="GLshort"/>
         <param name="r" type="GLshort"/>
         <param name="q" type="GLshort"/>
     </function>
 
-    <function name="TexCoord4sv" offset="125" deprecated="3.1">
+    <function name="TexCoord4sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="64"/>
     </function>
 
-    <function name="Vertex2d" offset="126" vectorequiv="Vertex2dv"
-              deprecated="3.1">
+    <function name="Vertex2d" vectorequiv="Vertex2dv" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
     </function>
 
-    <function name="Vertex2dv" offset="127" deprecated="3.1">
+    <function name="Vertex2dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="2"/>
         <glx rop="65"/>
     </function>
 
-    <function name="Vertex2f" offset="128" vectorequiv="Vertex2fv"
+    <function name="Vertex2f" vectorequiv="Vertex2fv"
               deprecated="3.1" exec="dynamic">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
     </function>
 
-    <function name="Vertex2fv" offset="129" deprecated="3.1"
-              exec="dynamic">
+    <function name="Vertex2fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="2"/>
         <glx rop="66"/>
     </function>
 
-    <function name="Vertex2i" offset="130" vectorequiv="Vertex2iv"
-              deprecated="3.1">
+    <function name="Vertex2i" vectorequiv="Vertex2iv" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
     </function>
 
-    <function name="Vertex2iv" offset="131" deprecated="3.1">
+    <function name="Vertex2iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="2"/>
         <glx rop="67"/>
     </function>
 
-    <function name="Vertex2s" offset="132" vectorequiv="Vertex2sv"
-              deprecated="3.1">
+    <function name="Vertex2s" vectorequiv="Vertex2sv" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
     </function>
 
-    <function name="Vertex2sv" offset="133" deprecated="3.1">
+    <function name="Vertex2sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="2"/>
         <glx rop="68"/>
     </function>
 
-    <function name="Vertex3d" offset="134" vectorequiv="Vertex3dv"
-              deprecated="3.1">
+    <function name="Vertex3d" vectorequiv="Vertex3dv" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
     </function>
 
-    <function name="Vertex3dv" offset="135" deprecated="3.1">
+    <function name="Vertex3dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="69"/>
     </function>
 
-    <function name="Vertex3f" offset="136" vectorequiv="Vertex3fv"
+    <function name="Vertex3f" vectorequiv="Vertex3fv"
               deprecated="3.1" exec="dynamic">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
     </function>
 
-    <function name="Vertex3fv" offset="137" deprecated="3.1"
-              exec="dynamic">
+    <function name="Vertex3fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="70"/>
     </function>
 
-    <function name="Vertex3i" offset="138" vectorequiv="Vertex3iv"
-              deprecated="3.1">
+    <function name="Vertex3i" vectorequiv="Vertex3iv" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
     </function>
 
-    <function name="Vertex3iv" offset="139" deprecated="3.1">
+    <function name="Vertex3iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="3"/>
         <glx rop="71"/>
     </function>
 
-    <function name="Vertex3s" offset="140" vectorequiv="Vertex3sv"
-              deprecated="3.1">
+    <function name="Vertex3s" vectorequiv="Vertex3sv" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
     </function>
 
-    <function name="Vertex3sv" offset="141" deprecated="3.1">
+    <function name="Vertex3sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="72"/>
     </function>
 
-    <function name="Vertex4d" offset="142" vectorequiv="Vertex4dv"
-              deprecated="3.1">
+    <function name="Vertex4d" vectorequiv="Vertex4dv" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="Vertex4dv" offset="143" deprecated="3.1">
+    <function name="Vertex4dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="4"/>
         <glx rop="73"/>
     </function>
 
-    <function name="Vertex4f" offset="144" vectorequiv="Vertex4fv"
+    <function name="Vertex4f" vectorequiv="Vertex4fv"
               deprecated="3.1" exec="dynamic">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
@@ -1975,199 +1920,195 @@
         <param name="w" type="GLfloat"/>
     </function>
 
-    <function name="Vertex4fv" offset="145" deprecated="3.1"
-              exec="dynamic">
+    <function name="Vertex4fv" deprecated="3.1" exec="dynamic">
         <param name="v" type="const GLfloat *" count="4"/>
         <glx rop="74"/>
     </function>
 
-    <function name="Vertex4i" offset="146" vectorequiv="Vertex4iv"
-              deprecated="3.1">
+    <function name="Vertex4i" vectorequiv="Vertex4iv" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
         <param name="w" type="GLint"/>
     </function>
 
-    <function name="Vertex4iv" offset="147" deprecated="3.1">
+    <function name="Vertex4iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="4"/>
         <glx rop="75"/>
     </function>
 
-    <function name="Vertex4s" offset="148" vectorequiv="Vertex4sv"
-              deprecated="3.1">
+    <function name="Vertex4s" vectorequiv="Vertex4sv" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
         <param name="w" type="GLshort"/>
     </function>
 
-    <function name="Vertex4sv" offset="149" deprecated="3.1">
+    <function name="Vertex4sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="76"/>
     </function>
 
-    <function name="ClipPlane" offset="150" deprecated="3.1">
+    <function name="ClipPlane" deprecated="3.1">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="const GLdouble *" count="4"/>
         <glx rop="77"/>
     </function>
 
-    <function name="ColorMaterial" offset="151" deprecated="3.1">
+    <function name="ColorMaterial" deprecated="3.1">
         <param name="face" type="GLenum"/>
         <param name="mode" type="GLenum"/>
         <glx rop="78"/>
     </function>
 
-    <function name="CullFace" offset="152" es1="1.0" es2="2.0">
+    <function name="CullFace" es1="1.0" es2="2.0">
         <param name="mode" type="GLenum"/>
         <glx rop="79"/>
     </function>
 
-    <function name="Fogf" offset="153" es1="1.0" deprecated="3.1">
+    <function name="Fogf" es1="1.0" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="80"/>
     </function>
 
-    <function name="Fogfv" offset="154" es1="1.0" deprecated="3.1">
+    <function name="Fogfv" es1="1.0" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="81"/>
     </function>
 
-    <function name="Fogi" offset="155" deprecated="3.1">
+    <function name="Fogi" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="82"/>
     </function>
 
-    <function name="Fogiv" offset="156" deprecated="3.1">
+    <function name="Fogiv" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="83"/>
     </function>
 
-    <function name="FrontFace" offset="157" es1="1.0" es2="2.0">
+    <function name="FrontFace" es1="1.0" es2="2.0">
         <param name="mode" type="GLenum"/>
         <glx rop="84"/>
     </function>
 
-    <function name="Hint" offset="158" es1="1.0" es2="2.0">
+    <function name="Hint" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="mode" type="GLenum"/>
         <glx rop="85"/>
     </function>
 
-    <function name="Lightf" offset="159" es1="1.0" deprecated="3.1">
+    <function name="Lightf" es1="1.0" deprecated="3.1">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="86"/>
     </function>
 
-    <function name="Lightfv" offset="160" es1="1.0" deprecated="3.1">
+    <function name="Lightfv" es1="1.0" deprecated="3.1">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="87"/>
     </function>
 
-    <function name="Lighti" offset="161" deprecated="3.1">
+    <function name="Lighti" deprecated="3.1">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="88"/>
     </function>
 
-    <function name="Lightiv" offset="162" deprecated="3.1">
+    <function name="Lightiv" deprecated="3.1">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="89"/>
     </function>
 
-    <function name="LightModelf" offset="163" es1="1.0" deprecated="3.1">
+    <function name="LightModelf" es1="1.0" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="90"/>
     </function>
 
-    <function name="LightModelfv" offset="164" es1="1.0" deprecated="3.1">
+    <function name="LightModelfv" es1="1.0" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="91"/>
     </function>
 
-    <function name="LightModeli" offset="165" deprecated="3.1">
+    <function name="LightModeli" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="92"/>
     </function>
 
-    <function name="LightModeliv" offset="166" deprecated="3.1">
+    <function name="LightModeliv" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="93"/>
     </function>
 
-    <function name="LineStipple" offset="167" deprecated="3.1">
+    <function name="LineStipple" deprecated="3.1">
         <param name="factor" type="GLint"/>
         <param name="pattern" type="GLushort"/>
         <glx rop="94"/>
     </function>
 
-    <function name="LineWidth" offset="168" es1="1.0" es2="2.0">
+    <function name="LineWidth" es1="1.0" es2="2.0">
         <param name="width" type="GLfloat"/>
         <glx rop="95"/>
     </function>
 
-    <function name="Materialf" offset="169" es1="1.0" deprecated="3.1">
+    <function name="Materialf" es1="1.0" deprecated="3.1">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="96"/>
     </function>
 
-    <function name="Materialfv" offset="170" es1="1.0" deprecated="3.1"
-              exec="dynamic">
+    <function name="Materialfv" es1="1.0" deprecated="3.1" exec="dynamic">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="97"/>
     </function>
 
-    <function name="Materiali" offset="171" deprecated="3.1">
+    <function name="Materiali" deprecated="3.1">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="98"/>
     </function>
 
-    <function name="Materialiv" offset="172" deprecated="3.1">
+    <function name="Materialiv" deprecated="3.1">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="99"/>
     </function>
 
-    <function name="PointSize" offset="173" es1="1.0">
+    <function name="PointSize" es1="1.0">
         <param name="size" type="GLfloat"/>
         <glx rop="100"/>
     </function>
 
-    <function name="PolygonMode" offset="174">
+    <function name="PolygonMode">
         <param name="face" type="GLenum"/>
         <param name="mode" type="GLenum"/>
         <glx rop="101"/>
     </function>
 
-    <function name="PolygonStipple" offset="175" deprecated="3.1">
+    <function name="PolygonStipple" deprecated="3.1">
         <param name="mask" type="const GLubyte *" img_width="32" img_height="32" img_format="GL_COLOR_INDEX" img_type="GL_BITMAP" img_target="0" img_pad_dimensions="false"/>
         <glx rop="102"/>
     </function>
 
-    <function name="Scissor" offset="176" es1="1.0" es2="2.0">
+    <function name="Scissor" es1="1.0" es2="2.0">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="width" type="GLsizei"/>
@@ -2175,40 +2116,40 @@
         <glx rop="103"/>
     </function>
 
-    <function name="ShadeModel" offset="177" es1="1.0" deprecated="3.1">
+    <function name="ShadeModel" es1="1.0" deprecated="3.1">
         <param name="mode" type="GLenum"/>
         <glx rop="104"/>
     </function>
 
-    <function name="TexParameterf" offset="178" es1="1.0" es2="2.0">
+    <function name="TexParameterf" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="105"/>
     </function>
 
-    <function name="TexParameterfv" offset="179" es1="1.1" es2="2.0">
+    <function name="TexParameterfv" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="106"/>
     </function>
 
-    <function name="TexParameteri" offset="180" es1="1.1" es2="2.0">
+    <function name="TexParameteri" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="107"/>
     </function>
 
-    <function name="TexParameteriv" offset="181" es1="1.1" es2="2.0">
+    <function name="TexParameteriv" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="108"/>
     </function>
 
-    <function name="TexImage1D" offset="182">
+    <function name="TexImage1D">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLint"/>
@@ -2220,7 +2161,7 @@
         <glx rop="109" large="true"/>
     </function>
 
-    <function name="TexImage2D" offset="183" es1="1.0" es2="2.0">
+    <function name="TexImage2D" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLint"/>
@@ -2233,129 +2174,129 @@
         <glx rop="110" large="true"/>
     </function>
 
-    <function name="TexEnvf" offset="184" es1="1.0" deprecated="3.1">
+    <function name="TexEnvf" es1="1.0" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="111"/>
     </function>
 
-    <function name="TexEnvfv" offset="185" es1="1.0" deprecated="3.1">
+    <function name="TexEnvfv" es1="1.0" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="112"/>
     </function>
 
-    <function name="TexEnvi" offset="186" es1="1.1" deprecated="3.1">
+    <function name="TexEnvi" es1="1.1" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="113"/>
     </function>
 
-    <function name="TexEnviv" offset="187" es1="1.1" deprecated="3.1">
+    <function name="TexEnviv" es1="1.1" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="114"/>
     </function>
 
-    <function name="TexGend" offset="188" deprecated="3.1">
+    <function name="TexGend" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLdouble"/>
         <glx rop="115"/>
     </function>
 
-    <function name="TexGendv" offset="189" deprecated="3.1">
+    <function name="TexGendv" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLdouble *" variable_param="pname"/>
         <glx rop="116"/>
     </function>
 
-    <function name="TexGenf" offset="190" deprecated="3.1">
+    <function name="TexGenf" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="117"/>
     </function>
 
-    <function name="TexGenfv" offset="191" deprecated="3.1">
+    <function name="TexGenfv" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="118"/>
     </function>
 
-    <function name="TexGeni" offset="192" deprecated="3.1">
+    <function name="TexGeni" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="119"/>
     </function>
 
-    <function name="TexGeniv" offset="193" deprecated="3.1">
+    <function name="TexGeniv" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="120"/>
     </function>
 
-    <function name="FeedbackBuffer" offset="194" deprecated="3.1">
+    <function name="FeedbackBuffer" deprecated="3.1">
         <param name="size" type="GLsizei"/>
         <param name="type" type="GLenum"/>
         <param name="buffer" type="GLfloat *" output="true"/>
         <glx sop="105" handcode="true"/>
     </function>
 
-    <function name="SelectBuffer" offset="195" deprecated="3.1">
+    <function name="SelectBuffer" deprecated="3.1">
         <param name="size" type="GLsizei"/>
         <param name="buffer" type="GLuint *" output="true"/>
         <glx sop="106" handcode="true"/>
     </function>
 
-    <function name="RenderMode" offset="196" deprecated="3.1">
+    <function name="RenderMode" deprecated="3.1">
         <param name="mode" type="GLenum"/>
         <return type="GLint"/>
         <glx sop="107" handcode="true"/>
     </function>
 
-    <function name="InitNames" offset="197" deprecated="3.1">
+    <function name="InitNames" deprecated="3.1">
         <glx rop="121"/>
     </function>
 
-    <function name="LoadName" offset="198" deprecated="3.1">
+    <function name="LoadName" deprecated="3.1">
         <param name="name" type="GLuint"/>
         <glx rop="122"/>
     </function>
 
-    <function name="PassThrough" offset="199" deprecated="3.1">
+    <function name="PassThrough" deprecated="3.1">
         <param name="token" type="GLfloat"/>
         <glx rop="123"/>
     </function>
 
-    <function name="PopName" offset="200" deprecated="3.1">
+    <function name="PopName" deprecated="3.1">
         <glx rop="124"/>
     </function>
 
-    <function name="PushName" offset="201" deprecated="3.1">
+    <function name="PushName" deprecated="3.1">
         <param name="name" type="GLuint"/>
         <glx rop="125"/>
     </function>
 
-    <function name="DrawBuffer" offset="202">
+    <function name="DrawBuffer">
         <param name="mode" type="GLenum"/>
         <glx rop="126"/>
     </function>
 
-    <function name="Clear" offset="203" es1="1.0" es2="2.0">
+    <function name="Clear" es1="1.0" es2="2.0">
         <param name="mask" type="GLbitfield"/>
         <glx rop="127"/>
     </function>
 
-    <function name="ClearAccum" offset="204" deprecated="3.1">
+    <function name="ClearAccum" deprecated="3.1">
         <param name="red" type="GLfloat"/>
         <param name="green" type="GLfloat"/>
         <param name="blue" type="GLfloat"/>
@@ -2363,12 +2304,12 @@
         <glx rop="128"/>
     </function>
 
-    <function name="ClearIndex" offset="205" deprecated="3.1">
+    <function name="ClearIndex" deprecated="3.1">
         <param name="c" type="GLfloat"/>
         <glx rop="129"/>
     </function>
 
-    <function name="ClearColor" offset="206" es1="1.0" es2="2.0">
+    <function name="ClearColor" es1="1.0" es2="2.0">
         <param name="red" type="GLclampf"/>
         <param name="green" type="GLclampf"/>
         <param name="blue" type="GLclampf"/>
@@ -2376,22 +2317,22 @@
         <glx rop="130"/>
     </function>
 
-    <function name="ClearStencil" offset="207" es1="1.0" es2="2.0">
+    <function name="ClearStencil" es1="1.0" es2="2.0">
         <param name="s" type="GLint"/>
         <glx rop="131"/>
     </function>
 
-    <function name="ClearDepth" offset="208">
+    <function name="ClearDepth">
         <param name="depth" type="GLclampd"/>
         <glx rop="132"/>
     </function>
 
-    <function name="StencilMask" offset="209" es1="1.0" es2="2.0">
+    <function name="StencilMask" es1="1.0" es2="2.0">
         <param name="mask" type="GLuint"/>
         <glx rop="133"/>
     </function>
 
-    <function name="ColorMask" offset="210" es1="1.0" es2="2.0">
+    <function name="ColorMask" es1="1.0" es2="2.0">
         <param name="red" type="GLboolean"/>
         <param name="green" type="GLboolean"/>
         <param name="blue" type="GLboolean"/>
@@ -2399,50 +2340,50 @@
         <glx rop="134"/>
     </function>
 
-    <function name="DepthMask" offset="211" es1="1.0" es2="2.0">
+    <function name="DepthMask" es1="1.0" es2="2.0">
         <param name="flag" type="GLboolean"/>
         <glx rop="135"/>
     </function>
 
-    <function name="IndexMask" offset="212" deprecated="3.1">
+    <function name="IndexMask" deprecated="3.1">
         <param name="mask" type="GLuint"/>
         <glx rop="136"/>
     </function>
 
-    <function name="Accum" offset="213" deprecated="3.1">
+    <function name="Accum" deprecated="3.1">
         <param name="op" type="GLenum"/>
         <param name="value" type="GLfloat"/>
         <glx rop="137"/>
     </function>
 
-    <function name="Disable" offset="214" es1="1.0" es2="2.0">
+    <function name="Disable" es1="1.0" es2="2.0">
         <param name="cap" type="GLenum"/>
         <glx rop="138" handcode="client"/>
     </function>
 
-    <function name="Enable" offset="215" es1="1.0" es2="2.0">
+    <function name="Enable" es1="1.0" es2="2.0">
         <param name="cap" type="GLenum"/>
         <glx rop="139" handcode="client"/>
     </function>
 
-    <function name="Finish" offset="216" es1="1.0" es2="2.0">
+    <function name="Finish" es1="1.0" es2="2.0">
         <glx sop="108" handcode="true"/>
     </function>
 
-    <function name="Flush" offset="217" es1="1.0" es2="2.0">
+    <function name="Flush" es1="1.0" es2="2.0">
         <glx sop="142" handcode="true"/>
     </function>
 
-    <function name="PopAttrib" offset="218" deprecated="3.1">
+    <function name="PopAttrib" deprecated="3.1">
         <glx rop="141"/>
     </function>
 
-    <function name="PushAttrib" offset="219" deprecated="3.1">
+    <function name="PushAttrib" deprecated="3.1">
         <param name="mask" type="GLbitfield"/>
         <glx rop="142"/>
     </function>
 
-    <function name="Map1d" offset="220" deprecated="3.1">
+    <function name="Map1d" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="u1" type="GLdouble"/>
         <param name="u2" type="GLdouble"/>
@@ -2452,7 +2393,7 @@
         <glx rop="143" handcode="true"/>
     </function>
 
-    <function name="Map1f" offset="221" deprecated="3.1">
+    <function name="Map1f" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="u1" type="GLfloat"/>
         <param name="u2" type="GLfloat"/>
@@ -2462,7 +2403,7 @@
         <glx rop="144" handcode="true"/>
     </function>
 
-    <function name="Map2d" offset="222" deprecated="3.1">
+    <function name="Map2d" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="u1" type="GLdouble"/>
         <param name="u2" type="GLdouble"/>
@@ -2476,7 +2417,7 @@
         <glx rop="145" handcode="true"/>
     </function>
 
-    <function name="Map2f" offset="223" deprecated="3.1">
+    <function name="Map2f" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="u1" type="GLfloat"/>
         <param name="u2" type="GLfloat"/>
@@ -2490,21 +2431,21 @@
         <glx rop="146" handcode="true"/>
     </function>
 
-    <function name="MapGrid1d" offset="224" deprecated="3.1">
+    <function name="MapGrid1d" deprecated="3.1">
         <param name="un" type="GLint"/>
         <param name="u1" type="GLdouble"/>
         <param name="u2" type="GLdouble"/>
         <glx rop="147"/>
     </function>
 
-    <function name="MapGrid1f" offset="225" deprecated="3.1">
+    <function name="MapGrid1f" deprecated="3.1">
         <param name="un" type="GLint"/>
         <param name="u1" type="GLfloat"/>
         <param name="u2" type="GLfloat"/>
         <glx rop="148"/>
     </function>
 
-    <function name="MapGrid2d" offset="226" deprecated="3.1">
+    <function name="MapGrid2d" deprecated="3.1">
         <param name="un" type="GLint"/>
         <param name="u1" type="GLdouble"/>
         <param name="u2" type="GLdouble"/>
@@ -2514,7 +2455,7 @@
         <glx rop="149"/>
     </function>
 
-    <function name="MapGrid2f" offset="227" deprecated="3.1">
+    <function name="MapGrid2f" deprecated="3.1">
         <param name="un" type="GLint"/>
         <param name="u1" type="GLfloat"/>
         <param name="u2" type="GLfloat"/>
@@ -2524,64 +2465,59 @@
         <glx rop="150"/>
     </function>
 
-    <function name="EvalCoord1d" offset="228" vectorequiv="EvalCoord1dv"
-              deprecated="3.1">
+    <function name="EvalCoord1d" vectorequiv="EvalCoord1dv" deprecated="3.1">
         <param name="u" type="GLdouble"/>
     </function>
 
-    <function name="EvalCoord1dv" offset="229" deprecated="3.1">
+    <function name="EvalCoord1dv" deprecated="3.1">
         <param name="u" type="const GLdouble *" count="1"/>
         <glx rop="151"/>
     </function>
 
-    <function name="EvalCoord1f" offset="230" vectorequiv="EvalCoord1fv"
+    <function name="EvalCoord1f" vectorequiv="EvalCoord1fv"
               deprecated="3.1" exec="dynamic">
         <param name="u" type="GLfloat"/>
     </function>
 
-    <function name="EvalCoord1fv" offset="231" deprecated="3.1">
+    <function name="EvalCoord1fv" deprecated="3.1">
         <param name="u" type="const GLfloat *" count="1"/>
         <glx rop="152"/>
     </function>
 
-    <function name="EvalCoord2d" offset="232" vectorequiv="EvalCoord2dv"
-              deprecated="3.1">
+    <function name="EvalCoord2d" vectorequiv="EvalCoord2dv" deprecated="3.1">
         <param name="u" type="GLdouble"/>
         <param name="v" type="GLdouble"/>
     </function>
 
-    <function name="EvalCoord2dv" offset="233" deprecated="3.1">
+    <function name="EvalCoord2dv" deprecated="3.1">
         <param name="u" type="const GLdouble *" count="2"/>
         <glx rop="153"/>
     </function>
 
-    <function name="EvalCoord2f" offset="234" vectorequiv="EvalCoord2fv"
+    <function name="EvalCoord2f" vectorequiv="EvalCoord2fv"
               deprecated="3.1" exec="dynamic">
         <param name="u" type="GLfloat"/>
         <param name="v" type="GLfloat"/>
     </function>
 
-    <function name="EvalCoord2fv" offset="235" deprecated="3.1">
+    <function name="EvalCoord2fv" deprecated="3.1">
         <param name="u" type="const GLfloat *" count="2"/>
         <glx rop="154"/>
     </function>
 
-    <function name="EvalMesh1" offset="236" deprecated="3.1"
-              exec="dynamic">
+    <function name="EvalMesh1" deprecated="3.1" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="i1" type="GLint"/>
         <param name="i2" type="GLint"/>
         <glx rop="155"/>
     </function>
 
-    <function name="EvalPoint1" offset="237" deprecated="3.1"
-              exec="dynamic">
+    <function name="EvalPoint1" deprecated="3.1" exec="dynamic">
         <param name="i" type="GLint"/>
         <glx rop="156"/>
     </function>
 
-    <function name="EvalMesh2" offset="238" deprecated="3.1"
-              exec="dynamic">
+    <function name="EvalMesh2" deprecated="3.1" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="i1" type="GLint"/>
         <param name="i2" type="GLint"/>
@@ -2590,106 +2526,105 @@
         <glx rop="157"/>
     </function>
 
-    <function name="EvalPoint2" offset="239" deprecated="3.1"
-              exec="dynamic">
+    <function name="EvalPoint2" deprecated="3.1" exec="dynamic">
         <param name="i" type="GLint"/>
         <param name="j" type="GLint"/>
         <glx rop="158"/>
     </function>
 
-    <function name="AlphaFunc" offset="240" es1="1.0" deprecated="3.1">
+    <function name="AlphaFunc" es1="1.0" deprecated="3.1">
         <param name="func" type="GLenum"/>
         <param name="ref" type="GLclampf"/>
         <glx rop="159"/>
     </function>
 
-    <function name="BlendFunc" offset="241" es1="1.0" es2="2.0">
+    <function name="BlendFunc" es1="1.0" es2="2.0">
         <param name="sfactor" type="GLenum"/>
         <param name="dfactor" type="GLenum"/>
         <glx rop="160"/>
     </function>
 
-    <function name="LogicOp" offset="242" es1="1.0">
+    <function name="LogicOp" es1="1.0">
         <param name="opcode" type="GLenum"/>
         <glx rop="161"/>
     </function>
 
-    <function name="StencilFunc" offset="243" es1="1.0" es2="2.0">
+    <function name="StencilFunc" es1="1.0" es2="2.0">
         <param name="func" type="GLenum"/>
         <param name="ref" type="GLint"/>
         <param name="mask" type="GLuint"/>
         <glx rop="162"/>
     </function>
 
-    <function name="StencilOp" offset="244" es1="1.0" es2="2.0">
+    <function name="StencilOp" es1="1.0" es2="2.0">
         <param name="fail" type="GLenum"/>
         <param name="zfail" type="GLenum"/>
         <param name="zpass" type="GLenum"/>
         <glx rop="163"/>
     </function>
 
-    <function name="DepthFunc" offset="245" es1="1.0" es2="2.0">
+    <function name="DepthFunc" es1="1.0" es2="2.0">
         <param name="func" type="GLenum"/>
         <glx rop="164"/>
     </function>
 
-    <function name="PixelZoom" offset="246" deprecated="3.1">
+    <function name="PixelZoom" deprecated="3.1">
         <param name="xfactor" type="GLfloat"/>
         <param name="yfactor" type="GLfloat"/>
         <glx rop="165"/>
     </function>
 
-    <function name="PixelTransferf" offset="247" deprecated="3.1">
+    <function name="PixelTransferf" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="166"/>
     </function>
 
-    <function name="PixelTransferi" offset="248" deprecated="3.1">
+    <function name="PixelTransferi" deprecated="3.1">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="167"/>
     </function>
 
-    <function name="PixelStoref" offset="249">
+    <function name="PixelStoref">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx sop="109" handcode="client"/>
     </function>
 
-    <function name="PixelStorei" offset="250" es1="1.0" es2="2.0">
+    <function name="PixelStorei" es1="1.0" es2="2.0">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx sop="110" handcode="client"/>
     </function>
 
-    <function name="PixelMapfv" offset="251" deprecated="3.1">
+    <function name="PixelMapfv" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="mapsize" type="GLsizei" counter="true"/>
         <param name="values" type="const GLfloat *" count="mapsize"/>
         <glx rop="168" large="true"/>
     </function>
 
-    <function name="PixelMapuiv" offset="252" deprecated="3.1">
+    <function name="PixelMapuiv" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="mapsize" type="GLsizei" counter="true"/>
         <param name="values" type="const GLuint *" count="mapsize"/>
         <glx rop="169" large="true"/>
     </function>
 
-    <function name="PixelMapusv" offset="253" deprecated="3.1">
+    <function name="PixelMapusv" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="mapsize" type="GLsizei" counter="true"/>
         <param name="values" type="const GLushort *" count="mapsize"/>
         <glx rop="170" large="true"/>
     </function>
 
-    <function name="ReadBuffer" offset="254" es2="3.0">
+    <function name="ReadBuffer" es2="3.0">
         <param name="mode" type="GLenum"/>
         <glx rop="171"/>
     </function>
 
-    <function name="CopyPixels" offset="255" deprecated="3.1">
+    <function name="CopyPixels" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="width" type="GLsizei"/>
@@ -2698,7 +2633,7 @@
         <glx rop="172"/>
     </function>
 
-    <function name="ReadPixels" offset="256" es1="1.0" es2="2.0">
+    <function name="ReadPixels" es1="1.0" es2="2.0">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="width" type="GLsizei"/>
@@ -2709,7 +2644,7 @@
         <glx sop="111"/>
     </function>
 
-    <function name="DrawPixels" offset="257" deprecated="3.1">
+    <function name="DrawPixels" deprecated="3.1">
         <param name="width" type="GLsizei"/>
         <param name="height" type="GLsizei"/>
         <param name="format" type="GLenum"/>
@@ -2718,155 +2653,155 @@
         <glx rop="173" large="true"/>
     </function>
 
-    <function name="GetBooleanv" offset="258" es1="1.1" es2="2.0">
+    <function name="GetBooleanv" es1="1.1" es2="2.0">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLboolean *" output="true" variable_param="pname"/>
         <glx sop="112" handcode="client"/>
     </function>
 
-    <function name="GetClipPlane" offset="259" deprecated="3.1">
+    <function name="GetClipPlane" deprecated="3.1">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="GLdouble *" output="true" count="4"/>
         <glx sop="113" always_array="true"/>
     </function>
 
-    <function name="GetDoublev" offset="260">
+    <function name="GetDoublev">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLdouble *" output="true" variable_param="pname"/>
         <glx sop="114" handcode="client"/>
     </function>
 
-    <function name="GetError" offset="261" es1="1.0" es2="2.0">
+    <function name="GetError" es1="1.0" es2="2.0">
         <return type="GLenum"/>
         <glx sop="115" handcode="client"/>
     </function>
 
-    <function name="GetFloatv" offset="262" es1="1.1" es2="2.0">
+    <function name="GetFloatv" es1="1.1" es2="2.0">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="116" handcode="client"/>
     </function>
 
-    <function name="GetIntegerv" offset="263" es1="1.0" es2="2.0">
+    <function name="GetIntegerv" es1="1.0" es2="2.0">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="117" handcode="client"/>
     </function>
 
-    <function name="GetLightfv" offset="264" es1="1.1" deprecated="3.1">
+    <function name="GetLightfv" es1="1.1" deprecated="3.1">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="118"/>
     </function>
 
-    <function name="GetLightiv" offset="265" deprecated="3.1">
+    <function name="GetLightiv" deprecated="3.1">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="119"/>
     </function>
 
-    <function name="GetMapdv" offset="266" deprecated="3.1">
+    <function name="GetMapdv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="query" type="GLenum"/>
         <param name="v" type="GLdouble *" output="true" variable_param="target query"/>
         <glx sop="120"/>
     </function>
 
-    <function name="GetMapfv" offset="267" deprecated="3.1">
+    <function name="GetMapfv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="query" type="GLenum"/>
         <param name="v" type="GLfloat *" output="true" variable_param="target query"/>
         <glx sop="121"/>
     </function>
 
-    <function name="GetMapiv" offset="268" deprecated="3.1">
+    <function name="GetMapiv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="query" type="GLenum"/>
         <param name="v" type="GLint *" output="true" variable_param="target query"/>
         <glx sop="122"/>
     </function>
 
-    <function name="GetMaterialfv" offset="269" es1="1.1" deprecated="3.1">
+    <function name="GetMaterialfv" es1="1.1" deprecated="3.1">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="123"/>
     </function>
 
-    <function name="GetMaterialiv" offset="270" deprecated="3.1">
+    <function name="GetMaterialiv" deprecated="3.1">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="124"/>
     </function>
 
-    <function name="GetPixelMapfv" offset="271" deprecated="3.1">
+    <function name="GetPixelMapfv" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="values" type="GLfloat *" output="true" variable_param="map"/>
         <glx sop="125"/>
     </function>
 
-    <function name="GetPixelMapuiv" offset="272" deprecated="3.1">
+    <function name="GetPixelMapuiv" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="values" type="GLuint *" output="true" variable_param="map"/>
         <glx sop="126"/>
     </function>
 
-    <function name="GetPixelMapusv" offset="273" deprecated="3.1">
+    <function name="GetPixelMapusv" deprecated="3.1">
         <param name="map" type="GLenum"/>
         <param name="values" type="GLushort *" output="true" variable_param="map"/>
         <glx sop="127"/>
     </function>
 
-    <function name="GetPolygonStipple" offset="274" deprecated="3.1">
+    <function name="GetPolygonStipple" deprecated="3.1">
         <param name="mask" type="GLubyte *" output="true" img_width="32" img_height="32" img_format="GL_COLOR_INDEX" img_type="GL_BITMAP"/>
         <glx sop="128"/>
     </function>
 
-    <function name="GetString" offset="275" es1="1.0" es2="2.0">
+    <function name="GetString" es1="1.0" es2="2.0">
         <param name="name" type="GLenum"/>
         <return type="const GLubyte *"/>
         <glx sop="129" handcode="true"/>
     </function>
 
-    <function name="GetTexEnvfv" offset="276" es1="1.1" deprecated="3.1">
+    <function name="GetTexEnvfv" es1="1.1" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="130"/>
     </function>
 
-    <function name="GetTexEnviv" offset="277" es1="1.1" deprecated="3.1">
+    <function name="GetTexEnviv" es1="1.1" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="131"/>
     </function>
 
-    <function name="GetTexGendv" offset="278" deprecated="3.1">
+    <function name="GetTexGendv" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLdouble *" output="true" variable_param="pname"/>
         <glx sop="132"/>
     </function>
 
-    <function name="GetTexGenfv" offset="279" deprecated="3.1">
+    <function name="GetTexGenfv" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="133"/>
     </function>
 
-    <function name="GetTexGeniv" offset="280" deprecated="3.1">
+    <function name="GetTexGeniv" deprecated="3.1">
         <param name="coord" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="134"/>
     </function>
 
-    <function name="GetTexImage" offset="281">
+    <function name="GetTexImage">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="format" type="GLenum"/>
@@ -2875,21 +2810,21 @@
         <glx sop="135" dimensions_in_reply="true"/>
     </function>
 
-    <function name="GetTexParameterfv" offset="282" es1="1.1" es2="2.0">
+    <function name="GetTexParameterfv" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="136"/>
     </function>
 
-    <function name="GetTexParameteriv" offset="283" es1="1.1" es2="2.0">
+    <function name="GetTexParameteriv" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="137"/>
     </function>
 
-    <function name="GetTexLevelParameterfv" offset="284">
+    <function name="GetTexLevelParameterfv" es2="3.1">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="pname" type="GLenum"/>
@@ -2897,7 +2832,7 @@
         <glx sop="138"/>
     </function>
 
-    <function name="GetTexLevelParameteriv" offset="285">
+    <function name="GetTexLevelParameteriv" es2="3.1">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="pname" type="GLenum"/>
@@ -2905,25 +2840,25 @@
         <glx sop="139"/>
     </function>
 
-    <function name="IsEnabled" offset="286" es1="1.1" es2="2.0">
+    <function name="IsEnabled" es1="1.1" es2="2.0">
         <param name="cap" type="GLenum"/>
         <return type="GLboolean"/>
         <glx sop="140" handcode="client"/>
     </function>
 
-    <function name="IsList" offset="287" deprecated="3.1">
+    <function name="IsList" deprecated="3.1">
         <param name="list" type="GLuint"/>
         <return type="GLboolean"/>
         <glx sop="141"/>
     </function>
 
-    <function name="DepthRange" offset="288">
+    <function name="DepthRange">
         <param name="zNear" type="GLclampd"/>
         <param name="zFar" type="GLclampd"/>
         <glx rop="174"/>
     </function>
 
-    <function name="Frustum" offset="289" deprecated="3.1">
+    <function name="Frustum" deprecated="3.1">
         <param name="left" type="GLdouble"/>
         <param name="right" type="GLdouble"/>
         <param name="bottom" type="GLdouble"/>
@@ -2933,36 +2868,36 @@
         <glx rop="175"/>
     </function>
 
-    <function name="LoadIdentity" offset="290" es1="1.0" deprecated="3.1">
+    <function name="LoadIdentity" es1="1.0" deprecated="3.1">
         <glx rop="176"/>
     </function>
 
-    <function name="LoadMatrixf" offset="291" es1="1.0" deprecated="3.1">
+    <function name="LoadMatrixf" es1="1.0" deprecated="3.1">
         <param name="m" type="const GLfloat *" count="16"/>
         <glx rop="177"/>
     </function>
 
-    <function name="LoadMatrixd" offset="292" deprecated="3.1">
+    <function name="LoadMatrixd" deprecated="3.1">
         <param name="m" type="const GLdouble *" count="16"/>
         <glx rop="178"/>
     </function>
 
-    <function name="MatrixMode" offset="293" es1="1.0" deprecated="3.1">
+    <function name="MatrixMode" es1="1.0" deprecated="3.1">
         <param name="mode" type="GLenum"/>
         <glx rop="179"/>
     </function>
 
-    <function name="MultMatrixf" offset="294" es1="1.0" deprecated="3.1">
+    <function name="MultMatrixf" es1="1.0" deprecated="3.1">
         <param name="m" type="const GLfloat *" count="16"/>
         <glx rop="180"/>
     </function>
 
-    <function name="MultMatrixd" offset="295" deprecated="3.1">
+    <function name="MultMatrixd" deprecated="3.1">
         <param name="m" type="const GLdouble *" count="16"/>
         <glx rop="181"/>
     </function>
 
-    <function name="Ortho" offset="296" deprecated="3.1">
+    <function name="Ortho" deprecated="3.1">
         <param name="left" type="GLdouble"/>
         <param name="right" type="GLdouble"/>
         <param name="bottom" type="GLdouble"/>
@@ -2972,15 +2907,15 @@
         <glx rop="182"/>
     </function>
 
-    <function name="PopMatrix" offset="297" es1="1.0" deprecated="3.1">
+    <function name="PopMatrix" es1="1.0" deprecated="3.1">
         <glx rop="183"/>
     </function>
 
-    <function name="PushMatrix" offset="298" es1="1.0" deprecated="3.1">
+    <function name="PushMatrix" es1="1.0" deprecated="3.1">
         <glx rop="184"/>
     </function>
 
-    <function name="Rotated" offset="299" deprecated="3.1">
+    <function name="Rotated" deprecated="3.1">
         <param name="angle" type="GLdouble"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
@@ -2988,7 +2923,7 @@
         <glx rop="185"/>
     </function>
 
-    <function name="Rotatef" offset="300" es1="1.0" deprecated="3.1">
+    <function name="Rotatef" es1="1.0" deprecated="3.1">
         <param name="angle" type="GLfloat"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
@@ -2996,35 +2931,35 @@
         <glx rop="186"/>
     </function>
 
-    <function name="Scaled" offset="301" deprecated="3.1">
+    <function name="Scaled" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
         <glx rop="187"/>
     </function>
 
-    <function name="Scalef" offset="302" es1="1.0" deprecated="3.1">
+    <function name="Scalef" es1="1.0" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
         <glx rop="188"/>
     </function>
 
-    <function name="Translated" offset="303" deprecated="3.1">
+    <function name="Translated" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
         <glx rop="189"/>
     </function>
 
-    <function name="Translatef" offset="304" es1="1.0" deprecated="3.1">
+    <function name="Translatef" es1="1.0" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
         <glx rop="190"/>
     </function>
 
-    <function name="Viewport" offset="305" es1="1.0" es2="2.0">
+    <function name="Viewport" es1="1.0" es2="2.0">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="width" type="GLsizei"/>
@@ -3207,13 +3142,12 @@
     <enum name="ALL_CLIENT_ATTRIB_BITS"                   value="0xFFFFFFFF"/>
     <enum name="CLIENT_ALL_ATTRIB_BITS"                   value="0xFFFFFFFF"/>
 
-    <function name="ArrayElement" offset="306" deprecated="3.1"
-              exec="dynamic">
+    <function name="ArrayElement" deprecated="3.1" exec="dynamic">
         <param name="i" type="GLint"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="ColorPointer" offset="308" es1="1.0" deprecated="3.1">
+    <function name="ColorPointer" es1="1.0" deprecated="3.1">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -3221,22 +3155,19 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="DisableClientState" offset="309" es1="1.0"
-              deprecated="3.1">
+    <function name="DisableClientState" es1="1.0" deprecated="3.1">
         <param name="array" type="GLenum"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="DrawArrays" offset="310" es1="1.0" es2="2.0"
-              exec="dynamic">
+    <function name="DrawArrays" es1="1.0" es2="2.0" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="first" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <glx rop="193" handcode="true"/>
     </function>
 
-    <function name="DrawElements" offset="311" es1="1.0" es2="2.0"
-              exec="dynamic">
+    <function name="DrawElements" es1="1.0" es2="2.0" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="count" type="GLsizei"/>
         <param name="type" type="GLenum"/>
@@ -3244,45 +3175,45 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="EdgeFlagPointer" offset="312" deprecated="3.1">
+    <function name="EdgeFlagPointer" deprecated="3.1">
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="EnableClientState" offset="313" es1="1.0" deprecated="3.1">
+    <function name="EnableClientState" es1="1.0" deprecated="3.1">
         <param name="array" type="GLenum"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="GetPointerv" offset="329" es1="1.1">
+    <function name="GetPointerv" es1="1.1">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLvoid **" output="true"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="IndexPointer" offset="314" deprecated="3.1">
+    <function name="IndexPointer" deprecated="3.1">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="InterleavedArrays" offset="317" deprecated="3.1">
+    <function name="InterleavedArrays" deprecated="3.1">
         <param name="format" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="NormalPointer" offset="318" es1="1.0" deprecated="3.1">
+    <function name="NormalPointer" es1="1.0" deprecated="3.1">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="TexCoordPointer" offset="320" es1="1.0" deprecated="3.1">
+    <function name="TexCoordPointer" es1="1.0" deprecated="3.1">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -3290,7 +3221,7 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="VertexPointer" offset="321" es1="1.0" deprecated="3.1">
+    <function name="VertexPointer" es1="1.0" deprecated="3.1">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -3298,13 +3229,13 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="PolygonOffset" offset="319" es1="1.0" es2="2.0">
+    <function name="PolygonOffset" es1="1.0" es2="2.0">
         <param name="factor" type="GLfloat"/>
         <param name="units" type="GLfloat"/>
         <glx rop="192"/>
     </function>
 
-    <function name="CopyTexImage1D" offset="323">
+    <function name="CopyTexImage1D">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -3315,7 +3246,7 @@
         <glx rop="4119"/>
     </function>
 
-    <function name="CopyTexImage2D" offset="324" es1="1.0" es2="2.0">
+    <function name="CopyTexImage2D" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -3327,7 +3258,7 @@
         <glx rop="4120"/>
     </function>
 
-    <function name="CopyTexSubImage1D" offset="325">
+    <function name="CopyTexSubImage1D">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3337,7 +3268,7 @@
         <glx rop="4121"/>
     </function>
 
-    <function name="CopyTexSubImage2D" offset="326" es1="1.0" es2="2.0">
+    <function name="CopyTexSubImage2D" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3349,7 +3280,7 @@
         <glx rop="4122"/>
     </function>
 
-    <function name="TexSubImage1D" offset="332">
+    <function name="TexSubImage1D">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3361,7 +3292,7 @@
         <glx rop="4099" large="true"/>
     </function>
 
-    <function name="TexSubImage2D" offset="333" es1="1.0" es2="2.0">
+    <function name="TexSubImage2D" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -3375,7 +3306,7 @@
         <glx rop="4100" large="true"/>
     </function>
 
-    <function name="AreTexturesResident" offset="322" deprecated="3.1">
+    <function name="AreTexturesResident" deprecated="3.1">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="textures" type="const GLuint *" count="n"/>
         <param name="residences" type="GLboolean *" output="true" count="n"/>
@@ -3383,52 +3314,51 @@
         <glx sop="143" handcode="client" always_array="true"/>
     </function>
 
-    <function name="BindTexture" offset="307" es1="1.0" es2="2.0">
+    <function name="BindTexture" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="texture" type="GLuint"/>
         <glx rop="4117"/>
     </function>
 
-    <function name="DeleteTextures" offset="327" es1="1.0" es2="2.0">
+    <function name="DeleteTextures" es1="1.0" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="textures" type="const GLuint *" count="n"/>
         <glx sop="144"/>
     </function>
 
-    <function name="GenTextures" offset="328" es1="1.0" es2="2.0">
+    <function name="GenTextures" es1="1.0" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="textures" type="GLuint *" output="true" count="n"/>
         <glx sop="145" always_array="true"/>
     </function>
 
-    <function name="IsTexture" offset="330" es1="1.1" es2="2.0">
+    <function name="IsTexture" es1="1.1" es2="2.0">
         <param name="texture" type="GLuint"/>
         <return type="GLboolean"/>
         <glx sop="146"/>
     </function>
 
-    <function name="PrioritizeTextures" offset="331" deprecated="3.1">
+    <function name="PrioritizeTextures" deprecated="3.1">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="textures" type="const GLuint *" count="n"/>
         <param name="priorities" type="const GLclampf *" count="n"/>
         <glx rop="4118"/>
     </function>
 
-    <function name="Indexub" offset="315" vectorequiv="Indexubv"
-              deprecated="3.1">
+    <function name="Indexub" vectorequiv="Indexubv" deprecated="3.1">
         <param name="c" type="GLubyte"/>
     </function>
 
-    <function name="Indexubv" offset="316" deprecated="3.1">
+    <function name="Indexubv" deprecated="3.1">
         <param name="c" type="const GLubyte *" count="1"/>
         <glx rop="194"/>
     </function>
 
-    <function name="PopClientAttrib" offset="334" deprecated="3.1">
+    <function name="PopClientAttrib" deprecated="3.1">
         <glx handcode="true"/>
     </function>
 
-    <function name="PushClientAttrib" offset="335" deprecated="3.1">
+    <function name="PushClientAttrib" deprecated="3.1">
         <param name="mask" type="GLbitfield"/>
         <glx handcode="true"/>
     </function>
@@ -3781,7 +3711,7 @@
     </enum>
 
 
-    <function name="BlendColor" offset="336" es2="2.0">
+    <function name="BlendColor" es2="2.0">
         <param name="red" type="GLclampf"/>
         <param name="green" type="GLclampf"/>
         <param name="blue" type="GLclampf"/>
@@ -3789,13 +3719,12 @@
         <glx rop="4096"/>
     </function>
 
-    <function name="BlendEquation" offset="337" es2="2.0">
+    <function name="BlendEquation" es2="2.0">
         <param name="mode" type="GLenum"/>
         <glx rop="4097"/>
     </function>
 
-    <function name="DrawRangeElements" offset="338" es2="3.0"
-              exec="dynamic">
+    <function name="DrawRangeElements" es2="3.0" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="start" type="GLuint"/>
         <param name="end" type="GLuint"/>
@@ -3805,7 +3734,7 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="ColorTable" offset="339" deprecated="3.1">
+    <function name="ColorTable" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -3815,21 +3744,21 @@
         <glx rop="2053" large="true"/>
     </function>
 
-    <function name="ColorTableParameterfv" offset="340" deprecated="3.1">
+    <function name="ColorTableParameterfv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="2054"/>
     </function>
 
-    <function name="ColorTableParameteriv" offset="341" deprecated="3.1">
+    <function name="ColorTableParameteriv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="2055"/>
     </function>
 
-    <function name="CopyColorTable" offset="342" deprecated="3.1">
+    <function name="CopyColorTable" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="x" type="GLint"/>
@@ -3838,7 +3767,7 @@
         <glx rop="2056"/>
     </function>
 
-    <function name="GetColorTable" offset="343" deprecated="3.1">
+    <function name="GetColorTable" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -3846,21 +3775,21 @@
         <glx sop="147" dimensions_in_reply="true"/>
     </function>
 
-    <function name="GetColorTableParameterfv" offset="344" deprecated="3.1">
+    <function name="GetColorTableParameterfv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="148"/>
     </function>
 
-    <function name="GetColorTableParameteriv" offset="345" deprecated="3.1">
+    <function name="GetColorTableParameteriv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="149"/>
     </function>
 
-    <function name="ColorSubTable" offset="346" deprecated="3.1">
+    <function name="ColorSubTable" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="start" type="GLsizei"/>
         <param name="count" type="GLsizei"/>
@@ -3870,7 +3799,7 @@
         <glx rop="195" large="true"/>
     </function>
 
-    <function name="CopyColorSubTable" offset="347" deprecated="3.1">
+    <function name="CopyColorSubTable" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="start" type="GLsizei"/>
         <param name="x" type="GLint"/>
@@ -3879,7 +3808,7 @@
         <glx rop="196"/>
     </function>
 
-    <function name="ConvolutionFilter1D" offset="348" deprecated="3.1">
+    <function name="ConvolutionFilter1D" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -3889,7 +3818,7 @@
         <glx rop="4101" large="true"/>
     </function>
 
-    <function name="ConvolutionFilter2D" offset="349" deprecated="3.1">
+    <function name="ConvolutionFilter2D" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -3900,35 +3829,35 @@
         <glx rop="4102" large="true"/>
     </function>
 
-    <function name="ConvolutionParameterf" offset="350" deprecated="3.1">
+    <function name="ConvolutionParameterf" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat"/>
         <glx rop="4103"/>
     </function>
 
-    <function name="ConvolutionParameterfv" offset="351" deprecated="3.1">
+    <function name="ConvolutionParameterfv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="4104"/>
     </function>
 
-    <function name="ConvolutionParameteri" offset="352" deprecated="3.1">
+    <function name="ConvolutionParameteri" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint"/>
         <glx rop="4105"/>
     </function>
 
-    <function name="ConvolutionParameteriv" offset="353" deprecated="3.1">
+    <function name="ConvolutionParameteriv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="4106"/>
     </function>
 
-    <function name="CopyConvolutionFilter1D" offset="354" deprecated="3.1">
+    <function name="CopyConvolutionFilter1D" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="x" type="GLint"/>
@@ -3937,7 +3866,7 @@
         <glx rop="4107"/>
     </function>
 
-    <function name="CopyConvolutionFilter2D" offset="355" deprecated="3.1">
+    <function name="CopyConvolutionFilter2D" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="x" type="GLint"/>
@@ -3947,7 +3876,7 @@
         <glx rop="4108"/>
     </function>
 
-    <function name="GetConvolutionFilter" offset="356" deprecated="3.1">
+    <function name="GetConvolutionFilter" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -3955,21 +3884,21 @@
         <glx sop="150" dimensions_in_reply="true"/>
     </function>
 
-    <function name="GetConvolutionParameterfv" offset="357" deprecated="3.1">
+    <function name="GetConvolutionParameterfv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="151"/>
     </function>
 
-    <function name="GetConvolutionParameteriv" offset="358" deprecated="3.1">
+    <function name="GetConvolutionParameteriv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="152"/>
     </function>
 
-    <function name="GetSeparableFilter" offset="359" deprecated="3.1">
+    <function name="GetSeparableFilter" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -3979,7 +3908,7 @@
         <glx sop="153" handcode="true"/>
     </function>
 
-    <function name="SeparableFilter2D" offset="360" deprecated="3.1">
+    <function name="SeparableFilter2D" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -3991,7 +3920,7 @@
         <glx rop="4109" handcode="true"/>
     </function>
 
-    <function name="GetHistogram" offset="361" deprecated="3.1">
+    <function name="GetHistogram" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="reset" type="GLboolean"/>
         <param name="format" type="GLenum"/>
@@ -4000,21 +3929,21 @@
         <glx sop="154" dimensions_in_reply="true" img_reset="reset"/>
     </function>
 
-    <function name="GetHistogramParameterfv" offset="362" deprecated="3.1">
+    <function name="GetHistogramParameterfv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="155"/>
     </function>
 
-    <function name="GetHistogramParameteriv" offset="363" deprecated="3.1">
+    <function name="GetHistogramParameteriv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="156"/>
     </function>
 
-    <function name="GetMinmax" offset="364" deprecated="3.1">
+    <function name="GetMinmax" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="reset" type="GLboolean"/>
         <param name="format" type="GLenum"/>
@@ -4023,21 +3952,21 @@
         <glx sop="157" img_reset="reset"/>
     </function>
 
-    <function name="GetMinmaxParameterfv" offset="365" deprecated="3.1">
+    <function name="GetMinmaxParameterfv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx sop="158"/>
     </function>
 
-    <function name="GetMinmaxParameteriv" offset="366" deprecated="3.1">
+    <function name="GetMinmaxParameteriv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="159"/>
     </function>
 
-    <function name="Histogram" offset="367" deprecated="3.1">
+    <function name="Histogram" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="width" type="GLsizei"/>
         <param name="internalformat" type="GLenum"/>
@@ -4045,24 +3974,24 @@
         <glx rop="4110"/>
     </function>
 
-    <function name="Minmax" offset="368" deprecated="3.1">
+    <function name="Minmax" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="sink" type="GLboolean"/>
         <glx rop="4111"/>
     </function>
 
-    <function name="ResetHistogram" offset="369" deprecated="3.1">
+    <function name="ResetHistogram" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <glx rop="4112"/>
     </function>
 
-    <function name="ResetMinmax" offset="370" deprecated="3.1">
+    <function name="ResetMinmax" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <glx rop="4113"/>
     </function>
 
-    <function name="TexImage3D" offset="371" es2="3.0">
+    <function name="TexImage3D" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLint"/>
@@ -4076,7 +4005,7 @@
         <glx rop="4114" large="true"/>
     </function>
 
-    <function name="TexSubImage3D" offset="372" es2="3.0">
+    <function name="TexSubImage3D" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4092,7 +4021,7 @@
         <glx rop="4115" large="true"/>
     </function>
 
-    <function name="CopyTexSubImage3D" offset="373" es2="3.0">
+    <function name="CopyTexSubImage3D" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4319,33 +4248,28 @@
     <enum name="DOT3_RGB"                                 value="0x86AE"/>
     <enum name="DOT3_RGBA"                                value="0x86AF"/>
 
-    <function name="ActiveTexture" es1="1.0"
-              es2="2.0" offset="374">
+    <function name="ActiveTexture" es1="1.0" es2="2.0">
         <param name="texture" type="GLenum"/>
         <glx rop="197"/>
     </function>
 
-    <function name="ClientActiveTexture"
-              es1="1.0" deprecated="3.1" offset="375">
+    <function name="ClientActiveTexture" es1="1.0" deprecated="3.1">
         <param name="texture" type="GLenum"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="MultiTexCoord1d"
-              deprecated="3.1" offset="376">
+    <function name="MultiTexCoord1d" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLdouble"/>
     </function>
 
-    <function name="MultiTexCoord1dv"
-              deprecated="3.1" offset="377">
+    <function name="MultiTexCoord1dv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLdouble *" count="1"/>
         <glx rop="198"/>
     </function>
 
-    <function name="MultiTexCoord1f" alias="MultiTexCoord1fARB"
-              deprecated="3.1">
+    <function name="MultiTexCoord1f" alias="MultiTexCoord1fARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLfloat"/>
     </function>
@@ -4356,41 +4280,35 @@
         <param name="v" type="const GLfloat *"/>
     </function>
 
-    <function name="MultiTexCoord1i"
-              deprecated="3.1" offset="380">
+    <function name="MultiTexCoord1i" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLint"/>
     </function>
 
-    <function name="MultiTexCoord1iv"
-              deprecated="3.1" offset="381">
+    <function name="MultiTexCoord1iv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLint *" count="1"/>
         <glx rop="200"/>
     </function>
 
-    <function name="MultiTexCoord1s"
-              deprecated="3.1" offset="382">
+    <function name="MultiTexCoord1s" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLshort"/>
     </function>
 
-    <function name="MultiTexCoord1sv"
-              deprecated="3.1" offset="383">
+    <function name="MultiTexCoord1sv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLshort *" count="1"/>
         <glx rop="201"/>
     </function>
 
-    <function name="MultiTexCoord2d"
-              deprecated="3.1" offset="384">
+    <function name="MultiTexCoord2d" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLdouble"/>
         <param name="t" type="GLdouble"/>
     </function>
 
-    <function name="MultiTexCoord2dv"
-              deprecated="3.1" offset="385">
+    <function name="MultiTexCoord2dv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLdouble *" count="2"/>
         <glx rop="202"/>
@@ -4409,44 +4327,38 @@
         <param name="v" type="const GLfloat *"/>
     </function>
 
-    <function name="MultiTexCoord2i"
-              deprecated="3.1" offset="388">
+    <function name="MultiTexCoord2i" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLint"/>
         <param name="t" type="GLint"/>
     </function>
 
-    <function name="MultiTexCoord2iv"
-              deprecated="3.1" offset="389">
+    <function name="MultiTexCoord2iv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLint *" count="2"/>
         <glx rop="204"/>
     </function>
 
-    <function name="MultiTexCoord2s"
-              deprecated="3.1" offset="390">
+    <function name="MultiTexCoord2s" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLshort"/>
         <param name="t" type="GLshort"/>
     </function>
 
-    <function name="MultiTexCoord2sv"
-              deprecated="3.1" offset="391">
+    <function name="MultiTexCoord2sv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLshort *" count="2"/>
         <glx rop="205"/>
     </function>
 
-    <function name="MultiTexCoord3d"
-              deprecated="3.1" offset="392">
+    <function name="MultiTexCoord3d" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLdouble"/>
         <param name="t" type="GLdouble"/>
         <param name="r" type="GLdouble"/>
     </function>
 
-    <function name="MultiTexCoord3dv"
-              deprecated="3.1" offset="393">
+    <function name="MultiTexCoord3dv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="206"/>
@@ -4466,38 +4378,33 @@
         <param name="v" type="const GLfloat *"/>
     </function>
 
-    <function name="MultiTexCoord3i"
-              deprecated="3.1" offset="396">
+    <function name="MultiTexCoord3i" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLint"/>
         <param name="t" type="GLint"/>
         <param name="r" type="GLint"/>
     </function>
 
-    <function name="MultiTexCoord3iv"
-              deprecated="3.1" offset="397">
+    <function name="MultiTexCoord3iv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLint *" count="3"/>
         <glx rop="208"/>
     </function>
 
-    <function name="MultiTexCoord3s"
-              deprecated="3.1" offset="398">
+    <function name="MultiTexCoord3s" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLshort"/>
         <param name="t" type="GLshort"/>
         <param name="r" type="GLshort"/>
     </function>
 
-    <function name="MultiTexCoord3sv"
-              deprecated="3.1" offset="399">
+    <function name="MultiTexCoord3sv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="209"/>
     </function>
 
-    <function name="MultiTexCoord4d"
-              deprecated="3.1" offset="400">
+    <function name="MultiTexCoord4d" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLdouble"/>
         <param name="t" type="GLdouble"/>
@@ -4505,8 +4412,7 @@
         <param name="q" type="GLdouble"/>
     </function>
 
-    <function name="MultiTexCoord4dv"
-              deprecated="3.1" offset="401">
+    <function name="MultiTexCoord4dv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLdouble *" count="4"/>
         <glx rop="210"/>
@@ -4527,8 +4433,7 @@
         <param name="v" type="const GLfloat *"/>
     </function>
 
-    <function name="MultiTexCoord4i"
-              deprecated="3.1" offset="404">
+    <function name="MultiTexCoord4i" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLint"/>
         <param name="t" type="GLint"/>
@@ -4536,15 +4441,13 @@
         <param name="q" type="GLint"/>
     </function>
 
-    <function name="MultiTexCoord4iv"
-              deprecated="3.1" offset="405">
+    <function name="MultiTexCoord4iv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLint *" count="4"/>
         <glx rop="212"/>
     </function>
 
-    <function name="MultiTexCoord4s"
-              deprecated="3.1" offset="406">
+    <function name="MultiTexCoord4s" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLshort"/>
         <param name="t" type="GLshort"/>
@@ -4552,45 +4455,39 @@
         <param name="q" type="GLshort"/>
     </function>
 
-    <function name="MultiTexCoord4sv"
-              deprecated="3.1" offset="407">
+    <function name="MultiTexCoord4sv" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="213"/>
     </function>
 
-    <function name="LoadTransposeMatrixf"
-              deprecated="3.1" offset="assign">
+    <function name="LoadTransposeMatrixf" deprecated="3.1">
         <param name="m" type="const GLfloat *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="LoadTransposeMatrixd"
-              deprecated="3.1" offset="assign">
+    <function name="LoadTransposeMatrixd" deprecated="3.1">
         <param name="m" type="const GLdouble *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="MultTransposeMatrixf"
-              deprecated="3.1" offset="assign">
+    <function name="MultTransposeMatrixf" deprecated="3.1">
         <param name="m" type="const GLfloat *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="MultTransposeMatrixd"
-              deprecated="3.1" offset="assign">
+    <function name="MultTransposeMatrixd" deprecated="3.1">
         <param name="m" type="const GLdouble *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="SampleCoverage" es1="1.0"
-              es2="2.0" offset="assign">
+    <function name="SampleCoverage" es1="1.0" es2="2.0">
         <param name="value" type="GLclampf"/>
         <param name="invert" type="GLboolean"/>
         <glx rop="229"/>
     </function>
 
-    <function name="CompressedTexImage3D" es2="3.0" offset="assign">
+    <function name="CompressedTexImage3D" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -4603,8 +4500,7 @@
         <glx rop="216" handcode="client"/>
     </function>
 
-    <function name="CompressedTexImage2D"
-              es1="1.0" es2="2.0" offset="assign">
+    <function name="CompressedTexImage2D" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -4616,7 +4512,7 @@
         <glx rop="215" handcode="client"/>
     </function>
 
-    <function name="CompressedTexImage1D" offset="assign">
+    <function name="CompressedTexImage1D">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="internalformat" type="GLenum"/>
@@ -4627,7 +4523,7 @@
         <glx rop="214" handcode="client"/>
     </function>
 
-    <function name="CompressedTexSubImage3D" es2="3.0" offset="assign">
+    <function name="CompressedTexSubImage3D" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4642,8 +4538,7 @@
         <glx rop="219" handcode="client"/>
     </function>
 
-    <function name="CompressedTexSubImage2D"
-              es1="1.0" es2="2.0" offset="assign">
+    <function name="CompressedTexSubImage2D" es1="1.0" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4656,7 +4551,7 @@
         <glx rop="218" handcode="client"/>
     </function>
 
-    <function name="CompressedTexSubImage1D" offset="assign">
+    <function name="CompressedTexSubImage1D">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -4667,7 +4562,7 @@
         <glx rop="217" handcode="client"/>
     </function>
 
-    <function name="GetCompressedTexImage" offset="assign">
+    <function name="GetCompressedTexImage">
         <param name="target" type="GLenum"/>
         <param name="level" type="GLint"/>
         <param name="img" type="GLvoid *" output="true"/>
@@ -4790,7 +4685,7 @@
     </enum>
     <enum name="COMPARE_R_TO_TEXTURE"                     value="0x884E"/>
 
-    <function name="BlendFuncSeparate" es2="2.0" offset="assign">
+    <function name="BlendFuncSeparate" es2="2.0">
         <param name="sfactorRGB" type="GLenum"/>
         <param name="dfactorRGB" type="GLenum"/>
         <param name="sfactorAlpha" type="GLenum"/>
@@ -4806,24 +4701,23 @@
         <param name="coord" type="const GLfloat *"/>
     </function>
 
-    <function name="FogCoordd" deprecated="3.1" offset="assign">
+    <function name="FogCoordd" deprecated="3.1">
         <param name="coord" type="GLdouble"/>
     </function>
 
-    <function name="FogCoorddv" deprecated="3.1" offset="assign">
+    <function name="FogCoorddv" deprecated="3.1">
         <param name="coord" type="const GLdouble *" count="1"/>
         <glx rop="4125"/>
     </function>
 
-    <function name="FogCoordPointer"
-              deprecated="3.1" offset="assign">
+    <function name="FogCoordPointer" deprecated="3.1">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="MultiDrawArrays" offset="assign">
+    <function name="MultiDrawArrays">
         <param name="mode" type="GLenum"/>
         <param name="first" type="const GLint *"/>
         <param name="count" type="const GLsizei *"/>
@@ -4839,52 +4733,48 @@
         <param name="primcount" type="GLsizei"/>
     </function>
 
-    <function name="PointParameterf" es1="1.1" offset="assign">
+    <function name="PointParameterf" es1="1.1">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
         <glx rop="2065"/>
     </function>
 
-    <function name="PointParameterfv" es1="1.1" offset="assign">
+    <function name="PointParameterfv" es1="1.1">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *" variable_param="pname"/>
         <glx rop="2066"/>
     </function>
 
-    <function name="PointParameteri" offset="assign">
+    <function name="PointParameteri">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
         <glx rop="4221"/>
     </function>
 
-    <function name="PointParameteriv" offset="assign">
+    <function name="PointParameteriv">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *" variable_param="pname"/>
         <glx rop="4222"/>
     </function>
 
-    <function name="SecondaryColor3b"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3b" deprecated="3.1">
         <param name="red" type="GLbyte"/>
         <param name="green" type="GLbyte"/>
         <param name="blue" type="GLbyte"/>
     </function>
 
-    <function name="SecondaryColor3bv"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3bv" deprecated="3.1">
         <param name="v" type="const GLbyte *" count="3"/>
         <glx rop="4126"/>
     </function>
 
-    <function name="SecondaryColor3d"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3d" deprecated="3.1">
         <param name="red" type="GLdouble"/>
         <param name="green" type="GLdouble"/>
         <param name="blue" type="GLdouble"/>
     </function>
 
-    <function name="SecondaryColor3dv"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3dv" deprecated="3.1">
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="4130"/>
     </function>
@@ -4901,73 +4791,62 @@
         <param name="v" type="const GLfloat *"/>
     </function>
 
-    <function name="SecondaryColor3i"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3i" deprecated="3.1">
         <param name="red" type="GLint"/>
         <param name="green" type="GLint"/>
         <param name="blue" type="GLint"/>
     </function>
 
-    <function name="SecondaryColor3iv"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3iv" deprecated="3.1">
         <param name="v" type="const GLint *" count="3"/>
         <glx rop="4128"/>
     </function>
 
-    <function name="SecondaryColor3s"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3s" deprecated="3.1">
         <param name="red" type="GLshort"/>
         <param name="green" type="GLshort"/>
         <param name="blue" type="GLshort"/>
     </function>
 
-    <function name="SecondaryColor3sv"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3sv" deprecated="3.1">
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="4127"/>
     </function>
 
-    <function name="SecondaryColor3ub"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3ub" deprecated="3.1">
         <param name="red" type="GLubyte"/>
         <param name="green" type="GLubyte"/>
         <param name="blue" type="GLubyte"/>
     </function>
 
-    <function name="SecondaryColor3ubv"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3ubv" deprecated="3.1">
         <param name="v" type="const GLubyte *" count="3"/>
         <glx rop="4131"/>
     </function>
 
-    <function name="SecondaryColor3ui"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3ui" deprecated="3.1">
         <param name="red" type="GLuint"/>
         <param name="green" type="GLuint"/>
         <param name="blue" type="GLuint"/>
     </function>
 
-    <function name="SecondaryColor3uiv"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3uiv" deprecated="3.1">
         <param name="v" type="const GLuint *" count="3"/>
         <glx rop="4133"/>
     </function>
 
-    <function name="SecondaryColor3us"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3us" deprecated="3.1">
         <param name="red" type="GLushort"/>
         <param name="green" type="GLushort"/>
         <param name="blue" type="GLushort"/>
     </function>
 
-    <function name="SecondaryColor3usv"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColor3usv" deprecated="3.1">
         <param name="v" type="const GLushort *" count="3"/>
         <glx rop="4132"/>
     </function>
 
-    <function name="SecondaryColorPointer"
-              deprecated="3.1" offset="assign">
+    <function name="SecondaryColorPointer" deprecated="3.1">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -4975,93 +4854,93 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2d" deprecated="3.1" offset="assign">
+    <function name="WindowPos2d" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2dv" deprecated="3.1" offset="assign">
+    <function name="WindowPos2dv" deprecated="3.1">
         <param name="v" type="const GLdouble *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2f" deprecated="3.1" offset="assign">
+    <function name="WindowPos2f" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2fv" deprecated="3.1" offset="assign">
+    <function name="WindowPos2fv" deprecated="3.1">
         <param name="v" type="const GLfloat *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2i" deprecated="3.1" offset="assign">
+    <function name="WindowPos2i" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2iv" deprecated="3.1" offset="assign">
+    <function name="WindowPos2iv" deprecated="3.1">
         <param name="v" type="const GLint *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2s" deprecated="3.1" offset="assign">
+    <function name="WindowPos2s" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos2sv" deprecated="3.1" offset="assign">
+    <function name="WindowPos2sv" deprecated="3.1">
         <param name="v" type="const GLshort *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos3d" deprecated="3.1" offset="assign">
+    <function name="WindowPos3d" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos3dv" deprecated="3.1" offset="assign">
+    <function name="WindowPos3dv" deprecated="3.1">
         <param name="v" type="const GLdouble *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos3f" deprecated="3.1" offset="assign">
+    <function name="WindowPos3f" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
     </function>
 
-    <function name="WindowPos3fv" deprecated="3.1" offset="assign">
+    <function name="WindowPos3fv" deprecated="3.1">
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="230"/>
     </function>
 
-    <function name="WindowPos3i" deprecated="3.1" offset="assign">
+    <function name="WindowPos3i" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos3iv" deprecated="3.1" offset="assign">
+    <function name="WindowPos3iv" deprecated="3.1">
         <param name="v" type="const GLint *"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos3s" deprecated="3.1" offset="assign">
+    <function name="WindowPos3s" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="WindowPos3sv" deprecated="3.1" offset="assign">
+    <function name="WindowPos3sv" deprecated="3.1">
         <param name="v" type="const GLshort *"/>
         <glx handcode="true"/>
     </function>
@@ -5132,13 +5011,13 @@
     <type name="intptr"   size="4"                  glx_name="CARD32"/>
     <type name="sizeiptr" size="4"  unsigned="true" glx_name="CARD32"/>
 
-    <function name="BindBuffer" es1="1.1" es2="2.0" offset="assign">
+    <function name="BindBuffer" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="buffer" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="BufferData" es1="1.1" es2="2.0" offset="assign">
+    <function name="BufferData" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="size" type="GLsizeiptr" counter="true"/>
         <param name="data" type="const GLvoid *" count="size" img_null_flag="true"/>
@@ -5146,8 +5025,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="BufferSubData" es1="1.1"
-              es2="2.0" offset="assign">
+    <function name="BufferSubData" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="size" type="GLsizeiptr" counter="true"/>
@@ -5155,35 +5033,33 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="DeleteBuffers" es1="1.1"
-              es2="2.0" offset="assign">
+    <function name="DeleteBuffers" es1="1.1" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="buffer" type="const GLuint *" count="n"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GenBuffers" es1="1.1" es2="2.0" offset="assign">
+    <function name="GenBuffers" es1="1.1" es2="2.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="buffer" type="GLuint *" output="true" count="n"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetBufferParameteriv"
-              es1="1.1" es2="2.0" offset="assign">
+    <function name="GetBufferParameteriv" es1="1.1" es2="2.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetBufferPointerv" es2="3.0" offset="assign">
+    <function name="GetBufferPointerv" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLvoid **" output="true"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetBufferSubData" offset="assign">
+    <function name="GetBufferSubData">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="size" type="GLsizeiptr" counter="true"/>
@@ -5191,69 +5067,69 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="IsBuffer" es1="1.1" es2="2.0" offset="assign">
+    <function name="IsBuffer" es1="1.1" es2="2.0">
         <param name="buffer" type="GLuint"/>
         <return type="GLboolean"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="MapBuffer" offset="assign">
+    <function name="MapBuffer">
         <param name="target" type="GLenum"/>
         <param name="access" type="GLenum"/>
         <return type="GLvoid *"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="UnmapBuffer" es2="3.0" offset="assign">
+    <function name="UnmapBuffer" es2="3.0">
         <param name="target" type="GLenum"/>
         <return type="GLboolean"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GenQueries" es2="3.0" offset="assign">
+    <function name="GenQueries" es2="3.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="ids" type="GLuint *" output="true" count="n"/>
         <glx sop="162" always_array="true"/>
     </function>
 
-    <function name="DeleteQueries" es2="3.0" offset="assign">
+    <function name="DeleteQueries" es2="3.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="ids" type="const GLuint *" count="n"/>
         <glx sop="161"/>
     </function>
 
-    <function name="IsQuery" es2="3.0" offset="assign">
+    <function name="IsQuery" es2="3.0">
         <param name="id" type="GLuint"/>
         <return type="GLboolean"/>
         <glx sop="163"/>
     </function>
 
-    <function name="BeginQuery" es2="3.0" offset="assign">
+    <function name="BeginQuery" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="id" type="GLuint"/>
         <glx rop="231"/>
     </function>
 
-    <function name="EndQuery" es2="3.0" offset="assign">
+    <function name="EndQuery" es2="3.0">
         <param name="target" type="GLenum"/>
         <glx rop="232"/>
     </function>
 
-    <function name="GetQueryiv" es2="3.0" offset="assign">
+    <function name="GetQueryiv" es2="3.0">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="164"/>
     </function>
 
-    <function name="GetQueryObjectiv" offset="assign">
+    <function name="GetQueryObjectiv">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx sop="165"/>
     </function>
 
-    <function name="GetQueryObjectuiv" es2="3.0" offset="assign">
+    <function name="GetQueryObjectuiv" es2="3.0">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLuint *" output="true" variable_param="pname"/>
@@ -5356,20 +5232,19 @@
     <enum name="STENCIL_BACK_VALUE_MASK"          value="0x8CA4"/>
     <enum name="STENCIL_BACK_WRITEMASK"           value="0x8CA5"/>
 
-    <function name="BlendEquationSeparate"
-              es2="2.0" offset="assign">
+    <function name="BlendEquationSeparate" es2="2.0">
         <param name="modeRGB" type="GLenum"/>
         <param name="modeA" type="GLenum"/>
         <glx rop="4228"/>
     </function>
 
-    <function name="DrawBuffers" es2="3.0" offset="assign">
+    <function name="DrawBuffers" es2="3.0">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="bufs" type="const GLenum *" count="n"/>
         <glx rop="233" large="true"/>
     </function>
 
-    <function name="StencilFuncSeparate" offset="assign" es2="2.0">
+    <function name="StencilFuncSeparate" es2="2.0">
         <param name="face" type="GLenum"/>
         <param name="func" type="GLenum"/>
         <param name="ref" type="GLint"/>
@@ -5377,7 +5252,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="StencilOpSeparate" offset="assign" es2="2.0">
+    <function name="StencilOpSeparate" es2="2.0">
         <param name="face" type="GLenum"/>
         <param name="sfail" type="GLenum"/>
         <param name="zfail" type="GLenum"/>
@@ -5385,72 +5260,70 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="StencilMaskSeparate" offset="assign" es2="2.0">
+    <function name="StencilMaskSeparate" es2="2.0">
         <param name="face" type="GLenum"/>
         <param name="mask" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="AttachShader" offset="assign" es2="2.0">
+    <function name="AttachShader" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="shader" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="BindAttribLocation"
-              es2="2.0" offset="assign">
+    <function name="BindAttribLocation" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="index" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="CompileShader" es2="2.0" offset="assign">
+    <function name="CompileShader" es2="2.0">
         <param name="shader" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="CreateProgram" offset="assign" es2="2.0">
+    <function name="CreateProgram" es2="2.0">
         <return type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="CreateShader" offset="assign" es2="2.0">
+    <function name="CreateShader" es2="2.0">
         <param name="type" type="GLenum"/>
         <return type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="DeleteProgram" offset="assign" es2="2.0">
+    <function name="DeleteProgram" es2="2.0">
         <param name="program" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="DeleteShader" offset="assign" es2="2.0">
+    <function name="DeleteShader" es2="2.0">
         <param name="program" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="DetachShader" offset="assign" es2="2.0">
+    <function name="DetachShader" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="shader" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="DisableVertexAttribArray" es2="2.0" offset="assign">
+    <function name="DisableVertexAttribArray" es2="2.0">
         <param name="index" type="GLuint"/>
         <glx ignore="true"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="EnableVertexAttribArray"
-              es2="2.0" offset="assign">
+    <function name="EnableVertexAttribArray" es2="2.0">
         <param name="index" type="GLuint"/>
         <glx ignore="true"/>
         <glx handcode="true"/>
     </function>
 
-    <function name="GetActiveAttrib" es2="2.0" offset="assign">
+    <function name="GetActiveAttrib" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="index" type="GLuint"/>
         <param name="bufSize" type="GLsizei "/>
@@ -5461,7 +5334,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetActiveUniform" es2="2.0" offset="assign">
+    <function name="GetActiveUniform" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="index" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
@@ -5472,7 +5345,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetAttachedShaders" offset="assign" es2="2.0">
+    <function name="GetAttachedShaders" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="maxCount" type="GLsizei"/>
         <param name="count" type="GLsizei *" output="true"/>
@@ -5480,21 +5353,21 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetAttribLocation" es2="2.0" offset="assign">
+    <function name="GetAttribLocation" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
         <return type="GLint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetProgramiv" offset="assign" es2="2.0">
+    <function name="GetProgramiv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetProgramInfoLog" offset="assign" es2="2.0">
+    <function name="GetProgramInfoLog" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *"/>
@@ -5502,14 +5375,14 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetShaderiv" offset="assign" es2="2.0">
+    <function name="GetShaderiv" es2="2.0">
         <param name="shader" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetShaderInfoLog" offset="assign" es2="2.0">
+    <function name="GetShaderInfoLog" es2="2.0">
         <param name="shader" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *"/>
@@ -5517,7 +5390,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetShaderSource" es2="2.0" offset="assign">
+    <function name="GetShaderSource" es2="2.0">
         <param name="shader" type="GLuint"/>
         <param name="bufSize" type="GLsizei"/>
         <param name="length" type="GLsizei *" output="true"/>
@@ -5525,29 +5398,28 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetUniformLocation"
-              es2="2.0" offset="assign">
+    <function name="GetUniformLocation" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="name" type="const GLchar *"/>
         <return type="GLint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetUniformfv" es2="2.0" offset="assign">
+    <function name="GetUniformfv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="params" type="GLfloat *" output="true"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetUniformiv" es2="2.0" offset="assign">
+    <function name="GetUniformiv" es2="2.0">
         <param name="program" type="GLuint"/>
         <param name="location" type="GLint"/>
         <param name="params" type="GLint *" output="true"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetVertexAttribdv" offset="assign">
+    <function name="GetVertexAttribdv">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLdouble *" output="true" variable_param="pname"/>
@@ -5555,7 +5427,7 @@
         <glx handcode="client" vendorpriv="1301"/>
     </function>
 
-    <function name="GetVertexAttribfv" es2="2.0" offset="assign">
+    <function name="GetVertexAttribfv" es2="2.0">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
@@ -5563,7 +5435,7 @@
         <glx handcode="client" vendorpriv="1302"/>
     </function>
 
-    <function name="GetVertexAttribiv" es2="2.0" offset="assign">
+    <function name="GetVertexAttribiv" es2="2.0">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
@@ -5571,8 +5443,7 @@
         <glx handcode="client" vendorpriv="1303"/>
     </function>
 
-    <function name="GetVertexAttribPointerv"
-              es2="2.0" offset="assign">
+    <function name="GetVertexAttribPointerv" es2="2.0">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="pointer" type="GLvoid **" output="true"/>
@@ -5580,24 +5451,24 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="IsProgram" offset="assign" es2="2.0">
+    <function name="IsProgram" es2="2.0">
         <param name="program" type="GLuint"/>
         <return type="GLboolean"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="IsShader" offset="assign" es2="2.0">
+    <function name="IsShader" es2="2.0">
         <param name="shader" type="GLuint"/>
         <return type="GLboolean"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="LinkProgram" es2="2.0" offset="assign">
+    <function name="LinkProgram" es2="2.0">
         <param name="program" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="ShaderSource" es2="2.0" offset="assign">
+    <function name="ShaderSource" es2="2.0">
         <param name="shader" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="string" type="const GLchar * const *"/>
@@ -5605,30 +5476,30 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="UseProgram" es2="2.0" offset="assign">
+    <function name="UseProgram" es2="2.0">
         <param name="program" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="Uniform1f" es2="2.0" offset="assign">
+    <function name="Uniform1f" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLfloat"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform2f" es2="2.0" offset="assign">
+    <function name="Uniform2f" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLfloat"/>
         <param name="v1" type="GLfloat"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform3f" es2="2.0" offset="assign">
+    <function name="Uniform3f" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLfloat"/>
         <param name="v1" type="GLfloat"/>
         <param name="v2" type="GLfloat"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform4f" es2="2.0" offset="assign">
+    <function name="Uniform4f" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLfloat"/>
         <param name="v1" type="GLfloat"/>
@@ -5637,25 +5508,25 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="Uniform1i" es2="2.0" offset="assign">
+    <function name="Uniform1i" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLint"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform2i" es2="2.0" offset="assign">
+    <function name="Uniform2i" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLint"/>
         <param name="v1" type="GLint"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform3i" es2="2.0" offset="assign">
+    <function name="Uniform3i" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLint"/>
         <param name="v1" type="GLint"/>
         <param name="v2" type="GLint"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform4i" es2="2.0" offset="assign">
+    <function name="Uniform4i" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="v0" type="GLint"/>
         <param name="v1" type="GLint"/>
@@ -5664,71 +5535,71 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="Uniform1fv" es2="2.0" offset="assign">
+    <function name="Uniform1fv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLfloat *" count="count"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform2fv" es2="2.0" offset="assign">
+    <function name="Uniform2fv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="2"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform3fv" es2="2.0" offset="assign">
+    <function name="Uniform3fv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="3"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform4fv" es2="2.0" offset="assign">
+    <function name="Uniform4fv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="4"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="Uniform1iv" es2="2.0" offset="assign">
+    <function name="Uniform1iv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLint *" count="count"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform2iv" es2="2.0" offset="assign">
+    <function name="Uniform2iv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLint *" count="count" count_scale="2"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform3iv" es2="2.0" offset="assign">
+    <function name="Uniform3iv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLint *" count="count" count_scale="3"/>
         <glx ignore="true"/>
     </function>
-    <function name="Uniform4iv" es2="2.0" offset="assign">
+    <function name="Uniform4iv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="value" type="const GLint *" count="count" count_scale="4"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="UniformMatrix2fv" es2="2.0" offset="assign">
+    <function name="UniformMatrix2fv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="4"/>
         <glx ignore="true"/>
     </function>
-    <function name="UniformMatrix3fv" es2="2.0" offset="assign">
+    <function name="UniformMatrix3fv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="9"/>
         <glx ignore="true"/>
     </function>
-    <function name="UniformMatrix4fv" es2="2.0" offset="assign">
+    <function name="UniformMatrix4fv" es2="2.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
@@ -5736,16 +5607,16 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="ValidateProgram" es2="2.0" offset="assign">
+    <function name="ValidateProgram" es2="2.0">
         <param name="program" type="GLuint"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="VertexAttrib1d" offset="assign">
+    <function name="VertexAttrib1d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
     </function>
-    <function name="VertexAttrib1dv" offset="assign">
+    <function name="VertexAttrib1dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="1"/>
         <glx rop="4197" doubles_in_order="true"/>
@@ -5758,22 +5629,22 @@
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *"/>
     </function>
-    <function name="VertexAttrib1s" offset="assign">
+    <function name="VertexAttrib1s">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
     </function>
-    <function name="VertexAttrib1sv" offset="assign">
+    <function name="VertexAttrib1sv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="1"/>
         <glx rop="4189"/>
     </function>
 
-    <function name="VertexAttrib2d" offset="assign">
+    <function name="VertexAttrib2d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
     </function>
-    <function name="VertexAttrib2dv" offset="assign">
+    <function name="VertexAttrib2dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="2"/>
         <glx rop="4198" doubles_in_order="true"/>
@@ -5787,24 +5658,24 @@
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *"/>
     </function>
-    <function name="VertexAttrib2s" offset="assign">
+    <function name="VertexAttrib2s">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
     </function>
-    <function name="VertexAttrib2sv" offset="assign">
+    <function name="VertexAttrib2sv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="2"/>
         <glx rop="4190"/>
     </function>
 
-    <function name="VertexAttrib3d" offset="assign">
+    <function name="VertexAttrib3d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
     </function>
-    <function name="VertexAttrib3dv" offset="assign">
+    <function name="VertexAttrib3dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="4199" doubles_in_order="true"/>
@@ -5819,68 +5690,68 @@
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *"/>
     </function>
-    <function name="VertexAttrib3s" offset="assign">
+    <function name="VertexAttrib3s">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
     </function>
-    <function name="VertexAttrib3sv" offset="assign">
+    <function name="VertexAttrib3sv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="4191"/>
     </function>
 
-    <function name="VertexAttrib4Nbv" offset="assign">
+    <function name="VertexAttrib4Nbv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLbyte *" count="4"/>
         <glx rop="4235"/>
     </function>
-    <function name="VertexAttrib4Niv" offset="assign">
+    <function name="VertexAttrib4Niv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLint *" count="4"/>
         <glx rop="4237"/>
     </function>
-    <function name="VertexAttrib4Nsv" offset="assign">
+    <function name="VertexAttrib4Nsv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="4236"/>
     </function>
-    <function name="VertexAttrib4Nub" offset="assign">
+    <function name="VertexAttrib4Nub">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLubyte"/>
         <param name="y" type="GLubyte"/>
         <param name="z" type="GLubyte"/>
         <param name="w" type="GLubyte"/>
     </function>
-    <function name="VertexAttrib4Nubv" offset="assign">
+    <function name="VertexAttrib4Nubv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLubyte *" count="4"/>
         <glx rop="4201"/>
     </function>
-    <function name="VertexAttrib4Nuiv" offset="assign">
+    <function name="VertexAttrib4Nuiv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLuint *" count="4"/>
         <glx rop="4239"/>
     </function>
-    <function name="VertexAttrib4Nusv" offset="assign">
+    <function name="VertexAttrib4Nusv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLushort *" count="4"/>
         <glx rop="4238"/>
     </function>
-    <function name="VertexAttrib4bv" offset="assign">
+    <function name="VertexAttrib4bv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLbyte *" count="4"/>
         <glx rop="4230"/>
     </function>
-    <function name="VertexAttrib4d" offset="assign">
+    <function name="VertexAttrib4d">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
         <param name="w" type="GLdouble"/>
     </function>
-    <function name="VertexAttrib4dv" offset="assign">
+    <function name="VertexAttrib4dv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="4"/>
         <glx rop="4200" doubles_in_order="true"/>
@@ -5896,41 +5767,40 @@
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *"/>
     </function>
-    <function name="VertexAttrib4iv" offset="assign">
+    <function name="VertexAttrib4iv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLint *" count="4"/>
         <glx rop="4231"/>
     </function>
-    <function name="VertexAttrib4s" offset="assign">
+    <function name="VertexAttrib4s">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
         <param name="w" type="GLshort"/>
     </function>
-    <function name="VertexAttrib4sv" offset="assign">
+    <function name="VertexAttrib4sv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="4192"/>
     </function>
-    <function name="VertexAttrib4ubv" offset="assign">
+    <function name="VertexAttrib4ubv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLubyte *" count="4"/>
         <glx rop="4232"/>
     </function>
-    <function name="VertexAttrib4uiv" offset="assign">
+    <function name="VertexAttrib4uiv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLuint *" count="4"/>
         <glx rop="4234"/>
     </function>
-    <function name="VertexAttrib4usv" offset="assign">
+    <function name="VertexAttrib4usv">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLushort *" count="4"/>
         <glx rop="4233"/>
     </function>
 
-    <function name="VertexAttribPointer"
-              es2="2.0" offset="assign">
+    <function name="VertexAttribPointer" es2="2.0">
         <param name="index" type="GLuint"/>
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -5971,42 +5841,42 @@
     <enum name="COMPRESSED_SLUMINANCE"          value="0x8C4A"/>
     <enum name="COMPRESSED_SLUMINANCE_ALPHA"    value="0x8C4B"/>
 
-    <function name="UniformMatrix2x3fv" offset="assign" es2="3.0">
+    <function name="UniformMatrix2x3fv" es2="3.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="6"/>
         <glx ignore="true"/>
     </function>
-    <function name="UniformMatrix3x2fv" offset="assign" es2="3.0">
+    <function name="UniformMatrix3x2fv" es2="3.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="6"/>
         <glx ignore="true"/>
     </function>
-    <function name="UniformMatrix2x4fv" offset="assign" es2="3.0">
+    <function name="UniformMatrix2x4fv" es2="3.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="6"/>
         <glx ignore="true"/>
     </function>
-    <function name="UniformMatrix4x2fv" offset="assign" es2="3.0">
+    <function name="UniformMatrix4x2fv" es2="3.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="8"/>
         <glx ignore="true"/>
     </function>
-    <function name="UniformMatrix3x4fv" offset="assign" es2="3.0">
+    <function name="UniformMatrix3x4fv" es2="3.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
         <param name="value" type="const GLfloat *" count="count" count_scale="12"/>
         <glx ignore="true"/>
     </function>
-    <function name="UniformMatrix4x3fv" offset="assign" es2="3.0">
+    <function name="UniformMatrix4x3fv" es2="3.0">
         <param name="location" type="GLint"/>
         <param name="count" type="GLsizei" counter="true"/>
         <param name="transpose" type="GLboolean"/>
@@ -6081,13 +5951,13 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="MultiTexCoord1fARB" offset="378"
+    <function name="MultiTexCoord1fARB"
               vectorequiv="MultiTexCoord1fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLfloat"/>
     </function>
 
-    <function name="MultiTexCoord1fvARB" offset="379" exec="dynamic">
+    <function name="MultiTexCoord1fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLfloat *" count="1"/>
         <glx rop="199"/>
@@ -6127,14 +5997,14 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="MultiTexCoord2fARB" offset="386"
+    <function name="MultiTexCoord2fARB"
               vectorequiv="MultiTexCoord2fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLfloat"/>
         <param name="t" type="GLfloat"/>
     </function>
 
-    <function name="MultiTexCoord2fvARB" offset="387" exec="dynamic">
+    <function name="MultiTexCoord2fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLfloat *" count="2"/>
         <glx rop="203"/>
@@ -6177,7 +6047,7 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="MultiTexCoord3fARB" offset="394"
+    <function name="MultiTexCoord3fARB"
               vectorequiv="MultiTexCoord3fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLfloat"/>
@@ -6185,7 +6055,7 @@
         <param name="r" type="GLfloat"/>
     </function>
 
-    <function name="MultiTexCoord3fvARB" offset="395" exec="dynamic">
+    <function name="MultiTexCoord3fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="207"/>
@@ -6231,7 +6101,7 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="MultiTexCoord4fARB" offset="402"
+    <function name="MultiTexCoord4fARB"
               vectorequiv="MultiTexCoord4fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLfloat"/>
@@ -6240,7 +6110,7 @@
         <param name="q" type="GLfloat"/>
     </function>
 
-    <function name="MultiTexCoord4fvARB" offset="403" exec="dynamic">
+    <function name="MultiTexCoord4fvARB" exec="dynamic">
         <param name="target" type="GLenum"/>
         <param name="v" type="const GLfloat *" count="4"/>
         <glx rop="211"/>
@@ -6672,7 +6542,7 @@
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
-        <glx handcode="true"/>
+        <glx ignore="true" handcode="true"/>
     </function>
 
     <function name="VertexBlendARB" exec="skip">
@@ -6738,7 +6608,7 @@
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="pointer" type="const GLvoid *"/>
-        <glx handcode="true"/>
+        <glx ignore="true" handcode="true"/>
     </function>
 </category>
 
@@ -7108,13 +6978,13 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttrib1fARB" offset="assign"
-              vectorequiv="VertexAttrib1fvARB" exec="dynamic">
+    <function name="VertexAttrib1fARB"
+	      vectorequiv="VertexAttrib1fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib1fvARB" offset="assign" exec="dynamic">
+    <function name="VertexAttrib1fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="1"/>
         <glx rop="4193"/>
@@ -7143,14 +7013,14 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttrib2fARB" offset="assign"
+    <function name="VertexAttrib2fARB"
               vectorequiv="VertexAttrib2fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib2fvARB" offset="assign" exec="dynamic">
+    <function name="VertexAttrib2fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="2"/>
         <glx rop="4194"/>
@@ -7181,7 +7051,7 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttrib3fARB" offset="assign"
+    <function name="VertexAttrib3fARB"
               vectorequiv="VertexAttrib3fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
@@ -7189,7 +7059,7 @@
         <param name="z" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib3fvARB" offset="assign" exec="dynamic">
+    <function name="VertexAttrib3fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="4195"/>
@@ -7222,7 +7092,7 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="VertexAttrib4fARB" offset="assign"
+    <function name="VertexAttrib4fARB"
               vectorequiv="VertexAttrib4fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
@@ -7231,7 +7101,7 @@
         <param name="w" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib4fvARB" offset="assign" exec="dynamic">
+    <function name="VertexAttrib4fvARB" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="4"/>
         <glx rop="4196"/>
@@ -7332,7 +7202,7 @@
         <param name="index" type="GLuint"/>
     </function>
 
-    <function name="ProgramStringARB" offset="assign" deprecated="3.1">
+    <function name="ProgramStringARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="len" type="GLsizei" counter="true"/>
@@ -7340,31 +7210,31 @@
         <glx rop="4217" large="true"/>
     </function>
 
-    <function name="BindProgramARB" offset="assign">
+    <function name="BindProgramARB">
         <param name="target" type="GLenum"/>
         <param name="program" type="GLuint"/>
         <glx rop="4180"/>
     </function>
 
-    <function name="DeleteProgramsARB" offset="assign">
+    <function name="DeleteProgramsARB">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="programs" type="const GLuint *" count="n"/>
         <glx vendorpriv="1294"/>
     </function>
 
-    <function name="GenProgramsARB" offset="assign">
+    <function name="GenProgramsARB">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="programs" type="GLuint *" output="true" count="n"/>
         <glx vendorpriv="1295" always_array="true"/>
     </function>
 
-    <function name="IsProgramARB" offset="assign">
+    <function name="IsProgramARB">
         <param name="program" type="GLuint"/>
         <return type="GLboolean"/>
         <glx vendorpriv="1304"/>
     </function>
 
-    <function name="ProgramEnvParameter4dARB" offset="assign"
+    <function name="ProgramEnvParameter4dARB"
               vectorequiv="ProgramEnvParameter4dvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -7374,7 +7244,7 @@
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="ProgramEnvParameter4dvARB" offset="assign"
+    <function name="ProgramEnvParameter4dvARB"
               deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -7382,7 +7252,7 @@
         <glx rop="4185" doubles_in_order="true"/>
     </function>
 
-    <function name="ProgramEnvParameter4fARB" offset="assign"
+    <function name="ProgramEnvParameter4fARB"
               vectorequiv="ProgramEnvParameter4fvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -7392,7 +7262,7 @@
         <param name="w" type="GLfloat"/>
     </function>
 
-    <function name="ProgramEnvParameter4fvARB" offset="assign"
+    <function name="ProgramEnvParameter4fvARB"
               deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -7400,7 +7270,7 @@
         <glx rop="4184"/>
     </function>
 
-    <function name="ProgramLocalParameter4dARB" offset="assign"
+    <function name="ProgramLocalParameter4dARB"
               vectorequiv="ProgramLocalParameter4dvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -7410,7 +7280,7 @@
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="ProgramLocalParameter4dvARB" offset="assign"
+    <function name="ProgramLocalParameter4dvARB"
               deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -7418,7 +7288,7 @@
         <glx rop="4216" doubles_in_order="true"/>
     </function>
 
-    <function name="ProgramLocalParameter4fARB" offset="assign"
+    <function name="ProgramLocalParameter4fARB"
               vectorequiv="ProgramLocalParameter4fvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -7428,8 +7298,7 @@
         <param name="w" type="GLfloat"/>
     </function>
 
-    <function name="ProgramLocalParameter4fvARB" offset="assign"
-              deprecated="3.1">
+    <function name="ProgramLocalParameter4fvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="params" type="const GLfloat *" count="4"/>
@@ -7448,32 +7317,28 @@
          the ARB_vertex_program protocol to unused padding.
       -->
 
-    <function name="GetProgramEnvParameterdvARB" offset="assign"
-              deprecated="3.1">
+    <function name="GetProgramEnvParameterdvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="params" type="GLdouble *" output="true" count="4"/>
         <glx vendorpriv="1297" handcode="client" doubles_in_order="true"/>
     </function>
 
-    <function name="GetProgramEnvParameterfvARB" offset="assign"
-              deprecated="3.1">
+    <function name="GetProgramEnvParameterfvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="params" type="GLfloat *" output="true" count="4"/>
         <glx vendorpriv="1296" handcode="client"/>
     </function>
 
-    <function name="GetProgramLocalParameterdvARB" offset="assign"
-              deprecated="3.1">
+    <function name="GetProgramLocalParameterdvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="params" type="GLdouble *" output="true" count="4"/>
         <glx vendorpriv="1306" handcode="client" doubles_in_order="true"/>
     </function>
 
-    <function name="GetProgramLocalParameterfvARB" offset="assign"
-              deprecated="3.1">
+    <function name="GetProgramLocalParameterfvARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="params" type="GLfloat *" output="true" count="4"/>
@@ -7481,14 +7346,14 @@
     </function>
 
 
-    <function name="GetProgramivARB" offset="assign" deprecated="3.1">
+    <function name="GetProgramivARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx vendorpriv="1307"/>
     </function>
 
-    <function name="GetProgramStringARB" offset="assign" deprecated="3.1">
+    <function name="GetProgramStringARB" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="string" type="GLvoid *" output="true"/>
@@ -7774,24 +7639,24 @@
     <type name="charARB"   size="1" glx_name="CARD8"/>
     <type name="handleARB" size="4" glx_name="CARD32"/>
 
-    <function name="DeleteObjectARB" offset="assign">
+    <function name="DeleteObjectARB">
         <param name="obj" type="GLhandleARB"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetHandleARB" offset="assign">
+    <function name="GetHandleARB">
         <param name="pname" type="GLenum"/>
         <return type="GLhandleARB"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="DetachObjectARB" offset="assign">
+    <function name="DetachObjectARB">
         <param name="containerObj" type="GLhandleARB"/>
         <param name="attachedObj" type="GLhandleARB"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="CreateShaderObjectARB" offset="assign">
+    <function name="CreateShaderObjectARB">
         <param name="shaderType" type="GLenum"/>
         <return type="GLhandleARB"/>
         <glx ignore="true"/>
@@ -7808,12 +7673,12 @@
         <param name="shader" type="GLhandleARB"/>
     </function>
 
-    <function name="CreateProgramObjectARB" offset="assign">
+    <function name="CreateProgramObjectARB">
         <return type="GLhandleARB"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="AttachObjectARB" offset="assign">
+    <function name="AttachObjectARB">
         <param name="containerObj" type="GLhandleARB"/>
         <param name="obj" type="GLhandleARB"/>
         <glx ignore="true"/>
@@ -7952,21 +7817,21 @@
         <param name="value" type="const GLfloat *"/>
     </function>
 
-    <function name="GetObjectParameterfvARB" offset="assign">
+    <function name="GetObjectParameterfvARB">
         <param name="obj" type="GLhandleARB"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetObjectParameterivARB" offset="assign">
+    <function name="GetObjectParameterivARB">
         <param name="obj" type="GLhandleARB"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="GetInfoLogARB" offset="assign">
+    <function name="GetInfoLogARB">
         <param name="obj" type="GLhandleARB"/>
         <param name="maxLength" type="GLsizei"/>
         <param name="length" type="GLsizei *" output="true"/>
@@ -7974,7 +7839,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="GetAttachedObjectsARB" offset="assign">
+    <function name="GetAttachedObjectsARB">
         <param name="containerObj" type="GLhandleARB"/>
         <param name="maxLength" type="GLsizei"/>
         <param name="length" type="GLsizei *" output="true"/>
@@ -8189,17 +8054,17 @@
     <enum name="TIMESTAMP" value="0x8E28"/>
     <type name="int64"                  size="8"/>
     <type name="uint64" unsigned="true" size="8"/>
-    <function name="GetQueryObjecti64v" static_dispatch="false" offset="assign">
+    <function name="GetQueryObjecti64v">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint64 *"/>
     </function>
-    <function name="GetQueryObjectui64v" static_dispatch="false" offset="assign">
+    <function name="GetQueryObjectui64v">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLuint64 *"/>
     </function>
-    <function name="QueryCounter" offset="assign" static_dispatch="false">
+    <function name="QueryCounter">
         <param name="id" type="GLuint"/>
         <param name="target" type="GLenum"/>
     </function>
@@ -8219,25 +8084,24 @@
   <enum name="MAX_TRANSFORM_FEEDBACK_BUFFERS" value="0x8E70"/>
   <enum name="MAX_VERTEX_STREAMS"             value="0x8E71"/>
 
-  <function name="DrawTransformFeedbackStream" offset="assign"
-            exec="dynamic">
+  <function name="DrawTransformFeedbackStream" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="id" type="GLuint"/>
     <param name="stream" type="GLuint"/>
   </function>
 
-  <function name="BeginQueryIndexed" offset="assign">
+  <function name="BeginQueryIndexed">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="id" type="GLuint"/>
   </function>
 
-  <function name="EndQueryIndexed" offset="assign">
+  <function name="EndQueryIndexed">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
   </function>
 
-  <function name="GetQueryIndexediv" offset="assign">
+  <function name="GetQueryIndexediv">
     <param name="target" type="GLenum"/>
     <param name="index" type="GLuint"/>
     <param name="pname" type="GLenum"/>
@@ -8268,15 +8132,13 @@
 <xi:include href="ARB_base_instance.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <category name="GL_ARB_transform_feedback_instanced" number="109">
-  <function name="DrawTransformFeedbackInstanced" offset="assign"
-            exec="dynamic">
+  <function name="DrawTransformFeedbackInstanced" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="id" type="GLuint"/>
     <param name="primcount" type="GLsizei"/>
   </function>
 
-  <function name="DrawTransformFeedbackStreamInstanced" offset="assign"
-            exec="dynamic">
+  <function name="DrawTransformFeedbackStreamInstanced" exec="dynamic">
     <param name="mode" type="GLenum"/>
     <param name="id" type="GLuint"/>
     <param name="stream" type="GLuint"/>
@@ -8326,7 +8188,9 @@
     <!-- No new functions, types, enums. -->
 </category>
 
-<!-- ARB extensions #130..#131 -->
+<xi:include href="ARB_framebuffer_no_attachments.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
+
+<!-- ARB extensions #131 -->
 
 <category name="GL_ARB_explicit_uniform_location" number="128">
     <enum name="MAX_UNIFORM_LOCATIONS" count="1" value="0x826E" >
@@ -8360,7 +8224,7 @@
     <enum name="BUFFER_STORAGE_FLAGS" value="0x8220" />
     <enum name="CLIENT_MAPPED_BUFFER_BARRIER_BIT" value="0x4000" />
 
-    <function name="BufferStorage" offset="assign">
+    <function name="BufferStorage">
         <param name="target" type="GLenum"/>
         <param name="size" type="GLsizeiptr"/>
         <param name="data" type="const GLvoid *"/>
@@ -8420,7 +8284,7 @@
 <category name="GL_EXT_polygon_offset" number="3">
     <enum name="POLYGON_OFFSET_BIAS_EXT"                  value="0x8039"/>
 
-    <function name="PolygonOffsetEXT" offset="assign" deprecated="3.1">
+    <function name="PolygonOffsetEXT" deprecated="3.1">
         <param name="factor" type="GLfloat"/>
         <param name="bias" type="GLfloat"/>
         <glx rop="4098" ignore="true"/>
@@ -8669,7 +8533,7 @@
     </enum>
     <enum name="TABLE_TOO_LARGE_EXT"                      value="0x8031"/>
 
-    <function name="GetHistogramEXT" alias="GetHistogram" static_dispatch="false">
+    <function name="GetHistogramEXT" alias="GetHistogram">
         <param name="target" type="GLenum"/>
         <param name="reset" type="GLboolean"/>
         <param name="format" type="GLenum"/>
@@ -8678,21 +8542,21 @@
         <glx vendorpriv="5" dimensions_in_reply="true" img_reset="reset"/>
     </function>
 
-    <function name="GetHistogramParameterfvEXT" alias="GetHistogramParameterfv" static_dispatch="false">
+    <function name="GetHistogramParameterfvEXT" alias="GetHistogramParameterfv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx vendorpriv="6"/>
     </function>
 
-    <function name="GetHistogramParameterivEXT" alias="GetHistogramParameteriv" static_dispatch="false">
+    <function name="GetHistogramParameterivEXT" alias="GetHistogramParameteriv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx vendorpriv="7"/>
     </function>
 
-    <function name="GetMinmaxEXT" alias="GetMinmax" static_dispatch="false">
+    <function name="GetMinmaxEXT" alias="GetMinmax">
         <param name="target" type="GLenum"/>
         <param name="reset" type="GLboolean"/>
         <param name="format" type="GLenum"/>
@@ -8701,38 +8565,38 @@
         <glx vendorpriv="8" img_reset="reset"/>
     </function>
 
-    <function name="GetMinmaxParameterfvEXT" alias="GetMinmaxParameterfv" static_dispatch="false">
+    <function name="GetMinmaxParameterfvEXT" alias="GetMinmaxParameterfv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx vendorpriv="9"/>
     </function>
 
-    <function name="GetMinmaxParameterivEXT" alias="GetMinmaxParameteriv" static_dispatch="false">
+    <function name="GetMinmaxParameterivEXT" alias="GetMinmaxParameteriv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx vendorpriv="10"/>
     </function>
 
-    <function name="HistogramEXT" alias="Histogram" static_dispatch="false">
+    <function name="HistogramEXT" alias="Histogram">
         <param name="target" type="GLenum"/>
         <param name="width" type="GLsizei"/>
         <param name="internalformat" type="GLenum"/>
         <param name="sink" type="GLboolean"/>
     </function>
 
-    <function name="MinmaxEXT" alias="Minmax" static_dispatch="false">
+    <function name="MinmaxEXT" alias="Minmax">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="sink" type="GLboolean"/>
     </function>
 
-    <function name="ResetHistogramEXT" alias="ResetHistogram" static_dispatch="false">
+    <function name="ResetHistogramEXT" alias="ResetHistogram">
         <param name="target" type="GLenum"/>
     </function>
 
-    <function name="ResetMinmaxEXT" alias="ResetMinmax" static_dispatch="false">
+    <function name="ResetMinmaxEXT" alias="ResetMinmax">
         <param name="target" type="GLenum"/>
     </function>
 </category>
@@ -8804,7 +8668,7 @@
         <size name="Get" mode="get"/>
     </enum>
 
-    <function name="ConvolutionFilter1DEXT" alias="ConvolutionFilter1D" static_dispatch="false">
+    <function name="ConvolutionFilter1DEXT" alias="ConvolutionFilter1D">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -8813,7 +8677,7 @@
         <param name="image" type="const GLvoid *"/>
     </function>
 
-    <function name="ConvolutionFilter2DEXT" alias="ConvolutionFilter2D" static_dispatch="false">
+    <function name="ConvolutionFilter2DEXT" alias="ConvolutionFilter2D">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -8823,31 +8687,31 @@
         <param name="image" type="const GLvoid *"/>
     </function>
 
-    <function name="ConvolutionParameterfEXT" alias="ConvolutionParameterf" static_dispatch="false">
+    <function name="ConvolutionParameterfEXT" alias="ConvolutionParameterf">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat"/>
     </function>
 
-    <function name="ConvolutionParameterfvEXT" alias="ConvolutionParameterfv" static_dispatch="false">
+    <function name="ConvolutionParameterfvEXT" alias="ConvolutionParameterfv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *"/>
     </function>
 
-    <function name="ConvolutionParameteriEXT" alias="ConvolutionParameteri" static_dispatch="false">
+    <function name="ConvolutionParameteriEXT" alias="ConvolutionParameteri">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint"/>
     </function>
 
-    <function name="ConvolutionParameterivEXT" alias="ConvolutionParameteriv" static_dispatch="false">
+    <function name="ConvolutionParameterivEXT" alias="ConvolutionParameteriv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *"/>
     </function>
 
-    <function name="CopyConvolutionFilter1DEXT" alias="CopyConvolutionFilter1D" static_dispatch="false">
+    <function name="CopyConvolutionFilter1DEXT" alias="CopyConvolutionFilter1D">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="x" type="GLint"/>
@@ -8855,7 +8719,7 @@
         <param name="width" type="GLsizei"/>
     </function>
 
-    <function name="CopyConvolutionFilter2DEXT" alias="CopyConvolutionFilter2D" static_dispatch="false">
+    <function name="CopyConvolutionFilter2DEXT" alias="CopyConvolutionFilter2D">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="x" type="GLint"/>
@@ -8864,7 +8728,7 @@
         <param name="height" type="GLsizei"/>
     </function>
 
-    <function name="GetConvolutionFilterEXT" alias="GetConvolutionFilter" static_dispatch="false">
+    <function name="GetConvolutionFilterEXT" alias="GetConvolutionFilter">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -8872,21 +8736,21 @@
         <glx vendorpriv="1" dimensions_in_reply="true"/>
     </function>
 
-    <function name="GetConvolutionParameterfvEXT" alias="GetConvolutionParameterfv" static_dispatch="false">
+    <function name="GetConvolutionParameterfvEXT" alias="GetConvolutionParameterfv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx vendorpriv="2"/>
     </function>
 
-    <function name="GetConvolutionParameterivEXT" alias="GetConvolutionParameteriv" static_dispatch="false">
+    <function name="GetConvolutionParameterivEXT" alias="GetConvolutionParameteriv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx vendorpriv="3"/>
     </function>
 
-    <function name="GetSeparableFilterEXT" alias="GetSeparableFilter" static_dispatch="false">
+    <function name="GetSeparableFilterEXT" alias="GetSeparableFilter">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -8896,7 +8760,7 @@
         <glx vendorpriv="4" handcode="true"/>
     </function>
 
-    <function name="SeparableFilter2DEXT" alias="SeparableFilter2D" static_dispatch="false">
+    <function name="SeparableFilter2DEXT" alias="SeparableFilter2D">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -8968,7 +8832,7 @@
         <size name="GetColorTableParameterivSGI" mode="get"/>
     </enum>
 
-    <function name="ColorTableSGI" alias="ColorTable" static_dispatch="false">
+    <function name="ColorTableSGI" alias="ColorTable">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="width" type="GLsizei"/>
@@ -8977,19 +8841,19 @@
         <param name="table" type="const GLvoid *"/>
     </function>
 
-    <function name="ColorTableParameterfvSGI" alias="ColorTableParameterfv" static_dispatch="false">
+    <function name="ColorTableParameterfvSGI" alias="ColorTableParameterfv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *"/>
     </function>
 
-    <function name="ColorTableParameterivSGI" alias="ColorTableParameteriv" static_dispatch="false">
+    <function name="ColorTableParameterivSGI" alias="ColorTableParameteriv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLint *"/>
     </function>
 
-    <function name="CopyColorTableSGI" alias="CopyColorTable" static_dispatch="false">
+    <function name="CopyColorTableSGI" alias="CopyColorTable">
         <param name="target" type="GLenum"/>
         <param name="internalformat" type="GLenum"/>
         <param name="x" type="GLint"/>
@@ -8997,7 +8861,7 @@
         <param name="width" type="GLsizei"/>
     </function>
 
-    <function name="GetColorTableSGI" alias="GetColorTable" static_dispatch="false">
+    <function name="GetColorTableSGI" alias="GetColorTable">
         <param name="target" type="GLenum"/>
         <param name="format" type="GLenum"/>
         <param name="type" type="GLenum"/>
@@ -9005,14 +8869,14 @@
         <glx vendorpriv="4098" dimensions_in_reply="true"/>
     </function>
 
-    <function name="GetColorTableParameterfvSGI" alias="GetColorTableParameterfv" static_dispatch="false">
+    <function name="GetColorTableParameterfvSGI" alias="GetColorTableParameterfv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx vendorpriv="4099"/>
     </function>
 
-    <function name="GetColorTableParameterivSGI" alias="GetColorTableParameteriv" static_dispatch="false">
+    <function name="GetColorTableParameterivSGI" alias="GetColorTableParameteriv">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
@@ -9170,15 +9034,13 @@
 </category>
 
 <category name="GL_SGIS_multisample" number="25">
-    <function name="SampleMaskSGIS" offset="assign" static_dispatch="false"
-              exec="skip">
+    <function name="SampleMaskSGIS" exec="skip">
         <param name="value" type="GLclampf"/>
         <param name="invert" type="GLboolean"/>
         <glx rop="2048"/>
     </function>
 
-    <function name="SamplePatternSGIS" offset="assign" static_dispatch="false"
-              exec="skip">
+    <function name="SamplePatternSGIS" exec="skip">
         <param name="pattern" type="GLenum"/>
         <glx rop="2049"/>
     </function>
@@ -9209,7 +9071,7 @@
         <param name="i" type="GLint"/>
     </function>
 
-    <function name="ColorPointerEXT" offset="assign" deprecated="3.1">
+    <function name="ColorPointerEXT" deprecated="3.1">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -9224,7 +9086,7 @@
         <param name="count" type="GLsizei"/>
     </function>
 
-    <function name="EdgeFlagPointerEXT" offset="assign" deprecated="3.1">
+    <function name="EdgeFlagPointerEXT" deprecated="3.1">
         <param name="stride" type="GLsizei"/>
         <param name="count" type="GLsizei"/>
         <param name="pointer" type="const GLboolean *"/>
@@ -9236,7 +9098,7 @@
         <param name="params" type="GLvoid **" output="true"/>
     </function>
 
-    <function name="IndexPointerEXT" offset="assign" deprecated="3.1">
+    <function name="IndexPointerEXT" deprecated="3.1">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="count" type="GLsizei"/>
@@ -9244,7 +9106,7 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="NormalPointerEXT" offset="assign" deprecated="3.1">
+    <function name="NormalPointerEXT" deprecated="3.1">
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
         <param name="count" type="GLsizei"/>
@@ -9252,7 +9114,7 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="TexCoordPointerEXT" offset="assign" deprecated="3.1">
+    <function name="TexCoordPointerEXT" deprecated="3.1">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -9261,7 +9123,7 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="VertexPointerEXT" offset="assign" deprecated="3.1">
+    <function name="VertexPointerEXT" deprecated="3.1">
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
         <param name="stride" type="GLsizei"/>
@@ -9589,7 +9451,7 @@
 <!-- Extension number 73 is not listed in the extension registry. -->
 
 <category name="GL_EXT_color_subtable" number="74">
-    <function name="ColorSubTableEXT" alias="ColorSubTable" static_dispatch="false">
+    <function name="ColorSubTableEXT" alias="ColorSubTable">
         <param name="target" type="GLenum"/>
         <param name="start" type="GLsizei"/>
         <param name="count" type="GLsizei"/>
@@ -9598,7 +9460,7 @@
         <param name="data" type="const GLvoid *"/>
     </function>
 
-    <function name="CopyColorSubTableEXT" alias="CopyColorSubTable" static_dispatch="false">
+    <function name="CopyColorSubTableEXT" alias="CopyColorSubTable">
         <param name="target" type="GLenum"/>
         <param name="start" type="GLsizei"/>
         <param name="x" type="GLint"/>
@@ -9800,13 +9662,13 @@
     <enum name="ARRAY_ELEMENT_LOCK_FIRST_EXT"             value="0x81A8"/>
     <enum name="ARRAY_ELEMENT_LOCK_COUNT_EXT"             value="0x81A9"/>
 
-    <function name="LockArraysEXT" offset="assign" deprecated="3.1">
+    <function name="LockArraysEXT" deprecated="3.1">
         <param name="first" type="GLint"/>
         <param name="count" type="GLsizei"/>
         <glx handcode="true" ignore="true"/>
     </function>
 
-    <function name="UnlockArraysEXT" offset="assign" deprecated="3.1">
+    <function name="UnlockArraysEXT" deprecated="3.1">
         <glx handcode="true" ignore="true"/>
     </function>
 </category>
@@ -10148,13 +10010,13 @@
         <param name="v" type="const GLdouble *"/>
     </function>
 
-    <function name="SecondaryColor3fEXT" offset="assign" vectorequiv="SecondaryColor3fvEXT">
+    <function name="SecondaryColor3fEXT" vectorequiv="SecondaryColor3fvEXT">
         <param name="red" type="GLfloat"/>
         <param name="green" type="GLfloat"/>
         <param name="blue" type="GLfloat"/>
     </function>
 
-    <function name="SecondaryColor3fvEXT" offset="assign">
+    <function name="SecondaryColor3fvEXT">
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="4129"/>
     </function>
@@ -10236,8 +10098,7 @@
         <param name="primcount" type="GLsizei"/>
     </function>
 
-    <function name="MultiDrawElementsEXT" offset="assign" es1="1.0" es2="2.0"
-              exec="dynamic">
+    <function name="MultiDrawElementsEXT" es1="1.0" es2="2.0" exec="dynamic">
         <param name="mode" type="GLenum"/>
         <param name="count" type="const GLsizei *"/>
         <param name="type" type="GLenum"/>
@@ -10248,12 +10109,11 @@
 </category>
 
 <category name="GL_EXT_fog_coord" number="149">
-    <function name="FogCoordfEXT" offset="assign" vectorequiv="FogCoordfvEXT"
-              exec="dynamic">
+    <function name="FogCoordfEXT" vectorequiv="FogCoordfvEXT" exec="dynamic">
         <param name="coord" type="GLfloat"/>
     </function>
 
-    <function name="FogCoordfvEXT" offset="assign" exec="dynamic">
+    <function name="FogCoordfvEXT" exec="dynamic">
         <param name="coord" type="const GLfloat *" count="1"/>
         <glx rop="4124"/>
     </function>
@@ -11105,7 +10965,7 @@
 </category>
 
 <category name="GL_MESA_resize_buffers" number="196">
-    <function name="ResizeBuffersMESA" offset="assign" exec="skip">
+    <function name="ResizeBuffersMESA" exec="skip">
         <glx ignore="true"/>
     </function>
 </category>
@@ -11187,7 +11047,7 @@
         <param name="v" type="const GLshort *"/>
     </function>
 
-    <function name="WindowPos4dMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4dMESA" deprecated="3.1">
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
         <param name="z" type="GLdouble"/>
@@ -11195,12 +11055,12 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="WindowPos4dvMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4dvMESA" deprecated="3.1">
         <param name="v" type="const GLdouble *"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="WindowPos4fMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4fMESA" deprecated="3.1">
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
@@ -11208,12 +11068,12 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="WindowPos4fvMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4fvMESA" deprecated="3.1">
         <param name="v" type="const GLfloat *"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="WindowPos4iMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4iMESA" deprecated="3.1">
         <param name="x" type="GLint"/>
         <param name="y" type="GLint"/>
         <param name="z" type="GLint"/>
@@ -11221,12 +11081,12 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="WindowPos4ivMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4ivMESA" deprecated="3.1">
         <param name="v" type="const GLint *"/>
         <glx ignore="true"/>
     </function>
 
-    <function name="WindowPos4sMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4sMESA" deprecated="3.1">
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
         <param name="z" type="GLshort"/>
@@ -11234,7 +11094,7 @@
         <glx ignore="true"/>
     </function>
 
-    <function name="WindowPos4svMESA" offset="assign" deprecated="3.1">
+    <function name="WindowPos4svMESA" deprecated="3.1">
         <param name="v" type="const GLshort *"/>
         <glx ignore="true"/>
     </function>
@@ -11248,7 +11108,7 @@
 </category>
 
 <category name="GL_IBM_multimode_draw_arrays" number="200">
-    <function name="MultiModeDrawArraysIBM" offset="assign" static_dispatch="false">
+    <function name="MultiModeDrawArraysIBM">
         <param name="mode" type="const GLenum *"/>
         <param name="first" type="const GLint *"/>
         <param name="count" type="const GLsizei *"/>
@@ -11257,7 +11117,7 @@
         <glx handcode="true" ignore="true"/>
     </function>
 
-    <function name="MultiModeDrawElementsIBM" offset="assign" static_dispatch="false">
+    <function name="MultiModeDrawElementsIBM">
         <param name="mode" type="const GLenum *"/>
         <param name="count" type="const GLsizei *"/>
         <param name="type" type="GLenum"/>
@@ -11354,12 +11214,12 @@
 </category>
 
 <category name="GL_EXT_multisample" number="209">
-    <function name="SampleMaskEXT" alias="SampleMaskSGIS" static_dispatch="false">
+    <function name="SampleMaskEXT" alias="SampleMaskSGIS">
         <param name="value" type="GLclampf"/>
         <param name="invert" type="GLboolean"/>
     </function>
 
-    <function name="SamplePatternEXT" alias="SamplePatternSGIS" static_dispatch="false">
+    <function name="SamplePatternEXT" alias="SamplePatternSGIS">
         <param name="pattern" type="GLenum"/>
     </function>
 </category>
@@ -11656,8 +11516,7 @@
     <enum name="MAP2_VERTEX_ATTRIB14_4_NV"                value="0x867E"/>
     <enum name="MAP2_VERTEX_ATTRIB15_4_NV"                value="0x867F"/>
 
-    <function name="AreProgramsResidentNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="AreProgramsResidentNV" deprecated="3.1" exec="skip">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="ids" type="const GLuint *" count="n"/>
         <param name="residences" type="GLboolean *" output="true" count="n"/>
@@ -11675,8 +11534,7 @@
         <param name="programs" type="const GLuint *"/>
     </function>
 
-    <function name="ExecuteProgramNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="ExecuteProgramNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="id" type="GLuint"/>
         <param name="params" type="const GLfloat *" count="4"/>
@@ -11694,8 +11552,7 @@
          of these functions.
       -->
 
-    <function name="GetProgramParameterdvNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetProgramParameterdvNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
@@ -11703,8 +11560,7 @@
         <glx vendorpriv="1297"/>
     </function>
 
-    <function name="GetProgramParameterfvNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetProgramParameterfvNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
@@ -11712,24 +11568,21 @@
         <glx vendorpriv="1296"/>
     </function>
 
-    <function name="GetProgramivNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetProgramivNV" deprecated="3.1" exec="skip">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
         <glx vendorpriv="1298"/>
     </function>
 
-    <function name="GetProgramStringNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetProgramStringNV" deprecated="3.1" exec="skip">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="program" type="GLubyte *" output="true"/>
         <glx vendorpriv="1299" handcode="server" always_array="true"/>
     </function>
 
-    <function name="GetTrackMatrixivNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetTrackMatrixivNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="address" type="GLuint"/>
         <param name="pname" type="GLenum"/>
@@ -11737,24 +11590,21 @@
         <glx vendorpriv="1300"/>
     </function>
 
-    <function name="GetVertexAttribdvNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetVertexAttribdvNV" deprecated="3.1" exec="skip">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLdouble *" output="true" variable_param="pname"/>
         <glx vendorpriv="1301"/>
     </function>
 
-    <function name="GetVertexAttribfvNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetVertexAttribfvNV" deprecated="3.1" exec="skip">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfloat *" output="true" variable_param="pname"/>
         <glx vendorpriv="1302"/>
     </function>
 
-    <function name="GetVertexAttribivNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="GetVertexAttribivNV" deprecated="3.1" exec="skip">
         <param name="index" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint *" output="true" variable_param="pname"/>
@@ -11772,8 +11622,7 @@
         <return type="GLboolean"/>
     </function>
 
-    <function name="LoadProgramNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="LoadProgramNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="id" type="GLuint"/>
         <param name="len" type="GLsizei" counter="true"/>
@@ -11811,8 +11660,7 @@
         <param name="params" type="const GLfloat *"/>
     </function>
 
-    <function name="ProgramParameters4dvNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="ProgramParameters4dvNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="num" type="GLsizei" counter="true"/>
@@ -11820,8 +11668,7 @@
         <glx rop="4187"/>
     </function>
 
-    <function name="ProgramParameters4fvNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="ProgramParameters4fvNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="num" type="GLsizei" counter="true"/>
@@ -11829,15 +11676,13 @@
         <glx rop="4186"/>
     </function>
 
-    <function name="RequestResidentProgramsNV" offset="assign"
-              deprecated="3.1" exec="skip">
+    <function name="RequestResidentProgramsNV" deprecated="3.1" exec="skip">
         <param name="n" type="GLsizei" counter="true"/>
         <param name="ids" type="const GLuint *" count="n"/>
         <glx rop="4182"/>
     </function>
 
-    <function name="TrackMatrixNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="TrackMatrixNV" deprecated="3.1" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="address" type="GLuint"/>
         <param name="matrix" type="GLenum"/>
@@ -11845,8 +11690,7 @@
         <glx rop="4188"/>
     </function>
 
-    <function name="VertexAttribPointerNV" offset="assign" deprecated="3.1"
-              exec="skip">
+    <function name="VertexAttribPointerNV" deprecated="3.1" exec="skip">
         <param name="index" type="GLuint"/>
         <param name="size" type="GLint"/>
         <param name="type" type="GLenum"/>
@@ -11855,32 +11699,32 @@
         <glx handcode="true"/>
     </function>
 
-    <function name="VertexAttrib1sNV" offset="assign"
+    <function name="VertexAttrib1sNV"
               vectorequiv="VertexAttrib1svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
     </function>
 
-    <function name="VertexAttrib1svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib1svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="1"/>
         <glx rop="4265"/>
     </function>
 
-    <function name="VertexAttrib2sNV" offset="assign"
+    <function name="VertexAttrib2sNV"
               vectorequiv="VertexAttrib2svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
         <param name="y" type="GLshort"/>
     </function>
 
-    <function name="VertexAttrib2svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib2svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="2"/>
         <glx rop="4266"/>
     </function>
 
-    <function name="VertexAttrib3sNV" offset="assign"
+    <function name="VertexAttrib3sNV"
               vectorequiv="VertexAttrib3svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
@@ -11888,13 +11732,13 @@
         <param name="z" type="GLshort"/>
     </function>
 
-    <function name="VertexAttrib3svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib3svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="3"/>
         <glx rop="4267"/>
     </function>
 
-    <function name="VertexAttrib4sNV" offset="assign"
+    <function name="VertexAttrib4sNV"
               vectorequiv="VertexAttrib4svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLshort"/>
@@ -11903,60 +11747,53 @@
         <param name="w" type="GLshort"/>
     </function>
 
-    <function name="VertexAttrib4svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib4svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLshort *" count="4"/>
         <glx rop="4268"/>
     </function>
 
-    <function name="VertexAttrib1fNV" offset="assign"
-              vectorequiv="VertexAttrib1fvNV" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib1fNV"
+              vectorequiv="VertexAttrib1fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib1fvNV" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib1fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="1"/>
         <glx rop="4269"/>
     </function>
 
-    <function name="VertexAttrib2fNV" offset="assign"
-              vectorequiv="VertexAttrib2fvNV" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib2fNV"
+              vectorequiv="VertexAttrib2fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib2fvNV" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib2fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="2"/>
         <glx rop="4270"/>
     </function>
 
-    <function name="VertexAttrib3fNV" offset="assign"
-              vectorequiv="VertexAttrib3fvNV" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib3fNV"
+              vectorequiv="VertexAttrib3fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
         <param name="z" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib3fvNV" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib3fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="3"/>
         <glx rop="4271"/>
     </function>
 
-    <function name="VertexAttrib4fNV" offset="assign"
-              vectorequiv="VertexAttrib4fvNV" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib4fNV"
+              vectorequiv="VertexAttrib4fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLfloat"/>
         <param name="y" type="GLfloat"/>
@@ -11964,39 +11801,38 @@
         <param name="w" type="GLfloat"/>
     </function>
 
-    <function name="VertexAttrib4fvNV" offset="assign" deprecated="3.1"
-              exec="dynamic">
+    <function name="VertexAttrib4fvNV" deprecated="3.1" exec="dynamic">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLfloat *" count="4"/>
         <glx rop="4272"/>
     </function>
 
-    <function name="VertexAttrib1dNV" offset="assign"
+    <function name="VertexAttrib1dNV"
               vectorequiv="VertexAttrib1dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttrib1dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib1dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="1"/>
         <glx rop="4273" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttrib2dNV" offset="assign"
+    <function name="VertexAttrib2dNV"
               vectorequiv="VertexAttrib2dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
         <param name="y" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttrib2dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib2dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="2"/>
         <glx rop="4274" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttrib3dNV" offset="assign"
+    <function name="VertexAttrib3dNV"
               vectorequiv="VertexAttrib3dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
@@ -12004,13 +11840,13 @@
         <param name="z" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttrib3dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib3dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="3"/>
         <glx rop="4275" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttrib4dNV" offset="assign"
+    <function name="VertexAttrib4dNV"
               vectorequiv="VertexAttrib4dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLdouble"/>
@@ -12019,13 +11855,13 @@
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="VertexAttrib4dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib4dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLdouble *" count="4"/>
         <glx rop="4276" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttrib4ubNV" offset="assign"
+    <function name="VertexAttrib4ubNV"
               vectorequiv="VertexAttrib4ubvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="x" type="GLubyte"/>
@@ -12034,97 +11870,97 @@
         <param name="w" type="GLubyte"/>
     </function>
 
-    <function name="VertexAttrib4ubvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttrib4ubvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="v" type="const GLubyte *" count="4"/>
         <glx rop="4277"/>
     </function>
 
-    <function name="VertexAttribs1svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs1svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLshort *" count="n"/>
         <glx rop="4202"/>
     </function>
 
-    <function name="VertexAttribs2svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs2svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLshort *" count="n" count_scale="2"/>
         <glx rop="4203"/>
     </function>
 
-    <function name="VertexAttribs3svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs3svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLshort *" count="n" count_scale="3"/>
         <glx rop="4204"/>
     </function>
 
-    <function name="VertexAttribs4svNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs4svNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLshort *" count="n" count_scale="4"/>
         <glx rop="4205"/>
     </function>
 
-    <function name="VertexAttribs1fvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs1fvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLfloat *" count="n"/>
         <glx rop="4206"/>
     </function>
 
-    <function name="VertexAttribs2fvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs2fvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLfloat *" count="n" count_scale="2"/>
         <glx rop="4207"/>
     </function>
 
-    <function name="VertexAttribs3fvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs3fvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLfloat *" count="n" count_scale="3"/>
         <glx rop="4208"/>
     </function>
 
-    <function name="VertexAttribs4fvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs4fvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLfloat *" count="n" count_scale="4"/>
         <glx rop="4209"/>
     </function>
 
-    <function name="VertexAttribs1dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs1dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLdouble *" count="n"/>
         <glx rop="4210" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttribs2dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs2dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLdouble *" count="n" count_scale="2"/>
         <glx rop="4211" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttribs3dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs3dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLdouble *" count="n" count_scale="3"/>
         <glx rop="4212" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttribs4dvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs4dvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLdouble *" count="n" count_scale="4"/>
         <glx rop="4213" doubles_in_order="true"/>
     </function>
 
-    <function name="VertexAttribs4ubvNV" offset="assign" deprecated="3.1">
+    <function name="VertexAttribs4ubvNV" deprecated="3.1">
         <param name="index" type="GLuint"/>
         <param name="n" type="GLsizei" counter="true"/>
         <param name="v" type="const GLubyte *" count="n" count_scale="4"/>
@@ -12193,22 +12029,22 @@
         <size name="GetTexEnviv" mode="get"/>
         <size name="GetTexEnvfv" mode="get"/>
     </enum>
-    <function name="TexBumpParameterfvATI" offset="assign" deprecated="3.1" exec="skip">
+    <function name="TexBumpParameterfvATI" deprecated="3.1" exec="skip">
       <param name="pname" type="GLenum"/>
       <param name="param" type="const GLfloat *" variable_param="pname"/>
       <glx ignore="true"/>
     </function>
-    <function name="TexBumpParameterivATI" offset="assign" deprecated="3.1" exec="skip">
+    <function name="TexBumpParameterivATI" deprecated="3.1" exec="skip">
       <param name="pname" type="GLenum"/>
       <param name="param" type="const GLint *" variable_param="pname"/>
       <glx ignore="true"/>
     </function>
-    <function name="GetTexBumpParameterfvATI" offset="assign" deprecated="3.1" exec="skip">
+    <function name="GetTexBumpParameterfvATI" deprecated="3.1" exec="skip">
       <param name="pname" type="GLenum"/>
       <param name="param" type="GLfloat *" variable_param="pname"/>
       <glx ignore="true"/>
     </function>
-    <function name="GetTexBumpParameterivATI" offset="assign" deprecated="3.1" exec="skip">
+    <function name="GetTexBumpParameterivATI" deprecated="3.1" exec="skip">
       <param name="pname" type="GLenum"/>
       <param name="param" type="GLint *" variable_param="pname"/>
       <glx ignore="true"/>
@@ -12327,45 +12163,45 @@
     redudndant garbage.  There are a lot of enums with the value 0x00000001.
     -->
 
-    <function name="GenFragmentShadersATI" offset="assign" deprecated="3.1">
+    <function name="GenFragmentShadersATI" deprecated="3.1">
       <return type="GLuint"/>
       <param name="range" type="GLuint"/>
       <glx ignore="true"/>
     </function>
     
-    <function name="BindFragmentShaderATI" offset="assign" deprecated="3.1">
+    <function name="BindFragmentShaderATI" deprecated="3.1">
       <param name="id" type="GLuint"/>
       <glx ignore="true"/>
     </function>
 
-    <function name="DeleteFragmentShaderATI" offset="assign" deprecated="3.1">
+    <function name="DeleteFragmentShaderATI" deprecated="3.1">
       <param name="id" type="GLuint"/>
       <glx ignore="true"/>
     </function>
 
-    <function name="BeginFragmentShaderATI" offset="assign" deprecated="3.1">
+    <function name="BeginFragmentShaderATI" deprecated="3.1">
       <glx ignore="true"/>
     </function>
 
-    <function name="EndFragmentShaderATI" offset="assign" deprecated="3.1">
+    <function name="EndFragmentShaderATI" deprecated="3.1">
       <glx ignore="true"/>
     </function>
     
-    <function name="PassTexCoordATI" offset="assign" deprecated="3.1">
+    <function name="PassTexCoordATI" deprecated="3.1">
       <param name="dst" type="GLuint"/>
       <param name="coord" type="GLuint"/>
       <param name="swizzle" type="GLenum"/>
       <glx ignore="true"/>
     </function>
 
-    <function name="SampleMapATI" offset="assign" deprecated="3.1">
+    <function name="SampleMapATI" deprecated="3.1">
       <param name="dst" type="GLuint"/>
       <param name="interp" type="GLuint"/>
       <param name="swizzle" type="GLenum"/>
       <glx ignore="true"/>
     </function>
 
-    <function name="ColorFragmentOp1ATI" offset="assign" deprecated="3.1">
+    <function name="ColorFragmentOp1ATI" deprecated="3.1">
       <param name="op" type="GLenum"/>
       <param name="dst" type="GLuint"/>
       <param name="dstMask" type="GLuint"/>
@@ -12376,7 +12212,7 @@
       <glx ignore="true"/>
     </function>
 
-    <function name="ColorFragmentOp2ATI" offset="assign" deprecated="3.1">
+    <function name="ColorFragmentOp2ATI" deprecated="3.1">
       <param name="op" type="GLenum"/>
       <param name="dst" type="GLuint"/>
       <param name="dstMask" type="GLuint"/>
@@ -12390,7 +12226,7 @@
       <glx ignore="true"/>
     </function>
 
-    <function name="ColorFragmentOp3ATI" offset="assign" deprecated="3.1">
+    <function name="ColorFragmentOp3ATI" deprecated="3.1">
       <param name="op" type="GLenum"/>
       <param name="dst" type="GLuint"/>
       <param name="dstMask" type="GLuint"/>
@@ -12407,7 +12243,7 @@
       <glx ignore="true"/>
     </function>
 
-    <function name="AlphaFragmentOp1ATI" offset="assign" deprecated="3.1">
+    <function name="AlphaFragmentOp1ATI" deprecated="3.1">
       <param name="op" type="GLenum"/>
       <param name="dst" type="GLuint"/>
       <param name="dstMod" type="GLuint"/>
@@ -12417,7 +12253,7 @@
       <glx ignore="true"/>
     </function>
 
-    <function name="AlphaFragmentOp2ATI" offset="assign" deprecated="3.1">
+    <function name="AlphaFragmentOp2ATI" deprecated="3.1">
       <param name="op" type="GLenum"/>
       <param name="dst" type="GLuint"/>
       <param name="dstMod" type="GLuint"/>
@@ -12430,7 +12266,7 @@
       <glx ignore="true"/>
     </function>
 
-    <function name="AlphaFragmentOp3ATI" offset="assign" deprecated="3.1">
+    <function name="AlphaFragmentOp3ATI" deprecated="3.1">
       <param name="op" type="GLenum"/>
       <param name="dst" type="GLuint"/>
       <param name="dstMod" type="GLuint"/>
@@ -12446,8 +12282,7 @@
       <glx ignore="true"/>
     </function>
 
-    <function name="SetFragmentShaderConstantATI" offset="assign"
-              deprecated="3.1">
+    <function name="SetFragmentShaderConstantATI" deprecated="3.1">
       <param name="dst" type="GLuint"/>
       <param name="value" type="const GLfloat *"/>
       <glx ignore="true"/>
@@ -12547,8 +12382,7 @@
         <size name="Get" mode="get"/>
     </enum>
 
-    <function name="ActiveStencilFaceEXT" offset="assign"
-              static_dispatch="false" deprecated="3.1">
+    <function name="ActiveStencilFaceEXT" deprecated="3.1">
         <param name="face" type="GLenum"/>
         <glx rop="4220"/>
     </function>
@@ -12588,7 +12422,7 @@
     <enum name="MAX_FRAGMENT_PROGRAM_LOCAL_PARAMETERS_NV" value="0x8868"/>
     <enum name="PROGRAM_ERROR_STRING_NV"                  value="0x8874"/>
 
-    <function name="ProgramNamedParameter4fNV" offset="assign"
+    <function name="ProgramNamedParameter4fNV"
               vectorequiv="ProgramNamedParameter4fvNV" deprecated="3.1"
               exec="skip">
         <param name="id" type="GLuint"/>
@@ -12600,7 +12434,7 @@
         <param name="w" type="GLfloat"/>
     </function>
 
-    <function name="ProgramNamedParameter4dNV" offset="assign"
+    <function name="ProgramNamedParameter4dNV"
               vectorequiv="ProgramNamedParameter4dvNV" deprecated="3.1"
               exec="skip">
         <param name="id" type="GLuint"/>
@@ -12612,8 +12446,7 @@
         <param name="w" type="GLdouble"/>
     </function>
 
-    <function name="ProgramNamedParameter4fvNV" offset="assign"
-              deprecated="3.1" exec="skip">
+    <function name="ProgramNamedParameter4fvNV" deprecated="3.1" exec="skip">
         <param name="id" type="GLuint"/>
         <param name="len" type="GLsizei" counter="true"/>
         <param name="name" type="const GLubyte *" count="len"/>
@@ -12621,8 +12454,7 @@
         <glx rop="4218"/>
     </function>
 
-    <function name="ProgramNamedParameter4dvNV" offset="assign"
-              deprecated="3.1" exec="skip">
+    <function name="ProgramNamedParameter4dvNV" deprecated="3.1" exec="skip">
         <param name="id" type="GLuint"/>
         <param name="len" type="GLsizei" counter="true"/>
         <param name="name" type="const GLubyte *" count="len"/>
@@ -12630,8 +12462,7 @@
         <glx rop="4219"/>
     </function>
 
-    <function name="GetProgramNamedParameterfvNV" offset="assign"
-              deprecated="3.1" exec="skip">
+    <function name="GetProgramNamedParameterfvNV" deprecated="3.1" exec="skip">
         <param name="id" type="GLuint"/>
         <param name="len" type="GLsizei" counter="true"/>
         <param name="name" type="const GLubyte *" count="len"/>
@@ -12639,8 +12470,7 @@
         <glx vendorpriv="1310" always_array="true"/>
     </function>
 
-    <function name="GetProgramNamedParameterdvNV" offset="assign"
-              deprecated="3.1" exec="skip">
+    <function name="GetProgramNamedParameterdvNV" deprecated="3.1" exec="skip">
         <param name="id" type="GLuint"/>
         <param name="len" type="GLsizei" counter="true"/>
         <param name="name" type="const GLubyte *" count="len"/>
@@ -12684,7 +12514,7 @@
         <size name="Get" mode="get"/>
     </enum>
 
-    <function name="DepthBoundsEXT" offset="assign" static_dispatch="false">
+    <function name="DepthBoundsEXT">
         <param name="zmin" type="GLclampd"/>
         <param name="zmax" type="GLclampd"/>
         <glx rop="4229" ignore="true"/>
@@ -12705,7 +12535,7 @@
         <size name="Get" mode="get"/>
     </enum>
 
-    <function name="BlendEquationSeparateEXT" static_dispatch="false" alias="BlendEquationSeparate">
+    <function name="BlendEquationSeparateEXT" alias="BlendEquationSeparate">
         <param name="modeRGB" type="GLenum"/>
         <param name="modeA" type="GLenum"/>
     </function>
@@ -12789,14 +12619,12 @@
     <enum name="BUFFER_FLUSHING_UNMAP_APPLE" count="1" value="0x8A13">
         <size name="GetBufferParameteriv" mode="get"/>
     </enum>
-    <function name="BufferParameteriAPPLE" offset="assign"
-              static_dispatch="false" exec="skip">
+    <function name="BufferParameteriAPPLE" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLint"/>
     </function>
-    <function name="FlushMappedBufferRangeAPPLE" offset="assign"
-              static_dispatch="false" exec="skip">
+    <function name="FlushMappedBufferRangeAPPLE" exec="skip">
         <param name="target" type="GLenum"/>
         <param name="offset" type="GLintptr"/>
         <param name="size" type="GLsizeiptr"/>
@@ -12875,7 +12703,7 @@
     <enum name="POLYGON_OFFSET_CLAMP_EXT"             value="0x8E1B">
         <size name="Get" mode="get"/>
     </enum>
-    <function name="PolygonOffsetClampEXT" offset="assign">
+    <function name="PolygonOffsetClampEXT">
         <param name="factor" type="GLfloat"/>
         <param name="units"  type="GLfloat"/>
         <param name="clamp"  type="GLfloat"/>
@@ -12885,7 +12713,7 @@
 <!-- Unnumbered extensions sorted by name. -->
 
 <category name="GL_ATI_blend_equation_separate">
-    <function name="BlendEquationSeparateATI" alias="BlendEquationSeparate" static_dispatch="false">
+    <function name="BlendEquationSeparateATI" alias="BlendEquationSeparate">
         <param name="modeRGB" type="GLenum"/>
         <param name="modeA" type="GLenum"/>
     </function>
@@ -12905,14 +12733,13 @@
         <size name="Get" mode="get"/>
     </enum>
 
-    <function name="StencilOpSeparateATI" alias="StencilOpSeparate" static_dispatch="false">
+    <function name="StencilOpSeparateATI" alias="StencilOpSeparate">
         <param name="face" type="GLenum"/>
         <param name="sfail" type="GLenum"/>
         <param name="zfail" type="GLenum"/>
         <param name="zpass" type="GLenum"/>
     </function>
-    <function name="StencilFuncSeparateATI" offset="assign"
-              static_dispatch="false" deprecated="3.1">
+    <function name="StencilFuncSeparateATI" deprecated="3.1">
         <param name="frontfunc" type="GLenum"/>
         <param name="backfunc" type="GLenum"/>
         <param name="ref" type="GLint"/>
@@ -12922,16 +12749,14 @@
 </category>
 
 <category name="GL_EXT_gpu_program_parameters">
-    <function name="ProgramEnvParameters4fvEXT" offset="assign"
-              static_dispatch="false" deprecated="3.1">
+    <function name="ProgramEnvParameters4fvEXT" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="count" type="GLsizei"/>
         <param name="params" type="const GLfloat *"/>
     </function>
 
-    <function name="ProgramLocalParameters4fvEXT" offset="assign"
-              static_dispatch="false" deprecated="3.1">
+    <function name="ProgramLocalParameters4fvEXT" deprecated="3.1">
         <param name="target" type="GLenum"/>
         <param name="index" type="GLuint"/>
         <param name="count" type="GLsizei"/>
@@ -12943,12 +12768,12 @@
     <enum name="TIME_ELAPSED_EXT" value="0x88BF"/>
     <type name="int64EXT"                  size="8"/>
     <type name="uint64EXT" unsigned="true" size="8"/>
-    <function name="GetQueryObjecti64vEXT" static_dispatch="false" alias="GetQueryObjecti64v">
+    <function name="GetQueryObjecti64vEXT" alias="GetQueryObjecti64v">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLint64EXT *"/>
     </function>
-    <function name="GetQueryObjectui64vEXT" static_dispatch="false" alias="GetQueryObjectui64v">
+    <function name="GetQueryObjectui64vEXT" alias="GetQueryObjectui64v">
         <param name="id" type="GLuint"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLuint64EXT *"/>
@@ -12956,7 +12781,7 @@
 </category>
 
 <category name="GL_INGR_blend_func_separate">
-    <function name="BlendFuncSeparateINGR" alias="BlendFuncSeparate" static_dispatch="false">
+    <function name="BlendFuncSeparateINGR" alias="BlendFuncSeparate">
         <param name="sfactorRGB" type="GLenum"/>
         <param name="dfactorRGB" type="GLenum"/>
         <param name="sfactorAlpha" type="GLenum"/>
@@ -12984,12 +12809,12 @@
         <size name="PointParameterfv"/>
     </enum>
 
-    <function name="PointParameterfSGIS" alias="PointParameterf" static_dispatch="false">
+    <function name="PointParameterfSGIS" alias="PointParameterf">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfloat"/>
     </function>
 
-    <function name="PointParameterfvSGIS" alias="PointParameterfv" static_dispatch="false">
+    <function name="PointParameterfvSGIS" alias="PointParameterfv">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfloat *"/>
     </function>
diff --git a/src/mapi/glapi/gen/gl_SPARC_asm.py b/src/mapi/glapi/gen/gl_SPARC_asm.py
index 23e2329..fa6217e 100644
--- a/src/mapi/glapi/gen/gl_SPARC_asm.py
+++ b/src/mapi/glapi/gen/gl_SPARC_asm.py
@@ -25,9 +25,10 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+
 import license
 import gl_XML, glX_XML
-import sys, getopt
 
 class PrintGenericStubs(gl_XML.gl_print_base):
     def __init__(self):
@@ -244,30 +245,24 @@ class PrintGenericStubs(gl_XML.gl_print_base):
         return
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-m output_mode]" % sys.argv[0]
-    sys.exit(1)
+def _parser():
+    """Parse arguments and return a namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        dest='filename',
+                        default='gl_API.xml',
+                        help='An XML description of an API.')
+    return parser.parse_args()
 
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
-    mode = "generic"
 
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "m:f:")
-    except Exception,e:
-        show_usage()
+def main():
+    """Main function."""
+    args = _parser()
+    printer = PrintGenericStubs()
 
-    for (arg,val) in args:
-        if arg == '-m':
-            mode = val
-        elif arg == "-f":
-            file_name = val
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
+    printer.Print(api)
 
-    if mode == "generic":
-        printer = PrintGenericStubs()
-    else:
-        print "ERROR: Invalid mode \"%s\" specified." % mode
-        show_usage()
 
-    api = gl_XML.parse_GL_API(file_name, glX_XML.glx_item_factory())
-    printer.Print(api)
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/gl_XML.py b/src/mapi/glapi/gen/gl_XML.py
index 1a2bc2b..67aba81 100644
--- a/src/mapi/glapi/gen/gl_XML.py
+++ b/src/mapi/glapi/gen/gl_XML.py
@@ -30,6 +30,7 @@ import xml.etree.ElementTree as ET
 import re, sys, string
 import os.path
 import typeexpr
+import static_data
 
 
 def parse_GL_API( file_name, factory = None ):
@@ -625,7 +626,7 @@ class gl_function( gl_item ):
         # Decimal('1.1') }.
         self.api_map = {}
 
-        self.assign_offset = 0
+        self.assign_offset = False
 
         self.static_entry_points = []
 
@@ -649,7 +650,7 @@ class gl_function( gl_item ):
         name = element.get( "name" )
         alias = element.get( "alias" )
 
-        if is_attr_true(element, "static_dispatch", "true"):
+        if name in static_data.functions:
             self.static_entry_points.append(name)
 
         self.entry_points.append( name )
@@ -684,16 +685,11 @@ class gl_function( gl_item ):
             # Only try to set the offset when a non-alias entry-point
             # is being processed.
 
-            offset = element.get( "offset" )
-            if offset:
-                try:
-                    o = int( offset )
-                    self.offset = o
-                except Exception, e:
-                    self.offset = -1
-                    if offset == "assign":
-                        self.assign_offset = 1
-
+            if name in static_data.offsets:
+                self.offset = static_data.offsets[name]
+            else:
+                self.offset = -1
+                self.assign_offset = self.exec_flavor != "skip" or name in static_data.unused_functions
 
         if not self.name:
             self.name = true_name
diff --git a/src/mapi/glapi/gen/gl_and_es_API.xml b/src/mapi/glapi/gen/gl_and_es_API.xml
index d158a6b..fc15284 100644
--- a/src/mapi/glapi/gen/gl_and_es_API.xml
+++ b/src/mapi/glapi/gen/gl_and_es_API.xml
@@ -18,45 +18,45 @@
     <type name="fixed"   size="4"                                    />
     <type name="clampx"  size="4"                                    />
 
-    <function name="AlphaFuncx" es1="1.0" desktop="false" offset="assign">
+    <function name="AlphaFuncx" es1="1.0" desktop="false">
         <param name="func" type="GLenum"/>
         <param name="ref" type="GLclampx"/>
     </function>
 
-    <function name="ClearColorx" es1="1.0" desktop="false" offset="assign">
+    <function name="ClearColorx" es1="1.0" desktop="false">
         <param name="red" type="GLclampx"/>
         <param name="green" type="GLclampx"/>
         <param name="blue" type="GLclampx"/>
         <param name="alpha" type="GLclampx"/>
     </function>
 
-    <function name="ClearDepthx" es1="1.0" desktop="false" offset="assign">
+    <function name="ClearDepthx" es1="1.0" desktop="false">
         <param name="depth" type="GLclampx"/>
     </function>
 
-    <function name="Color4x" es1="1.0" desktop="false" offset="assign">
+    <function name="Color4x" es1="1.0" desktop="false">
         <param name="red" type="GLfixed"/>
         <param name="green" type="GLfixed"/>
         <param name="blue" type="GLfixed"/>
         <param name="alpha" type="GLfixed"/>
     </function>
 
-    <function name="DepthRangex" es1="1.0" desktop="false" offset="assign">
+    <function name="DepthRangex" es1="1.0" desktop="false">
         <param name="zNear" type="GLclampx"/>
         <param name="zFar" type="GLclampx"/>
     </function>
 
-    <function name="Fogx" es1="1.0" desktop="false" offset="assign">
+    <function name="Fogx" es1="1.0" desktop="false">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="Fogxv" es1="1.0" desktop="false" offset="assign">
+    <function name="Fogxv" es1="1.0" desktop="false">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *" variable_param="pname"/>
     </function>
 
-    <function name="Frustumx" es1="1.0" desktop="false" offset="assign">
+    <function name="Frustumx" es1="1.0" desktop="false">
         <param name="left" type="GLfixed"/>
         <param name="right" type="GLfixed"/>
         <param name="bottom" type="GLfixed"/>
@@ -65,53 +65,53 @@
         <param name="zFar" type="GLfixed"/>
     </function>
 
-    <function name="LightModelx" es1="1.0" desktop="false" offset="assign">
+    <function name="LightModelx" es1="1.0" desktop="false">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="LightModelxv" es1="1.0" desktop="false" offset="assign">
+    <function name="LightModelxv" es1="1.0" desktop="false">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *" variable_param="pname"/>
     </function>
 
-    <function name="Lightx" es1="1.0" desktop="false" offset="assign">
+    <function name="Lightx" es1="1.0" desktop="false">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="Lightxv" es1="1.0" desktop="false" offset="assign">
+    <function name="Lightxv" es1="1.0" desktop="false">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *" variable_param="pname"/>
     </function>
 
-    <function name="LineWidthx" es1="1.0" desktop="false" offset="assign">
+    <function name="LineWidthx" es1="1.0" desktop="false">
         <param name="width" type="GLfixed"/>
     </function>
 
-    <function name="LoadMatrixx" es1="1.0" desktop="false" offset="assign">
+    <function name="LoadMatrixx" es1="1.0" desktop="false">
         <param name="m" type="const GLfixed *" count="16"/>
     </function>
 
-    <function name="Materialx" es1="1.0" desktop="false" offset="assign">
+    <function name="Materialx" es1="1.0" desktop="false">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="Materialxv" es1="1.0" desktop="false" offset="assign">
+    <function name="Materialxv" es1="1.0" desktop="false">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *" variable_param="pname"/>
     </function>
 
-    <function name="MultMatrixx" es1="1.0" desktop="false" offset="assign">
+    <function name="MultMatrixx" es1="1.0" desktop="false">
         <param name="m" type="const GLfixed *" count="16"/>
     </function>
 
-    <function name="MultiTexCoord4x" es1="1.0" desktop="false" offset="assign">
+    <function name="MultiTexCoord4x" es1="1.0" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="s" type="GLfixed"/>
         <param name="t" type="GLfixed"/>
@@ -119,13 +119,13 @@
         <param name="q" type="GLfixed"/>
     </function>
 
-    <function name="Normal3x" es1="1.0" desktop="false" offset="assign">
+    <function name="Normal3x" es1="1.0" desktop="false">
         <param name="nx" type="GLfixed"/>
         <param name="ny" type="GLfixed"/>
         <param name="nz" type="GLfixed"/>
     </function>
 
-    <function name="Orthox" es1="1.0" desktop="false" offset="assign">
+    <function name="Orthox" es1="1.0" desktop="false">
         <param name="left" type="GLfixed"/>
         <param name="right" type="GLfixed"/>
         <param name="bottom" type="GLfixed"/>
@@ -134,59 +134,59 @@
         <param name="zFar" type="GLfixed"/>
     </function>
 
-    <function name="PointSizex" es1="1.0" desktop="false" offset="assign">
+    <function name="PointSizex" es1="1.0" desktop="false">
         <param name="size" type="GLfixed"/>
     </function>
 
-    <function name="PolygonOffsetx" es1="1.0" desktop="false" offset="assign">
+    <function name="PolygonOffsetx" es1="1.0" desktop="false">
         <param name="factor" type="GLfixed"/>
         <param name="units" type="GLfixed"/>
     </function>
 
-    <function name="Rotatex" es1="1.0" desktop="false" offset="assign">
+    <function name="Rotatex" es1="1.0" desktop="false">
         <param name="angle" type="GLfixed"/>
         <param name="x" type="GLfixed"/>
         <param name="y" type="GLfixed"/>
         <param name="z" type="GLfixed"/>
     </function>
 
-    <function name="SampleCoveragex" es1="1.0" desktop="false" offset="assign">
+    <function name="SampleCoveragex" es1="1.0" desktop="false">
         <param name="value" type="GLclampx"/>
         <param name="invert" type="GLboolean"/>
     </function>
 
-    <function name="Scalex" es1="1.0" desktop="false" offset="assign">
+    <function name="Scalex" es1="1.0" desktop="false">
         <param name="x" type="GLfixed"/>
         <param name="y" type="GLfixed"/>
         <param name="z" type="GLfixed"/>
     </function>
 
-    <function name="TexEnvx" es1="1.0" desktop="false" offset="assign">
+    <function name="TexEnvx" es1="1.0" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="TexEnvxv" es1="1.0" desktop="false" offset="assign">
+    <function name="TexEnvxv" es1="1.0" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *" variable_param="pname"/>
     </function>
 
-    <function name="TexParameterx" es1="1.0" desktop="false" offset="assign">
+    <function name="TexParameterx" es1="1.0" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="Translatex" es1="1.0" desktop="false" offset="assign">
+    <function name="Translatex" es1="1.0" desktop="false">
         <param name="x" type="GLfixed"/>
         <param name="y" type="GLfixed"/>
         <param name="z" type="GLfixed"/>
     </function>
 
     <!-- from GL_OES_single_precision -->
-    <function name="Frustumf" es1="1.0" desktop="false" offset="assign">
+    <function name="Frustumf" es1="1.0" desktop="false">
         <param name="left" type="GLfloat"/>
         <param name="right" type="GLfloat"/>
         <param name="bottom" type="GLfloat"/>
@@ -195,7 +195,7 @@
         <param name="zFar" type="GLfloat"/>
     </function>
 
-    <function name="Orthof" es1="1.0" desktop="false" offset="assign">
+    <function name="Orthof" es1="1.0" desktop="false">
         <param name="left" type="GLfloat"/>
         <param name="right" type="GLfloat"/>
         <param name="bottom" type="GLfloat"/>
@@ -207,68 +207,68 @@
 
 <category name="es1.1">
     <!-- from GL_OES_fixed_point -->
-    <function name="ClipPlanex" es1="1.1" desktop="false" offset="assign">
+    <function name="ClipPlanex" es1="1.1" desktop="false">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="const GLfixed *" count="4"/>
     </function>
 
-    <function name="GetClipPlanex" es1="1.1" offset="assign">
+    <function name="GetClipPlanex" es1="1.1">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="GLfixed *" output="true" count="4"/>
     </function>
 
-    <function name="GetFixedv" es1="1.1" desktop="false" offset="assign">
+    <function name="GetFixedv" es1="1.1" desktop="false">
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="GetLightxv" es1="1.1" desktop="false" offset="assign">
+    <function name="GetLightxv" es1="1.1" desktop="false">
         <param name="light" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="GetMaterialxv" es1="1.1" desktop="false" offset="assign">
+    <function name="GetMaterialxv" es1="1.1" desktop="false">
         <param name="face" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="GetTexEnvxv" es1="1.1" desktop="false" offset="assign">
+    <function name="GetTexEnvxv" es1="1.1" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="GetTexParameterxv" es1="1.1" desktop="false" offset="assign">
+    <function name="GetTexParameterxv" es1="1.1" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="GLfixed *" output="true" variable_param="pname"/>
     </function>
 
-    <function name="PointParameterx" es1="1.1" desktop="false" offset="assign">
+    <function name="PointParameterx" es1="1.1" desktop="false">
         <param name="pname" type="GLenum"/>
         <param name="param" type="GLfixed"/>
     </function>
 
-    <function name="PointParameterxv" es1="1.1" desktop="false" offset="assign">
+    <function name="PointParameterxv" es1="1.1" desktop="false">
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *"/>
     </function>
 
-    <function name="TexParameterxv" es1="1.1" desktop="false" offset="assign">
+    <function name="TexParameterxv" es1="1.1" desktop="false">
         <param name="target" type="GLenum"/>
         <param name="pname" type="GLenum"/>
         <param name="params" type="const GLfixed *" variable_param="pname"/>
     </function>
 
     <!-- from GL_OES_single_precision -->
-    <function name="ClipPlanef" es1="1.1" desktop="false" offset="assign">
+    <function name="ClipPlanef" es1="1.1" desktop="false">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="const GLfloat *" count="4"/>
     </function>
 
-    <function name="GetClipPlanef" es1="1.1" offset="assign">
+    <function name="GetClipPlanef" es1="1.1">
         <param name="plane" type="GLenum"/>
         <param name="equation" type="GLfloat *" output="true" count="4"/>
     </function>
diff --git a/src/mapi/glapi/gen/gl_apitemp.py b/src/mapi/glapi/gen/gl_apitemp.py
index 4157032..5e985a2 100644
--- a/src/mapi/glapi/gen/gl_apitemp.py
+++ b/src/mapi/glapi/gen/gl_apitemp.py
@@ -25,9 +25,10 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+
 import gl_XML, glX_XML
 import license
-import sys, getopt
 
 class PrintGlOffsets(gl_XML.gl_print_base):
     def __init__(self, es=False):
@@ -301,27 +302,30 @@ _glapi_proc UNUSED_TABLE_NAME[] = {"""
         return
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-c]" % sys.argv[0]
-    print "-c          Enable compatibility with OpenGL ES."
-    sys.exit(1)
-
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
+def _parser():
+    """Parser arguments and return a namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        metavar='<input file name>',
+                        dest='filename',
+                        default="gl_API.xml",
+                        help="An XML file describing the API.")
+    parser.add_argument('-c',
+                        action='store_true',
+                        dest='es',
+                        help="Enable OpenGL ES compatibility")
+    return parser.parse_args()
 
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:c")
-    except Exception,e:
-        show_usage()
 
-    es = False
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-c":
-            es = True
+def main():
+    """Main function."""
+    args = _parser()
 
-    api = gl_XML.parse_GL_API(file_name, glX_XML.glx_item_factory())
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
 
-    printer = PrintGlOffsets(es)
+    printer = PrintGlOffsets(args.es)
     printer.Print(api)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/gl_enums.py b/src/mapi/glapi/gen/gl_enums.py
index f45782d..955f27d 100644
--- a/src/mapi/glapi/gen/gl_enums.py
+++ b/src/mapi/glapi/gen/gl_enums.py
@@ -1,8 +1,8 @@
 #!/usr/bin/python2
 # -*- Mode: Python; py-indent-offset: 8 -*-
 
-# (C) Copyright Zack Rusin 2005
-# All Rights Reserved.
+# (C) Copyright Zack Rusin 2005. All Rights Reserved.
+# Copyright (C) 2015 Intel Corporation
 # 
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -26,6 +26,8 @@
 # Authors:
 #    Zack Rusin <zack@kde.org>
 
+import argparse
+
 import license
 import gl_XML
 import sys, getopt
@@ -201,21 +203,21 @@ _mesa_lookup_prim_by_nr(GLuint nr)
                 enum.append( [name, priority] )
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name]" % sys.argv[0]
-    sys.exit(1)
+def _parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--input_file',
+                        required=True,
+                        help="Choose an xml file to parse.")
+    return parser.parse_args()
 
-if __name__ == '__main__':
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:")
-    except Exception,e:
-        show_usage()
 
-    api_list = []
-    for (arg,val) in args:
-        if arg == "-f":
-            api = gl_XML.parse_GL_API( val )
-            api_list.append(api);
+def main():
+    args = _parser()
+    api_list = [gl_XML.parse_GL_API(args.input_file)]
 
     printer = PrintGlEnums()
-    printer.Print( api_list )
+    printer.Print(api_list)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/gl_genexec.py b/src/mapi/glapi/gen/gl_genexec.py
index 4e76fe3..26d8e7b 100644
--- a/src/mapi/glapi/gen/gl_genexec.py
+++ b/src/mapi/glapi/gen/gl_genexec.py
@@ -25,10 +25,12 @@
 # _mesa_initialize_exec_table().  It is responsible for populating all
 # entries in the "exec" dispatch table that aren't dynamic.
 
+import argparse
 import collections
 import license
 import gl_XML
-import sys, getopt
+import sys
+import apiexec
 
 
 exec_flavor_map = {
@@ -175,18 +177,49 @@ class PrintCode(gl_XML.gl_print_base):
                 raise Exception(
                     'Unrecognized exec flavor {0!r}'.format(f.exec_flavor))
             condition_parts = []
-            if f.desktop:
-                if f.deprecated:
+            if f.name in apiexec.functions:
+                ex = apiexec.functions[f.name]
+                unconditional_count = 0
+
+                if ex.compatibility is not None:
                     condition_parts.append('ctx->API == API_OPENGL_COMPAT')
-                else:
-                    condition_parts.append('_mesa_is_desktop_gl(ctx)')
-            if 'es1' in f.api_map:
-                condition_parts.append('ctx->API == API_OPENGLES')
-            if 'es2' in f.api_map:
-                if f.api_map['es2'] > 2.0:
-                    condition_parts.append('(ctx->API == API_OPENGLES2 && ctx->Version >= {0})'.format(int(f.api_map['es2'] * 10)))
-                else:
-                    condition_parts.append('ctx->API == API_OPENGLES2')
+                    unconditional_count += 1
+
+                if ex.core is not None:
+                    condition_parts.append('ctx->API == API_OPENGL_CORE')
+                    unconditional_count += 1
+
+                if ex.es1 is not None:
+                    condition_parts.append('ctx->API == API_OPENGLES')
+                    unconditional_count += 1
+
+                if ex.es2 is not None:
+                    if ex.es2 > 20:
+                        condition_parts.append('(ctx->API == API_OPENGLES2 && ctx->Version >= {0})'.format(ex.es2))
+                    else:
+                        condition_parts.append('ctx->API == API_OPENGLES2')
+                        unconditional_count += 1
+
+                # If the function is unconditionally available in all four
+                # APIs, then it is always available.  Replace the complex
+                # tautology condition with "true" and let GCC do the right
+                # thing.
+                if unconditional_count == 4:
+                    condition_parts = ['true']
+            else:
+                if f.desktop:
+                    if f.deprecated:
+                        condition_parts.append('ctx->API == API_OPENGL_COMPAT')
+                    else:
+                        condition_parts.append('_mesa_is_desktop_gl(ctx)')
+                if 'es1' in f.api_map:
+                    condition_parts.append('ctx->API == API_OPENGLES')
+                if 'es2' in f.api_map:
+                    if f.api_map['es2'] > 2.0:
+                        condition_parts.append('(ctx->API == API_OPENGLES2 && ctx->Version >= {0})'.format(int(f.api_map['es2'] * 10)))
+                    else:
+                        condition_parts.append('ctx->API == API_OPENGLES2')
+
             if not condition_parts:
                 # This function does not exist in any API.
                 continue
@@ -207,24 +240,23 @@ class PrintCode(gl_XML.gl_print_base):
             print '   }'
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name]" % sys.argv[0]
-    sys.exit(1)
-
+def _parser():
+    """Parse arguments and return namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        dest='filename',
+                        default='gl_and_es_API.xml',
+                        help='an xml file describing an API')
+    return parser.parse_args()
 
-if __name__ == '__main__':
-    file_name = "gl_and_es_API.xml"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "m:f:")
-    except Exception,e:
-        show_usage()
-
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
 
+def main():
+    """Main function."""
+    args = _parser()
     printer = PrintCode()
-
-    api = gl_XML.parse_GL_API(file_name)
+    api = gl_XML.parse_GL_API(args.filename)
     printer.Print(api)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/gl_gentable.py b/src/mapi/glapi/gen/gl_gentable.py
index 06a5ebf..1b3eb72 100644
--- a/src/mapi/glapi/gen/gl_gentable.py
+++ b/src/mapi/glapi/gen/gl_gentable.py
@@ -2,6 +2,7 @@
 
 # (C) Copyright IBM Corporation 2004, 2005
 # (C) Copyright Apple Inc. 2011
+# Copyright (C) 2015 Intel Corporation
 # All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
@@ -29,9 +30,10 @@
 # Based on code ogiginally by:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+
 import license
 import gl_XML, glX_XML
-import sys, getopt
 
 header = """/* GLXEXT is the define used in the xserver when the GLX extension is being
  * built.  Hijack this to determine whether this file is being built for the
@@ -186,23 +188,27 @@ class PrintCode(gl_XML.gl_print_base):
                 print body_template % vars
         return
 
-def show_usage():
-    print "Usage: %s [-f input_file_name]" % sys.argv[0]
-    sys.exit(1)
 
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
+def _parser():
+    """Parse arguments and return a namespace object."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        dest='filename',
+                        default='gl_API.xml',
+                        help='An XML file description of an API')
+
+    return parser.parse_args()
 
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "m:f:")
-    except Exception,e:
-        show_usage()
 
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
+def main():
+    """Main function."""
+    args = _parser()
 
     printer = PrintCode()
 
-    api = gl_XML.parse_GL_API(file_name, glX_XML.glx_item_factory())
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
     printer.Print(api)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/gl_procs.py b/src/mapi/glapi/gen/gl_procs.py
index b1fffc4..685e2fa 100644
--- a/src/mapi/glapi/gen/gl_procs.py
+++ b/src/mapi/glapi/gen/gl_procs.py
@@ -25,9 +25,12 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+
 import license
-import gl_XML, glX_XML
-import sys, getopt
+import gl_XML
+import glX_XML
+
 
 class PrintGlProcs(gl_XML.gl_print_base):
     def __init__(self, es=False):
@@ -39,7 +42,6 @@ class PrintGlProcs(gl_XML.gl_print_base):
 """Copyright (C) 1999-2001  Brian Paul   All Rights Reserved.
 (C) Copyright IBM Corporation 2004, 2006""", "BRIAN PAUL, IBM")
 
-
     def printRealHeader(self):
         print """
 /* This file is only included by glapi.c and is used for
@@ -161,26 +163,28 @@ typedef struct {
         return
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-c]" % sys.argv[0]
-    print "-c          Enable compatibility with OpenGL ES."
-    sys.exit(1)
+def _parser():
+    """Parse arguments and return a namepsace."""
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--filename',
+                        default='gl_API.xml',
+                        metavar="input_file_name",
+                        dest='file_name',
+                        help="Path to an XML description of OpenGL API.")
+    parser.add_argument('-c', '--es-version',
+                        dest='es',
+                        action="store_true",
+                        help="filter functions for es")
+    return parser.parse_args()
+
+
+def main():
+    """Main function."""
+    args = _parser()
+    api = gl_XML.parse_GL_API(args.file_name, glX_XML.glx_item_factory())
+    PrintGlProcs(args.es).Print(api)
+
 
 if __name__ == '__main__':
-    file_name = "gl_API.xml"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:c")
-    except Exception,e:
-        show_usage()
-
-    es = False
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-c":
-            es = True
-
-    api = gl_XML.parse_GL_API(file_name, glX_XML.glx_item_factory())
-    printer = PrintGlProcs(es)
-    printer.Print(api)
+    main()
diff --git a/src/mapi/glapi/gen/gl_table.py b/src/mapi/glapi/gen/gl_table.py
index fd38468..e25971a 100644
--- a/src/mapi/glapi/gen/gl_table.py
+++ b/src/mapi/glapi/gen/gl_table.py
@@ -2,6 +2,7 @@
 
 # (C) Copyright IBM Corporation 2004
 # All Rights Reserved.
+# Copyright (c) 2014 Intel Corporation
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -25,35 +26,35 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+
 import gl_XML
 import license
-import sys, getopt
+
 
 class PrintGlTable(gl_XML.gl_print_base):
-    def __init__(self, es=False):
+    def __init__(self):
         gl_XML.gl_print_base.__init__(self)
 
-        self.es = es
         self.header_tag = '_GLAPI_TABLE_H_'
         self.name = "gl_table.py (from Mesa)"
         self.license = license.bsd_license_template % ( \
 """Copyright (C) 1999-2003  Brian Paul   All Rights Reserved.
 (C) Copyright IBM Corporation 2004""", "BRIAN PAUL, IBM")
-        self.ifdef_emitted = False;
+        self.ifdef_emitted = False
         return
 
-
     def printBody(self, api):
         for f in api.functionIterateByOffset():
             if not f.is_abi() and not self.ifdef_emitted:
                 print '#if !defined HAVE_SHARED_GLAPI'
                 self.ifdef_emitted = True
             arg_string = f.get_parameter_string()
-            print '   %s (GLAPIENTRYP %s)(%s); /* %d */' % (f.return_type, f.name, arg_string, f.offset)
+            print '   %s (GLAPIENTRYP %s)(%s); /* %d */' % (
+                f.return_type, f.name, arg_string, f.offset)
 
         print '#endif /* !defined HAVE_SHARED_GLAPI */'
 
-
     def printRealHeader(self):
         print '#ifndef GLAPIENTRYP'
         print '# ifndef GLAPIENTRY'
@@ -68,20 +69,19 @@ class PrintGlTable(gl_XML.gl_print_base):
         print '{'
         return
 
-
     def printRealFooter(self):
         print '};'
         return
 
 
 class PrintRemapTable(gl_XML.gl_print_base):
-    def __init__(self, es=False):
+    def __init__(self):
         gl_XML.gl_print_base.__init__(self)
 
-        self.es = es
         self.header_tag = '_DISPATCH_H_'
         self.name = "gl_table.py (from Mesa)"
-        self.license = license.bsd_license_template % ("(C) Copyright IBM Corporation 2005", "IBM")
+        self.license = license.bsd_license_template % (
+            "(C) Copyright IBM Corporation 2005", "IBM")
         return
 
 
@@ -100,6 +100,7 @@ class PrintRemapTable(gl_XML.gl_print_base):
 """
         return
 
+
     def printBody(self, api):
         print '#define CALL_by_offset(disp, cast, offset, parameters) \\'
         print '    (*(cast (GET_by_offset(disp, offset)))) parameters'
@@ -120,19 +121,13 @@ class PrintRemapTable(gl_XML.gl_print_base):
 
         functions = []
         abi_functions = []
-        alias_functions = []
         count = 0
         for f in api.functionIterateByOffset():
             if not f.is_abi():
-                functions.append( [f, count] )
+                functions.append([f, count])
                 count += 1
             else:
-                abi_functions.append( [f, -1] )
-
-            if self.es:
-                # remember functions with aliases
-                if len(f.entry_points) > 1:
-                    alias_functions.append(f)
+                abi_functions.append([f, -1])
 
         print '/* total number of offsets below */'
         print '#define _gloffset_COUNT %d' % (len(abi_functions + functions))
@@ -141,18 +136,11 @@ class PrintRemapTable(gl_XML.gl_print_base):
         for f, index in abi_functions:
             print '#define _gloffset_%s %d' % (f.name, f.offset)
 
-        if self.es:
-            remap_table = "esLocalRemapTable"
-
-            print '#define %s_size %u' % (remap_table, count)
-            print 'static int %s[ %s_size ];' % (remap_table, remap_table)
-            print ''
-        else:
-            remap_table = "driDispatchRemapTable"
+        remap_table = "driDispatchRemapTable"
 
-            print '#define %s_size %u' % (remap_table, count)
-            print 'extern int %s[ %s_size ];' % (remap_table, remap_table)
-            print ''
+        print '#define %s_size %u' % (remap_table, count)
+        print 'extern int %s[ %s_size ];' % (remap_table, remap_table)
+        print ''
 
         for f, index in functions:
             print '#define %s_remap_index %u' % (f.name, index)
@@ -165,7 +153,7 @@ class PrintRemapTable(gl_XML.gl_print_base):
         print ''
 
         for f, index in abi_functions + functions:
-            arg_string = gl_XML.create_parameter_string( f.parameters, 0 )
+            arg_string = gl_XML.create_parameter_string(f.parameters, 0)
 
             print 'typedef %s (GLAPIENTRYP _glptr_%s)(%s);' % (f.return_type, f.name, arg_string)
             print '#define CALL_%s(disp, parameters) \\' % (f.name)
@@ -179,60 +167,38 @@ class PrintRemapTable(gl_XML.gl_print_base):
             print '}'
             print
 
-        if alias_functions:
-            print ''
-            print '/* define aliases for compatibility */'
-            for f in alias_functions:
-                for name in f.entry_points:
-                    if name != f.name:
-                        print '#define CALL_%s(disp, parameters) CALL_%s(disp, parameters)' % (name, f.name)
-                        print '#define GET_%s(disp) GET_%s(disp)' % (name, f.name)
-                        print '#define SET_%s(disp, fn) SET_%s(disp, fn)' % (name, f.name)
-            print ''
-
-            for f in alias_functions:
-                for name in f.entry_points:
-                    if name != f.name:
-                        print '#define %s_remap_index %s_remap_index' % (name, f.name)
-            print ''
-
         return
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-m mode] [-c ver]" % sys.argv[0]
-    print "    -m mode   Mode can be 'table' or 'remap_table'."
-    print "    -c ver    Version can be 'es1' or 'es2'."
-    sys.exit(1)
+def _parser():
+    """Parse arguments and return a namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--filename',
+                        default='gl_API.xml',
+                        metavar="input_file_name",
+                        dest='file_name',
+                        help="Path to an XML description of OpenGL API.")
+    parser.add_argument('-m', '--mode',
+                        choices=['table', 'remap_table'],
+                        default='table',
+                        metavar="mode",
+                        help="Generate either a table or a remap_table")
+    return parser.parse_args()
+
+
+def main():
+    """Main function."""
+    args = _parser()
+
+    api = gl_XML.parse_GL_API(args.file_name)
+
+    if args.mode == "table":
+        printer = PrintGlTable()
+    elif args.mode == "remap_table":
+        printer = PrintRemapTable()
+
+    printer.Print(api)
+
 
 if __name__ == '__main__':
-    file_name = "gl_API.xml"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:m:c:")
-    except Exception,e:
-        show_usage()
-
-    mode = "table"
-    es = None
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-m":
-            mode = val
-        elif arg == "-c":
-            es = val
-
-    if mode == "table":
-        printer = PrintGlTable(es)
-    elif mode == "remap_table":
-        printer = PrintRemapTable(es)
-    else:
-        show_usage()
-
-    api = gl_XML.parse_GL_API( file_name )
-
-    if es is not None:
-        api.filter_functions_by_api(es)
-
-    printer.Print( api )
+    main()
diff --git a/src/mapi/glapi/gen/gl_x86-64_asm.py b/src/mapi/glapi/gen/gl_x86-64_asm.py
index 7afc2b1..cf42371 100644
--- a/src/mapi/glapi/gen/gl_x86-64_asm.py
+++ b/src/mapi/glapi/gen/gl_x86-64_asm.py
@@ -25,9 +25,11 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+import copy
+
 import license
 import gl_XML, glX_XML
-import sys, getopt, copy
 
 def should_use_push(registers):
     for [reg, offset] in registers:
@@ -289,30 +291,25 @@ class PrintGenericStubs(gl_XML.gl_print_base):
 
         return
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-m output_mode]" % sys.argv[0]
-    sys.exit(1)
 
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
-    mode = "generic"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "m:f:")
-    except Exception,e:
-        show_usage()
-
-    for (arg,val) in args:
-        if arg == '-m':
-            mode = val
-        elif arg == "-f":
-            file_name = val
-
-    if mode == "generic":
-        printer = PrintGenericStubs()
-    else:
-        print "ERROR: Invalid mode \"%s\" specified." % mode
-        show_usage()
+def _parser():
+    """Parse arguments and return a namespace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        default='gl_API.xml',
+                        dest='filename',
+                        help='An XML file describing an API')
+    return parser.parse_args()
+
+
+def main():
+    """Main file."""
+    args = _parser()
+    printer = PrintGenericStubs()
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
 
-    api = gl_XML.parse_GL_API(file_name, glX_XML.glx_item_factory())
     printer.Print(api)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/gl_x86_asm.py b/src/mapi/glapi/gen/gl_x86_asm.py
index f855dba..c0c7941 100644
--- a/src/mapi/glapi/gen/gl_x86_asm.py
+++ b/src/mapi/glapi/gen/gl_x86_asm.py
@@ -25,9 +25,10 @@
 # Authors:
 #    Ian Romanick <idr@us.ibm.com>
 
+import argparse
+
 import license
 import gl_XML, glX_XML
-import sys, getopt
 
 class PrintGenericStubs(gl_XML.gl_print_base):
 
@@ -217,30 +218,22 @@ class PrintGenericStubs(gl_XML.gl_print_base):
 
         return
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-m output_mode]" % sys.argv[0]
-    sys.exit(1)
+def _parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f',
+                        dest='filename',
+                        default='gl_API.xml',
+                        help='An XML file describing an API.')
+    return parser.parse_args()
 
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
-    mode = "generic"
-
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "m:f:")
-    except Exception,e:
-        show_usage()
-
-    for (arg,val) in args:
-        if arg == '-m':
-            mode = val
-        elif arg == "-f":
-            file_name = val
-
-    if mode == "generic":
-        printer = PrintGenericStubs()
-    else:
-        print "ERROR: Invalid mode \"%s\" specified." % mode
-        show_usage()
-
-    api = gl_XML.parse_GL_API(file_name, glX_XML.glx_item_factory())
+
+def main():
+    args = _parser()
+    printer = PrintGenericStubs()
+
+    api = gl_XML.parse_GL_API(args.filename, glX_XML.glx_item_factory())
     printer.Print(api)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/remap_helper.py b/src/mapi/glapi/gen/remap_helper.py
index e1a13d0..edc6c3e 100644
--- a/src/mapi/glapi/gen/remap_helper.py
+++ b/src/mapi/glapi/gen/remap_helper.py
@@ -24,9 +24,11 @@
 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
 # IN THE SOFTWARE.
 
-import gl_XML
+import argparse
+
 import license
-import sys, getopt, string
+import gl_XML
+
 
 def get_function_spec(func):
     sig = ""
@@ -54,6 +56,7 @@ def get_function_spec(func):
 
     return spec
 
+
 class PrintGlRemap(gl_XML.gl_print_base):
     def __init__(self):
         gl_XML.gl_print_base.__init__(self)
@@ -163,30 +166,26 @@ class PrintGlRemap(gl_XML.gl_print_base):
         return
 
 
-def show_usage():
-    print "Usage: %s [-f input_file_name] [-c ver]" % sys.argv[0]
-    print "    -c ver    Version can be 'es1' or 'es2'."
-    sys.exit(1)
+def _parser():
+    """Parse input options and return a namsepace."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f', '--filename',
+                        default="gl_API.xml",
+                        metavar="input_file_name",
+                        dest='file_name',
+                        help="An xml description file.")
+    return parser.parse_args()
 
-if __name__ == '__main__':
-    file_name = "gl_API.xml"
 
-    try:
-        (args, trail) = getopt.getopt(sys.argv[1:], "f:c:")
-    except Exception,e:
-        show_usage()
+def main():
+    """Main function."""
+    args = _parser()
 
-    es = None
-    for (arg,val) in args:
-        if arg == "-f":
-            file_name = val
-        elif arg == "-c":
-            es = val
+    api = gl_XML.parse_GL_API(args.file_name)
 
-    api = gl_XML.parse_GL_API( file_name )
+    printer = PrintGlRemap()
+    printer.Print(api)
 
-    if es is not None:
-        api.filter_functions_by_api(es)
 
-    printer = PrintGlRemap()
-    printer.Print( api )
+if __name__ == '__main__':
+    main()
diff --git a/src/mapi/glapi/gen/static_data.py b/src/mapi/glapi/gen/static_data.py
new file mode 100644
index 0000000..142c503
--- /dev/null
+++ b/src/mapi/glapi/gen/static_data.py
@@ -0,0 +1,1729 @@
+#!/usr/bin/env python
+
+# Copyright (C) 2015 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+"""Table of functions that have ABI-mandated offsets in the dispatch table.
+
+This list will never change."""
+offsets = {
+    "NewList": 0,
+    "EndList": 1,
+    "CallList": 2,
+    "CallLists": 3,
+    "DeleteLists": 4,
+    "GenLists": 5,
+    "ListBase": 6,
+    "Begin": 7,
+    "Bitmap": 8,
+    "Color3b": 9,
+    "Color3bv": 10,
+    "Color3d": 11,
+    "Color3dv": 12,
+    "Color3f": 13,
+    "Color3fv": 14,
+    "Color3i": 15,
+    "Color3iv": 16,
+    "Color3s": 17,
+    "Color3sv": 18,
+    "Color3ub": 19,
+    "Color3ubv": 20,
+    "Color3ui": 21,
+    "Color3uiv": 22,
+    "Color3us": 23,
+    "Color3usv": 24,
+    "Color4b": 25,
+    "Color4bv": 26,
+    "Color4d": 27,
+    "Color4dv": 28,
+    "Color4f": 29,
+    "Color4fv": 30,
+    "Color4i": 31,
+    "Color4iv": 32,
+    "Color4s": 33,
+    "Color4sv": 34,
+    "Color4ub": 35,
+    "Color4ubv": 36,
+    "Color4ui": 37,
+    "Color4uiv": 38,
+    "Color4us": 39,
+    "Color4usv": 40,
+    "EdgeFlag": 41,
+    "EdgeFlagv": 42,
+    "End": 43,
+    "Indexd": 44,
+    "Indexdv": 45,
+    "Indexf": 46,
+    "Indexfv": 47,
+    "Indexi": 48,
+    "Indexiv": 49,
+    "Indexs": 50,
+    "Indexsv": 51,
+    "Normal3b": 52,
+    "Normal3bv": 53,
+    "Normal3d": 54,
+    "Normal3dv": 55,
+    "Normal3f": 56,
+    "Normal3fv": 57,
+    "Normal3i": 58,
+    "Normal3iv": 59,
+    "Normal3s": 60,
+    "Normal3sv": 61,
+    "RasterPos2d": 62,
+    "RasterPos2dv": 63,
+    "RasterPos2f": 64,
+    "RasterPos2fv": 65,
+    "RasterPos2i": 66,
+    "RasterPos2iv": 67,
+    "RasterPos2s": 68,
+    "RasterPos2sv": 69,
+    "RasterPos3d": 70,
+    "RasterPos3dv": 71,
+    "RasterPos3f": 72,
+    "RasterPos3fv": 73,
+    "RasterPos3i": 74,
+    "RasterPos3iv": 75,
+    "RasterPos3s": 76,
+    "RasterPos3sv": 77,
+    "RasterPos4d": 78,
+    "RasterPos4dv": 79,
+    "RasterPos4f": 80,
+    "RasterPos4fv": 81,
+    "RasterPos4i": 82,
+    "RasterPos4iv": 83,
+    "RasterPos4s": 84,
+    "RasterPos4sv": 85,
+    "Rectd": 86,
+    "Rectdv": 87,
+    "Rectf": 88,
+    "Rectfv": 89,
+    "Recti": 90,
+    "Rectiv": 91,
+    "Rects": 92,
+    "Rectsv": 93,
+    "TexCoord1d": 94,
+    "TexCoord1dv": 95,
+    "TexCoord1f": 96,
+    "TexCoord1fv": 97,
+    "TexCoord1i": 98,
+    "TexCoord1iv": 99,
+    "TexCoord1s": 100,
+    "TexCoord1sv": 101,
+    "TexCoord2d": 102,
+    "TexCoord2dv": 103,
+    "TexCoord2f": 104,
+    "TexCoord2fv": 105,
+    "TexCoord2i": 106,
+    "TexCoord2iv": 107,
+    "TexCoord2s": 108,
+    "TexCoord2sv": 109,
+    "TexCoord3d": 110,
+    "TexCoord3dv": 111,
+    "TexCoord3f": 112,
+    "TexCoord3fv": 113,
+    "TexCoord3i": 114,
+    "TexCoord3iv": 115,
+    "TexCoord3s": 116,
+    "TexCoord3sv": 117,
+    "TexCoord4d": 118,
+    "TexCoord4dv": 119,
+    "TexCoord4f": 120,
+    "TexCoord4fv": 121,
+    "TexCoord4i": 122,
+    "TexCoord4iv": 123,
+    "TexCoord4s": 124,
+    "TexCoord4sv": 125,
+    "Vertex2d": 126,
+    "Vertex2dv": 127,
+    "Vertex2f": 128,
+    "Vertex2fv": 129,
+    "Vertex2i": 130,
+    "Vertex2iv": 131,
+    "Vertex2s": 132,
+    "Vertex2sv": 133,
+    "Vertex3d": 134,
+    "Vertex3dv": 135,
+    "Vertex3f": 136,
+    "Vertex3fv": 137,
+    "Vertex3i": 138,
+    "Vertex3iv": 139,
+    "Vertex3s": 140,
+    "Vertex3sv": 141,
+    "Vertex4d": 142,
+    "Vertex4dv": 143,
+    "Vertex4f": 144,
+    "Vertex4fv": 145,
+    "Vertex4i": 146,
+    "Vertex4iv": 147,
+    "Vertex4s": 148,
+    "Vertex4sv": 149,
+    "ClipPlane": 150,
+    "ColorMaterial": 151,
+    "CullFace": 152,
+    "Fogf": 153,
+    "Fogfv": 154,
+    "Fogi": 155,
+    "Fogiv": 156,
+    "FrontFace": 157,
+    "Hint": 158,
+    "Lightf": 159,
+    "Lightfv": 160,
+    "Lighti": 161,
+    "Lightiv": 162,
+    "LightModelf": 163,
+    "LightModelfv": 164,
+    "LightModeli": 165,
+    "LightModeliv": 166,
+    "LineStipple": 167,
+    "LineWidth": 168,
+    "Materialf": 169,
+    "Materialfv": 170,
+    "Materiali": 171,
+    "Materialiv": 172,
+    "PointSize": 173,
+    "PolygonMode": 174,
+    "PolygonStipple": 175,
+    "Scissor": 176,
+    "ShadeModel": 177,
+    "TexParameterf": 178,
+    "TexParameterfv": 179,
+    "TexParameteri": 180,
+    "TexParameteriv": 181,
+    "TexImage1D": 182,
+    "TexImage2D": 183,
+    "TexEnvf": 184,
+    "TexEnvfv": 185,
+    "TexEnvi": 186,
+    "TexEnviv": 187,
+    "TexGend": 188,
+    "TexGendv": 189,
+    "TexGenf": 190,
+    "TexGenfv": 191,
+    "TexGeni": 192,
+    "TexGeniv": 193,
+    "FeedbackBuffer": 194,
+    "SelectBuffer": 195,
+    "RenderMode": 196,
+    "InitNames": 197,
+    "LoadName": 198,
+    "PassThrough": 199,
+    "PopName": 200,
+    "PushName": 201,
+    "DrawBuffer": 202,
+    "Clear": 203,
+    "ClearAccum": 204,
+    "ClearIndex": 205,
+    "ClearColor": 206,
+    "ClearStencil": 207,
+    "ClearDepth": 208,
+    "StencilMask": 209,
+    "ColorMask": 210,
+    "DepthMask": 211,
+    "IndexMask": 212,
+    "Accum": 213,
+    "Disable": 214,
+    "Enable": 215,
+    "Finish": 216,
+    "Flush": 217,
+    "PopAttrib": 218,
+    "PushAttrib": 219,
+    "Map1d": 220,
+    "Map1f": 221,
+    "Map2d": 222,
+    "Map2f": 223,
+    "MapGrid1d": 224,
+    "MapGrid1f": 225,
+    "MapGrid2d": 226,
+    "MapGrid2f": 227,
+    "EvalCoord1d": 228,
+    "EvalCoord1dv": 229,
+    "EvalCoord1f": 230,
+    "EvalCoord1fv": 231,
+    "EvalCoord2d": 232,
+    "EvalCoord2dv": 233,
+    "EvalCoord2f": 234,
+    "EvalCoord2fv": 235,
+    "EvalMesh1": 236,
+    "EvalPoint1": 237,
+    "EvalMesh2": 238,
+    "EvalPoint2": 239,
+    "AlphaFunc": 240,
+    "BlendFunc": 241,
+    "LogicOp": 242,
+    "StencilFunc": 243,
+    "StencilOp": 244,
+    "DepthFunc": 245,
+    "PixelZoom": 246,
+    "PixelTransferf": 247,
+    "PixelTransferi": 248,
+    "PixelStoref": 249,
+    "PixelStorei": 250,
+    "PixelMapfv": 251,
+    "PixelMapuiv": 252,
+    "PixelMapusv": 253,
+    "ReadBuffer": 254,
+    "CopyPixels": 255,
+    "ReadPixels": 256,
+    "DrawPixels": 257,
+    "GetBooleanv": 258,
+    "GetClipPlane": 259,
+    "GetDoublev": 260,
+    "GetError": 261,
+    "GetFloatv": 262,
+    "GetIntegerv": 263,
+    "GetLightfv": 264,
+    "GetLightiv": 265,
+    "GetMapdv": 266,
+    "GetMapfv": 267,
+    "GetMapiv": 268,
+    "GetMaterialfv": 269,
+    "GetMaterialiv": 270,
+    "GetPixelMapfv": 271,
+    "GetPixelMapuiv": 272,
+    "GetPixelMapusv": 273,
+    "GetPolygonStipple": 274,
+    "GetString": 275,
+    "GetTexEnvfv": 276,
+    "GetTexEnviv": 277,
+    "GetTexGendv": 278,
+    "GetTexGenfv": 279,
+    "GetTexGeniv": 280,
+    "GetTexImage": 281,
+    "GetTexParameterfv": 282,
+    "GetTexParameteriv": 283,
+    "GetTexLevelParameterfv": 284,
+    "GetTexLevelParameteriv": 285,
+    "IsEnabled": 286,
+    "IsList": 287,
+    "DepthRange": 288,
+    "Frustum": 289,
+    "LoadIdentity": 290,
+    "LoadMatrixf": 291,
+    "LoadMatrixd": 292,
+    "MatrixMode": 293,
+    "MultMatrixf": 294,
+    "MultMatrixd": 295,
+    "Ortho": 296,
+    "PopMatrix": 297,
+    "PushMatrix": 298,
+    "Rotated": 299,
+    "Rotatef": 300,
+    "Scaled": 301,
+    "Scalef": 302,
+    "Translated": 303,
+    "Translatef": 304,
+    "Viewport": 305,
+    "ArrayElement": 306,
+    "ColorPointer": 308,
+    "DisableClientState": 309,
+    "DrawArrays": 310,
+    "DrawElements": 311,
+    "EdgeFlagPointer": 312,
+    "EnableClientState": 313,
+    "GetPointerv": 329,
+    "IndexPointer": 314,
+    "InterleavedArrays": 317,
+    "NormalPointer": 318,
+    "TexCoordPointer": 320,
+    "VertexPointer": 321,
+    "PolygonOffset": 319,
+    "CopyTexImage1D": 323,
+    "CopyTexImage2D": 324,
+    "CopyTexSubImage1D": 325,
+    "CopyTexSubImage2D": 326,
+    "TexSubImage1D": 332,
+    "TexSubImage2D": 333,
+    "AreTexturesResident": 322,
+    "BindTexture": 307,
+    "DeleteTextures": 327,
+    "GenTextures": 328,
+    "IsTexture": 330,
+    "PrioritizeTextures": 331,
+    "Indexub": 315,
+    "Indexubv": 316,
+    "PopClientAttrib": 334,
+    "PushClientAttrib": 335,
+    "BlendColor": 336,
+    "BlendEquation": 337,
+    "DrawRangeElements": 338,
+    "ColorTable": 339,
+    "ColorTableParameterfv": 340,
+    "ColorTableParameteriv": 341,
+    "CopyColorTable": 342,
+    "GetColorTable": 343,
+    "GetColorTableParameterfv": 344,
+    "GetColorTableParameteriv": 345,
+    "ColorSubTable": 346,
+    "CopyColorSubTable": 347,
+    "ConvolutionFilter1D": 348,
+    "ConvolutionFilter2D": 349,
+    "ConvolutionParameterf": 350,
+    "ConvolutionParameterfv": 351,
+    "ConvolutionParameteri": 352,
+    "ConvolutionParameteriv": 353,
+    "CopyConvolutionFilter1D": 354,
+    "CopyConvolutionFilter2D": 355,
+    "GetConvolutionFilter": 356,
+    "GetConvolutionParameterfv": 357,
+    "GetConvolutionParameteriv": 358,
+    "GetSeparableFilter": 359,
+    "SeparableFilter2D": 360,
+    "GetHistogram": 361,
+    "GetHistogramParameterfv": 362,
+    "GetHistogramParameteriv": 363,
+    "GetMinmax": 364,
+    "GetMinmaxParameterfv": 365,
+    "GetMinmaxParameteriv": 366,
+    "Histogram": 367,
+    "Minmax": 368,
+    "ResetHistogram": 369,
+    "ResetMinmax": 370,
+    "TexImage3D": 371,
+    "TexSubImage3D": 372,
+    "CopyTexSubImage3D": 373,
+    "ActiveTexture": 374,
+    "ClientActiveTexture": 375,
+    "MultiTexCoord1d": 376,
+    "MultiTexCoord1dv": 377,
+    "MultiTexCoord1fARB": 378,
+    "MultiTexCoord1fvARB": 379,
+    "MultiTexCoord1i": 380,
+    "MultiTexCoord1iv": 381,
+    "MultiTexCoord1s": 382,
+    "MultiTexCoord1sv": 383,
+    "MultiTexCoord2d": 384,
+    "MultiTexCoord2dv": 385,
+    "MultiTexCoord2fARB": 386,
+    "MultiTexCoord2fvARB": 387,
+    "MultiTexCoord2i": 388,
+    "MultiTexCoord2iv": 389,
+    "MultiTexCoord2s": 390,
+    "MultiTexCoord2sv": 391,
+    "MultiTexCoord3d": 392,
+    "MultiTexCoord3dv": 393,
+    "MultiTexCoord3fARB": 394,
+    "MultiTexCoord3fvARB": 395,
+    "MultiTexCoord3i": 396,
+    "MultiTexCoord3iv": 397,
+    "MultiTexCoord3s": 398,
+    "MultiTexCoord3sv": 399,
+    "MultiTexCoord4d": 400,
+    "MultiTexCoord4dv": 401,
+    "MultiTexCoord4fARB": 402,
+    "MultiTexCoord4fvARB": 403,
+    "MultiTexCoord4i": 404,
+    "MultiTexCoord4iv": 405,
+    "MultiTexCoord4s": 406,
+    "MultiTexCoord4sv": 407
+}
+
+functions = [
+   "Accum",
+   "ActiveTexture",
+   "ActiveTextureARB",
+   "AlphaFunc",
+   "AlphaFuncx",
+   "AreTexturesResident",
+   "AreTexturesResidentEXT",
+   "ArrayElement",
+   "ArrayElementEXT",
+   "AttachObjectARB",
+   "AttachShader",
+   "Begin",
+   "BeginConditionalRender",
+   "BeginConditionalRenderNV",
+   "BeginQuery",
+   "BeginQueryARB",
+   "BeginQueryIndexed",
+   "BeginTransformFeedback",
+   "BindAttribLocation",
+   "BindAttribLocationARB",
+   "BindBuffer",
+   "BindBufferARB",
+   "BindBufferBase",
+   "BindBufferRange",
+   "BindBuffersBase",
+   "BindBuffersRange",
+   "BindFragDataLocation",
+   "BindFragDataLocationEXT",
+   "BindFragDataLocationIndexed",
+   "BindFramebuffer",
+   "BindFramebufferEXT",
+   "BindImageTexture",
+   "BindImageTextures",
+   "BindProgramARB",
+   "BindRenderbuffer",
+   "BindRenderbufferEXT",
+   "BindSampler",
+   "BindSamplers",
+   "BindTexture",
+   "BindTextureEXT",
+   "BindTextures",
+   "BindTransformFeedback",
+   "BindVertexArray",
+   "BindVertexBuffer",
+   "BindVertexBuffers",
+   "Bitmap",
+   "BlendColor",
+   "BlendColorEXT",
+   "BlendEquation",
+   "BlendEquationEXT",
+   "BlendEquationiARB",
+   "BlendEquationSeparate",
+   "BlendEquationSeparateiARB",
+   "BlendFunc",
+   "BlendFunciARB",
+   "BlendFuncSeparate",
+   "BlendFuncSeparateEXT",
+   "BlendFuncSeparateiARB",
+   "BlitFramebuffer",
+   "BufferData",
+   "BufferDataARB",
+   "BufferStorage",
+   "BufferSubData",
+   "BufferSubDataARB",
+   "CallList",
+   "CallLists",
+   "CheckFramebufferStatus",
+   "CheckFramebufferStatusEXT",
+   "ClampColor",
+   "ClampColorARB",
+   "Clear",
+   "ClearAccum",
+   "ClearBufferData",
+   "ClearBufferfi",
+   "ClearBufferfv",
+   "ClearBufferiv",
+   "ClearBufferSubData",
+   "ClearBufferuiv",
+   "ClearColor",
+   "ClearColorIiEXT",
+   "ClearColorIuiEXT",
+   "ClearColorx",
+   "ClearDepth",
+   "ClearDepthf",
+   "ClearDepthx",
+   "ClearIndex",
+   "ClearStencil",
+   "ClearTexImage",
+   "ClearTexSubImage",
+   "ClientActiveTexture",
+   "ClientActiveTextureARB",
+   "ClientWaitSync",
+   "ClipPlane",
+   "ClipPlanef",
+   "ClipPlanex",
+   "Color3b",
+   "Color3bv",
+   "Color3d",
+   "Color3dv",
+   "Color3f",
+   "Color3fv",
+   "Color3i",
+   "Color3iv",
+   "Color3s",
+   "Color3sv",
+   "Color3ub",
+   "Color3ubv",
+   "Color3ui",
+   "Color3uiv",
+   "Color3us",
+   "Color3usv",
+   "Color4b",
+   "Color4bv",
+   "Color4d",
+   "Color4dv",
+   "Color4f",
+   "Color4fv",
+   "Color4i",
+   "Color4iv",
+   "Color4s",
+   "Color4sv",
+   "Color4ub",
+   "Color4ubv",
+   "Color4ui",
+   "Color4uiv",
+   "Color4us",
+   "Color4usv",
+   "Color4x",
+   "ColorMask",
+   "ColorMaski",
+   "ColorMaskIndexedEXT",
+   "ColorMaterial",
+   "ColorP3ui",
+   "ColorP3uiv",
+   "ColorP4ui",
+   "ColorP4uiv",
+   "ColorPointer",
+   "ColorPointerEXT",
+   "ColorSubTable",
+   "ColorTable",
+   "ColorTableParameterfv",
+   "ColorTableParameteriv",
+   "CompileShader",
+   "CompileShaderARB",
+   "CompressedTexImage1D",
+   "CompressedTexImage1DARB",
+   "CompressedTexImage2D",
+   "CompressedTexImage2DARB",
+   "CompressedTexImage3D",
+   "CompressedTexImage3DARB",
+   "CompressedTexSubImage1D",
+   "CompressedTexSubImage1DARB",
+   "CompressedTexSubImage2D",
+   "CompressedTexSubImage2DARB",
+   "CompressedTexSubImage3D",
+   "CompressedTexSubImage3DARB",
+   "ConvolutionFilter1D",
+   "ConvolutionFilter2D",
+   "ConvolutionParameterf",
+   "ConvolutionParameterfv",
+   "ConvolutionParameteri",
+   "ConvolutionParameteriv",
+   "CopyBufferSubData",
+   "CopyColorSubTable",
+   "CopyColorTable",
+   "CopyConvolutionFilter1D",
+   "CopyConvolutionFilter2D",
+   "CopyImageSubData",
+   "CopyPixels",
+   "CopyTexImage1D",
+   "CopyTexImage2D",
+   "CopyTexSubImage1D",
+   "CopyTexSubImage2D",
+   "CopyTexSubImage3D",
+   "CopyTexSubImage3DEXT",
+   "CreateProgram",
+   "CreateProgramObjectARB",
+   "CreateShader",
+   "CreateShaderObjectARB",
+   "CullFace",
+   "DebugMessageCallback",
+   "DebugMessageCallbackARB",
+   "DebugMessageControl",
+   "DebugMessageControlARB",
+   "DebugMessageInsert",
+   "DebugMessageInsertARB",
+   "DeleteBuffers",
+   "DeleteBuffersARB",
+   "DeleteFramebuffers",
+   "DeleteFramebuffersEXT",
+   "DeleteLists",
+   "DeleteObjectARB",
+   "DeleteProgram",
+   "DeleteProgramsARB",
+   "DeleteQueries",
+   "DeleteQueriesARB",
+   "DeleteRenderbuffers",
+   "DeleteRenderbuffersEXT",
+   "DeleteSamplers",
+   "DeleteShader",
+   "DeleteSync",
+   "DeleteTextures",
+   "DeleteTexturesEXT",
+   "DeleteTransformFeedbacks",
+   "DeleteVertexArrays",
+   "DepthFunc",
+   "DepthMask",
+   "DepthRange",
+   "DepthRangeArrayv",
+   "DepthRangef",
+   "DepthRangeIndexed",
+   "DepthRangex",
+   "DetachObjectARB",
+   "DetachShader",
+   "Disable",
+   "DisableClientState",
+   "Disablei",
+   "DisableIndexedEXT",
+   "DisableVertexAttribArray",
+   "DisableVertexAttribArrayARB",
+   "DispatchCompute",
+   "DispatchComputeIndirect",
+   "DrawArrays",
+   "DrawArraysEXT",
+   "DrawArraysIndirect",
+   "DrawArraysInstanced",
+   "DrawArraysInstancedARB",
+   "DrawArraysInstancedBaseInstance",
+   "DrawArraysInstancedEXT",
+   "DrawBuffer",
+   "DrawBuffers",
+   "DrawBuffersARB",
+   "DrawBuffersATI",
+   "DrawElements",
+   "DrawElementsBaseVertex",
+   "DrawElementsIndirect",
+   "DrawElementsInstanced",
+   "DrawElementsInstancedARB",
+   "DrawElementsInstancedBaseInstance",
+   "DrawElementsInstancedBaseVertex",
+   "DrawElementsInstancedBaseVertexBaseInstance",
+   "DrawElementsInstancedEXT",
+   "DrawPixels",
+   "DrawRangeElements",
+   "DrawRangeElementsBaseVertex",
+   "DrawRangeElementsEXT",
+   "DrawTransformFeedback",
+   "DrawTransformFeedbackInstanced",
+   "DrawTransformFeedbackStream",
+   "DrawTransformFeedbackStreamInstanced",
+   "EdgeFlag",
+   "EdgeFlagPointer",
+   "EdgeFlagPointerEXT",
+   "EdgeFlagv",
+   "Enable",
+   "EnableClientState",
+   "Enablei",
+   "EnableIndexedEXT",
+   "EnableVertexAttribArray",
+   "EnableVertexAttribArrayARB",
+   "End",
+   "EndConditionalRender",
+   "EndConditionalRenderNV",
+   "EndList",
+   "EndQuery",
+   "EndQueryARB",
+   "EndQueryIndexed",
+   "EndTransformFeedback",
+   "EvalCoord1d",
+   "EvalCoord1dv",
+   "EvalCoord1f",
+   "EvalCoord1fv",
+   "EvalCoord2d",
+   "EvalCoord2dv",
+   "EvalCoord2f",
+   "EvalCoord2fv",
+   "EvalMesh1",
+   "EvalMesh2",
+   "EvalPoint1",
+   "EvalPoint2",
+   "FeedbackBuffer",
+   "FenceSync",
+   "Finish",
+   "Flush",
+   "FlushMappedBufferRange",
+   "FogCoordd",
+   "FogCoorddEXT",
+   "FogCoorddv",
+   "FogCoorddvEXT",
+   "FogCoordf",
+   "FogCoordfEXT",
+   "FogCoordfv",
+   "FogCoordfvEXT",
+   "FogCoordPointer",
+   "FogCoordPointerEXT",
+   "Fogf",
+   "Fogfv",
+   "Fogi",
+   "Fogiv",
+   "Fogx",
+   "Fogxv",
+   "FramebufferRenderbuffer",
+   "FramebufferRenderbufferEXT",
+   "FramebufferTexture",
+   "FramebufferTexture1D",
+   "FramebufferTexture1DEXT",
+   "FramebufferTexture2D",
+   "FramebufferTexture2DEXT",
+   "FramebufferTexture3D",
+   "FramebufferTexture3DEXT",
+   "FramebufferTextureARB",
+   "FramebufferTextureLayer",
+   "FramebufferTextureLayerARB",
+   "FramebufferTextureLayerEXT",
+   "FrontFace",
+   "Frustum",
+   "Frustumf",
+   "Frustumx",
+   "GenBuffers",
+   "GenBuffersARB",
+   "GenerateMipmap",
+   "GenerateMipmapEXT",
+   "GenFramebuffers",
+   "GenFramebuffersEXT",
+   "GenLists",
+   "GenProgramsARB",
+   "GenQueries",
+   "GenQueriesARB",
+   "GenRenderbuffers",
+   "GenRenderbuffersEXT",
+   "GenSamplers",
+   "GenTextures",
+   "GenTexturesEXT",
+   "GenTransformFeedbacks",
+   "GenVertexArrays",
+   "GetActiveAtomicCounterBufferiv",
+   "GetActiveAttrib",
+   "GetActiveAttribARB",
+   "GetActiveUniform",
+   "GetActiveUniformARB",
+   "GetActiveUniformBlockiv",
+   "GetActiveUniformBlockName",
+   "GetActiveUniformName",
+   "GetActiveUniformsiv",
+   "GetAttachedObjectsARB",
+   "GetAttachedShaders",
+   "GetAttribLocation",
+   "GetAttribLocationARB",
+   "GetBooleanIndexedvEXT",
+   "GetBooleani_v",
+   "GetBooleanv",
+   "GetBufferParameteri64v",
+   "GetBufferParameteriv",
+   "GetBufferParameterivARB",
+   "GetBufferPointerv",
+   "GetBufferPointervARB",
+   "GetBufferSubData",
+   "GetBufferSubDataARB",
+   "GetClipPlane",
+   "GetClipPlanef",
+   "GetClipPlanex",
+   "GetColorTable",
+   "GetColorTableParameterfv",
+   "GetColorTableParameteriv",
+   "GetCompressedTexImage",
+   "GetCompressedTexImageARB",
+   "GetConvolutionFilter",
+   "GetConvolutionParameterfv",
+   "GetConvolutionParameteriv",
+   "GetDebugMessageLog",
+   "GetDebugMessageLogARB",
+   "GetDoublei_v",
+   "GetDoublev",
+   "GetError",
+   "GetFixedv",
+   "GetFloati_v",
+   "GetFloatv",
+   "GetFragDataIndex",
+   "GetFragDataLocation",
+   "GetFragDataLocationEXT",
+   "GetFramebufferAttachmentParameteriv",
+   "GetFramebufferAttachmentParameterivEXT",
+   "GetGraphicsResetStatusARB",
+   "GetHandleARB",
+   "GetHistogram",
+   "GetHistogramParameterfv",
+   "GetHistogramParameteriv",
+   "GetInfoLogARB",
+   "GetInteger64i_v",
+   "GetInteger64v",
+   "GetIntegerIndexedvEXT",
+   "GetIntegeri_v",
+   "GetIntegerv",
+   "GetLightfv",
+   "GetLightiv",
+   "GetLightxv",
+   "GetMapdv",
+   "GetMapfv",
+   "GetMapiv",
+   "GetMaterialfv",
+   "GetMaterialiv",
+   "GetMaterialxv",
+   "GetMinmax",
+   "GetMinmaxParameterfv",
+   "GetMinmaxParameteriv",
+   "GetMultisamplefv",
+   "GetnColorTableARB",
+   "GetnCompressedTexImageARB",
+   "GetnConvolutionFilterARB",
+   "GetnHistogramARB",
+   "GetnMapdvARB",
+   "GetnMapfvARB",
+   "GetnMapivARB",
+   "GetnMinmaxARB",
+   "GetnPixelMapfvARB",
+   "GetnPixelMapuivARB",
+   "GetnPixelMapusvARB",
+   "GetnPolygonStippleARB",
+   "GetnSeparableFilterARB",
+   "GetnTexImageARB",
+   "GetnUniformdvARB",
+   "GetnUniformfvARB",
+   "GetnUniformivARB",
+   "GetnUniformuivARB",
+   "GetObjectLabel",
+   "GetObjectParameterfvARB",
+   "GetObjectParameterivARB",
+   "GetObjectPtrLabel",
+   "GetPixelMapfv",
+   "GetPixelMapuiv",
+   "GetPixelMapusv",
+   "GetPointerv",
+   "GetPointervEXT",
+   "GetPolygonStipple",
+   "GetProgramBinary",
+   "GetProgramEnvParameterdvARB",
+   "GetProgramEnvParameterfvARB",
+   "GetProgramInfoLog",
+   "GetProgramiv",
+   "GetProgramivARB",
+   "GetProgramLocalParameterdvARB",
+   "GetProgramLocalParameterfvARB",
+   "GetProgramStringARB",
+   "GetQueryIndexediv",
+   "GetQueryiv",
+   "GetQueryivARB",
+   "GetQueryObjectiv",
+   "GetQueryObjectivARB",
+   "GetQueryObjectuiv",
+   "GetQueryObjectuivARB",
+   "GetRenderbufferParameteriv",
+   "GetRenderbufferParameterivEXT",
+   "GetSamplerParameterfv",
+   "GetSamplerParameterIiv",
+   "GetSamplerParameterIuiv",
+   "GetSamplerParameteriv",
+   "GetSeparableFilter",
+   "GetShaderInfoLog",
+   "GetShaderiv",
+   "GetShaderPrecisionFormat",
+   "GetShaderSource",
+   "GetShaderSourceARB",
+   "GetString",
+   "GetStringi",
+   "GetSynciv",
+   "GetTexEnvfv",
+   "GetTexEnviv",
+   "GetTexEnvxv",
+   "GetTexGendv",
+   "GetTexGenfv",
+   "GetTexGeniv",
+   "GetTexImage",
+   "GetTexLevelParameterfv",
+   "GetTexLevelParameteriv",
+   "GetTexParameterfv",
+   "GetTexParameterIiv",
+   "GetTexParameterIivEXT",
+   "GetTexParameterIuiv",
+   "GetTexParameterIuivEXT",
+   "GetTexParameteriv",
+   "GetTexParameterxv",
+   "GetTransformFeedbackVarying",
+   "GetUniformBlockIndex",
+   "GetUniformfv",
+   "GetUniformfvARB",
+   "GetUniformIndices",
+   "GetUniformiv",
+   "GetUniformivARB",
+   "GetUniformLocation",
+   "GetUniformLocationARB",
+   "GetUniformuiv",
+   "GetUniformuivEXT",
+   "GetVertexAttribdv",
+   "GetVertexAttribdvARB",
+   "GetVertexAttribfv",
+   "GetVertexAttribfvARB",
+   "GetVertexAttribIiv",
+   "GetVertexAttribIivEXT",
+   "GetVertexAttribIuiv",
+   "GetVertexAttribIuivEXT",
+   "GetVertexAttribiv",
+   "GetVertexAttribivARB",
+   "GetVertexAttribPointerv",
+   "GetVertexAttribPointervARB",
+   "Hint",
+   "Histogram",
+   "Indexd",
+   "Indexdv",
+   "Indexf",
+   "Indexfv",
+   "Indexi",
+   "Indexiv",
+   "IndexMask",
+   "IndexPointer",
+   "IndexPointerEXT",
+   "Indexs",
+   "Indexsv",
+   "Indexub",
+   "Indexubv",
+   "InitNames",
+   "InterleavedArrays",
+   "InvalidateBufferData",
+   "InvalidateBufferSubData",
+   "InvalidateFramebuffer",
+   "InvalidateSubFramebuffer",
+   "InvalidateTexImage",
+   "InvalidateTexSubImage",
+   "IsBuffer",
+   "IsBufferARB",
+   "IsEnabled",
+   "IsEnabledi",
+   "IsEnabledIndexedEXT",
+   "IsFramebuffer",
+   "IsFramebufferEXT",
+   "IsList",
+   "IsProgram",
+   "IsProgramARB",
+   "IsQuery",
+   "IsQueryARB",
+   "IsRenderbuffer",
+   "IsRenderbufferEXT",
+   "IsSampler",
+   "IsShader",
+   "IsSync",
+   "IsTexture",
+   "IsTextureEXT",
+   "IsTransformFeedback",
+   "IsVertexArray",
+   "Lightf",
+   "Lightfv",
+   "Lighti",
+   "Lightiv",
+   "LightModelf",
+   "LightModelfv",
+   "LightModeli",
+   "LightModeliv",
+   "LightModelx",
+   "LightModelxv",
+   "Lightx",
+   "Lightxv",
+   "LineStipple",
+   "LineWidth",
+   "LineWidthx",
+   "LinkProgram",
+   "LinkProgramARB",
+   "ListBase",
+   "LoadIdentity",
+   "LoadMatrixd",
+   "LoadMatrixf",
+   "LoadMatrixx",
+   "LoadName",
+   "LoadTransposeMatrixd",
+   "LoadTransposeMatrixdARB",
+   "LoadTransposeMatrixf",
+   "LoadTransposeMatrixfARB",
+   "LockArraysEXT",
+   "LogicOp",
+   "Map1d",
+   "Map1f",
+   "Map2d",
+   "Map2f",
+   "MapBuffer",
+   "MapBufferARB",
+   "MapBufferRange",
+   "MapGrid1d",
+   "MapGrid1f",
+   "MapGrid2d",
+   "MapGrid2f",
+   "Materialf",
+   "Materialfv",
+   "Materiali",
+   "Materialiv",
+   "Materialx",
+   "Materialxv",
+   "MatrixMode",
+   "MemoryBarrier",
+   "Minmax",
+   "MinSampleShading",
+   "MinSampleShadingARB",
+   "MultiDrawArrays",
+   "MultiDrawArraysEXT",
+   "MultiDrawArraysIndirect",
+   "MultiDrawElements",
+   "MultiDrawElementsBaseVertex",
+   "MultiDrawElementsEXT",
+   "MultiDrawElementsIndirect",
+   "MultiTexCoord1d",
+   "MultiTexCoord1dARB",
+   "MultiTexCoord1dv",
+   "MultiTexCoord1dvARB",
+   "MultiTexCoord1f",
+   "MultiTexCoord1fARB",
+   "MultiTexCoord1fv",
+   "MultiTexCoord1fvARB",
+   "MultiTexCoord1i",
+   "MultiTexCoord1iARB",
+   "MultiTexCoord1iv",
+   "MultiTexCoord1ivARB",
+   "MultiTexCoord1s",
+   "MultiTexCoord1sARB",
+   "MultiTexCoord1sv",
+   "MultiTexCoord1svARB",
+   "MultiTexCoord2d",
+   "MultiTexCoord2dARB",
+   "MultiTexCoord2dv",
+   "MultiTexCoord2dvARB",
+   "MultiTexCoord2f",
+   "MultiTexCoord2fARB",
+   "MultiTexCoord2fv",
+   "MultiTexCoord2fvARB",
+   "MultiTexCoord2i",
+   "MultiTexCoord2iARB",
+   "MultiTexCoord2iv",
+   "MultiTexCoord2ivARB",
+   "MultiTexCoord2s",
+   "MultiTexCoord2sARB",
+   "MultiTexCoord2sv",
+   "MultiTexCoord2svARB",
+   "MultiTexCoord3d",
+   "MultiTexCoord3dARB",
+   "MultiTexCoord3dv",
+   "MultiTexCoord3dvARB",
+   "MultiTexCoord3f",
+   "MultiTexCoord3fARB",
+   "MultiTexCoord3fv",
+   "MultiTexCoord3fvARB",
+   "MultiTexCoord3i",
+   "MultiTexCoord3iARB",
+   "MultiTexCoord3iv",
+   "MultiTexCoord3ivARB",
+   "MultiTexCoord3s",
+   "MultiTexCoord3sARB",
+   "MultiTexCoord3sv",
+   "MultiTexCoord3svARB",
+   "MultiTexCoord4d",
+   "MultiTexCoord4dARB",
+   "MultiTexCoord4dv",
+   "MultiTexCoord4dvARB",
+   "MultiTexCoord4f",
+   "MultiTexCoord4fARB",
+   "MultiTexCoord4fv",
+   "MultiTexCoord4fvARB",
+   "MultiTexCoord4i",
+   "MultiTexCoord4iARB",
+   "MultiTexCoord4iv",
+   "MultiTexCoord4ivARB",
+   "MultiTexCoord4s",
+   "MultiTexCoord4sARB",
+   "MultiTexCoord4sv",
+   "MultiTexCoord4svARB",
+   "MultiTexCoord4x",
+   "MultiTexCoordP1ui",
+   "MultiTexCoordP1uiv",
+   "MultiTexCoordP2ui",
+   "MultiTexCoordP2uiv",
+   "MultiTexCoordP3ui",
+   "MultiTexCoordP3uiv",
+   "MultiTexCoordP4ui",
+   "MultiTexCoordP4uiv",
+   "MultMatrixd",
+   "MultMatrixf",
+   "MultMatrixx",
+   "MultTransposeMatrixd",
+   "MultTransposeMatrixdARB",
+   "MultTransposeMatrixf",
+   "MultTransposeMatrixfARB",
+   "NewList",
+   "Normal3b",
+   "Normal3bv",
+   "Normal3d",
+   "Normal3dv",
+   "Normal3f",
+   "Normal3fv",
+   "Normal3i",
+   "Normal3iv",
+   "Normal3s",
+   "Normal3sv",
+   "Normal3x",
+   "NormalP3ui",
+   "NormalP3uiv",
+   "NormalPointer",
+   "NormalPointerEXT",
+   "ObjectLabel",
+   "ObjectPtrLabel",
+   "Ortho",
+   "Orthof",
+   "Orthox",
+   "PassThrough",
+   "PauseTransformFeedback",
+   "PixelMapfv",
+   "PixelMapuiv",
+   "PixelMapusv",
+   "PixelStoref",
+   "PixelStorei",
+   "PixelTransferf",
+   "PixelTransferi",
+   "PixelZoom",
+   "PointParameterf",
+   "PointParameterfARB",
+   "PointParameterfEXT",
+   "PointParameterfv",
+   "PointParameterfvARB",
+   "PointParameterfvEXT",
+   "PointParameteri",
+   "PointParameteriv",
+   "PointParameterx",
+   "PointParameterxv",
+   "PointSize",
+   "PointSizePointerOES",
+   "PointSizex",
+   "PolygonMode",
+   "PolygonOffset",
+   "PolygonOffsetx",
+   "PolygonStipple",
+   "PopAttrib",
+   "PopClientAttrib",
+   "PopDebugGroup",
+   "PopMatrix",
+   "PopName",
+   "PrimitiveRestartIndex",
+   "PrimitiveRestartIndexNV",
+   "PrimitiveRestartNV",
+   "PrioritizeTextures",
+   "PrioritizeTexturesEXT",
+   "ProgramBinary",
+   "ProgramEnvParameter4dARB",
+   "ProgramEnvParameter4dvARB",
+   "ProgramEnvParameter4fARB",
+   "ProgramEnvParameter4fvARB",
+   "ProgramLocalParameter4dARB",
+   "ProgramLocalParameter4dvARB",
+   "ProgramLocalParameter4fARB",
+   "ProgramLocalParameter4fvARB",
+   "ProgramParameteri",
+   "ProgramParameteriARB",
+   "ProgramStringARB",
+   "ProvokingVertex",
+   "ProvokingVertexEXT",
+   "PushAttrib",
+   "PushClientAttrib",
+   "PushDebugGroup",
+   "PushMatrix",
+   "PushName",
+   "RasterPos2d",
+   "RasterPos2dv",
+   "RasterPos2f",
+   "RasterPos2fv",
+   "RasterPos2i",
+   "RasterPos2iv",
+   "RasterPos2s",
+   "RasterPos2sv",
+   "RasterPos3d",
+   "RasterPos3dv",
+   "RasterPos3f",
+   "RasterPos3fv",
+   "RasterPos3i",
+   "RasterPos3iv",
+   "RasterPos3s",
+   "RasterPos3sv",
+   "RasterPos4d",
+   "RasterPos4dv",
+   "RasterPos4f",
+   "RasterPos4fv",
+   "RasterPos4i",
+   "RasterPos4iv",
+   "RasterPos4s",
+   "RasterPos4sv",
+   "ReadBuffer",
+   "ReadnPixelsARB",
+   "ReadPixels",
+   "Rectd",
+   "Rectdv",
+   "Rectf",
+   "Rectfv",
+   "Recti",
+   "Rectiv",
+   "Rects",
+   "Rectsv",
+   "ReleaseShaderCompiler",
+   "RenderbufferStorage",
+   "RenderbufferStorageEXT",
+   "RenderbufferStorageMultisample",
+   "RenderbufferStorageMultisampleEXT",
+   "RenderMode",
+   "ResetHistogram",
+   "ResetMinmax",
+   "ResumeTransformFeedback",
+   "Rotated",
+   "Rotatef",
+   "Rotatex",
+   "SampleCoverage",
+   "SampleCoverageARB",
+   "SampleCoveragex",
+   "SampleMaski",
+   "SamplerParameterf",
+   "SamplerParameterfv",
+   "SamplerParameteri",
+   "SamplerParameterIiv",
+   "SamplerParameterIuiv",
+   "SamplerParameteriv",
+   "Scaled",
+   "Scalef",
+   "Scalex",
+   "Scissor",
+   "ScissorArrayv",
+   "ScissorIndexed",
+   "ScissorIndexedv",
+   "SecondaryColor3b",
+   "SecondaryColor3bEXT",
+   "SecondaryColor3bv",
+   "SecondaryColor3bvEXT",
+   "SecondaryColor3d",
+   "SecondaryColor3dEXT",
+   "SecondaryColor3dv",
+   "SecondaryColor3dvEXT",
+   "SecondaryColor3f",
+   "SecondaryColor3fEXT",
+   "SecondaryColor3fv",
+   "SecondaryColor3fvEXT",
+   "SecondaryColor3i",
+   "SecondaryColor3iEXT",
+   "SecondaryColor3iv",
+   "SecondaryColor3ivEXT",
+   "SecondaryColor3s",
+   "SecondaryColor3sEXT",
+   "SecondaryColor3sv",
+   "SecondaryColor3svEXT",
+   "SecondaryColor3ub",
+   "SecondaryColor3ubEXT",
+   "SecondaryColor3ubv",
+   "SecondaryColor3ubvEXT",
+   "SecondaryColor3ui",
+   "SecondaryColor3uiEXT",
+   "SecondaryColor3uiv",
+   "SecondaryColor3uivEXT",
+   "SecondaryColor3us",
+   "SecondaryColor3usEXT",
+   "SecondaryColor3usv",
+   "SecondaryColor3usvEXT",
+   "SecondaryColorP3ui",
+   "SecondaryColorP3uiv",
+   "SecondaryColorPointer",
+   "SecondaryColorPointerEXT",
+   "SelectBuffer",
+   "SeparableFilter2D",
+   "ShadeModel",
+   "ShaderBinary",
+   "ShaderSource",
+   "ShaderSourceARB",
+   "StencilFunc",
+   "StencilFuncSeparate",
+   "StencilMask",
+   "StencilMaskSeparate",
+   "StencilOp",
+   "StencilOpSeparate",
+   "TexBuffer",
+   "TexBufferARB",
+   "TexBufferRange",
+   "TexCoord1d",
+   "TexCoord1dv",
+   "TexCoord1f",
+   "TexCoord1fv",
+   "TexCoord1i",
+   "TexCoord1iv",
+   "TexCoord1s",
+   "TexCoord1sv",
+   "TexCoord2d",
+   "TexCoord2dv",
+   "TexCoord2f",
+   "TexCoord2fv",
+   "TexCoord2i",
+   "TexCoord2iv",
+   "TexCoord2s",
+   "TexCoord2sv",
+   "TexCoord3d",
+   "TexCoord3dv",
+   "TexCoord3f",
+   "TexCoord3fv",
+   "TexCoord3i",
+   "TexCoord3iv",
+   "TexCoord3s",
+   "TexCoord3sv",
+   "TexCoord4d",
+   "TexCoord4dv",
+   "TexCoord4f",
+   "TexCoord4fv",
+   "TexCoord4i",
+   "TexCoord4iv",
+   "TexCoord4s",
+   "TexCoord4sv",
+   "TexCoordP1ui",
+   "TexCoordP1uiv",
+   "TexCoordP2ui",
+   "TexCoordP2uiv",
+   "TexCoordP3ui",
+   "TexCoordP3uiv",
+   "TexCoordP4ui",
+   "TexCoordP4uiv",
+   "TexCoordPointer",
+   "TexCoordPointerEXT",
+   "TexEnvf",
+   "TexEnvfv",
+   "TexEnvi",
+   "TexEnviv",
+   "TexEnvx",
+   "TexEnvxv",
+   "TexGend",
+   "TexGendv",
+   "TexGenf",
+   "TexGenfv",
+   "TexGeni",
+   "TexGeniv",
+   "TexImage1D",
+   "TexImage2D",
+   "TexImage2DMultisample",
+   "TexImage3D",
+   "TexImage3DEXT",
+   "TexImage3DMultisample",
+   "TexParameterf",
+   "TexParameterfv",
+   "TexParameteri",
+   "TexParameterIiv",
+   "TexParameterIivEXT",
+   "TexParameterIuiv",
+   "TexParameterIuivEXT",
+   "TexParameteriv",
+   "TexParameterx",
+   "TexParameterxv",
+   "TexStorage1D",
+   "TexStorage2D",
+   "TexStorage2DMultisample",
+   "TexStorage3D",
+   "TexStorage3DMultisample",
+   "TexSubImage1D",
+   "TexSubImage2D",
+   "TexSubImage3D",
+   "TexSubImage3DEXT",
+   "TextureBarrierNV",
+   "TextureStorage1DEXT",
+   "TextureStorage2DEXT",
+   "TextureStorage3DEXT",
+   "TextureView",
+   "TransformFeedbackVaryings",
+   "Translated",
+   "Translatef",
+   "Translatex",
+   "Uniform1f",
+   "Uniform1fARB",
+   "Uniform1fv",
+   "Uniform1fvARB",
+   "Uniform1i",
+   "Uniform1iARB",
+   "Uniform1iv",
+   "Uniform1ivARB",
+   "Uniform1ui",
+   "Uniform1uiEXT",
+   "Uniform1uiv",
+   "Uniform1uivEXT",
+   "Uniform2f",
+   "Uniform2fARB",
+   "Uniform2fv",
+   "Uniform2fvARB",
+   "Uniform2i",
+   "Uniform2iARB",
+   "Uniform2iv",
+   "Uniform2ivARB",
+   "Uniform2ui",
+   "Uniform2uiEXT",
+   "Uniform2uiv",
+   "Uniform2uivEXT",
+   "Uniform3f",
+   "Uniform3fARB",
+   "Uniform3fv",
+   "Uniform3fvARB",
+   "Uniform3i",
+   "Uniform3iARB",
+   "Uniform3iv",
+   "Uniform3ivARB",
+   "Uniform3ui",
+   "Uniform3uiEXT",
+   "Uniform3uiv",
+   "Uniform3uivEXT",
+   "Uniform4f",
+   "Uniform4fARB",
+   "Uniform4fv",
+   "Uniform4fvARB",
+   "Uniform4i",
+   "Uniform4iARB",
+   "Uniform4iv",
+   "Uniform4ivARB",
+   "Uniform4ui",
+   "Uniform4uiEXT",
+   "Uniform4uiv",
+   "Uniform4uivEXT",
+   "UniformBlockBinding",
+   "UniformMatrix2fv",
+   "UniformMatrix2fvARB",
+   "UniformMatrix2x3fv",
+   "UniformMatrix2x4fv",
+   "UniformMatrix3fv",
+   "UniformMatrix3fvARB",
+   "UniformMatrix3x2fv",
+   "UniformMatrix3x4fv",
+   "UniformMatrix4fv",
+   "UniformMatrix4fvARB",
+   "UniformMatrix4x2fv",
+   "UniformMatrix4x3fv",
+   "UnlockArraysEXT",
+   "UnmapBuffer",
+   "UnmapBufferARB",
+   "UseProgram",
+   "UseProgramObjectARB",
+   "ValidateProgram",
+   "ValidateProgramARB",
+   "Vertex2d",
+   "Vertex2dv",
+   "Vertex2f",
+   "Vertex2fv",
+   "Vertex2i",
+   "Vertex2iv",
+   "Vertex2s",
+   "Vertex2sv",
+   "Vertex3d",
+   "Vertex3dv",
+   "Vertex3f",
+   "Vertex3fv",
+   "Vertex3i",
+   "Vertex3iv",
+   "Vertex3s",
+   "Vertex3sv",
+   "Vertex4d",
+   "Vertex4dv",
+   "Vertex4f",
+   "Vertex4fv",
+   "Vertex4i",
+   "Vertex4iv",
+   "Vertex4s",
+   "Vertex4sv",
+   "VertexAttrib1d",
+   "VertexAttrib1dARB",
+   "VertexAttrib1dv",
+   "VertexAttrib1dvARB",
+   "VertexAttrib1f",
+   "VertexAttrib1fARB",
+   "VertexAttrib1fv",
+   "VertexAttrib1fvARB",
+   "VertexAttrib1s",
+   "VertexAttrib1sARB",
+   "VertexAttrib1sv",
+   "VertexAttrib1svARB",
+   "VertexAttrib2d",
+   "VertexAttrib2dARB",
+   "VertexAttrib2dv",
+   "VertexAttrib2dvARB",
+   "VertexAttrib2f",
+   "VertexAttrib2fARB",
+   "VertexAttrib2fv",
+   "VertexAttrib2fvARB",
+   "VertexAttrib2s",
+   "VertexAttrib2sARB",
+   "VertexAttrib2sv",
+   "VertexAttrib2svARB",
+   "VertexAttrib3d",
+   "VertexAttrib3dARB",
+   "VertexAttrib3dv",
+   "VertexAttrib3dvARB",
+   "VertexAttrib3f",
+   "VertexAttrib3fARB",
+   "VertexAttrib3fv",
+   "VertexAttrib3fvARB",
+   "VertexAttrib3s",
+   "VertexAttrib3sARB",
+   "VertexAttrib3sv",
+   "VertexAttrib3svARB",
+   "VertexAttrib4bv",
+   "VertexAttrib4bvARB",
+   "VertexAttrib4d",
+   "VertexAttrib4dARB",
+   "VertexAttrib4dv",
+   "VertexAttrib4dvARB",
+   "VertexAttrib4f",
+   "VertexAttrib4fARB",
+   "VertexAttrib4fv",
+   "VertexAttrib4fvARB",
+   "VertexAttrib4iv",
+   "VertexAttrib4ivARB",
+   "VertexAttrib4Nbv",
+   "VertexAttrib4NbvARB",
+   "VertexAttrib4Niv",
+   "VertexAttrib4NivARB",
+   "VertexAttrib4Nsv",
+   "VertexAttrib4NsvARB",
+   "VertexAttrib4Nub",
+   "VertexAttrib4NubARB",
+   "VertexAttrib4Nubv",
+   "VertexAttrib4NubvARB",
+   "VertexAttrib4Nuiv",
+   "VertexAttrib4NuivARB",
+   "VertexAttrib4Nusv",
+   "VertexAttrib4NusvARB",
+   "VertexAttrib4s",
+   "VertexAttrib4sARB",
+   "VertexAttrib4sv",
+   "VertexAttrib4svARB",
+   "VertexAttrib4ubv",
+   "VertexAttrib4ubvARB",
+   "VertexAttrib4uiv",
+   "VertexAttrib4uivARB",
+   "VertexAttrib4usv",
+   "VertexAttrib4usvARB",
+   "VertexAttribBinding",
+   "VertexAttribDivisor",
+   "VertexAttribDivisorARB",
+   "VertexAttribFormat",
+   "VertexAttribI1i",
+   "VertexAttribI1iEXT",
+   "VertexAttribI1iv",
+   "VertexAttribI1ivEXT",
+   "VertexAttribI1ui",
+   "VertexAttribI1uiEXT",
+   "VertexAttribI1uiv",
+   "VertexAttribI1uivEXT",
+   "VertexAttribI2i",
+   "VertexAttribI2iEXT",
+   "VertexAttribI2iv",
+   "VertexAttribI2ivEXT",
+   "VertexAttribI2ui",
+   "VertexAttribI2uiEXT",
+   "VertexAttribI2uiv",
+   "VertexAttribI2uivEXT",
+   "VertexAttribI3i",
+   "VertexAttribI3iEXT",
+   "VertexAttribI3iv",
+   "VertexAttribI3ivEXT",
+   "VertexAttribI3ui",
+   "VertexAttribI3uiEXT",
+   "VertexAttribI3uiv",
+   "VertexAttribI3uivEXT",
+   "VertexAttribI4bv",
+   "VertexAttribI4bvEXT",
+   "VertexAttribI4i",
+   "VertexAttribI4iEXT",
+   "VertexAttribI4iv",
+   "VertexAttribI4ivEXT",
+   "VertexAttribI4sv",
+   "VertexAttribI4svEXT",
+   "VertexAttribI4ubv",
+   "VertexAttribI4ubvEXT",
+   "VertexAttribI4ui",
+   "VertexAttribI4uiEXT",
+   "VertexAttribI4uiv",
+   "VertexAttribI4uivEXT",
+   "VertexAttribI4usv",
+   "VertexAttribI4usvEXT",
+   "VertexAttribIFormat",
+   "VertexAttribIPointer",
+   "VertexAttribIPointerEXT",
+   "VertexAttribLFormat",
+   "VertexAttribP1ui",
+   "VertexAttribP1uiv",
+   "VertexAttribP2ui",
+   "VertexAttribP2uiv",
+   "VertexAttribP3ui",
+   "VertexAttribP3uiv",
+   "VertexAttribP4ui",
+   "VertexAttribP4uiv",
+   "VertexAttribPointer",
+   "VertexAttribPointerARB",
+   "VertexBindingDivisor",
+   "VertexP2ui",
+   "VertexP2uiv",
+   "VertexP3ui",
+   "VertexP3uiv",
+   "VertexP4ui",
+   "VertexP4uiv",
+   "VertexPointer",
+   "VertexPointerEXT",
+   "Viewport",
+   "ViewportArrayv",
+   "ViewportIndexedf",
+   "ViewportIndexedfv",
+   "WaitSync",
+   "WindowPos2d",
+   "WindowPos2dARB",
+   "WindowPos2dv",
+   "WindowPos2dvARB",
+   "WindowPos2f",
+   "WindowPos2fARB",
+   "WindowPos2fv",
+   "WindowPos2fvARB",
+   "WindowPos2i",
+   "WindowPos2iARB",
+   "WindowPos2iv",
+   "WindowPos2ivARB",
+   "WindowPos2s",
+   "WindowPos2sARB",
+   "WindowPos2sv",
+   "WindowPos2svARB",
+   "WindowPos3d",
+   "WindowPos3dARB",
+   "WindowPos3dv",
+   "WindowPos3dvARB",
+   "WindowPos3f",
+   "WindowPos3fARB",
+   "WindowPos3fv",
+   "WindowPos3fvARB",
+   "WindowPos3i",
+   "WindowPos3iARB",
+   "WindowPos3iv",
+   "WindowPos3ivARB",
+   "WindowPos3s",
+   "WindowPos3sARB",
+   "WindowPos3sv",
+   "WindowPos3svARB",
+]
+
+"""Functions that need dispatch slots but are not used
+
+Some of these functions may have GLX protocol support (for
+indirect-rendering).  Other were used in previous versions of Mesa.  They keep
+slots in the dispatch table so that newer versions of libGL can still be used
+with older drivers."""
+unused_functions = [
+    # SGIS_multisample
+    "SampleMaskSGIS",
+    "SamplePatternSGIS",
+
+    # NV_vertex_program
+    "AreProgramsResidentNV",
+    "ExecuteProgramNV",
+    "GetProgramParameterdvNV",
+    "GetProgramParameterfvNV",
+    "GetProgramivNV",
+    "GetProgramStringNV",
+    "GetTrackMatrixivNV",
+    "GetVertexAttribdvNV",
+    "GetVertexAttribfvNV",
+    "GetVertexAttribivNV",
+    "LoadProgramNV",
+    "ProgramParameters4dvNV",
+    "ProgramParameters4fvNV",
+    "RequestResidentProgramsNV",
+    "TrackMatrixNV",
+    "VertexAttribPointerNV",
+
+    # MESA_resize_buffers
+    "ResizeBuffersMESA",
+
+    # ATI_envmap_bumpmap
+    "TexBumpParameterfvATI",
+    "TexBumpParameterivATI",
+    "GetTexBumpParameterfvATI",
+    "GetTexBumpParameterivATI",
+
+    # NV_fragment_program
+    "ProgramNamedParameter4fNV",
+    "ProgramNamedParameter4dNV",
+    "ProgramNamedParameter4fvNV",
+    "ProgramNamedParameter4dvNV",
+    "GetProgramNamedParameterfvNV",
+    "GetProgramNamedParameterdvNV",
+
+    # APPLE_flush_buffer_range
+    "BufferParameteriAPPLE",
+    "FlushMappedBufferRangeAPPLE",
+
+    # EXT_separate_shader_objects
+    "UseShaderProgramEXT",
+    "ActiveProgramEXT",
+    "CreateShaderProgramEXT",
+]
diff --git a/src/mapi/glapi/glapi_priv.h b/src/mapi/glapi/glapi_priv.h
index 50f710e..337913a 100644
--- a/src/mapi/glapi/glapi_priv.h
+++ b/src/mapi/glapi/glapi_priv.h
@@ -49,6 +49,10 @@ typedef void *GLeglImageOES;
 #include "glapi/glapi.h"
 
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 /* getproc */
 
 extern void
@@ -106,4 +110,8 @@ get_entrypoint_address(unsigned int functionOffset);
 #define MAX_EXTENSION_FUNCS 256
 
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif
diff --git a/src/mapi/glapi/tests/check_table.cpp b/src/mapi/glapi/tests/check_table.cpp
index 5d759df..09bf4f3 100644
--- a/src/mapi/glapi/tests/check_table.cpp
+++ b/src/mapi/glapi/tests/check_table.cpp
@@ -1137,7 +1137,6 @@ const struct name_offset known_dispatch[] = {
    { "glDrawElementsInstancedARB", _O(DrawElementsInstancedARB) },
    { "glRenderbufferStorageMultisample", _O(RenderbufferStorageMultisample) },
    { "glFramebufferTexture", _O(FramebufferTexture) },
-   { "glFramebufferTextureFaceARB", _O(FramebufferTextureFaceARB) },
    { "glProgramParameteri", _O(ProgramParameteri) },
    { "glVertexAttribDivisor", _O(VertexAttribDivisor) },
    { "glFlushMappedBufferRange", _O(FlushMappedBufferRange) },
diff --git a/src/mesa/Android.gen.mk b/src/mesa/Android.gen.mk
index cc97954..145f259 100644
--- a/src/mesa/Android.gen.mk
+++ b/src/mesa/Android.gen.mk
@@ -115,9 +115,11 @@ $(intermediates)/main/api_exec.c: $(dispatch_deps)
 
 GET_HASH_GEN := $(LOCAL_PATH)/main/get_hash_generator.py
 
+$(intermediates)/main/get_hash.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(GET_HASH_GEN)
+$(intermediates)/main/get_hash.h: PRIVATE_XML := -f $(glapi)/gl_and_es_API.xml
 $(intermediates)/main/get_hash.h: $(glapi)/gl_and_es_API.xml \
                $(LOCAL_PATH)/main/get_hash_params.py $(GET_HASH_GEN)
-	@$(MESA_PYTHON2) $(GET_HASH_GEN) -f $< > $@
+	$(call es-gen)
 
 FORMAT_INFO := $(LOCAL_PATH)/main/format_info.py
 format_info_deps := \
@@ -125,8 +127,10 @@ format_info_deps := \
 	$(LOCAL_PATH)/main/format_parser.py \
 	$(FORMAT_INFO)
 
+$(intermediates)/main/format_info.h: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_INFO)
+$(intermediates)/main/format_info.h: PRIVATE_XML :=
 $(intermediates)/main/format_info.h: $(format_info_deps)
-	@$(MESA_PYTHON2) $(FORMAT_INFO) $< > $@
+	$(call es-gen, $<)
 
 FORMAT_PACK := $(LOCAL_PATH)/main/format_pack.py
 format_pack_deps := \
@@ -134,8 +138,10 @@ format_pack_deps := \
 	$(LOCAL_PATH)/main/format_parser.py \
 	$(FORMAT_PACK)
 
+$(intermediates)/main/format_pack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_PACK)
+$(intermediates)/main/format_pack.c: PRIVATE_XML :=
 $(intermediates)/main/format_pack.c: $(format_pack_deps)
-	$(hide) $(MESA_PYTHON2) $(FORMAT_PACK) $< > $@
+	$(call es-gen, $<)
 
 FORMAT_UNPACK := $(LOCAL_PATH)/main/format_unpack.py
 format_unpack_deps := \
@@ -143,5 +149,7 @@ format_unpack_deps := \
 	$(LOCAL_PATH)/main/format_parser.py \
 	$(FORMAT_UNPACK)
 
+$(intermediates)/main/format_unpack.c: PRIVATE_SCRIPT := $(MESA_PYTHON2) $(FORMAT_UNPACK)
+$(intermediates)/main/format_unpack.c: PRIVATE_XML :=
 $(intermediates)/main/format_unpack.c: $(format_unpack_deps)
-	$(hide) $(MESA_PYTHON2) $(FORMAT_UNPACK) $< > $@
+	$(call es-gen, $<)
diff --git a/src/mesa/Android.libmesa_glsl_utils.mk b/src/mesa/Android.libmesa_glsl_utils.mk
index 3497377..ed620ac 100644
--- a/src/mesa/Android.libmesa_glsl_utils.mk
+++ b/src/mesa/Android.libmesa_glsl_utils.mk
@@ -44,7 +44,8 @@ LOCAL_C_INCLUDES := \
 LOCAL_SRC_FILES := \
 	main/imports.c \
 	program/prog_hash_table.c \
-	program/symbol_table.c
+	program/symbol_table.c \
+	program/dummy_errors.c
 
 include $(MESA_COMMON_MK)
 include $(BUILD_STATIC_LIBRARY)
@@ -68,7 +69,8 @@ LOCAL_C_INCLUDES := \
 LOCAL_SRC_FILES := \
 	main/imports.c \
 	program/prog_hash_table.c \
-	program/symbol_table.c
+	program/symbol_table.c \
+	program/dummy_errors.c
 
 include $(MESA_COMMON_MK)
 include $(BUILD_HOST_STATIC_LIBRARY)
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 60114e4..71794b5 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -60,7 +60,6 @@ main/git_sha1.h: main/git_sha1.h.tmp
 include Makefile.sources
 
 EXTRA_DIST = \
-	drivers/haiku \
 	drivers/SConscript \
 	main/format_info.py \
 	main/format_pack.py \
diff --git a/src/mesa/drivers/SConscript b/src/mesa/drivers/SConscript
index db65678..5d654f5 100644
--- a/src/mesa/drivers/SConscript
+++ b/src/mesa/drivers/SConscript
@@ -8,6 +8,3 @@ if env['dri']:
         'dri/common/xmlpool/SConscript',
         'dri/common/SConscript',
     ])
-
-if env['platform'] == 'haiku':
-    SConscript('haiku/swrast/SConscript')
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 0d094dd..71c1a76 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -172,7 +172,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->UnmapRenderbuffer = _swrast_unmap_soft_renderbuffer;
    driver->RenderTexture = _swrast_render_texture;
    driver->FinishRenderTexture = _swrast_finish_render_texture;
-   driver->FramebufferRenderbuffer = _mesa_framebuffer_renderbuffer;
+   driver->FramebufferRenderbuffer = _mesa_FramebufferRenderbuffer_sw;
    driver->ValidateFramebuffer = _mesa_validate_framebuffer;
 
    driver->BlitFramebuffer = _swrast_BlitFramebuffer;
diff --git a/src/mesa/drivers/common/meta.c b/src/mesa/drivers/common/meta.c
index d2ab7b8..214a68a 100644
--- a/src/mesa/drivers/common/meta.c
+++ b/src/mesa/drivers/common/meta.c
@@ -1211,7 +1211,8 @@ _mesa_meta_end(struct gl_context *ctx)
       _mesa_BindRenderbuffer(GL_RENDERBUFFER, save->RenderbufferName);
 
    if (state & MESA_META_DRAW_BUFFERS) {
-      _mesa_drawbuffers(ctx, ctx->Const.MaxDrawBuffers, save->ColorDrawBuffers, NULL);
+      _mesa_drawbuffers(ctx, ctx->DrawBuffer, ctx->Const.MaxDrawBuffers,
+                        save->ColorDrawBuffers, NULL);
    }
 
    ctx->Meta->SaveStackDepth--;
diff --git a/src/mesa/drivers/common/meta_blit.c b/src/mesa/drivers/common/meta_blit.c
index bb21642..9cace2b 100644
--- a/src/mesa/drivers/common/meta_blit.c
+++ b/src/mesa/drivers/common/meta_blit.c
@@ -82,7 +82,7 @@ setup_glsl_msaa_blit_scaled_shader(struct gl_context *ctx,
    y_scale = samples * 0.5;
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && (samples & (samples - 1)) == 0);
+   assert(samples > 0 && is_power_of_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -263,7 +263,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
    }
 
    /* We expect only power of 2 samples in source multisample buffer. */
-   assert(samples > 0 && (samples & (samples - 1)) == 0);
+   assert(samples > 0 && is_power_of_two(samples));
    while (samples >> (shader_offset + 1)) {
       shader_offset++;
    }
@@ -434,7 +434,7 @@ setup_glsl_msaa_blit_shader(struct gl_context *ctx,
           * (so the floating point exponent just gets increased), rather than
           * doing a naive sum and dividing.
           */
-         assert((samples & (samples - 1)) == 0);
+         assert(is_power_of_two(samples));
          /* Fetch each individual sample. */
          sample_resolve = rzalloc_size(mem_ctx, 1);
          for (i = 0; i < samples; i++) {
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index ad6e787..d2474f5 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -34,6 +34,7 @@
 #include "macros.h"
 #include "meta.h"
 #include "pbo.h"
+#include "readpix.h"
 #include "shaderapi.h"
 #include "state.h"
 #include "teximage.h"
@@ -150,7 +151,8 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims,
    bool success = false;
    int z;
 
-   if (!_mesa_is_bufferobj(packing->BufferObj) && !create_pbo)
+   if (!_mesa_is_bufferobj(packing->BufferObj) &&
+       (!create_pbo || pixels == NULL))
       return false;
 
    if (format == GL_DEPTH_COMPONENT ||
@@ -257,6 +259,7 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    GLuint pbo = 0, pbo_tex = 0, fbos[2] = { 0, 0 };
    int full_height, image_height;
    struct gl_texture_image *pbo_tex_image;
+   struct gl_renderbuffer *rb = NULL;
    GLenum status;
    bool success = false;
    int z;
@@ -273,6 +276,13 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
    if (ctx->_ImageTransferState)
       return false;
 
+
+   if (!tex_image) {
+      rb = ctx->ReadBuffer->_ColorReadBuffer;
+      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format))
+         return false;
+   }
+
    /* For arrays, use a tall (height * depth) 2D texture but taking into
     * account the inter-image padding specified with the image height packing
     * property.
diff --git a/src/mesa/drivers/dri/Makefile.am b/src/mesa/drivers/dri/Makefile.am
index fa1de10..08a8e64 100644
--- a/src/mesa/drivers/dri/Makefile.am
+++ b/src/mesa/drivers/dri/Makefile.am
@@ -60,6 +60,7 @@ mesa_dri_drivers_la_LIBADD = \
         ../../libmesa.la \
         common/libmegadriver_stub.la \
         common/libdricommon.la \
+        common/libxmlconfig.la \
         $(MEGADRIVERS_DEPS) \
         $(DRI_LIB_DEPS) \
         $()
diff --git a/src/mesa/drivers/dri/common/Android.mk b/src/mesa/drivers/dri/common/Android.mk
index a7fcd6d..6986f5e 100644
--- a/src/mesa/drivers/dri/common/Android.mk
+++ b/src/mesa/drivers/dri/common/Android.mk
@@ -39,7 +39,9 @@ intermediates := $(call local-generated-sources-dir)
 LOCAL_C_INCLUDES := \
     $(MESA_DRI_C_INCLUDES)
 
-LOCAL_EXPORT_C_INCLUDE_DIRS := $(intermediates)
+LOCAL_EXPORT_C_INCLUDE_DIRS := \
+    $(LOCAL_PATH) \
+    $(intermediates)
 
 # swrast only
 ifeq ($(MESA_GPU_DRIVERS),swrast)
@@ -48,7 +50,9 @@ else
 LOCAL_SHARED_LIBRARIES := libdrm
 endif
 
-LOCAL_SRC_FILES := $(DRI_COMMON_FILES)
+LOCAL_SRC_FILES := \
+	$(DRI_COMMON_FILES) \
+	$(XMLCONFIG_FILES)
 
 MESA_DRI_OPTIONS_H := $(intermediates)/xmlpool/options.h
 LOCAL_GENERATED_SOURCES := $(MESA_DRI_OPTIONS_H)
diff --git a/src/mesa/drivers/dri/common/Makefile.am b/src/mesa/drivers/dri/common/Makefile.am
index da8f97a..ae19fcb 100644
--- a/src/mesa/drivers/dri/common/Makefile.am
+++ b/src/mesa/drivers/dri/common/Makefile.am
@@ -33,16 +33,20 @@ AM_CFLAGS = \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	$(DEFINES) \
-	$(EXPAT_CFLAGS) \
 	$(VISIBILITY_CFLAGS)
 
 noinst_LTLIBRARIES = \
 	libdricommon.la \
+	libxmlconfig.la \
 	libmegadriver_stub.la \
 	libdri_test_stubs.la
 
 libdricommon_la_SOURCES = $(DRI_COMMON_FILES)
 
+libxmlconfig_la_SOURCES = $(XMLCONFIG_FILES)
+libxmlconfig_la_CFLAGS = $(AM_CFLAGS) $(EXPAT_CFLAGS)
+libxmlconfig_la_LIBADD = $(EXPAT_LIBS) -lm
+
 libdri_test_stubs_la_SOURCES = $(test_stubs_FILES)
 libdri_test_stubs_la_CFLAGS = $(AM_CFLAGS) -DNO_MAIN
 
diff --git a/src/mesa/drivers/dri/common/Makefile.sources b/src/mesa/drivers/dri/common/Makefile.sources
index d00ec5f..d5d8da8 100644
--- a/src/mesa/drivers/dri/common/Makefile.sources
+++ b/src/mesa/drivers/dri/common/Makefile.sources
@@ -2,7 +2,9 @@ DRI_COMMON_FILES := \
 	utils.c \
 	utils.h \
 	dri_util.c \
-	dri_util.h \
+	dri_util.h
+
+XMLCONFIG_FILES := \
 	xmlconfig.c \
 	xmlconfig.h
 
diff --git a/src/mesa/drivers/dri/common/SConscript b/src/mesa/drivers/dri/common/SConscript
index 0bee1b4..b402736 100644
--- a/src/mesa/drivers/dri/common/SConscript
+++ b/src/mesa/drivers/dri/common/SConscript
@@ -37,7 +37,7 @@ drienv.PkgUseModules('DRM')
 # else
 #env.Append(CPPDEFINES = ['__NOT_HAVE_DRM_H'])
 
-sources = drienv.ParseSourceList('Makefile.sources', 'DRI_COMMON_FILES')
+sources = drienv.ParseSourceList('Makefile.sources', ['DRI_COMMON_FILES', 'XMLCONFIG_FILES' ])
 
 dri_common = drienv.ConvenienceLibrary(
 	target = 'dri_common',
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index d6e875f..e7ababe 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -162,13 +162,21 @@ driCreateNewScreen2(int scrn, int fd,
 	return NULL;
     }
 
-    int gl_version_override = _mesa_get_gl_version_override();
-    if (gl_version_override >= 31) {
-       psp->max_gl_core_version = MAX2(psp->max_gl_core_version,
-                                       gl_version_override);
-    } else {
-       psp->max_gl_compat_version = MAX2(psp->max_gl_compat_version,
-                                         gl_version_override);
+    struct gl_constants consts = { 0 };
+    gl_api api;
+    unsigned version;
+
+    api = API_OPENGLES2;
+    if (_mesa_override_gl_version_contextless(&consts, &api, &version))
+       psp->max_gl_es2_version = version;
+
+    api = API_OPENGL_COMPAT;
+    if (_mesa_override_gl_version_contextless(&consts, &api, &version)) {
+       if (api == API_OPENGL_CORE) {
+          psp->max_gl_core_version = version;
+       } else {
+          psp->max_gl_compat_version = version;
+       }
     }
 
     psp->api_mask = (1 << __DRI_API_OPENGL);
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 91da977..8ed8ff5 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -730,9 +730,9 @@ i830_update_draw_buffer(struct intel_context *intel)
     */
    if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
    }
 
    if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
diff --git a/src/mesa/drivers/dri/i915/i915_fragprog.c b/src/mesa/drivers/dri/i915/i915_fragprog.c
index 9b00223..03c32e5 100644
--- a/src/mesa/drivers/dri/i915/i915_fragprog.c
+++ b/src/mesa/drivers/dri/i915/i915_fragprog.c
@@ -220,7 +220,7 @@ get_result_flags(const struct prog_instruction *inst)
 {
    GLuint flags = 0;
 
-   if (inst->SaturateMode == SATURATE_ZERO_ONE)
+   if (inst->Saturate)
       flags |= A0_DEST_SATURATE;
    if (inst->DstReg.WriteMask & WRITEMASK_X)
       flags |= A0_DEST_CHANNEL_X;
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 97bf81e..80bd249 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -732,9 +732,9 @@ i915_update_draw_buffer(struct intel_context *intel)
     */
    if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
    }
 
    if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
diff --git a/src/mesa/drivers/dri/i915/intel_fbo.c b/src/mesa/drivers/dri/i915/intel_fbo.c
index 24c3180..a5d5c58 100644
--- a/src/mesa/drivers/dri/i915/intel_fbo.c
+++ b/src/mesa/drivers/dri/i915/intel_fbo.c
@@ -427,7 +427,7 @@ intel_framebuffer_renderbuffer(struct gl_context * ctx,
 {
    DBG("Intel FramebufferRenderbuffer %u %u\n", fb->Name, rb ? rb->Name : 0);
 
-   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+   _mesa_FramebufferRenderbuffer_sw(ctx, fb, attachment, rb);
    intel_draw_buffer(ctx);
 }
 
diff --git a/src/mesa/drivers/dri/i965/Makefile.am b/src/mesa/drivers/dri/i965/Makefile.am
index cf2424e..9c947be 100644
--- a/src/mesa/drivers/dri/i965/Makefile.am
+++ b/src/mesa/drivers/dri/i965/Makefile.am
@@ -48,6 +48,7 @@ libi965_dri_la_LIBADD = $(INTEL_LIBS)
 TEST_LIBS = \
 	libi965_dri.la \
 	../common/libdricommon.la \
+	../common/libxmlconfig.la \
 	../common/libmegadriver_stub.la \
         ../../../libmesa.la \
 	$(DRI_LIB_DEPS) \
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 1ae93e1..981fe79 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -18,9 +18,11 @@ i965_FILES = \
 	brw_clip_unfilled.c \
 	brw_clip_util.c \
 	brw_compute.c \
+	brw_conditional_render.c \
 	brw_context.c \
 	brw_context.h \
 	brw_cs.cpp \
+	brw_cs.h \
 	brw_cubemap_normalize.cpp \
 	brw_curbe.c \
 	brw_dead_control_flow.cpp \
@@ -40,6 +42,7 @@ i965_FILES = \
 	brw_ff_gs.c \
 	brw_ff_gs_emit.c \
 	brw_ff_gs.h \
+	brw_fs_builder.h \
 	brw_fs_channel_expressions.cpp \
 	brw_fs_cmod_propagation.cpp \
 	brw_fs_combine_constants.cpp \
@@ -47,7 +50,6 @@ i965_FILES = \
 	brw_fs.cpp \
 	brw_fs_cse.cpp \
 	brw_fs_dead_code_eliminate.cpp \
-	brw_fs_fp.cpp \
 	brw_fs_generator.cpp \
 	brw_fs.h \
 	brw_fs_live_variables.cpp \
@@ -128,6 +130,7 @@ i965_FILES = \
 	brw_vs.h \
 	brw_vs_state.c \
 	brw_vs_surface_state.c \
+	brw_vue_map.c \
 	brw_wm.c \
 	brw_wm.h \
 	brw_wm_iz.cpp \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
index c1b7609..789520c 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit_eu.cpp
@@ -29,7 +29,8 @@
 brw_blorp_eu_emitter::brw_blorp_eu_emitter(struct brw_context *brw,
                                            bool debug_flag)
    : mem_ctx(ralloc_context(NULL)),
-     generator(brw, mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key),
+     generator(brw->intelScreen->compiler, brw,
+               mem_ctx, (void *) rzalloc(mem_ctx, struct brw_wm_prog_key),
                (struct brw_stage_prog_data *) rzalloc(mem_ctx, struct brw_wm_prog_data),
                NULL, 0, false, "BLORP")
 {
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.cpp b/src/mesa/drivers/dri/i965/brw_cfg.cpp
index 7e7770e..f1f230e 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cfg.cpp
@@ -141,12 +141,12 @@ bblock_t::combine_with(bblock_t *that)
 }
 
 void
-bblock_t::dump(backend_visitor *v) const
+bblock_t::dump(backend_shader *s) const
 {
    int ip = this->start_ip;
    foreach_inst_in_block(backend_instruction, inst, this) {
       fprintf(stderr, "%5d: ", ip);
-      v->dump_instruction(inst);
+      s->dump_instruction(inst);
       ip++;
    }
 }
@@ -231,6 +231,7 @@ cfg_t::cfg_t(exec_list *instructions)
          if (cur_else) {
             cur_else->add_successor(mem_ctx, cur_endif);
          } else {
+            assert(cur_if != NULL);
             cur_if->add_successor(mem_ctx, cur_endif);
          }
 
@@ -299,6 +300,7 @@ cfg_t::cfg_t(exec_list *instructions)
          inst->exec_node::remove();
          cur->instructions.push_tail(inst);
 
+         assert(cur_do != NULL && cur_while != NULL);
 	 cur->add_successor(mem_ctx, cur_do);
 	 set_next_block(&cur, cur_while, ip);
 
@@ -411,7 +413,7 @@ cfg_t::make_block_array()
 }
 
 void
-cfg_t::dump(backend_visitor *v)
+cfg_t::dump(backend_shader *s)
 {
    if (idom_dirty)
       calculate_idom();
@@ -423,8 +425,8 @@ cfg_t::dump(backend_visitor *v)
                  link->block->num);
       }
       fprintf(stderr, "\n");
-      if (v != NULL)
-         block->dump(v);
+      if (s != NULL)
+         block->dump(s);
       fprintf(stderr, "END B%d", block->num);
       foreach_list_typed(bblock_link, link, link, &block->children) {
          fprintf(stderr, " ->B%d",
diff --git a/src/mesa/drivers/dri/i965/brw_cfg.h b/src/mesa/drivers/dri/i965/brw_cfg.h
index 56d7d07..a094917 100644
--- a/src/mesa/drivers/dri/i965/brw_cfg.h
+++ b/src/mesa/drivers/dri/i965/brw_cfg.h
@@ -60,7 +60,7 @@ struct bblock_t {
    bool is_successor_of(const bblock_t *block) const;
    bool can_combine_with(const bblock_t *that) const;
    void combine_with(bblock_t *that);
-   void dump(backend_visitor *v) const;
+   void dump(backend_shader *s) const;
 
    backend_instruction *start();
    const backend_instruction *start() const;
@@ -273,7 +273,7 @@ struct cfg_t {
    void calculate_idom();
    static bblock_t *intersect(bblock_t *b1, bblock_t *b2);
 
-   void dump(backend_visitor *v);
+   void dump(backend_shader *s);
    void dump_cfg();
    void dump_domtree();
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_clear.c b/src/mesa/drivers/dri/i965/brw_clear.c
index 1231420..1d4ba3c 100644
--- a/src/mesa/drivers/dri/i965/brw_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_clear.c
@@ -121,8 +121,9 @@ brw_fast_clear_depth(struct gl_context *ctx)
     * first.
     */
    if ((ctx->Scissor.EnableFlags & 1) && !noop_scissor(ctx, fb)) {
-      perf_debug("Failed to fast clear depth due to scissor being enabled.  "
-                 "Possible 5%% performance win if avoided.\n");
+      perf_debug("Failed to fast clear %dx%d depth because of scissors.  "
+                 "Possible 5%% performance win if avoided.\n",
+                 mt->logical_width0, mt->logical_height0);
       return false;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 3223834..dee74db 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -32,6 +32,7 @@
 #include "brw_context.h"
 #include "brw_state.h"
 #include "brw_defines.h"
+#include "main/framebuffer.h"
 
 static void
 upload_clip_vp(struct brw_context *brw)
@@ -59,7 +60,9 @@ brw_upload_clip_unit(struct brw_context *brw)
    struct brw_clip_unit_state *clip;
 
    /* _NEW_BUFFERS */
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const float fb_width = (float)_mesa_geometric_width(fb);
+   const float fb_height = (float)_mesa_geometric_height(fb);
 
    upload_clip_vp(brw);
 
@@ -127,8 +130,8 @@ brw_upload_clip_unit(struct brw_context *brw)
    /* enable guardband clipping if we can */
    if (ctx->ViewportArray[0].X == 0 &&
        ctx->ViewportArray[0].Y == 0 &&
-       ctx->ViewportArray[0].Width == (float) fb->Width &&
-       ctx->ViewportArray[0].Height == (float) fb->Height)
+       ctx->ViewportArray[0].Width == fb_width &&
+       ctx->ViewportArray[0].Height == fb_height)
    {
       clip->clip5.guard_band_enable = 1;
       clip->clip6.clipper_viewport_state_ptr =
diff --git a/src/mesa/drivers/dri/i965/brw_compute.c b/src/mesa/drivers/dri/i965/brw_compute.c
index b3d6de5..5693ab5 100644
--- a/src/mesa/drivers/dri/i965/brw_compute.c
+++ b/src/mesa/drivers/dri/i965/brw_compute.c
@@ -45,7 +45,7 @@ brw_emit_gpgpu_walker(struct brw_context *brw, const GLuint *num_groups)
    unsigned thread_width_max =
       (group_size + simd_size - 1) / simd_size;
 
-   uint32_t right_mask = (1u << simd_size) - 1;
+   uint32_t right_mask = 0xffffffffu >> (32 - simd_size);
    const unsigned right_non_aligned = group_size & (simd_size - 1);
    if (right_non_aligned != 0)
       right_mask >>= (simd_size - right_non_aligned);
diff --git a/src/mesa/drivers/dri/i965/brw_conditional_render.c b/src/mesa/drivers/dri/i965/brw_conditional_render.c
new file mode 100644
index 0000000..6d37c3b
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_conditional_render.c
@@ -0,0 +1,161 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Neil Roberts <neil@linux.intel.com>
+ */
+
+/** @file brw_conditional_render.c
+ *
+ * Support for conditional rendering based on query objects
+ * (GL_NV_conditional_render, GL_ARB_conditional_render_inverted) on Gen7+.
+ */
+
+#include "main/imports.h"
+#include "main/condrender.h"
+
+#include "brw_context.h"
+#include "brw_defines.h"
+#include "intel_batchbuffer.h"
+
+static void
+set_predicate_enable(struct brw_context *brw,
+                     bool value)
+{
+   if (value)
+      brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
+   else
+      brw->predicate.state = BRW_PREDICATE_STATE_DONT_RENDER;
+}
+
+static void
+set_predicate_for_result(struct brw_context *brw,
+                         struct brw_query_object *query,
+                         bool inverted)
+{
+   int load_op;
+
+   assert(query->bo != NULL);
+
+   brw_load_register_mem64(brw,
+                           MI_PREDICATE_SRC0,
+                           query->bo,
+                           I915_GEM_DOMAIN_INSTRUCTION,
+                           0, /* write domain */
+                           0 /* offset */);
+   brw_load_register_mem64(brw,
+                           MI_PREDICATE_SRC1,
+                           query->bo,
+                           I915_GEM_DOMAIN_INSTRUCTION,
+                           0, /* write domain */
+                           8 /* offset */);
+
+   if (inverted)
+      load_op = MI_PREDICATE_LOADOP_LOAD;
+   else
+      load_op = MI_PREDICATE_LOADOP_LOADINV;
+
+   BEGIN_BATCH(1);
+   OUT_BATCH(GEN7_MI_PREDICATE |
+             load_op |
+             MI_PREDICATE_COMBINEOP_SET |
+             MI_PREDICATE_COMPAREOP_SRCS_EQUAL);
+   ADVANCE_BATCH();
+
+   brw->predicate.state = BRW_PREDICATE_STATE_USE_BIT;
+}
+
+static void
+brw_begin_conditional_render(struct gl_context *ctx,
+                             struct gl_query_object *q,
+                             GLenum mode)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *) q;
+   bool inverted;
+
+   if (!brw->predicate.supported)
+      return;
+
+   switch (mode) {
+   case GL_QUERY_WAIT:
+   case GL_QUERY_NO_WAIT:
+   case GL_QUERY_BY_REGION_WAIT:
+   case GL_QUERY_BY_REGION_NO_WAIT:
+      inverted = false;
+      break;
+   case GL_QUERY_WAIT_INVERTED:
+   case GL_QUERY_NO_WAIT_INVERTED:
+   case GL_QUERY_BY_REGION_WAIT_INVERTED:
+   case GL_QUERY_BY_REGION_NO_WAIT_INVERTED:
+      inverted = true;
+      break;
+   default:
+      unreachable("Unexpected conditional render mode");
+   }
+
+   /* If there are already samples from a BLT operation or if the query object
+    * is ready then we can avoid looking at the values in the buffer and just
+    * decide whether to draw using the CPU without stalling.
+    */
+   if (query->Base.Result || query->Base.Ready)
+      set_predicate_enable(brw, (query->Base.Result != 0) ^ inverted);
+   else
+      set_predicate_for_result(brw, query, inverted);
+}
+
+static void
+brw_end_conditional_render(struct gl_context *ctx,
+                           struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   /* When there is no longer a conditional render in progress it should
+    * always render.
+    */
+   brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
+}
+
+void
+brw_init_conditional_render_functions(struct dd_function_table *functions)
+{
+   functions->BeginConditionalRender = brw_begin_conditional_render;
+   functions->EndConditionalRender = brw_end_conditional_render;
+}
+
+bool
+brw_check_conditional_render(struct brw_context *brw)
+{
+   if (brw->predicate.supported) {
+      /* In some cases it is possible to determine that the primitives should
+       * be skipped without needing the predicate enable bit and still without
+       * stalling.
+       */
+      return brw->predicate.state != BRW_PREDICATE_STATE_DONT_RENDER;
+   } else if (brw->ctx.Query.CondRenderQuery) {
+      perf_debug("Conditional rendering is implemented in software and may "
+                 "stall.\n");
+      return _mesa_check_conditional_render(&brw->ctx);
+   } else {
+      return true;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 2383805..ebf12fa 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -50,6 +50,7 @@
 
 #include "brw_context.h"
 #include "brw_defines.h"
+#include "brw_shader.h"
 #include "brw_draw.h"
 #include "brw_state.h"
 
@@ -68,8 +69,6 @@
 #include "tnl/t_pipeline.h"
 #include "util/ralloc.h"
 
-#include "glsl/nir/nir.h"
-
 /***************************************
  * Mesa's Driver Functions
  ***************************************/
@@ -289,6 +288,8 @@ brw_init_driver_functions(struct brw_context *brw,
    else
       gen4_init_queryobj_functions(functions);
    brw_init_compute_functions(functions);
+   if (brw->gen >= 7)
+      brw_init_conditional_render_functions(functions);
 
    functions->QuerySamplesForFormat = brw_query_samples_for_format;
 
@@ -427,11 +428,7 @@ brw_initialize_context_constants(struct brw_context *brw)
 
    ctx->Const.MinLineWidth = 1.0;
    ctx->Const.MinLineWidthAA = 1.0;
-   if (brw->gen >= 9 || brw->is_cherryview) {
-      ctx->Const.MaxLineWidth = 40.0;
-      ctx->Const.MaxLineWidthAA = 40.0;
-      ctx->Const.LineWidthGranularity = 0.125;
-   } else if (brw->gen >= 6) {
+   if (brw->gen >= 6) {
       ctx->Const.MaxLineWidth = 7.375;
       ctx->Const.MaxLineWidthAA = 7.375;
       ctx->Const.LineWidthGranularity = 0.125;
@@ -441,6 +438,13 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.LineWidthGranularity = 0.5;
    }
 
+   /* For non-antialiased lines, we have to round the line width to the
+    * nearest whole number. Make sure that we don't advertise a line
+    * width that, when rounded, will be beyond the actual hardware
+    * maximum.
+    */
+   assert(roundf(ctx->Const.MaxLineWidth) <= ctx->Const.MaxLineWidth);
+
    ctx->Const.MinPointSize = 1.0;
    ctx->Const.MinPointSizeAA = 1.0;
    ctx->Const.MaxPointSize = 255.0;
@@ -544,6 +548,7 @@ brw_initialize_context_constants(struct brw_context *brw)
     */
    ctx->Const.UniformBufferOffsetAlignment = 16;
    ctx->Const.TextureBufferOffsetAlignment = 16;
+   ctx->Const.MaxTextureBufferSize = 128 * 1024 * 1024;
 
    if (brw->gen >= 6) {
       ctx->Const.MaxVarying = 32;
@@ -553,51 +558,12 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxInputComponents = 128;
    }
 
-   static const nir_shader_compiler_options nir_options = {
-      .native_integers = true,
-      /* In order to help allow for better CSE at the NIR level we tell NIR
-       * to split all ffma instructions during opt_algebraic and we then
-       * re-combine them as a later step.
-       */
-      .lower_ffma = true,
-      .lower_sub = true,
-   };
-
    /* We want the GLSL compiler to emit code that uses condition codes */
    for (int i = 0; i < MESA_SHADER_STAGES; i++) {
-      ctx->Const.ShaderCompilerOptions[i].MaxIfDepth = brw->gen < 6 ? 16 : UINT_MAX;
-      ctx->Const.ShaderCompilerOptions[i].EmitCondCodes = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoNoise = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoMainReturn = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectInput = true;
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectOutput =
-	 (i == MESA_SHADER_FRAGMENT);
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectTemp =
-	 (i == MESA_SHADER_FRAGMENT);
-      ctx->Const.ShaderCompilerOptions[i].EmitNoIndirectUniform = false;
-      ctx->Const.ShaderCompilerOptions[i].LowerClipDistance = true;
+      ctx->Const.ShaderCompilerOptions[i] =
+         brw->intelScreen->compiler->glsl_compiler_options[i];
    }
 
-   ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = true;
-   ctx->Const.ShaderCompilerOptions[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
-
-   if (brw->scalar_vs) {
-      /* If we're using the scalar backend for vertex shaders, we need to
-       * configure these accordingly.
-       */
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].OptimizeForAOS = false;
-
-      if (brw_env_var_as_boolean("INTEL_USE_NIR", true))
-         ctx->Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions = &nir_options;
-   }
-
-   if (brw_env_var_as_boolean("INTEL_USE_NIR", true))
-      ctx->Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions = &nir_options;
-
-   ctx->Const.ShaderCompilerOptions[MESA_SHADER_COMPUTE].NirOptions = &nir_options;
-
    /* ARB_viewport_array */
    if (brw->gen >= 6 && ctx->API == API_OPENGL_CORE) {
       ctx->Const.MaxViewports = GEN6_NUM_VIEWPORTS;
@@ -612,6 +578,12 @@ brw_initialize_context_constants(struct brw_context *brw)
    /* ARB_gpu_shader5 */
    if (brw->gen >= 7)
       ctx->Const.MaxVertexStreams = MIN2(4, MAX_VERTEX_STREAMS);
+
+   /* ARB_framebuffer_no_attachments */
+   ctx->Const.MaxFramebufferWidth = ctx->Const.MaxViewportWidth;
+   ctx->Const.MaxFramebufferHeight = ctx->Const.MaxViewportHeight;
+   ctx->Const.MaxFramebufferLayers = ctx->Const.MaxArrayTextureLayers;
+   ctx->Const.MaxFramebufferSamples = max_samples;
 }
 
 static void
@@ -814,10 +786,9 @@ brwCreateContext(gl_api api,
    _mesa_meta_init(ctx);
 
    brw_process_driconf_options(brw);
-   brw_process_intel_debug_variable(brw);
 
-   if (brw->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
-      brw->scalar_vs = true;
+   if (INTEL_DEBUG & DEBUG_PERF)
+      brw->perf_debug = true;
 
    brw_initialize_context_constants(brw);
 
@@ -894,6 +865,8 @@ brwCreateContext(gl_api api,
    brw->gs.enabled = false;
    brw->sf.viewport_transform_enable = true;
 
+   brw->predicate.state = BRW_PREDICATE_STATE_RENDER;
+
    ctx->VertexProgram._MaintainTnlProgram = true;
    ctx->FragmentProgram._MaintainTexEnvProgram = true;
 
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index cb4cc7f..9e1f722 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -611,6 +611,12 @@ struct brw_ff_gs_prog_data {
    unsigned svbi_postincrement_value;
 };
 
+enum shader_dispatch_mode {
+   DISPATCH_MODE_4X1_SINGLE = 0,
+   DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
+   DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
+   DISPATCH_MODE_SIMD8 = 3,
+};
 
 /* Note: brw_vue_prog_data_compare() must be updated when adding fields to
  * this struct!
@@ -628,7 +634,7 @@ struct brw_vue_prog_data {
     */
    GLuint urb_entry_size;
 
-   bool simd8;
+   enum shader_dispatch_mode dispatch_mode;
 };
 
 
@@ -726,14 +732,6 @@ struct brw_gs_prog_data
    int invocations;
 
    /**
-    * Dispatch mode, can be any of:
-    * GEN7_GS_DISPATCH_MODE_DUAL_OBJECT
-    * GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE
-    * GEN7_GS_DISPATCH_MODE_SINGLE
-    */
-   int dispatch_mode;
-
-   /**
     * Gen6 transform feedback enabled flag.
     */
    bool gen6_xfb_enabled;
@@ -829,20 +827,10 @@ struct brw_tracked_state {
 enum shader_time_shader_type {
    ST_NONE,
    ST_VS,
-   ST_VS_WRITTEN,
-   ST_VS_RESET,
    ST_GS,
-   ST_GS_WRITTEN,
-   ST_GS_RESET,
    ST_FS8,
-   ST_FS8_WRITTEN,
-   ST_FS8_RESET,
    ST_FS16,
-   ST_FS16_WRITTEN,
-   ST_FS16_RESET,
    ST_CS,
-   ST_CS_WRITTEN,
-   ST_CS_RESET,
 };
 
 struct brw_vertex_buffer {
@@ -972,6 +960,22 @@ struct brw_stage_state
    uint32_t sampler_offset;
 };
 
+enum brw_predicate_state {
+   /* The first two states are used if we can determine whether to draw
+    * without having to look at the values in the query object buffer. This
+    * will happen if there is no conditional render in progress, if the query
+    * object is already completed or if something else has already added
+    * samples to the preliminary result such as via a BLT command.
+    */
+   BRW_PREDICATE_STATE_RENDER,
+   BRW_PREDICATE_STATE_DONT_RENDER,
+   /* In this case whether to draw or not depends on the result of an
+    * MI_PREDICATE command so the predicate enable bit needs to be checked.
+    */
+   BRW_PREDICATE_STATE_USE_BIT
+};
+
+struct shader_times;
 
 /**
  * brw_context is derived from gl_context.
@@ -1131,7 +1135,6 @@ struct brw_context
    bool has_pln;
    bool no_simd8;
    bool use_rep_send;
-   bool scalar_vs;
 
    /**
     * Some versions of Gen hardware don't do centroid interpolation correctly
@@ -1408,6 +1411,11 @@ struct brw_context
    } query;
 
    struct {
+      enum brw_predicate_state state;
+      bool supported;
+   } predicate;
+
+   struct {
       /** A map from pipeline statistics counter IDs to MMIO addresses. */
       const int *statistics_registers;
 
@@ -1453,6 +1461,7 @@ struct brw_context
       uint32_t offset;
       uint32_t size;
       enum aub_state_struct_type type;
+      int index;
    } *state_batch_list;
    int state_batch_count;
 
@@ -1492,7 +1501,7 @@ struct brw_context
       const char **names;
       int *ids;
       enum shader_time_shader_type *types;
-      uint64_t *cumulative;
+      struct shader_times *cumulative;
       int num_entries;
       int max_entries;
       double report_time;
@@ -1606,12 +1615,21 @@ void brw_write_depth_count(struct brw_context *brw, drm_intel_bo *bo, int idx);
 void brw_store_register_mem64(struct brw_context *brw,
                               drm_intel_bo *bo, uint32_t reg, int idx);
 
+/** brw_conditional_render.c */
+void brw_init_conditional_render_functions(struct dd_function_table *functions);
+bool brw_check_conditional_render(struct brw_context *brw);
+
 /** intel_batchbuffer.c */
 void brw_load_register_mem(struct brw_context *brw,
                            uint32_t reg,
                            drm_intel_bo *bo,
                            uint32_t read_domains, uint32_t write_domain,
                            uint32_t offset);
+void brw_load_register_mem64(struct brw_context *brw,
+                             uint32_t reg,
+                             drm_intel_bo *bo,
+                             uint32_t read_domains, uint32_t write_domain,
+                             uint32_t offset);
 
 /*======================================================================
  * brw_state_dump.c
@@ -1991,6 +2009,10 @@ void intel_context_destroy(struct brw_context *brw);
 void
 brw_initialize_context_constants(struct brw_context *brw);
 
+bool
+gen9_use_linear_1d_layout(const struct brw_context *brw,
+                          const struct intel_mipmap_tree *mt);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_cs.cpp b/src/mesa/drivers/dri/i965/brw_cs.cpp
index 2432875..42a082b 100644
--- a/src/mesa/drivers/dri/i965/brw_cs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_cs.cpp
@@ -88,9 +88,15 @@ brw_cs_emit(struct brw_context *brw,
    cfg_t *cfg = NULL;
    const char *fail_msg = NULL;
 
+   int st_index = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      st_index = brw_get_shader_time_index(brw, prog, &cp->Base, ST_CS);
+
    /* Now the main event: Visit the shader IR and generate our CS IR for it.
     */
-   fs_visitor v8(brw, mem_ctx, key, prog_data, prog, cp, 8);
+   fs_visitor v8(brw->intelScreen->compiler, brw,
+                 mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+                 &cp->Base, 8, st_index);
    if (!v8.run_cs()) {
       fail_msg = v8.fail_msg;
    } else if (local_workgroup_size <= 8 * brw->max_cs_threads) {
@@ -98,7 +104,9 @@ brw_cs_emit(struct brw_context *brw,
       prog_data->simd_size = 8;
    }
 
-   fs_visitor v16(brw, mem_ctx, key, prog_data, prog, cp, 16);
+   fs_visitor v16(brw->intelScreen->compiler, brw,
+                  mem_ctx, MESA_SHADER_COMPUTE, key, &prog_data->base, prog,
+                  &cp->Base, 16, st_index);
    if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
        !fail_msg && !v8.simd16_unsupported &&
        local_workgroup_size <= 16 * brw->max_cs_threads) {
@@ -126,7 +134,8 @@ brw_cs_emit(struct brw_context *brw,
       return NULL;
    }
 
-   fs_generator g(brw, mem_ctx, (void*) key, &prog_data->base, &cp->Base,
+   fs_generator g(brw->intelScreen->compiler, brw,
+                  mem_ctx, (void*) key, &prog_data->base, &cp->Base,
                   v8.promoted_constants, v8.runtime_check_aads_emit, "CS");
    if (INTEL_DEBUG & DEBUG_CS) {
       char *name = ralloc_asprintf(mem_ctx, "%s compute shader %d",
@@ -368,9 +377,11 @@ brw_upload_cs_state(struct brw_context *brw)
 
 extern "C"
 const struct brw_tracked_state brw_cs_state = {
-   .dirty = {
-      .mesa  = 0,
-      .brw   = BRW_NEW_CS_PROG_DATA,
+   /* explicit initialisers aren't valid C++, comment
+    * them for documentation purposes */
+   /* .dirty = */{
+      /* .mesa = */ 0,
+      /* .brw = */  BRW_NEW_CS_PROG_DATA,
    },
-   .emit = brw_upload_cs_state
+   /* .emit = */ brw_upload_cs_state
 };
diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
index 03f838d..61f2581 100644
--- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
+++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.cpp
@@ -36,11 +36,11 @@
  *   - if/else/endif
  */
 bool
-dead_control_flow_eliminate(backend_visitor *v)
+dead_control_flow_eliminate(backend_shader *s)
 {
    bool progress = false;
 
-   foreach_block_safe (block, v->cfg) {
+   foreach_block_safe (block, s->cfg) {
       bblock_t *if_block = NULL, *else_block = NULL, *endif_block = block;
       bool found = false;
 
@@ -115,7 +115,7 @@ dead_control_flow_eliminate(backend_visitor *v)
    }
 
    if (progress)
-      v->invalidate_live_intervals();
+      s->invalidate_live_intervals();
 
    return progress;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
index 57a4dab..83fd9b1 100644
--- a/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
+++ b/src/mesa/drivers/dri/i965/brw_dead_control_flow.h
@@ -23,4 +23,4 @@
 
 #include "brw_shader.h"
 
-bool dead_control_flow_eliminate(backend_visitor *v);
+bool dead_control_flow_eliminate(backend_shader *s);
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 3c704ee..c113d52 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -38,6 +38,7 @@
       fieldval & field ## _MASK;                                        \
    })
 
+#define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low))
 #define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
 
 #ifndef BRW_DEFINES_H
@@ -51,6 +52,7 @@
 # define GEN4_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL (0 << 15)
 # define GEN4_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     (1 << 15)
 # define GEN7_3DPRIM_INDIRECT_PARAMETER_ENABLE      (1 << 10)
+# define GEN7_3DPRIM_PREDICATE_ENABLE               (1 << 8)
 /* DW1 */
 # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_SEQUENTIAL (0 << 8)
 # define GEN7_3DPRIM_VERTEXBUFFER_ACCESS_RANDOM     (1 << 8)
@@ -530,9 +532,11 @@
 #define GEN7_SURFACE_ARYSPC_FULL	(0 << 10)
 #define GEN7_SURFACE_ARYSPC_LOD0	(1 << 10)
 
-/* Surface state DW0 */
+/* Surface state DW1 */
 #define GEN8_SURFACE_MOCS_SHIFT         24
 #define GEN8_SURFACE_MOCS_MASK          INTEL_MASK(30, 24)
+#define GEN8_SURFACE_QPITCH_SHIFT       0
+#define GEN8_SURFACE_QPITCH_MASK        INTEL_MASK(14, 0)
 
 /* Surface state DW2 */
 #define BRW_SURFACE_HEIGHT_SHIFT	19
@@ -590,6 +594,15 @@
 #define GEN7_SURFACE_MOCS_SHIFT                 16
 #define GEN7_SURFACE_MOCS_MASK                  INTEL_MASK(19, 16)
 
+#define GEN9_SURFACE_TRMODE_SHIFT          18
+#define GEN9_SURFACE_TRMODE_MASK           INTEL_MASK(19, 18)
+#define GEN9_SURFACE_TRMODE_NONE           0
+#define GEN9_SURFACE_TRMODE_TILEYF         1
+#define GEN9_SURFACE_TRMODE_TILEYS         2
+
+#define GEN9_SURFACE_MIP_TAIL_START_LOD_SHIFT      8
+#define GEN9_SURFACE_MIP_TAIL_START_LOD_MASK       INTEL_MASK(11, 8)
+
 /* Surface state DW6 */
 #define GEN7_SURFACE_MCS_ENABLE                 (1 << 0)
 #define GEN7_SURFACE_MCS_PITCH_SHIFT            3
@@ -606,6 +619,8 @@
 #define GEN8_SURFACE_AUX_MODE_HIZ               3
 
 /* Surface state DW7 */
+#define GEN9_SURFACE_RT_COMPRESSION_SHIFT       30
+#define GEN9_SURFACE_RT_COMPRESSION_MASK        INTEL_MASK(30, 30)
 #define GEN7_SURFACE_CLEAR_COLOR_SHIFT		28
 #define GEN7_SURFACE_SCS_R_SHIFT                25
 #define GEN7_SURFACE_SCS_R_MASK                 INTEL_MASK(27, 25)
@@ -1131,6 +1146,11 @@ enum opcode {
     * Terminate the compute shader.
     */
    CS_OPCODE_CS_TERMINATE,
+
+   /**
+    * GLSL barrier()
+    */
+   SHADER_OPCODE_BARRIER,
 };
 
 enum brw_urb_write_flags {
@@ -1592,6 +1612,14 @@ enum brw_message_target {
 #define BRW_SCRATCH_SPACE_SIZE_1M     10
 #define BRW_SCRATCH_SPACE_SIZE_2M     11
 
+#define BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY         0
+#define BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY        1
+#define BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG          2
+#define BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP        3
+#define BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG          4
+#define BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE 5
+#define BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE      6
+
 
 #define CMD_URB_FENCE                 0x6000
 #define CMD_CS_URB_STATE              0x6001
@@ -1769,9 +1797,8 @@ enum brw_message_target {
 # define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID		1
 # define GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT		20
 # define GEN7_GS_INSTANCE_CONTROL_SHIFT			15
-# define GEN7_GS_DISPATCH_MODE_SINGLE			(0 << 11)
-# define GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE		(1 << 11)
-# define GEN7_GS_DISPATCH_MODE_DUAL_OBJECT		(2 << 11)
+# define GEN7_GS_DISPATCH_MODE_SHIFT                    11
+# define GEN7_GS_DISPATCH_MODE_MASK                     INTEL_MASK(12, 11)
 # define GEN6_GS_STATISTICS_ENABLE			(1 << 10)
 # define GEN6_GS_SO_STATISTICS_ENABLE			(1 << 9)
 # define GEN6_GS_RENDERING_ENABLE			(1 << 8)
@@ -2470,8 +2497,8 @@ enum brw_wm_barycentric_interp_mode {
  * cache settings.  We still use only either write-back or write-through; and
  * rely on the documented default values.
  */
-#define SKL_MOCS_WB 9
-#define SKL_MOCS_WT 5
+#define SKL_MOCS_WB (0b001001 << 1)
+#define SKL_MOCS_WT (0b000101 << 1)
 
 #define MEDIA_VFE_STATE                         0x7000
 /* GEN7 DW2, GEN8+ DW3 */
diff --git a/src/mesa/drivers/dri/i965/brw_disasm.c b/src/mesa/drivers/dri/i965/brw_disasm.c
index 95e262a..1075c5a 100644
--- a/src/mesa/drivers/dri/i965/brw_disasm.c
+++ b/src/mesa/drivers/dri/i965/brw_disasm.c
@@ -402,6 +402,16 @@ static const char *const gen6_sfid[16] = {
    [HSW_SFID_CRE]                      = "cre",
 };
 
+static const char *const gen7_gateway_subfuncid[8] = {
+   [BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY] = "open",
+   [BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY] = "close",
+   [BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG] = "forward msg",
+   [BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP] = "get timestamp",
+   [BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG] = "barrier msg",
+   [BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE] = "update state",
+   [BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE] = "mmio read/write",
+};
+
 static const char *const dp_write_port_msg_type[8] = {
    [0b000] = "OWord block write",
    [0b001] = "OWord dual block write",
@@ -977,13 +987,14 @@ src0_3src(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
               brw_inst_3src_src0_reg_nr(devinfo, inst));
    if (err == -1)
       return 0;
-   if (src0_subreg_nr)
+   if (src0_subreg_nr || brw_inst_3src_src0_rep_ctrl(devinfo, inst))
       format(file, ".%d", src0_subreg_nr);
    if (brw_inst_3src_src0_rep_ctrl(devinfo, inst))
       string(file, "<0,1,0>");
-   else
+   else {
       string(file, "<4,4,1>");
-   err |= src_swizzle(file, brw_inst_3src_src0_swizzle(devinfo, inst));
+      err |= src_swizzle(file, brw_inst_3src_src0_swizzle(devinfo, inst));
+   }
    err |= control(file, "src da16 reg type", three_source_reg_encoding,
                   brw_inst_3src_src_type(devinfo, inst), NULL);
    return err;
@@ -1003,13 +1014,14 @@ src1_3src(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
               brw_inst_3src_src1_reg_nr(devinfo, inst));
    if (err == -1)
       return 0;
-   if (src1_subreg_nr)
+   if (src1_subreg_nr || brw_inst_3src_src1_rep_ctrl(devinfo, inst))
       format(file, ".%d", src1_subreg_nr);
    if (brw_inst_3src_src1_rep_ctrl(devinfo, inst))
       string(file, "<0,1,0>");
-   else
+   else {
       string(file, "<4,4,1>");
-   err |= src_swizzle(file, brw_inst_3src_src1_swizzle(devinfo, inst));
+      err |= src_swizzle(file, brw_inst_3src_src1_swizzle(devinfo, inst));
+   }
    err |= control(file, "src da16 reg type", three_source_reg_encoding,
                   brw_inst_3src_src_type(devinfo, inst), NULL);
    return err;
@@ -1030,13 +1042,14 @@ src2_3src(FILE *file, const struct brw_device_info *devinfo, brw_inst *inst)
               brw_inst_3src_src2_reg_nr(devinfo, inst));
    if (err == -1)
       return 0;
-   if (src2_subreg_nr)
+   if (src2_subreg_nr || brw_inst_3src_src2_rep_ctrl(devinfo, inst))
       format(file, ".%d", src2_subreg_nr);
    if (brw_inst_3src_src2_rep_ctrl(devinfo, inst))
       string(file, "<0,1,0>");
-   else
+   else {
       string(file, "<4,4,1>");
-   err |= src_swizzle(file, brw_inst_3src_src2_swizzle(devinfo, inst));
+      err |= src_swizzle(file, brw_inst_3src_src2_swizzle(devinfo, inst));
+   }
    err |= control(file, "src da16 reg type", three_source_reg_encoding,
                   brw_inst_3src_src_type(devinfo, inst), NULL);
    return err;
@@ -1495,6 +1508,12 @@ brw_disassemble_inst(FILE *file, const struct brw_device_info *devinfo,
             break;
          case BRW_SFID_THREAD_SPAWNER:
             break;
+
+         case BRW_SFID_MESSAGE_GATEWAY:
+            format(file, " (%s)",
+                   gen7_gateway_subfuncid[brw_inst_gateway_subfuncid(devinfo, inst)]);
+            break;
+
          case GEN7_SFID_DATAPORT_DATA_CACHE:
             if (devinfo->gen >= 7) {
                format(file, " (");
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 96e2369..b91597a 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -92,8 +92,10 @@ get_hw_prim_for_gl_prim(int mode)
 {
    if (mode >= BRW_PRIM_OFFSET)
       return mode - BRW_PRIM_OFFSET;
-   else
+   else {
+      assert(mode < ARRAY_SIZE(prim_to_hw_prim));
       return prim_to_hw_prim[mode];
+   }
 }
 
 
@@ -178,6 +180,7 @@ static void brw_emit_prim(struct brw_context *brw,
    int verts_per_instance;
    int vertex_access_type;
    int indirect_flag;
+   int predicate_enable;
 
    DBG("PRIM: %s %d %d\n", _mesa_lookup_enum_by_nr(prim->mode),
        prim->start, prim->count);
@@ -258,10 +261,14 @@ static void brw_emit_prim(struct brw_context *brw,
       indirect_flag = 0;
    }
 
-
    if (brw->gen >= 7) {
+      if (brw->predicate.state == BRW_PREDICATE_STATE_USE_BIT)
+         predicate_enable = GEN7_3DPRIM_PREDICATE_ENABLE;
+      else
+         predicate_enable = 0;
+
       BEGIN_BATCH(7);
-      OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag);
+      OUT_BATCH(CMD_3D_PRIM << 16 | (7 - 2) | indirect_flag | predicate_enable);
       OUT_BATCH(hw_prim | vertex_access_type);
    } else {
       BEGIN_BATCH(6);
@@ -561,12 +568,7 @@ void brw_draw_prims( struct gl_context *ctx,
 
    assert(unused_tfb_object == NULL);
 
-   if (ctx->Query.CondRenderQuery) {
-      perf_debug("Conditional rendering is implemented in software and may "
-                 "stall.  This should be fixed in the driver.\n");
-   }
-
-   if (!_mesa_check_conditional_render(ctx))
+   if (!brw_check_conditional_render(brw))
       return;
 
    /* Handle primitive restart if needed */
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 0e7be1e..761aa0e 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -361,6 +361,8 @@ brw_jump_scale(const struct brw_device_info *devinfo)
    return 1;
 }
 
+void brw_barrier(struct brw_codegen *p, struct brw_reg src);
+
 /* If/else/endif.  Works by manipulating the execution flags on each
  * channel.
  */
@@ -390,6 +392,8 @@ brw_inst *brw_JMPI(struct brw_codegen *p, struct brw_reg index,
 
 void brw_NOP(struct brw_codegen *p);
 
+void brw_WAIT(struct brw_codegen *p);
+
 /* Special case: there is never a destination, execution size will be
  * taken from src0:
  */
diff --git a/src/mesa/drivers/dri/i965/brw_eu_compact.c b/src/mesa/drivers/dri/i965/brw_eu_compact.c
index 69cb114..67f0b45 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_compact.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_compact.c
@@ -849,6 +849,12 @@ set_3src_source_index(const struct brw_device_info *devinfo,
 static bool
 has_unmapped_bits(const struct brw_device_info *devinfo, brw_inst *src)
 {
+   /* EOT can only be mapped on a send if the src1 is an immediate */
+   if ((brw_inst_opcode(devinfo, src) == BRW_OPCODE_SENDC ||
+        brw_inst_opcode(devinfo, src) == BRW_OPCODE_SEND) &&
+       brw_inst_eot(devinfo, src))
+      return true;
+
    /* Check for instruction bits that don't map to any of the fields of the
     * compacted instruction.  The instruction cannot be compacted if any of
     * them are set.  They overlap with:
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index e78d0be..0f53604 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -914,6 +914,8 @@ brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
          brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
          brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
          break;
+      default:
+         unreachable("not reached");
       }
    }
 
@@ -3404,3 +3406,54 @@ void brw_shader_time_add(struct brw_codegen *p,
 
    brw_pop_insn_state(p);
 }
+
+
+/**
+ * Emit the SEND message for a barrier
+ */
+void
+brw_barrier(struct brw_codegen *p, struct brw_reg src)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   struct brw_inst *inst;
+
+   assert(devinfo->gen >= 7);
+
+   inst = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, inst, brw_null_reg());
+   brw_set_src0(p, inst, src);
+   brw_set_src1(p, inst, brw_null_reg());
+
+   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
+                              1 /* msg_length */,
+                              0 /* response_length */,
+                              false /* header_present */,
+                              false /* end_of_thread */);
+
+   brw_inst_set_gateway_notify(devinfo, inst, 1);
+   brw_inst_set_gateway_subfuncid(devinfo, inst,
+                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
+
+   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+}
+
+
+/**
+ * Emit the wait instruction for a barrier
+ */
+void
+brw_WAIT(struct brw_codegen *p)
+{
+   const struct brw_device_info *devinfo = p->devinfo;
+   struct brw_inst *insn;
+
+   struct brw_reg src = brw_notification_reg();
+
+   insn = next_insn(p, BRW_OPCODE_WAIT);
+   brw_set_dest(p, insn, src);
+   brw_set_src0(p, insn, src);
+   brw_set_src1(p, insn, brw_null_reg());
+
+   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 5ce1dfc..2c0ff96 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -49,6 +49,8 @@
 #include "glsl/glsl_types.h"
 #include "program/sampler.h"
 
+using namespace brw;
+
 void
 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
               const fs_reg *src, unsigned sources)
@@ -212,152 +214,13 @@ fs_inst::resize_sources(uint8_t num_sources)
    }
 }
 
-#define ALU1(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
-   }
-
-#define ALU2(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1)                                   \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
-   }
-
-#define ALU2_ACC(op)                                                    \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1)                                   \
-   {                                                                    \
-      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
-      inst->writes_accumulator = true;                                  \
-      return inst;                                                      \
-   }
-
-#define ALU3(op)                                                        \
-   fs_inst *                                                            \
-   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
-                  const fs_reg &src1, const fs_reg &src2)               \
-   {                                                                    \
-      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
-   }
-
-ALU1(NOT)
-ALU1(MOV)
-ALU1(FRC)
-ALU1(RNDD)
-ALU1(RNDE)
-ALU1(RNDZ)
-ALU2(ADD)
-ALU2(MUL)
-ALU2_ACC(MACH)
-ALU2(AND)
-ALU2(OR)
-ALU2(XOR)
-ALU2(SHL)
-ALU2(SHR)
-ALU2(ASR)
-ALU3(LRP)
-ALU1(BFREV)
-ALU3(BFE)
-ALU2(BFI1)
-ALU3(BFI2)
-ALU1(FBH)
-ALU1(FBL)
-ALU1(CBIT)
-ALU3(MAD)
-ALU2_ACC(ADDC)
-ALU2_ACC(SUBB)
-ALU2(SEL)
-ALU2(MAC)
-
-/** Gen4 predicated IF. */
-fs_inst *
-fs_visitor::IF(enum brw_predicate predicate)
-{
-   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
-   inst->predicate = predicate;
-   return inst;
-}
-
-/** Gen6 IF with embedded comparison. */
-fs_inst *
-fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
-               enum brw_conditional_mod condition)
-{
-   assert(devinfo->gen == 6);
-   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
-                                        reg_null_d, src0, src1);
-   inst->conditional_mod = condition;
-   return inst;
-}
-
-/**
- * CMP: Sets the low bit of the destination channels with the result
- * of the comparison, while the upper bits are undefined, and updates
- * the flag register with the packed 16 bits of the result.
- */
-fs_inst *
-fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
-                enum brw_conditional_mod condition)
-{
-   fs_inst *inst;
-
-   /* Take the instruction:
-    *
-    * CMP null<d> src0<f> src1<f>
-    *
-    * Original gen4 does type conversion to the destination type before
-    * comparison, producing garbage results for floating point comparisons.
-    *
-    * The destination type doesn't matter on newer generations, so we set the
-    * type to match src0 so we can compact the instruction.
-    */
-   dst.type = src0.type;
-   if (dst.file == HW_REG)
-      dst.fixed_hw_reg.type = dst.type;
-
-   resolve_ud_negate(&src0);
-   resolve_ud_negate(&src1);
-
-   inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
-   inst->conditional_mod = condition;
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
-                         int header_size)
-{
-   assert(dst.width % 8 == 0);
-   fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
-                                        dst, src, sources);
-   inst->header_size = header_size;
-
-   for (int i = 0; i < header_size; i++)
-      assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
-   inst->regs_written = header_size;
-
-   for (int i = header_size; i < sources; ++i)
-      assert(src[i].file != GRF || src[i].width == dst.width);
-   inst->regs_written += (sources - header_size) * (dst.width / 8);
-
-   return inst;
-}
-
-exec_list
-fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
+void
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
+                                       const fs_reg &dst,
                                        const fs_reg &surf_index,
                                        const fs_reg &varying_offset,
                                        uint32_t const_offset)
 {
-   exec_list instructions;
-   fs_inst *inst;
-
    /* We have our constant surface use a pitch of 4 bytes, so our index can
     * be any component of a vector, and then we load 4 contiguous
     * components starting from that.
@@ -370,8 +233,7 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
     * the redundant ones.
     */
    fs_reg vec4_offset = vgrf(glsl_type::int_type);
-   instructions.push_tail(ADD(vec4_offset,
-                              varying_offset, fs_reg(const_offset & ~3)));
+   bld.ADD(vec4_offset, varying_offset, fs_reg(const_offset & ~3));
 
    int scale = 1;
    if (devinfo->gen == 4 && dst.width == 8) {
@@ -393,9 +255,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
    int regs_written = 4 * (dst.width / 8) * scale;
    fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
                                dst.type, dst.width);
-   inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
+   fs_inst *inst = bld.emit(op, vec4_result, surf_index, vec4_offset);
    inst->regs_written = regs_written;
-   instructions.push_tail(inst);
 
    if (devinfo->gen < 7) {
       inst->base_mrf = 13;
@@ -406,30 +267,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
          inst->mlen = 1 + dispatch_width / 8;
    }
 
-   fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
-   instructions.push_tail(MOV(dst, result));
-
-   return instructions;
+   bld.MOV(dst, offset(vec4_result, (const_offset & 3) * scale));
 }
 
 /**
  * A helper for MOV generation for fixing up broken hardware SEND dependency
  * handling.
  */
-fs_inst *
-fs_visitor::DEP_RESOLVE_MOV(int grf)
+void
+fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
 {
-   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
-
-   inst->ir = NULL;
-   inst->annotation = "send dependency resolve";
-
    /* The caller always wants uncompressed to emit the minimal extra
     * dependencies, and to avoid having to deal with aligning its regs to 2.
     */
-   inst->exec_size = 8;
+   const fs_builder ubld = bld.annotate("send dependency resolve")
+                              .half(0);
 
-   return inst;
+   ubld.MOV(ubld.null_reg_f(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
 }
 
 bool
@@ -685,7 +539,7 @@ fs_visitor::type_size(const struct glsl_type *type)
  * the destination of the MOV, with extra parameters set.
  */
 fs_reg
-fs_visitor::get_timestamp(fs_inst **out_mov)
+fs_visitor::get_timestamp(const fs_builder &bld)
 {
    assert(devinfo->gen >= 7);
 
@@ -696,11 +550,10 @@ fs_visitor::get_timestamp(fs_inst **out_mov)
 
    fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
 
-   fs_inst *mov = MOV(dst, ts);
    /* We want to read the 3 fields we care about even if it's not enabled in
     * the dispatch.
     */
-   mov->force_writemask_all = true;
+   bld.exec_all().MOV(dst, ts);
 
    /* The caller wants the low 32 bits of the timestamp.  Since it's running
     * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
@@ -714,105 +567,60 @@ fs_visitor::get_timestamp(fs_inst **out_mov)
     */
    dst.set_smear(0);
 
-   *out_mov = mov;
    return dst;
 }
 
 void
 fs_visitor::emit_shader_time_begin()
 {
-   current_annotation = "shader time start";
-   fs_inst *mov;
-   shader_start_time = get_timestamp(&mov);
-   emit(mov);
+   shader_start_time = get_timestamp(bld.annotate("shader time start"));
 }
 
 void
 fs_visitor::emit_shader_time_end()
 {
-   current_annotation = "shader time end";
-
-   enum shader_time_shader_type type, written_type, reset_type;
-   switch (stage) {
-   case MESA_SHADER_VERTEX:
-      type = ST_VS;
-      written_type = ST_VS_WRITTEN;
-      reset_type = ST_VS_RESET;
-      break;
-   case MESA_SHADER_GEOMETRY:
-      type = ST_GS;
-      written_type = ST_GS_WRITTEN;
-      reset_type = ST_GS_RESET;
-      break;
-   case MESA_SHADER_FRAGMENT:
-      if (dispatch_width == 8) {
-         type = ST_FS8;
-         written_type = ST_FS8_WRITTEN;
-         reset_type = ST_FS8_RESET;
-      } else {
-         assert(dispatch_width == 16);
-         type = ST_FS16;
-         written_type = ST_FS16_WRITTEN;
-         reset_type = ST_FS16_RESET;
-      }
-      break;
-   case MESA_SHADER_COMPUTE:
-      type = ST_CS;
-      written_type = ST_CS_WRITTEN;
-      reset_type = ST_CS_RESET;
-      break;
-   default:
-      unreachable("fs_visitor::emit_shader_time_end missing code");
-   }
-
    /* Insert our code just before the final SEND with EOT. */
    exec_node *end = this->instructions.get_tail();
    assert(end && ((fs_inst *) end)->eot);
+   const fs_builder ibld = bld.annotate("shader time end")
+                              .exec_all().at(NULL, end);
 
-   fs_inst *tm_read;
-   fs_reg shader_end_time = get_timestamp(&tm_read);
-   end->insert_before(tm_read);
+   fs_reg shader_end_time = get_timestamp(ibld);
 
    /* Check that there weren't any timestamp reset events (assuming these
     * were the only two timestamp reads that happened).
     */
    fs_reg reset = shader_end_time;
    reset.set_smear(2);
-   fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
-   test->conditional_mod = BRW_CONDITIONAL_Z;
-   test->force_writemask_all = true;
-   end->insert_before(test);
-   end->insert_before(IF(BRW_PREDICATE_NORMAL));
+   set_condmod(BRW_CONDITIONAL_Z,
+               ibld.AND(ibld.null_reg_ud(), reset, fs_reg(1u)));
+   ibld.IF(BRW_PREDICATE_NORMAL);
 
    fs_reg start = shader_start_time;
    start.negate = true;
    fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
    diff.set_smear(0);
-   fs_inst *add = ADD(diff, start, shader_end_time);
-   add->force_writemask_all = true;
-   end->insert_before(add);
+   ibld.ADD(diff, start, shader_end_time);
 
    /* If there were no instructions between the two timestamp gets, the diff
     * is 2 cycles.  Remove that overhead, so I can forget about that when
     * trying to determine the time taken for single instructions.
     */
-   add = ADD(diff, diff, fs_reg(-2u));
-   add->force_writemask_all = true;
-   end->insert_before(add);
-
-   end->insert_before(SHADER_TIME_ADD(type, diff));
-   end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
-   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
-   end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
-   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
+   ibld.ADD(diff, diff, fs_reg(-2u));
+   SHADER_TIME_ADD(ibld, 0, diff);
+   SHADER_TIME_ADD(ibld, 1, fs_reg(1u));
+   ibld.emit(BRW_OPCODE_ELSE);
+   SHADER_TIME_ADD(ibld, 2, fs_reg(1u));
+   ibld.emit(BRW_OPCODE_ENDIF);
 }
 
-fs_inst *
-fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
+void
+fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
+                            int shader_time_subindex,
+                            fs_reg value)
 {
-   int shader_time_index =
-      brw_get_shader_time_index(brw, shader_prog, prog, type);
-   fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
+   int index = shader_time_index * 3 + shader_time_subindex;
+   fs_reg offset = fs_reg(index * SHADER_TIME_STRIDE);
 
    fs_reg payload;
    if (dispatch_width == 8)
@@ -820,8 +628,7 @@ fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
    else
       payload = vgrf(glsl_type::uint_type);
 
-   return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
-                               fs_reg(), payload, offset, value);
+   bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
 }
 
 void
@@ -864,65 +671,16 @@ fs_visitor::fail(const char *format, ...)
  * During a SIMD16 compile (if one happens anyway), this just calls fail().
  */
 void
-fs_visitor::no16(const char *format, ...)
+fs_visitor::no16(const char *msg)
 {
-   va_list va;
-
-   va_start(va, format);
-
    if (dispatch_width == 16) {
-      vfail(format, va);
+      fail("%s", msg);
    } else {
       simd16_unsupported = true;
 
-      if (brw->perf_debug) {
-         if (no16_msg)
-            ralloc_vasprintf_append(&no16_msg, format, va);
-         else
-            no16_msg = ralloc_vasprintf(mem_ctx, format, va);
-      }
+      compiler->shader_perf_log(log_data,
+                                "SIMD16 shader failed to compile: %s", msg);
    }
-
-   va_end(va);
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1, const fs_reg &src2)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
-}
-
-fs_inst *
-fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
-                 fs_reg src[], int sources)
-{
-   return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
 }
 
 /**
@@ -1051,7 +809,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
       return inst->mlen;
    case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
-      return 2;
+      return inst->mlen;
    case SHADER_OPCODE_UNTYPED_ATOMIC:
    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
@@ -1077,14 +835,6 @@ fs_visitor::vgrf(const glsl_type *const type)
                  brw_type_for_base_type(type), dispatch_width);
 }
 
-fs_reg
-fs_visitor::vgrf(int num_components)
-{
-   int reg_width = dispatch_width / 8;
-   return fs_reg(GRF, alloc.allocate(num_components * reg_width),
-                 BRW_REGISTER_TYPE_F, dispatch_width);
-}
-
 /** Fixed HW reg constructor. */
 fs_reg::fs_reg(enum register_file file, int reg)
 {
@@ -1130,117 +880,18 @@ fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
    this->width = width;
 }
 
-fs_reg *
-fs_visitor::variable_storage(ir_variable *var)
-{
-   return (fs_reg *)hash_table_find(this->variable_ht, var);
-}
-
-void
-import_uniforms_callback(const void *key,
-			 void *data,
-			 void *closure)
-{
-   struct hash_table *dst_ht = (struct hash_table *)closure;
-   const fs_reg *reg = (const fs_reg *)data;
-
-   if (reg->file != UNIFORM)
-      return;
-
-   hash_table_insert(dst_ht, data, key);
-}
-
 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
  * This brings in those uniform definitions
  */
 void
 fs_visitor::import_uniforms(fs_visitor *v)
 {
-   hash_table_call_foreach(v->variable_ht,
-			   import_uniforms_callback,
-			   variable_ht);
    this->push_constant_loc = v->push_constant_loc;
    this->pull_constant_loc = v->pull_constant_loc;
    this->uniforms = v->uniforms;
    this->param_size = v->param_size;
 }
 
-/* Our support for uniforms is piggy-backed on the struct
- * gl_fragment_program, because that's where the values actually
- * get stored, rather than in some global gl_shader_program uniform
- * store.
- */
-void
-fs_visitor::setup_uniform_values(ir_variable *ir)
-{
-   int namelen = strlen(ir->name);
-
-   /* The data for our (non-builtin) uniforms is stored in a series of
-    * gl_uniform_driver_storage structs for each subcomponent that
-    * glGetUniformLocation() could name.  We know it's been set up in the same
-    * order we'd walk the type, so walk the list of storage and find anything
-    * with our name, or the prefix of a component that starts with our name.
-    */
-   unsigned params_before = uniforms;
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
-      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
-
-      if (strncmp(ir->name, storage->name, namelen) != 0 ||
-          (storage->name[namelen] != 0 &&
-           storage->name[namelen] != '.' &&
-           storage->name[namelen] != '[')) {
-         continue;
-      }
-
-      unsigned slots = storage->type->component_slots();
-      if (storage->array_elements)
-         slots *= storage->array_elements;
-
-      for (unsigned i = 0; i < slots; i++) {
-         stage_prog_data->param[uniforms++] = &storage->storage[i];
-      }
-   }
-
-   /* Make sure we actually initialized the right amount of stuff here. */
-   assert(params_before + ir->type->component_slots() == uniforms);
-   (void)params_before;
-}
-
-
-/* Our support for builtin uniforms is even scarier than non-builtin.
- * It sits on top of the PROG_STATE_VAR parameters that are
- * automatically updated from GL context state.
- */
-void
-fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
-{
-   const ir_state_slot *const slots = ir->get_state_slots();
-   assert(slots != NULL);
-
-   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
-      /* This state reference has already been setup by ir_to_mesa, but we'll
-       * get the same index back here.
-       */
-      int index = _mesa_add_state_reference(this->prog->Parameters,
-					    (gl_state_index *)slots[i].tokens);
-
-      /* Add each of the unique swizzles of the element as a parameter.
-       * This'll end up matching the expected layout of the
-       * array/matrix/structure we're trying to fill in.
-       */
-      int last_swiz = -1;
-      for (unsigned int j = 0; j < 4; j++) {
-	 int swiz = GET_SWZ(slots[i].swizzle, j);
-	 if (swiz == last_swiz)
-	    break;
-	 last_swiz = swiz;
-
-         stage_prog_data->param[uniforms++] =
-            &prog->Parameters->ParameterValues[index][swiz];
-      }
-   }
-}
-
 fs_reg *
 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
                                          bool origin_upper_left)
@@ -1253,15 +904,15 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 
    /* gl_FragCoord.x */
    if (pixel_center_integer) {
-      emit(MOV(wpos, this->pixel_x));
+      bld.MOV(wpos, this->pixel_x);
    } else {
-      emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
+      bld.ADD(wpos, this->pixel_x, fs_reg(0.5f));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.y */
    if (!flip && pixel_center_integer) {
-      emit(MOV(wpos, this->pixel_y));
+      bld.MOV(wpos, this->pixel_y);
    } else {
       fs_reg pixel_y = this->pixel_y;
       float offset = (pixel_center_integer ? 0.0 : 0.5);
@@ -1271,22 +922,22 @@ fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
 	 offset += key->drawable_height - 1.0;
       }
 
-      emit(ADD(wpos, pixel_y, fs_reg(offset)));
+      bld.ADD(wpos, pixel_y, fs_reg(offset));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.z */
    if (devinfo->gen >= 6) {
-      emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
+      bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
    } else {
-      emit(FS_OPCODE_LINTERP, wpos,
+      bld.emit(FS_OPCODE_LINTERP, wpos,
            this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
            interp_reg(VARYING_SLOT_POS, 2));
    }
    wpos = offset(wpos, 1);
 
    /* gl_FragCoord.w: Already set up in emit_interpolation */
-   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
+   bld.MOV(wpos, this->wpos_w);
 
    return reg;
 }
@@ -1321,8 +972,8 @@ fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
        */
       barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
    }
-   return emit(FS_OPCODE_LINTERP, attr,
-               this->delta_xy[barycoord_mode], interp);
+   return bld.emit(FS_OPCODE_LINTERP, attr,
+                   this->delta_xy[barycoord_mode], interp);
 }
 
 void
@@ -1380,7 +1031,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
 	       struct brw_reg interp = interp_reg(location, k);
 	       interp = suboffset(interp, 3);
                interp.type = attr.type;
-	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
+               bld.emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
 	       attr = offset(attr, 1);
 	    }
 	 } else {
@@ -1393,7 +1044,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
                    * unlit, replace the centroid data with non-centroid
                    * data.
                    */
-                  emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+                  bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
 
                   fs_inst *inst;
                   inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
@@ -1417,7 +1068,7 @@ fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
                                mod_sample || key->persample_shading);
                }
                if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
-                  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
+                  bld.MUL(attr, attr, this->pixel_w);
                }
 	       attr = offset(attr, 1);
 	    }
@@ -1448,7 +1099,7 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
       g0.negate = true;
 
-      emit(ASR(*reg, g0, fs_reg(15)));
+      bld.ASR(*reg, g0, fs_reg(15));
    } else {
       /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
        * a boolean result from this (1/true or 0/false).
@@ -1463,7 +1114,7 @@ fs_visitor::emit_frontfacing_interpolation()
       fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
       g1_6.negate = true;
 
-      emit(ASR(*reg, g1_6, fs_reg(31)));
+      bld.ASR(*reg, g1_6, fs_reg(31));
    }
 
    return reg;
@@ -1478,9 +1129,9 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
 
    if (key->compute_pos_offset) {
       /* Convert int_sample_pos to floating point */
-      emit(MOV(dst, int_sample_pos));
+      bld.MOV(dst, int_sample_pos);
       /* Scale to the range [0, 1] */
-      emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
+      bld.MUL(dst, dst, fs_reg(1 / 16.0f));
    }
    else {
       /* From ARB_sample_shading specification:
@@ -1488,7 +1139,7 @@ fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
        *  rasterization is disabled, gl_SamplePosition will always be
        *  (0.5, 0.5).
        */
-      emit(MOV(dst, fs_reg(0.5f)));
+      bld.MOV(dst, fs_reg(0.5f));
    }
 }
 
@@ -1497,7 +1148,7 @@ fs_visitor::emit_samplepos_setup()
 {
    assert(devinfo->gen >= 6);
 
-   this->current_annotation = "compute sample position";
+   const fs_builder abld = bld.annotate("compute sample position");
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
    fs_reg pos = *reg;
    fs_reg int_sample_x = vgrf(glsl_type::int_type);
@@ -1519,22 +1170,22 @@ fs_visitor::emit_samplepos_setup()
                     BRW_REGISTER_TYPE_B), 16, 8, 2);
 
    if (dispatch_width == 8) {
-      emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
+      abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
    } else {
-      emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
-      emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
-         ->force_sechalf = true;
+      abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
+      abld.half(1).MOV(half(int_sample_x, 1),
+                       fs_reg(suboffset(sample_pos_reg, 16)));
    }
    /* Compute gl_SamplePosition.x */
    compute_sample_position(pos, int_sample_x);
    pos = offset(pos, 1);
    if (dispatch_width == 8) {
-      emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
+      abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
    } else {
-      emit(MOV(half(int_sample_y, 0),
-               fs_reg(suboffset(sample_pos_reg, 1))));
-      emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
-         ->force_sechalf = true;
+      abld.half(0).MOV(half(int_sample_y, 0),
+                       fs_reg(suboffset(sample_pos_reg, 1)));
+      abld.half(1).MOV(half(int_sample_y, 1),
+                       fs_reg(suboffset(sample_pos_reg, 17)));
    }
    /* Compute gl_SamplePosition.y */
    compute_sample_position(pos, int_sample_y);
@@ -1548,7 +1199,7 @@ fs_visitor::emit_sampleid_setup()
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
    assert(devinfo->gen >= 6);
 
-   this->current_annotation = "compute sample id";
+   const fs_builder abld = bld.annotate("compute sample id");
    fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
 
    if (key->compute_sample_id) {
@@ -1575,26 +1226,25 @@ fs_visitor::emit_sampleid_setup()
        * are sample 1 of subspan 0; the third group is sample 0 of
        * subspan 1, and finally sample 1 of subspan 1.
        */
-      fs_inst *inst;
-      inst = emit(BRW_OPCODE_AND, t1,
-                  fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
-                  fs_reg(0xc0));
-      inst->force_writemask_all = true;
-      inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
-      inst->force_writemask_all = true;
+      abld.exec_all()
+          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+               fs_reg(0xc0));
+      abld.exec_all().SHR(t1, t1, fs_reg(5));
+
       /* This works for both SIMD8 and SIMD16 */
-      inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
-      inst->force_writemask_all = true;
+      abld.exec_all()
+          .MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210));
+
       /* This special instruction takes care of setting vstride=1,
        * width=4, hstride=0 of t2 during an ADD instruction.
        */
-      emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+      abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
    } else {
       /* As per GL_ARB_sample_shading specification:
        * "When rendering to a non-multisample buffer, or if multisample
        *  rasterization is disabled, gl_SampleID will always be zero."
        */
-      emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
+      abld.MOV(*reg, fs_reg(0));
    }
 
    return reg;
@@ -1606,111 +1256,11 @@ fs_visitor::resolve_source_modifiers(fs_reg *src)
    if (!src->abs && !src->negate)
       return;
 
-   fs_reg temp = retype(vgrf(1), src->type);
-   emit(MOV(temp, *src));
+   fs_reg temp = bld.vgrf(src->type);
+   bld.MOV(temp, *src);
    *src = temp;
 }
 
-fs_reg
-fs_visitor::fix_math_operand(fs_reg src)
-{
-   /* Can't do hstride == 0 args on gen6 math, so expand it out. We
-    * might be able to do better by doing execsize = 1 math and then
-    * expanding that result out, but we would need to be careful with
-    * masking.
-    *
-    * The hardware ignores source modifiers (negate and abs) on math
-    * instructions, so we also move to a temp to set those up.
-    */
-   if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
-       !src.abs && !src.negate)
-      return src;
-
-   /* Gen7 relaxes most of the above restrictions, but still can't use IMM
-    * operands to math
-    */
-   if (devinfo->gen >= 7 && src.file != IMM)
-      return src;
-
-   fs_reg expanded = vgrf(glsl_type::float_type);
-   expanded.type = src.type;
-   emit(BRW_OPCODE_MOV, expanded, src);
-   return expanded;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
-{
-   switch (opcode) {
-   case SHADER_OPCODE_RCP:
-   case SHADER_OPCODE_RSQ:
-   case SHADER_OPCODE_SQRT:
-   case SHADER_OPCODE_EXP2:
-   case SHADER_OPCODE_LOG2:
-   case SHADER_OPCODE_SIN:
-   case SHADER_OPCODE_COS:
-      break;
-   default:
-      unreachable("not reached: bad math opcode");
-   }
-
-   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
-    * might be able to do better by doing execsize = 1 math and then
-    * expanding that result out, but we would need to be careful with
-    * masking.
-    *
-    * Gen 6 hardware ignores source modifiers (negate and abs) on math
-    * instructions, so we also move to a temp to set those up.
-    */
-   if (devinfo->gen == 6 || devinfo->gen == 7)
-      src = fix_math_operand(src);
-
-   fs_inst *inst = emit(opcode, dst, src);
-
-   if (devinfo->gen < 6) {
-      inst->base_mrf = 2;
-      inst->mlen = dispatch_width / 8;
-   }
-
-   return inst;
-}
-
-fs_inst *
-fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
-{
-   int base_mrf = 2;
-   fs_inst *inst;
-
-   if (devinfo->gen >= 8) {
-      inst = emit(opcode, dst, src0, src1);
-   } else if (devinfo->gen >= 6) {
-      src0 = fix_math_operand(src0);
-      src1 = fix_math_operand(src1);
-
-      inst = emit(opcode, dst, src0, src1);
-   } else {
-      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
-       * "Message Payload":
-       *
-       * "Operand0[7].  For the INT DIV functions, this operand is the
-       *  denominator."
-       *  ...
-       * "Operand1[7].  For the INT DIV functions, this operand is the
-       *  numerator."
-       */
-      bool is_int_div = opcode != SHADER_OPCODE_POW;
-      fs_reg &op0 = is_int_div ? src1 : src0;
-      fs_reg &op1 = is_int_div ? src0 : src1;
-
-      emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
-      inst = emit(opcode, dst, op0, reg_null_f);
-
-      inst->base_mrf = base_mrf;
-      inst->mlen = 2 * dispatch_width / 8;
-   }
-   return inst;
-}
-
 void
 fs_visitor::emit_discard_jump()
 {
@@ -1719,7 +1269,7 @@ fs_visitor::emit_discard_jump()
    /* For performance, after a discard, jump to the end of the
     * shader if all relevant channels have been discarded.
     */
-   fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
+   fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
    discard_jump->flag_subreg = 1;
 
    discard_jump->predicate = (dispatch_width == 8)
@@ -2317,26 +1867,22 @@ fs_visitor::demote_pull_constants()
 	    continue;
 
          /* Set up the annotation tracking for new generated instructions. */
-         base_ir = inst->ir;
-         current_annotation = inst->annotation;
-
+         const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
+                                    .at(block, inst);
          fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
          fs_reg dst = vgrf(glsl_type::float_type);
 
          /* Generate a pull load into dst. */
          if (inst->src[i].reladdr) {
-            exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
-                                                        surf_index,
-                                                        *inst->src[i].reladdr,
-                                                        pull_index);
-            inst->insert_before(block, &list);
+            VARYING_PULL_CONSTANT_LOAD(ibld, dst,
+                                       surf_index,
+                                       *inst->src[i].reladdr,
+                                       pull_index);
             inst->src[i].reladdr = NULL;
          } else {
             fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
-            fs_inst *pull =
-               new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                    dst, surf_index, offset);
-            inst->insert_before(block, pull);
+            ibld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                      dst, surf_index, offset);
             inst->src[i].set_smear(pull_index & 3);
          }
 
@@ -2663,6 +2209,16 @@ fs_visitor::opt_sampler_eot()
    if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
       return false;
 
+   /* This optimisation doesn't seem to work for textureGather for some
+    * reason. I can't find any documentation or known workarounds to indicate
+    * that this is expected, but considering that it is probably pretty
+    * unlikely that a shader would directly write out the results from
+    * textureGather we might as well just disable it.
+    */
+   if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
+       tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+      return false;
+
    /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
     * It's very likely to be the previous instruction.
     */
@@ -2676,7 +2232,7 @@ fs_visitor::opt_sampler_eot()
 
    tex_inst->offset |= fb_write->target << 24;
    tex_inst->eot = true;
-   tex_inst->dst = reg_null_ud;
+   tex_inst->dst = bld.null_reg_ud();
    fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
 
    /* If a header is present, marking the eot is sufficient. Otherwise, we need
@@ -2688,7 +2244,8 @@ fs_visitor::opt_sampler_eot()
    if (tex_inst->header_size != 0)
       return true;
 
-   fs_reg send_header = vgrf(load_payload->sources + 1);
+   fs_reg send_header = bld.vgrf(BRW_REGISTER_TYPE_F,
+                                 load_payload->sources + 1);
    fs_reg *new_sources =
       ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
 
@@ -3041,8 +2598,8 @@ fs_visitor::emit_repclear_shader()
    fs_inst *mov;
 
    if (uniforms == 1) {
-      mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
-                     fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
+      mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
+                               fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
    } else {
       struct brw_reg reg =
          brw_reg(BRW_GENERAL_REGISTER_FILE,
@@ -3051,14 +2608,13 @@ fs_visitor::emit_repclear_shader()
                  BRW_WIDTH_2,
                  BRW_HORIZONTAL_STRIDE_4, BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
 
-      mov = emit(MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg)));
+      mov = bld.exec_all().MOV(vec4(brw_message_reg(color_mrf)),
+                               fs_reg(reg));
    }
 
-   mov->force_writemask_all = true;
-
    fs_inst *write;
    if (key->nr_color_regions == 1) {
-      write = emit(FS_OPCODE_REP_FB_WRITE);
+      write = bld.emit(FS_OPCODE_REP_FB_WRITE);
       write->saturate = key->clamp_fragment_color;
       write->base_mrf = color_mrf;
       write->target = 0;
@@ -3067,7 +2623,7 @@ fs_visitor::emit_repclear_shader()
    } else {
       assume(key->nr_color_regions > 0);
       for (int i = 0; i < key->nr_color_regions; ++i) {
-         write = emit(FS_OPCODE_REP_FB_WRITE);
+         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
          write->saturate = key->clamp_fragment_color;
          write->base_mrf = base_mrf;
          write->target = i;
@@ -3223,9 +2779,8 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
        */
       if (block->start() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
-            if (needs_dep[i]) {
-               inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
-            }
+            if (needs_dep[i])
+               DEP_RESOLVE_MOV(bld.at(block, inst), first_write_grf + i);
          }
          return;
       }
@@ -3241,7 +2796,7 @@ fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
             if (reg >= first_write_grf &&
                 reg < first_write_grf + write_len &&
                 needs_dep[reg - first_write_grf]) {
-               inst->insert_before(block, DEP_RESOLVE_MOV(reg));
+               DEP_RESOLVE_MOV(bld.at(block, inst), reg);
                needs_dep[reg - first_write_grf] = false;
                if (scan_inst->exec_size == 16)
                   needs_dep[reg - first_write_grf + 1] = false;
@@ -3288,8 +2843,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
       if (block->end() == scan_inst) {
          for (int i = 0; i < write_len; i++) {
             if (needs_dep[i])
-               scan_inst->insert_before(block,
-                                        DEP_RESOLVE_MOV(first_write_grf + i));
+               DEP_RESOLVE_MOV(bld.at(block, scan_inst), first_write_grf + i);
          }
          return;
       }
@@ -3304,7 +2858,7 @@ fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_ins
           scan_inst->dst.reg >= first_write_grf &&
           scan_inst->dst.reg < first_write_grf + write_len &&
           needs_dep[scan_inst->dst.reg - first_write_grf]) {
-         scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
+         DEP_RESOLVE_MOV(bld.at(block, scan_inst), scan_inst->dst.reg);
          needs_dep[scan_inst->dst.reg - first_write_grf] = false;
       }
 
@@ -3429,6 +2983,9 @@ fs_visitor::lower_load_payload()
       assert(inst->dst.file == MRF || inst->dst.file == GRF);
       assert(inst->saturate == false);
 
+      const fs_builder ibld = bld.group(inst->exec_size, inst->force_sechalf)
+                                 .exec_all(inst->force_writemask_all)
+                                 .at(block, inst);
       fs_reg dst = inst->dst;
 
       /* Get rid of COMPR4.  We'll add it back in if we need it */
@@ -3441,9 +2998,7 @@ fs_visitor::lower_load_payload()
             fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
             fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
             mov_src.width = 8;
-            fs_inst *mov = MOV(mov_dst, mov_src);
-            mov->force_writemask_all = true;
-            inst->insert_before(block, mov);
+            ibld.exec_all().MOV(mov_dst, mov_src);
          }
          dst = offset(dst, 1);
       }
@@ -3474,23 +3029,13 @@ fs_visitor::lower_load_payload()
                if (devinfo->has_compr4) {
                   fs_reg compr4_dst = retype(dst, inst->src[i].type);
                   compr4_dst.reg |= BRW_MRF_COMPR4;
-
-                  fs_inst *mov = MOV(compr4_dst, inst->src[i]);
-                  mov->force_writemask_all = inst->force_writemask_all;
-                  inst->insert_before(block, mov);
+                  ibld.MOV(compr4_dst, inst->src[i]);
                } else {
                   /* Platform doesn't have COMPR4.  We have to fake it */
                   fs_reg mov_dst = retype(dst, inst->src[i].type);
                   mov_dst.width = 8;
-
-                  fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
-                  mov->force_writemask_all = inst->force_writemask_all;
-                  inst->insert_before(block, mov);
-
-                  mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
-                  mov->force_writemask_all = inst->force_writemask_all;
-                  mov->force_sechalf = true;
-                  inst->insert_before(block, mov);
+                  ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
+                  ibld.half(1).MOV(offset(mov_dst, 4), half(inst->src[i], 1));
                }
             }
 
@@ -3513,12 +3058,8 @@ fs_visitor::lower_load_payload()
       }
 
       for (uint8_t i = inst->header_size; i < inst->sources; i++) {
-         if (inst->src[i].file != BAD_FILE) {
-            fs_inst *mov = MOV(retype(dst, inst->src[i].type),
-                               inst->src[i]);
-            mov->force_writemask_all = inst->force_writemask_all;
-            inst->insert_before(block, mov);
-         }
+         if (inst->src[i].file != BAD_FILE)
+            ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
          dst = offset(dst, 1);
       }
 
@@ -3532,6 +3073,172 @@ fs_visitor::lower_load_payload()
    return progress;
 }
 
+bool
+fs_visitor::lower_integer_multiplication()
+{
+   bool progress = false;
+
+   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
+    * directly, but Cherryview cannot.
+    */
+   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
+      return false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MUL ||
+          inst->dst.is_accumulator() ||
+          (inst->dst.type != BRW_REGISTER_TYPE_D &&
+           inst->dst.type != BRW_REGISTER_TYPE_UD))
+         continue;
+
+      const fs_builder ibld = bld.at(block, inst);
+
+      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+       * src1 are used.
+       *
+       * If multiplying by an immediate value that fits in 16-bits, do a
+       * single MUL instruction with that value in the proper location.
+       */
+      if (inst->src[1].file == IMM &&
+          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
+         if (devinfo->gen < 7) {
+            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
+                       inst->dst.type, dispatch_width);
+            ibld.MOV(imm, inst->src[1]);
+            ibld.MUL(inst->dst, imm, inst->src[0]);
+         } else {
+            ibld.MUL(inst->dst, inst->src[0], inst->src[1]);
+         }
+      } else {
+         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+          * do 32-bit integer multiplication in one instruction, but instead
+          * must do a sequence (which actually calculates a 64-bit result):
+          *
+          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+          *    mov(8)  g2<1>D     acc0<8,8,1>D
+          *
+          * But on Gen > 6, the ability to use second accumulator register
+          * (acc1) for non-float data types was removed, preventing a simple
+          * implementation in SIMD16. A 16-channel result can be calculated by
+          * executing the three instructions twice in SIMD8, once with quarter
+          * control of 1Q for the first eight channels and again with 2Q for
+          * the second eight channels.
+          *
+          * Which accumulator register is implicitly accessed (by AccWrEnable
+          * for instance) is determined by the quarter control. Unfortunately
+          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+          * implicit accumulator access by an instruction with 2Q will access
+          * acc1 regardless of whether the data type is usable in acc1.
+          *
+          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+          * integer data types.
+          *
+          * Since we only want the low 32-bits of the result, we can do two
+          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+          * adjust the high result and add them (like the mach is doing):
+          *
+          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+          *
+          * We avoid the shl instruction by realizing that we only want to add
+          * the low 16-bits of the "high" result to the high 16-bits of the
+          * "low" result and using proper regioning on the add:
+          *
+          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+          *
+          * Since it does not use the (single) accumulator register, we can
+          * schedule multi-component multiplications much better.
+          */
+
+         if (inst->conditional_mod && inst->dst.is_null()) {
+            inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
+                               inst->dst.type, dispatch_width);
+         }
+         fs_reg low = inst->dst;
+         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
+                     inst->dst.type, dispatch_width);
+
+         if (devinfo->gen >= 7) {
+            fs_reg src1_0_w = inst->src[1];
+            fs_reg src1_1_w = inst->src[1];
+
+            if (inst->src[1].file == IMM) {
+               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
+               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
+            } else {
+               src1_0_w.type = BRW_REGISTER_TYPE_UW;
+               if (src1_0_w.stride != 0) {
+                  assert(src1_0_w.stride == 1);
+                  src1_0_w.stride = 2;
+               }
+
+               src1_1_w.type = BRW_REGISTER_TYPE_UW;
+               if (src1_1_w.stride != 0) {
+                  assert(src1_1_w.stride == 1);
+                  src1_1_w.stride = 2;
+               }
+               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+            }
+            ibld.MUL(low, inst->src[0], src1_0_w);
+            ibld.MUL(high, inst->src[0], src1_1_w);
+         } else {
+            fs_reg src0_0_w = inst->src[0];
+            fs_reg src0_1_w = inst->src[0];
+
+            src0_0_w.type = BRW_REGISTER_TYPE_UW;
+            if (src0_0_w.stride != 0) {
+               assert(src0_0_w.stride == 1);
+               src0_0_w.stride = 2;
+            }
+
+            src0_1_w.type = BRW_REGISTER_TYPE_UW;
+            if (src0_1_w.stride != 0) {
+               assert(src0_1_w.stride == 1);
+               src0_1_w.stride = 2;
+            }
+            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
+
+            ibld.MUL(low, src0_0_w, inst->src[1]);
+            ibld.MUL(high, src0_1_w, inst->src[1]);
+         }
+
+         fs_reg dst = inst->dst;
+         dst.type = BRW_REGISTER_TYPE_UW;
+         dst.subreg_offset = 2;
+         dst.stride = 2;
+
+         high.type = BRW_REGISTER_TYPE_UW;
+         high.stride = 2;
+
+         low.type = BRW_REGISTER_TYPE_UW;
+         low.subreg_offset = 2;
+         low.stride = 2;
+
+         ibld.ADD(dst, low, high);
+
+         if (inst->conditional_mod) {
+            fs_reg null(retype(ibld.null_reg_f(), inst->dst.type));
+            set_condmod(inst->conditional_mod,
+                        ibld.MOV(null, inst->dst));
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
 void
 fs_visitor::dump_instructions()
 {
@@ -3602,6 +3309,9 @@ fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
    }
    fprintf(file, "(%d) ", inst->exec_size);
 
+   if (inst->mlen) {
+      fprintf(file, "(mlen: %d) ", inst->mlen);
+   }
 
    switch (inst->dst.file) {
    case GRF:
@@ -3895,7 +3605,7 @@ fs_visitor::setup_vs_payload()
 void
 fs_visitor::setup_cs_payload()
 {
-   assert(brw->gen >= 7);
+   assert(devinfo->gen >= 7);
 
    payload.num_regs = 1;
 }
@@ -3938,6 +3648,17 @@ fs_visitor::calculate_register_pressure()
 void
 fs_visitor::optimize()
 {
+   /* bld is the common builder object pointing at the end of the program we
+    * used to translate it into i965 IR.  For the optimization and lowering
+    * passes coming next, any code added after the end of the program without
+    * having explicitly called fs_builder::at() clearly points at a mistake.
+    * Ideally optimization passes wouldn't be part of the visitor so they
+    * wouldn't have access to bld at all, but they do, so just in case some
+    * pass forgets to ask for a location explicitly set it to NULL here to
+    * make it trip.
+    */
+   bld = bld.at(NULL, NULL);
+
    split_virtual_grfs();
 
    move_uniform_array_access_to_pull_constants();
@@ -3953,7 +3674,7 @@ fs_visitor::optimize()
          snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
                   stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
                                                                         \
-         backend_visitor::dump_instructions(filename);                  \
+         backend_shader::dump_instructions(filename);                   \
       }                                                                 \
                                                                         \
       progress = progress || this_progress;                             \
@@ -3966,7 +3687,7 @@ fs_visitor::optimize()
                stage_abbrev, dispatch_width,
                shader_prog ? shader_prog->Name : 0);
 
-      backend_visitor::dump_instructions(filename);
+      backend_shader::dump_instructions(filename);
    }
 
    bool progress;
@@ -4010,6 +3731,7 @@ fs_visitor::optimize()
    }
 
    OPT(opt_combine_constants);
+   OPT(lower_integer_multiplication);
 
    lower_uniform_pull_constant_loads();
 }
@@ -4066,9 +3788,11 @@ fs_visitor::allocate_registers()
          fail("Failure to register allocate.  Reduce number of "
               "live scalar values to avoid this.");
       } else {
-         perf_debug("%s shader triggered register spilling.  "
-                    "Try reducing the number of live scalar values to "
-                    "improve performance.\n", stage_name);
+         compiler->shader_perf_log(log_data,
+                                   "%s shader triggered register spilling.  "
+                                   "Try reducing the number of live scalar "
+                                   "values to improve performance.\n",
+                                   stage_name);
       }
 
       /* Since we're out of heuristics, just go spill registers until we
@@ -4097,7 +3821,7 @@ fs_visitor::allocate_registers()
 }
 
 bool
-fs_visitor::run_vs()
+fs_visitor::run_vs(gl_clip_plane *clip_planes)
 {
    assert(stage == MESA_SHADER_VERTEX);
 
@@ -4105,26 +3829,17 @@ fs_visitor::run_vs()
       assign_common_binding_table_offsets(0);
    setup_vs_payload();
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_begin();
 
-   if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
-      emit_nir_code();
-   } else {
-      foreach_in_list(ir_instruction, ir, shader->base.ir) {
-         base_ir = ir;
-         this->result = reg_undef;
-         ir->accept(this);
-      }
-      base_ir = NULL;
-   }
+   emit_nir_code();
 
    if (failed)
       return false;
 
-   emit_urb_writes();
+   emit_urb_writes(clip_planes);
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_end();
 
    calculate_cfg();
@@ -4141,7 +3856,7 @@ fs_visitor::run_vs()
 }
 
 bool
-fs_visitor::run_fs()
+fs_visitor::run_fs(bool do_rep_send)
 {
    brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
    brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
@@ -4160,10 +3875,11 @@ fs_visitor::run_fs()
 
    if (0) {
       emit_dummy_fs();
-   } else if (brw->use_rep_send && dispatch_width == 16) {
+   } else if (do_rep_send) {
+      assert(dispatch_width == 16);
       emit_repclear_shader();
    } else {
-      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      if (shader_time_index >= 0)
          emit_shader_time_begin();
 
       calculate_urb_setup();
@@ -4178,37 +3894,27 @@ fs_visitor::run_fs()
        * Initialize it with the dispatched pixels.
        */
       if (wm_prog_data->uses_kill) {
-         fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+         fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
          discard_init->flag_subreg = 1;
       }
 
       /* Generate FS IR for main().  (the visitor only descends into
        * functions called "main").
        */
-      if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
-         emit_nir_code();
-      } else if (shader) {
-         foreach_in_list(ir_instruction, ir, shader->base.ir) {
-            base_ir = ir;
-            this->result = reg_undef;
-            ir->accept(this);
-         }
-      } else {
-         emit_fragment_program_code();
-      }
-      base_ir = NULL;
+      emit_nir_code();
+
       if (failed)
 	 return false;
 
       if (wm_prog_data->uses_kill)
-         emit(FS_OPCODE_PLACEHOLDER_HALT);
+         bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
 
       if (wm_key->alpha_test_func)
          emit_alpha_test();
 
       emit_fb_writes();
 
-      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      if (shader_time_index >= 0)
          emit_shader_time_end();
 
       calculate_cfg();
@@ -4252,7 +3958,7 @@ fs_visitor::run_cs()
 
    setup_cs_payload();
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_begin();
 
    emit_nir_code();
@@ -4262,7 +3968,7 @@ fs_visitor::run_cs()
 
    emit_cs_terminate();
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_end();
 
    calculate_cfg();
@@ -4312,10 +4018,18 @@ brw_wm_fs_emit(struct brw_context *brw,
    if (unlikely(INTEL_DEBUG & DEBUG_WM))
       brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
 
+   int st_index8 = -1, st_index16 = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME) {
+      st_index8 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS8);
+      st_index16 = brw_get_shader_time_index(brw, prog, &fp->Base, ST_FS16);
+   }
+
    /* Now the main event: Visit the shader IR and generate our FS IR for it.
     */
-   fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
-   if (!v.run_fs()) {
+   fs_visitor v(brw->intelScreen->compiler, brw,
+                mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+                prog, &fp->Base, 8, st_index8);
+   if (!v.run_fs(false /* do_rep_send */)) {
       if (prog) {
          prog->LinkStatus = false;
          ralloc_strcat(&prog->InfoLog, v.fail_msg);
@@ -4328,20 +4042,18 @@ brw_wm_fs_emit(struct brw_context *brw,
    }
 
    cfg_t *simd16_cfg = NULL;
-   fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
+   fs_visitor v2(brw->intelScreen->compiler, brw,
+                 mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
+                 prog, &fp->Base, 16, st_index16);
    if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
       if (!v.simd16_unsupported) {
          /* Try a SIMD16 compile */
          v2.import_uniforms(&v);
-         if (!v2.run_fs()) {
-            perf_debug("SIMD16 shader failed to compile, falling back to "
-                       "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
+         if (!v2.run_fs(brw->use_rep_send)) {
+            perf_debug("SIMD16 shader failed to compile: %s", v2.fail_msg);
          } else {
             simd16_cfg = v2.cfg;
          }
-      } else {
-         perf_debug("SIMD16 shader unsupported, falling back to "
-                    "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
       }
    }
 
@@ -4355,7 +4067,8 @@ brw_wm_fs_emit(struct brw_context *brw,
       prog_data->no_8 = false;
    }
 
-   fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
+   fs_generator g(brw->intelScreen->compiler, brw,
+                  mem_ctx, (void *) key, &prog_data->base,
                   &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
 
    if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 1d7de2e..243baf6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -29,6 +29,7 @@
 
 #include "brw_shader.h"
 #include "brw_ir_fs.h"
+#include "brw_fs_builder.h"
 
 extern "C" {
 
@@ -66,138 +67,44 @@ namespace brw {
  *
  * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
  */
-class fs_visitor : public backend_visitor
+class fs_visitor : public backend_shader
 {
 public:
-   const fs_reg reg_null_f;
-   const fs_reg reg_null_d;
-   const fs_reg reg_null_ud;
-
-   fs_visitor(struct brw_context *brw,
+   fs_visitor(const struct brw_compiler *compiler, void *log_data,
               void *mem_ctx,
-              const struct brw_wm_prog_key *key,
-              struct brw_wm_prog_data *prog_data,
+              gl_shader_stage stage,
+              const void *key,
+              struct brw_stage_prog_data *prog_data,
               struct gl_shader_program *shader_prog,
-              struct gl_fragment_program *fp,
-              unsigned dispatch_width);
-
-   fs_visitor(struct brw_context *brw,
-              void *mem_ctx,
-              const struct brw_vs_prog_key *key,
-              struct brw_vs_prog_data *prog_data,
-              struct gl_shader_program *shader_prog,
-              struct gl_vertex_program *cp,
-              unsigned dispatch_width);
-
-   fs_visitor(struct brw_context *brw,
-              void *mem_ctx,
-              const struct brw_cs_prog_key *key,
-              struct brw_cs_prog_data *prog_data,
-              struct gl_shader_program *shader_prog,
-              struct gl_compute_program *cp,
-              unsigned dispatch_width);
+              struct gl_program *prog,
+              unsigned dispatch_width,
+              int shader_time_index);
 
    ~fs_visitor();
-   void init();
 
-   fs_reg *variable_storage(ir_variable *var);
    fs_reg vgrf(const glsl_type *const type);
-   fs_reg vgrf(int num_components);
    void import_uniforms(fs_visitor *v);
-   void setup_uniform_clipplane_values();
-   void compute_clip_distance();
-
-   void visit(ir_variable *ir);
-   void visit(ir_assignment *ir);
-   void visit(ir_dereference_variable *ir);
-   void visit(ir_dereference_record *ir);
-   void visit(ir_dereference_array *ir);
-   void visit(ir_expression *ir);
-   void visit(ir_texture *ir);
-   void visit(ir_if *ir);
-   void visit(ir_constant *ir);
-   void visit(ir_swizzle *ir);
-   void visit(ir_return *ir);
-   void visit(ir_loop *ir);
-   void visit(ir_loop_jump *ir);
-   void visit(ir_discard *ir);
-   void visit(ir_call *ir);
-   void visit(ir_function *ir);
-   void visit(ir_function_signature *ir);
-   void visit(ir_emit_vertex *);
-   void visit(ir_end_primitive *);
+   void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
+   void compute_clip_distance(gl_clip_plane *clip_planes);
 
    uint32_t gather_channel(int orig_chan, uint32_t sampler);
    void swizzle_result(ir_texture_opcode op, int dest_components,
                        fs_reg orig_val, uint32_t sampler);
 
-   fs_inst *emit(fs_inst *inst);
-   void emit(exec_list list);
-
-   fs_inst *emit(enum opcode opcode);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
-                 const fs_reg &src1);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst,
-                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
-   fs_inst *emit(enum opcode opcode, const fs_reg &dst,
-                 fs_reg src[], int sources);
-
-   fs_inst *MOV(const fs_reg &dst, const fs_reg &src);
-   fs_inst *NOT(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDD(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDE(const fs_reg &dst, const fs_reg &src);
-   fs_inst *RNDZ(const fs_reg &dst, const fs_reg &src);
-   fs_inst *FRC(const fs_reg &dst, const fs_reg &src);
-   fs_inst *ADD(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MUL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MACH(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *MAC(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SHL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SHR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *ASR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *AND(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *OR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *XOR(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *IF(enum brw_predicate predicate);
-   fs_inst *IF(const fs_reg &src0, const fs_reg &src1,
-               enum brw_conditional_mod condition);
-   fs_inst *CMP(fs_reg dst, fs_reg src0, fs_reg src1,
-                enum brw_conditional_mod condition);
-   fs_inst *LRP(const fs_reg &dst, const fs_reg &a, const fs_reg &y,
-                const fs_reg &x);
-   fs_inst *DEP_RESOLVE_MOV(int grf);
-   fs_inst *BFREV(const fs_reg &dst, const fs_reg &value);
-   fs_inst *BFE(const fs_reg &dst, const fs_reg &bits, const fs_reg &offset,
-                const fs_reg &value);
-   fs_inst *BFI1(const fs_reg &dst, const fs_reg &bits, const fs_reg &offset);
-   fs_inst *BFI2(const fs_reg &dst, const fs_reg &bfi1_dst,
-                 const fs_reg &insert, const fs_reg &base);
-   fs_inst *FBH(const fs_reg &dst, const fs_reg &value);
-   fs_inst *FBL(const fs_reg &dst, const fs_reg &value);
-   fs_inst *CBIT(const fs_reg &dst, const fs_reg &value);
-   fs_inst *MAD(const fs_reg &dst, const fs_reg &c, const fs_reg &b,
-                const fs_reg &a);
-   fs_inst *ADDC(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SUBB(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-   fs_inst *SEL(const fs_reg &dst, const fs_reg &src0, const fs_reg &src1);
-
    int type_size(const struct glsl_type *type);
    fs_inst *get_instruction_generating_reg(fs_inst *start,
 					   fs_inst *end,
 					   const fs_reg &reg);
 
-   fs_inst *LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
-                         int header_size);
-
-   exec_list VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
-                                        const fs_reg &surf_index,
-                                        const fs_reg &varying_offset,
-                                        uint32_t const_offset);
+   void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
+                                   const fs_reg &dst,
+                                   const fs_reg &surf_index,
+                                   const fs_reg &varying_offset,
+                                   uint32_t const_offset);
+   void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
 
-   bool run_fs();
-   bool run_vs();
+   bool run_fs(bool do_rep_send);
+   bool run_vs(gl_clip_plane *clip_planes);
    bool run_cs();
    void optimize();
    void allocate_registers();
@@ -213,11 +120,8 @@ public:
    void assign_vs_urb_setup();
    bool assign_regs(bool allow_spilling);
    void assign_regs_trivial();
-   void get_used_mrfs(bool *mrf_used);
    void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
                                    int first_payload_node);
-   void setup_mrf_hack_interference(struct ra_graph *g,
-                                    int first_mrf_hack_node);
    int choose_spill_reg(struct ra_graph *g);
    void spill_reg(int spill_reg);
    void split_virtual_grfs();
@@ -254,9 +158,10 @@ public:
                                                      fs_inst *inst);
    void vfail(const char *msg, va_list args);
    void fail(const char *msg, ...);
-   void no16(const char *msg, ...);
+   void no16(const char *msg);
    void lower_uniform_pull_constant_loads();
    bool lower_load_payload();
+   bool lower_integer_multiplication();
    bool opt_combine_constants();
 
    void emit_dummy_fs();
@@ -318,58 +223,18 @@ public:
    fs_reg emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler);
    void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
    void resolve_source_modifiers(fs_reg *src);
-   fs_reg fix_math_operand(fs_reg src);
-   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0);
-   fs_inst *emit_math(enum opcode op, fs_reg dst, fs_reg src0, fs_reg src1);
-   fs_inst *emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
-                     const fs_reg &a);
-   void emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
-                    const fs_reg &src0, const fs_reg &src1);
    void emit_discard_jump();
-   /** Copy any live channel from \p src to the first channel of \p dst. */
-   void emit_uniformize(const fs_reg &dst, const fs_reg &src);
-   bool try_emit_b2f_of_comparison(ir_expression *ir);
-   bool try_emit_saturate(ir_expression *ir);
-   bool try_emit_line(ir_expression *ir);
-   bool try_emit_mad(ir_expression *ir);
    bool try_replace_with_sel();
-   bool try_opt_frontfacing_ternary(ir_if *ir);
    bool opt_peephole_sel();
    bool opt_peephole_predicated_break();
    bool opt_saturate_propagation();
    bool opt_cmod_propagation();
    bool opt_zero_samples();
-   void emit_bool_to_cond_code(ir_rvalue *condition);
-   void emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3]);
-   void emit_if_gen6(ir_if *ir);
    void emit_unspill(bblock_t *block, fs_inst *inst, fs_reg reg,
                      uint32_t spill_offset, int count);
    void emit_spill(bblock_t *block, fs_inst *inst, fs_reg reg,
                    uint32_t spill_offset, int count);
 
-   void emit_fragment_program_code();
-   void setup_fp_regs();
-   fs_reg get_fp_src_reg(const prog_src_register *src);
-   fs_reg get_fp_dst_reg(const prog_dst_register *dst);
-   void emit_fp_alu1(enum opcode opcode,
-                     const struct prog_instruction *fpi,
-                     fs_reg dst, fs_reg src);
-   void emit_fp_alu2(enum opcode opcode,
-                     const struct prog_instruction *fpi,
-                     fs_reg dst, fs_reg src0, fs_reg src1);
-   void emit_fp_scalar_write(const struct prog_instruction *fpi,
-                             fs_reg dst, fs_reg src);
-   void emit_fp_scalar_math(enum opcode opcode,
-                            const struct prog_instruction *fpi,
-                            fs_reg dst, fs_reg src);
-
-   void emit_fp_minmax(const struct prog_instruction *fpi,
-                       fs_reg dst, fs_reg src0, fs_reg src1);
-
-   void emit_fp_sop(enum brw_conditional_mod conditional_mod,
-                    const struct prog_instruction *fpi,
-                    fs_reg dst, fs_reg src0, fs_reg src1, fs_reg one);
-
    void emit_nir_code();
    void nir_setup_inputs(nir_shader *shader);
    void nir_setup_outputs(nir_shader *shader);
@@ -383,13 +248,17 @@ public:
    void nir_emit_loop(nir_loop *loop);
    void nir_emit_block(nir_block *block);
    void nir_emit_instr(nir_instr *instr);
-   void nir_emit_alu(nir_alu_instr *instr);
-   void nir_emit_intrinsic(nir_intrinsic_instr *instr);
-   void nir_emit_texture(nir_tex_instr *instr);
-   void nir_emit_jump(nir_jump_instr *instr);
+   void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_intrinsic(const brw::fs_builder &bld,
+                           nir_intrinsic_instr *instr);
+   void nir_emit_texture(const brw::fs_builder &bld,
+                         nir_tex_instr *instr);
+   void nir_emit_jump(const brw::fs_builder &bld,
+                      nir_jump_instr *instr);
    fs_reg get_nir_src(nir_src src);
    fs_reg get_nir_dest(nir_dest dest);
-   void emit_percomp(fs_inst *inst, unsigned wr_mask);
+   void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
+                     unsigned wr_mask);
 
    bool optimize_frontfacing_ternary(nir_alu_instr *instr,
                                      const fs_reg &result);
@@ -397,16 +266,21 @@ public:
    void setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
                             unsigned exec_size, bool use_2nd_half);
    void emit_alpha_test();
-   fs_inst *emit_single_fb_write(fs_reg color1, fs_reg color2,
+   fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
+                                 fs_reg color1, fs_reg color2,
                                  fs_reg src0_alpha, unsigned components,
                                  unsigned exec_size, bool use_2nd_half = false);
    void emit_fb_writes();
-   void emit_urb_writes();
+   void emit_urb_writes(gl_clip_plane *clip_planes);
    void emit_cs_terminate();
 
+   void emit_barrier();
+
    void emit_shader_time_begin();
    void emit_shader_time_end();
-   fs_inst *SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value);
+   void SHADER_TIME_ADD(const brw::fs_builder &bld,
+                        int shader_time_subindex,
+                        fs_reg value);
 
    void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                             fs_reg dst, fs_reg offset, fs_reg src0,
@@ -415,23 +289,9 @@ public:
    void emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
                                   fs_reg offset);
 
-   void emit_interpolate_expression(ir_expression *ir);
-
-   bool try_rewrite_rhs_to_dst(ir_assignment *ir,
-			       fs_reg dst,
-			       fs_reg src,
-			       fs_inst *pre_rhs_inst,
-			       fs_inst *last_rhs_inst);
-   void emit_assignment_writes(fs_reg &l, fs_reg &r,
-			       const glsl_type *type, bool predicated);
-   void resolve_ud_negate(fs_reg *reg);
-   void resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg);
-
-   fs_reg get_timestamp(fs_inst **out_mov);
+   fs_reg get_timestamp(const brw::fs_builder &bld);
 
    struct brw_reg interp_reg(int location, int channel);
-   void setup_uniform_values(ir_variable *ir);
-   void setup_builtin_uniform_values(ir_variable *ir);
    int implied_mrf_writes(fs_inst *inst);
 
    virtual void dump_instructions();
@@ -439,8 +299,6 @@ public:
    void dump_instruction(backend_instruction *inst);
    void dump_instruction(backend_instruction *inst, FILE *file);
 
-   void visit_atomic_counter_intrinsic(ir_call *ir);
-
    const void *const key;
    const struct brw_sampler_prog_key_data *key_tex;
 
@@ -476,7 +334,6 @@ public:
     */
    int *push_constant_loc;
 
-   struct hash_table *variable_ht;
    fs_reg frag_depth;
    fs_reg sample_mask;
    fs_reg outputs[VARYING_SLOT_MAX];
@@ -487,26 +344,18 @@ public:
    /** Either BRW_MAX_GRF or GEN7_MRF_HACK_START */
    unsigned max_grf;
 
-   fs_reg *fp_temp_regs;
-   fs_reg *fp_input_regs;
-
    fs_reg *nir_locals;
    fs_reg *nir_globals;
    fs_reg nir_inputs;
    fs_reg nir_outputs;
    fs_reg *nir_system_values;
 
-   /** @{ debug annotation info */
-   const char *current_annotation;
-   const void *base_ir;
-   /** @} */
-
    bool failed;
    char *fail_msg;
    bool simd16_unsupported;
    char *no16_msg;
 
-   /* Result of last visit() method. */
+   /* Result of last visit() method. Still used by emit_texture() */
    fs_reg result;
 
    /** Register numbers for thread payload fields. */
@@ -539,7 +388,10 @@ public:
 
    const unsigned dispatch_width; /**< 8 or 16 */
 
+   int shader_time_index;
+
    unsigned promoted_constants;
+   brw::fs_builder bld;
 };
 
 /**
@@ -550,7 +402,7 @@ public:
 class fs_generator
 {
 public:
-   fs_generator(struct brw_context *brw,
+   fs_generator(const struct brw_compiler *compiler, void *log_data,
                 void *mem_ctx,
                 const void *key,
                 struct brw_stage_prog_data *prog_data,
@@ -572,6 +424,7 @@ private:
    void generate_fb_write(fs_inst *inst, struct brw_reg payload);
    void generate_urb_write(fs_inst *inst, struct brw_reg payload);
    void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
+   void generate_barrier(fs_inst *inst, struct brw_reg src);
    void generate_blorp_fb_write(fs_inst *inst);
    void generate_linterp(fs_inst *inst, struct brw_reg dst,
 			 struct brw_reg *src);
@@ -644,7 +497,9 @@ private:
 
    bool patch_discard_jumps_to_fb_writes();
 
-   struct brw_context *brw;
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
    const struct brw_device_info *devinfo;
 
    struct brw_codegen *p;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_builder.h b/src/mesa/drivers/dri/i965/brw_fs_builder.h
new file mode 100644
index 0000000..58ac598
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_fs_builder.h
@@ -0,0 +1,652 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_BUILDER_H
+#define BRW_FS_BUILDER_H
+
+#include "brw_ir_fs.h"
+#include "brw_shader.h"
+#include "brw_context.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble an FS IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::vec4_builder.  They cannot be fully interchangeable because
+    * brw::fs_builder generates scalar code while brw::vec4_builder generates
+    * vector code.
+    */
+   class fs_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef fs_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef fs_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef fs_inst instruction;
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader.
+       * \p dispatch_width gives the native execution width of the program.
+       */
+      fs_builder(backend_shader *shader,
+                 unsigned dispatch_width) :
+         shader(shader), block(NULL), cursor(NULL),
+         _dispatch_width(dispatch_width),
+         _group(0),
+         force_writemask_all(false),
+         annotation()
+      {
+      }
+
+      /**
+       * Construct an fs_builder that inserts instructions before \p cursor in
+       * basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      fs_builder
+      at(bblock_t *block, exec_node *cursor) const
+      {
+         fs_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct an fs_builder appending instructions at the end of the
+       * instruction list of the shader, inheriting other code generation
+       * parameters from this.
+       */
+      fs_builder
+      at_end() const
+      {
+         return at(NULL, (exec_node *)&shader->instructions.tail);
+      }
+
+      /**
+       * Construct a builder specifying the default SIMD width and group of
+       * channel enable signals, inheriting other code generation parameters
+       * from this.
+       *
+       * \p n gives the default SIMD width, \p i gives the slot group used for
+       * predication and control flow masking in multiples of \p n channels.
+       */
+      fs_builder
+      group(unsigned n, unsigned i) const
+      {
+         assert(n <= dispatch_width() &&
+                i < dispatch_width() / n);
+         fs_builder bld = *this;
+         bld._dispatch_width = n;
+         bld._group += i * n;
+         return bld;
+      }
+
+      /**
+       * Alias for group() with width equal to eight.
+       */
+      fs_builder
+      half(unsigned i) const
+      {
+         return group(8, i);
+      }
+
+      /**
+       * Construct a builder with per-channel control flow execution masking
+       * disabled if \p b is true.  If control flow execution masking is
+       * already disabled this has no effect.
+       */
+      fs_builder
+      exec_all(bool b = true) const
+      {
+         fs_builder bld = *this;
+         if (b)
+            bld.force_writemask_all = true;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with the given debug annotation info.
+       */
+      fs_builder
+      annotate(const char *str, const void *ir = NULL) const
+      {
+         fs_builder bld = *this;
+         bld.annotation.str = str;
+         bld.annotation.ir = ir;
+         return bld;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return _dispatch_width;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (one for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for one logical
+       * component in this IR).
+       */
+      dst_reg
+      vgrf(enum brw_reg_type type, unsigned n = 1) const
+      {
+         return dst_reg(GRF, shader->alloc.allocate(
+                           DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                        REG_SIZE)),
+                        type, dispatch_width());
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_F));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Get the mask of SIMD channels enabled by dispatch and not yet
+       * disabled by discard.
+       */
+      src_reg
+      sample_mask_reg() const
+      {
+         const bool uses_kill =
+            (shader->stage == MESA_SHADER_FRAGMENT &&
+             ((brw_wm_prog_data *)shader->stage_prog_data)->uses_kill);
+         return (shader->stage != MESA_SHADER_FRAGMENT ? src_reg(0xffff) :
+                 uses_kill ? brw_flag_reg(0, 1) :
+                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(shader->mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode, dispatch_width()));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst.width, dst,
+                                fix_math_operand(src0))));
+
+         default:
+            return emit(instruction(opcode, dst.width, dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst.width, dst,
+                                fix_math_operand(src0),
+                                fix_math_operand(src1))));
+
+         default:
+            return emit(instruction(opcode, dst.width, dst, src0, src1));
+
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dst.width, dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dst.width, dst, src0, src1, src2));
+         }
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         assert(inst->exec_size == dispatch_width() ||
+                force_writemask_all);
+         assert(_group == 0 || _group == 8);
+
+         inst->force_sechalf = (_group == 8);
+         inst->force_writemask_all = force_writemask_all;
+         inst->annotation = annotation.str;
+         inst->ir = annotation.ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      void
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         if (shader->devinfo->gen >= 6) {
+            set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+         } else {
+            CMP(null_reg_d(), src0, src1, mod);
+            set_predicate(BRW_PREDICATE_NORMAL,
+                          SEL(dst, src0, src1));
+         }
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of \p dst.
+       */
+      void
+      emit_uniformize(const dst_reg &dst, const src_reg &src) const
+      {
+         const fs_builder ubld = exec_all();
+         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0));
+         ubld.emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
+                   src, component(chan_index, 0));
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU2(CMPN)
+      ALU3(CSEL)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(F16TO32)
+      ALU1(F32TO16)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gen4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gen4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         return set_predicate(predicate, emit(BRW_OPCODE_IF));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         if (shader->devinfo->gen >= 6) {
+            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+             * we need to reorder the operands.
+             */
+            return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+         } else {
+            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+            const dst_reg y_times_a = vgrf(dst.type);
+            const dst_reg one_minus_a = vgrf(dst.type);
+            const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+            MUL(y_times_a, y, a);
+            ADD(one_minus_a, negate(a), src_reg(1.0f));
+            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+         }
+      }
+
+      /**
+       * Collect a number of registers in a contiguous range of registers.
+       */
+      instruction *
+      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
+                   unsigned sources, unsigned header_size) const
+      {
+         assert(dst.width % 8 == 0);
+         instruction *inst = emit(instruction(SHADER_OPCODE_LOAD_PAYLOAD,
+                                              dst.width, dst, src, sources));
+         inst->header_size = header_size;
+
+         for (unsigned i = 0; i < header_size; i++)
+            assert(src[i].file != GRF ||
+                   src[i].width * type_sz(src[i].type) == 32);
+         inst->regs_written = header_size;
+
+         for (unsigned i = header_size; i < sources; ++i)
+            assert(src[i].file != GRF ||
+                   src[i].width == dst.width);
+         inst->regs_written += (sources - header_size) * (dst.width / 8);
+
+         return inst;
+      }
+
+      backend_shader *shader;
+
+   private:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for more details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD &&
+             src.negate) {
+            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         if (src.file == GRF || src.file == UNIFORM || src.stride > 1) {
+            return src;
+         } else {
+            dst_reg expanded = vgrf(src.type);
+            MOV(expanded, src);
+            return expanded;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
+          * might be able to do better by doing execsize = 1 math and then
+          * expanding that result out, but we would need to be careful with
+          * masking.
+          *
+          * Gen6 hardware ignores source modifiers (negate and abs) on math
+          * instructions, so we also move to a temp to set those up.
+          *
+          * Gen7 relaxes most of the above restrictions, but still can't use IMM
+          * operands to math
+          */
+         if ((shader->devinfo->gen == 6 &&
+              (src.file == IMM || src.file == UNIFORM ||
+               src.abs || src.negate)) ||
+             (shader->devinfo->gen == 7 && src.file == IMM)) {
+            const dst_reg tmp = vgrf(src.type);
+            MOV(tmp, src);
+            return tmp;
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround other weirdness of the math instruction.
+       */
+      instruction *
+      fix_math_instruction(instruction *inst) const
+      {
+         if (shader->devinfo->gen < 6) {
+            inst->base_mrf = 2;
+            inst->mlen = inst->sources * dispatch_width() / 8;
+
+            if (inst->sources > 1) {
+               /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+                * "Message Payload":
+                *
+                * "Operand0[7].  For the INT DIV functions, this operand is the
+                *  denominator."
+                *  ...
+                * "Operand1[7].  For the INT DIV functions, this operand is the
+                *  numerator."
+                */
+               const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+               const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
+               const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
+
+               inst->resize_sources(1);
+               inst->src[0] = src0;
+
+               at(block, inst).MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type,
+                                          dispatch_width()), src1);
+            }
+         }
+
+         return inst;
+      }
+
+      bblock_t *block;
+      exec_node *cursor;
+
+      unsigned _dispatch_width;
+      unsigned _group;
+      bool force_writemask_all;
+
+      /** Debug annotation info. */
+      struct {
+         const char *str;
+         const void *ir;
+      } annotation;
+   };
+}
+
+#endif
diff --git a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
index aa62031..0af5a91 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_combine_constants.cpp
@@ -38,6 +38,8 @@
 #include "brw_fs_live_variables.h"
 #include "brw_cfg.h"
 
+using namespace brw;
+
 /* Returns whether an instruction could co-issue if its immediate source were
  * replaced with a GRF source.
  */
@@ -270,15 +272,14 @@ fs_visitor::opt_combine_constants()
    reg.stride = 0;
    for (int i = 0; i < table.len; i++) {
       struct imm *imm = &table.imm[i];
-
-      fs_inst *mov = MOV(reg, fs_reg(imm->val));
-      mov->force_writemask_all = true;
-      if (imm->inst) {
-         imm->inst->insert_before(imm->block, mov);
-      } else {
-         backend_instruction *inst = imm->block->last_non_control_flow_inst();
-         inst->insert_after(imm->block, mov);
-      }
+      /* Insert it either before the instruction that generated the immediate
+       * or after the last non-control flow instruction of the common ancestor.
+       */
+      exec_node *n = (imm->inst ? imm->inst :
+                      imm->block->last_non_control_flow_inst()->next);
+      const fs_builder ibld = bld.at(imm->block, n).exec_all();
+
+      ibld.MOV(reg, fs_reg(imm->val));
       imm->reg = reg.reg;
       imm->subreg_offset = reg.subreg_offset;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 52bfa92..c92aae4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -541,8 +541,16 @@ fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
             /* Fit this constant in by commuting the operands.
              * Exception: we can't do this for 32-bit integer MUL/MACH
              * because it's asymmetric.
+             *
+             * The BSpec says for Broadwell that
+             *
+             *    "When multiplying DW x DW, the dst cannot be accumulator."
+             *
+             * Integer MUL with a non-accumulator destination will be lowered
+             * by lower_integer_multiplication(), so don't restrict it.
              */
-            if ((inst->opcode == BRW_OPCODE_MUL ||
+            if (((inst->opcode == BRW_OPCODE_MUL &&
+                  inst->dst.is_accumulator()) ||
                  inst->opcode == BRW_OPCODE_MACH) &&
                 (inst->src[1].type == BRW_REGISTER_TYPE_D ||
                  inst->src[1].type == BRW_REGISTER_TYPE_UD))
diff --git a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
index db01f8c..70f0217 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_cse.cpp
@@ -32,6 +32,8 @@
  * 13.1 (p378).
  */
 
+using namespace brw;
+
 namespace {
 struct aeb_entry : public exec_node {
    /** The instruction that generates the expression value. */
@@ -152,28 +154,34 @@ static bool
 instructions_match(fs_inst *a, fs_inst *b, bool *negate)
 {
    return a->opcode == b->opcode &&
+          a->force_writemask_all == b->force_writemask_all &&
+          a->exec_size == b->exec_size &&
+          a->force_sechalf == b->force_sechalf &&
           a->saturate == b->saturate &&
           a->predicate == b->predicate &&
           a->predicate_inverse == b->predicate_inverse &&
           a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
           a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->regs_written == b->regs_written &&
+          a->base_mrf == b->base_mrf &&
+          a->eot == b->eot &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
+          a->pi_noperspective == b->pi_noperspective &&
           a->sources == b->sources &&
-          (a->is_tex() ? (a->offset == b->offset &&
-                          a->mlen == b->mlen &&
-                          a->regs_written == b->regs_written &&
-                          a->base_mrf == b->base_mrf &&
-                          a->eot == b->eot &&
-                          a->header_size == b->header_size &&
-                          a->shadow_compare == b->shadow_compare)
-                       : true) &&
           operands_match(a, b, negate);
 }
 
-static fs_inst *
-create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate)
+static void
+create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
 {
    int written = inst->regs_written;
    int dst_width = inst->dst.width / 8;
+   const fs_builder ubld = bld.group(inst->exec_size, inst->force_sechalf)
+                              .exec_all(inst->force_writemask_all);
    fs_inst *copy;
 
    if (written > dst_width) {
@@ -189,7 +197,7 @@ create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate)
       }
 
       assert(src.file == GRF);
-      payload = ralloc_array(v->mem_ctx, fs_reg, sources);
+      payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
       for (int i = 0; i < header_size; i++) {
          payload[i] = src;
          payload[i].width = 8;
@@ -199,15 +207,12 @@ create_copy_instr(fs_visitor *v, fs_inst *inst, fs_reg src, bool negate)
          payload[i] = src;
          src = offset(src, 1);
       }
-      copy = v->LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
+      copy = ubld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
    } else {
-      copy = v->MOV(inst->dst, src);
-      copy->force_writemask_all = inst->force_writemask_all;
+      copy = ubld.MOV(inst->dst, src);
       copy->src[0].negate = negate;
    }
    assert(copy->regs_written == written);
-
-   return copy;
 }
 
 bool
@@ -261,9 +266,8 @@ fs_visitor::opt_cse_local(bblock_t *block)
                                    entry->generator->dst.type,
                                    entry->generator->dst.width);
 
-               fs_inst *copy = create_copy_instr(this, entry->generator,
-                                                 entry->tmp, false);
-               entry->generator->insert_after(block, copy);
+               create_copy_instr(bld.at(block, entry->generator->next),
+                                 entry->generator, entry->tmp, false);
 
                entry->generator->dst = entry->tmp;
             }
@@ -274,9 +278,7 @@ fs_visitor::opt_cse_local(bblock_t *block)
                assert(inst->dst.width == entry->generator->dst.width);
                assert(inst->dst.type == entry->tmp.type);
 
-               fs_inst *copy = create_copy_instr(this, inst,
-                                                 entry->tmp, negate);
-               inst->insert_before(block, copy);
+               create_copy_instr(bld.at(block, inst), inst, entry->tmp, negate);
             }
 
             /* Set our iterator so that next time through the loop inst->next
diff --git a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp b/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
deleted file mode 100644
index 6518ff6..0000000
--- a/src/mesa/drivers/dri/i965/brw_fs_fp.cpp
+++ /dev/null
@@ -1,742 +0,0 @@
-/*
- * Copyright © 2012 Intel Corporation
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
- * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
- * IN THE SOFTWARE.
- */
-
-/** @file brw_fs_fp.cpp
- *
- * Implementation of the compiler for GL_ARB_fragment_program shaders on top
- * of the GLSL compiler backend.
- */
-
-#include "brw_context.h"
-#include "brw_fs.h"
-
-void
-fs_visitor::emit_fp_alu1(enum opcode opcode,
-                         const struct prog_instruction *fpi,
-                         fs_reg dst, fs_reg src)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i))
-         emit(opcode, offset(dst, i), offset(src, i));
-   }
-}
-
-void
-fs_visitor::emit_fp_alu2(enum opcode opcode,
-                         const struct prog_instruction *fpi,
-                         fs_reg dst, fs_reg src0, fs_reg src1)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i))
-         emit(opcode, offset(dst, i),
-              offset(src0, i), offset(src1, i));
-   }
-}
-
-void
-fs_visitor::emit_fp_minmax(const prog_instruction *fpi,
-                           fs_reg dst, fs_reg src0, fs_reg src1)
-{
-   enum brw_conditional_mod conditionalmod;
-   if (fpi->Opcode == OPCODE_MIN)
-      conditionalmod = BRW_CONDITIONAL_L;
-   else
-      conditionalmod = BRW_CONDITIONAL_GE;
-
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i)) {
-         emit_minmax(conditionalmod, offset(dst, i),
-                     offset(src0, i), offset(src1, i));
-      }
-   }
-}
-
-void
-fs_visitor::emit_fp_sop(enum brw_conditional_mod conditional_mod,
-                        const struct prog_instruction *fpi,
-                        fs_reg dst, fs_reg src0, fs_reg src1,
-                        fs_reg one)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i)) {
-         fs_inst *inst;
-
-         emit(CMP(reg_null_d, offset(src0, i), offset(src1, i),
-                  conditional_mod));
-
-         inst = emit(BRW_OPCODE_SEL, offset(dst, i), one, fs_reg(0.0f));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-      }
-   }
-}
-
-void
-fs_visitor::emit_fp_scalar_write(const struct prog_instruction *fpi,
-                                 fs_reg dst, fs_reg src)
-{
-   for (int i = 0; i < 4; i++) {
-      if (fpi->DstReg.WriteMask & (1 << i))
-         emit(MOV(offset(dst, i), src));
-   }
-}
-
-void
-fs_visitor::emit_fp_scalar_math(enum opcode opcode,
-                                const struct prog_instruction *fpi,
-                                fs_reg dst, fs_reg src)
-{
-   fs_reg temp = vgrf(glsl_type::float_type);
-   emit_math(opcode, temp, src);
-   emit_fp_scalar_write(fpi, dst, temp);
-}
-
-void
-fs_visitor::emit_fragment_program_code()
-{
-   setup_fp_regs();
-
-   /* Keep a reg with 1.0 around, for reuse by emit_fp_sop so that it can just
-    * be:
-    *
-    * sel.f0 dst 1.0 0.0
-    *
-    * instead of
-    *
-    * mov    dst 0.0
-    * mov.f0 dst 1.0
-    */
-   fs_reg one = vgrf(glsl_type::float_type);
-   emit(MOV(one, fs_reg(1.0f)));
-
-   for (unsigned int insn = 0; insn < prog->NumInstructions; insn++) {
-      const struct prog_instruction *fpi = &prog->Instructions[insn];
-      base_ir = fpi;
-
-      fs_reg dst;
-      fs_reg src[3];
-
-      /* We always emit into a temporary destination register to avoid
-       * aliasing issues.
-       */
-      dst = vgrf(glsl_type::vec4_type);
-
-      for (int i = 0; i < 3; i++)
-         src[i] = get_fp_src_reg(&fpi->SrcReg[i]);
-
-      switch (fpi->Opcode) {
-      case OPCODE_ABS:
-         src[0].abs = true;
-         src[0].negate = false;
-         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_ADD:
-         emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_CMP:
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               fs_inst *inst;
-
-               emit(CMP(reg_null_f, offset(src[0], i), fs_reg(0.0f),
-                        BRW_CONDITIONAL_L));
-
-               inst = emit(BRW_OPCODE_SEL, offset(dst, i),
-                           offset(src[1], i), offset(src[2], i));
-               inst->predicate = BRW_PREDICATE_NORMAL;
-            }
-         }
-         break;
-
-      case OPCODE_COS:
-         emit_fp_scalar_math(SHADER_OPCODE_COS, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_DP2:
-      case OPCODE_DP3:
-      case OPCODE_DP4:
-      case OPCODE_DPH: {
-         fs_reg mul = vgrf(glsl_type::float_type);
-         fs_reg acc = vgrf(glsl_type::float_type);
-         int count;
-
-         switch (fpi->Opcode) {
-         case OPCODE_DP2: count = 2; break;
-         case OPCODE_DP3: count = 3; break;
-         case OPCODE_DP4: count = 4; break;
-         case OPCODE_DPH: count = 3; break;
-         default: unreachable("not reached");
-         }
-
-         emit(MUL(acc, offset(src[0], 0), offset(src[1], 0)));
-         for (int i = 1; i < count; i++) {
-            emit(MUL(mul, offset(src[0], i), offset(src[1], i)));
-            emit(ADD(acc, acc, mul));
-         }
-
-         if (fpi->Opcode == OPCODE_DPH)
-            emit(ADD(acc, acc, offset(src[1], 3)));
-
-         emit_fp_scalar_write(fpi, dst, acc);
-         break;
-      }
-
-      case OPCODE_DST:
-         if (fpi->DstReg.WriteMask & WRITEMASK_X)
-            emit(MOV(dst, fs_reg(1.0f)));
-         if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-            emit(MUL(offset(dst, 1),
-                     offset(src[0], 1), offset(src[1], 1)));
-         }
-         if (fpi->DstReg.WriteMask & WRITEMASK_Z)
-            emit(MOV(offset(dst, 2), offset(src[0], 2)));
-         if (fpi->DstReg.WriteMask & WRITEMASK_W)
-            emit(MOV(offset(dst, 3), offset(src[1], 3)));
-         break;
-
-      case OPCODE_EX2:
-         emit_fp_scalar_math(SHADER_OPCODE_EXP2, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_FLR:
-         emit_fp_alu1(BRW_OPCODE_RNDD, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_FRC:
-         emit_fp_alu1(BRW_OPCODE_FRC, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_KIL: {
-         for (int i = 0; i < 4; i++) {
-            /* In most cases the argument to a KIL will be something like
-             * TEMP[0].wwww, so there's no point in checking whether .w is < 0
-             * 4 times in a row.
-             */
-            if (i > 0 &&
-                GET_SWZ(fpi->SrcReg[0].Swizzle, i) ==
-                GET_SWZ(fpi->SrcReg[0].Swizzle, i - 1) &&
-                ((fpi->SrcReg[0].Negate >> i) & 1) ==
-                ((fpi->SrcReg[0].Negate >> (i - 1)) & 1)) {
-               continue;
-            }
-
-
-            /* Emit an instruction that's predicated on the current
-             * undiscarded pixels, and updates just those pixels to be
-             * turned off.
-             */
-            fs_inst *cmp = emit(CMP(reg_null_f, offset(src[0], i),
-                                    fs_reg(0.0f), BRW_CONDITIONAL_GE));
-            cmp->predicate = BRW_PREDICATE_NORMAL;
-            cmp->flag_subreg = 1;
-
-            if (devinfo->gen >= 6)
-               emit_discard_jump();
-         }
-         break;
-      }
-
-      case OPCODE_LG2:
-         emit_fp_scalar_math(SHADER_OPCODE_LOG2, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_LIT:
-         /* From the ARB_fragment_program spec:
-          *
-          *      tmp = VectorLoad(op0);
-          *      if (tmp.x < 0) tmp.x = 0;
-          *      if (tmp.y < 0) tmp.y = 0;
-          *      if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
-          *      else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
-          *      result.x = 1.0;
-          *      result.y = tmp.x;
-          *      result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
-          *      result.w = 1.0;
-          *
-          * Note that we don't do the clamping to +/- 128.  We didn't in
-          * brw_wm_emit.c either.
-          */
-         if (fpi->DstReg.WriteMask & WRITEMASK_X)
-            emit(MOV(offset(dst, 0), fs_reg(1.0f)));
-
-         if (fpi->DstReg.WriteMask & WRITEMASK_YZ) {
-            fs_inst *inst;
-            emit(CMP(reg_null_f, offset(src[0], 0), fs_reg(0.0f),
-                     BRW_CONDITIONAL_LE));
-
-            if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-               emit(MOV(offset(dst, 1), offset(src[0], 0)));
-               inst = emit(MOV(offset(dst, 1), fs_reg(0.0f)));
-               inst->predicate = BRW_PREDICATE_NORMAL;
-            }
-
-            if (fpi->DstReg.WriteMask & WRITEMASK_Z) {
-               emit_math(SHADER_OPCODE_POW, offset(dst, 2),
-                         offset(src[0], 1), offset(src[0], 3));
-
-               inst = emit(MOV(offset(dst, 2), fs_reg(0.0f)));
-               inst->predicate = BRW_PREDICATE_NORMAL;
-            }
-         }
-
-         if (fpi->DstReg.WriteMask & WRITEMASK_W)
-            emit(MOV(offset(dst, 3), fs_reg(1.0f)));
-
-         break;
-
-      case OPCODE_LRP:
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               fs_reg a = offset(src[0], i);
-               fs_reg y = offset(src[1], i);
-               fs_reg x = offset(src[2], i);
-               emit_lrp(offset(dst, i), x, y, a);
-            }
-         }
-         break;
-
-      case OPCODE_MAD:
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               if (devinfo->gen >= 6) {
-                  emit(MAD(offset(dst, i), offset(src[2], i),
-                           offset(src[1], i), offset(src[0], i)));
-               } else {
-                  fs_reg temp = vgrf(glsl_type::float_type);
-                  emit(MUL(temp, offset(src[0], i), offset(src[1], i)));
-                  emit(ADD(offset(dst, i), temp, offset(src[2], i)));
-               }
-            }
-         }
-         break;
-
-      case OPCODE_MAX:
-         emit_fp_minmax(fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_MOV:
-         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_MIN:
-         emit_fp_minmax(fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_MUL:
-         emit_fp_alu2(BRW_OPCODE_MUL, fpi, dst, src[0], src[1]);
-         break;
-
-      case OPCODE_POW: {
-         fs_reg temp = vgrf(glsl_type::float_type);
-         emit_math(SHADER_OPCODE_POW, temp, src[0], src[1]);
-         emit_fp_scalar_write(fpi, dst, temp);
-         break;
-      }
-
-      case OPCODE_RCP:
-         emit_fp_scalar_math(SHADER_OPCODE_RCP, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_RSQ:
-         emit_fp_scalar_math(SHADER_OPCODE_RSQ, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_SCS:
-         if (fpi->DstReg.WriteMask & WRITEMASK_X) {
-            emit_math(SHADER_OPCODE_COS, offset(dst, 0),
-                      offset(src[0], 0));
-         }
-
-         if (fpi->DstReg.WriteMask & WRITEMASK_Y) {
-            emit_math(SHADER_OPCODE_SIN, offset(dst, 1),
-                      offset(src[0], 1));
-         }
-         break;
-
-      case OPCODE_SGE:
-         emit_fp_sop(BRW_CONDITIONAL_GE, fpi, dst, src[0], src[1], one);
-         break;
-
-      case OPCODE_SIN:
-         emit_fp_scalar_math(SHADER_OPCODE_SIN, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_SLT:
-         emit_fp_sop(BRW_CONDITIONAL_L, fpi, dst, src[0], src[1], one);
-         break;
-
-      case OPCODE_SUB: {
-         fs_reg neg_src1 = src[1];
-         neg_src1.negate = !src[1].negate;
-
-         emit_fp_alu2(BRW_OPCODE_ADD, fpi, dst, src[0], neg_src1);
-         break;
-      }
-
-      case OPCODE_TEX:
-      case OPCODE_TXB:
-      case OPCODE_TXP: {
-         ir_texture_opcode op;
-         fs_reg lod;
-         fs_reg dpdy;
-         fs_reg coordinate = src[0];
-         fs_reg shadow_c;
-         fs_reg sample_index;
-         fs_reg texel_offset; /* No offsets; leave as BAD_FILE. */
-
-         switch (fpi->Opcode) {
-         case OPCODE_TEX:
-            op = ir_tex;
-            break;
-         case OPCODE_TXP: {
-            op = ir_tex;
-
-            coordinate = vgrf(glsl_type::vec3_type);
-            fs_reg invproj = vgrf(glsl_type::float_type);
-            emit_math(SHADER_OPCODE_RCP, invproj, offset(src[0], 3));
-            for (int i = 0; i < 3; i++) {
-               emit(MUL(offset(coordinate, i),
-                        offset(src[0], i), invproj));
-            }
-            break;
-         }
-         case OPCODE_TXB:
-            op = ir_txb;
-            lod = offset(src[0], 3);
-            break;
-         default:
-            unreachable("not reached");
-         }
-
-         int coord_components;
-         switch (fpi->TexSrcTarget) {
-         case TEXTURE_1D_INDEX:
-            coord_components = 1;
-            break;
-
-         case TEXTURE_2D_INDEX:
-         case TEXTURE_1D_ARRAY_INDEX:
-         case TEXTURE_RECT_INDEX:
-         case TEXTURE_EXTERNAL_INDEX:
-            coord_components = 2;
-            break;
-
-         case TEXTURE_3D_INDEX:
-         case TEXTURE_2D_ARRAY_INDEX:
-            coord_components = 3;
-            break;
-
-         case TEXTURE_CUBE_INDEX: {
-            coord_components = 3;
-
-            fs_reg temp = vgrf(glsl_type::float_type);
-            fs_reg cubecoord = vgrf(glsl_type::vec3_type);
-            fs_reg abscoord = coordinate;
-            abscoord.negate = false;
-            abscoord.abs = true;
-            emit_minmax(BRW_CONDITIONAL_GE, temp,
-                        offset(abscoord, 0), offset(abscoord, 1));
-            emit_minmax(BRW_CONDITIONAL_GE, temp,
-                        temp, offset(abscoord, 2));
-            emit_math(SHADER_OPCODE_RCP, temp, temp);
-            for (int i = 0; i < 3; i++) {
-               emit(MUL(offset(cubecoord, i),
-                        offset(coordinate, i), temp));
-            }
-
-            coordinate = cubecoord;
-            break;
-         }
-
-         default:
-            unreachable("not reached");
-         }
-
-         if (fpi->TexShadow)
-            shadow_c = offset(coordinate, 2);
-
-         emit_texture(op, glsl_type::vec4_type, coordinate, coord_components,
-                      shadow_c, lod, dpdy, 0, sample_index,
-                      reg_undef, /* offset */
-                      reg_undef, /* mcs */
-                      0, /* gather component */
-                      false, /* is cube array */
-                      fpi->TexSrcTarget == TEXTURE_RECT_INDEX,
-                      fpi->TexSrcUnit, fs_reg(fpi->TexSrcUnit),
-                      fpi->TexSrcUnit);
-         dst = this->result;
-
-         break;
-      }
-
-      case OPCODE_SWZ:
-         /* Note that SWZ's extended swizzles are handled in the general
-          * get_src_reg() code.
-          */
-         emit_fp_alu1(BRW_OPCODE_MOV, fpi, dst, src[0]);
-         break;
-
-      case OPCODE_XPD:
-         for (int i = 0; i < 3; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               int i1 = (i + 1) % 3;
-               int i2 = (i + 2) % 3;
-
-               fs_reg temp = vgrf(glsl_type::float_type);
-               fs_reg neg_src1_1 = offset(src[1], i1);
-               neg_src1_1.negate = !neg_src1_1.negate;
-               emit(MUL(temp, offset(src[0], i2), neg_src1_1));
-               emit(MUL(offset(dst, i),
-                        offset(src[0], i1), offset(src[1], i2)));
-               emit(ADD(offset(dst, i), offset(dst, i), temp));
-            }
-         }
-         break;
-
-      case OPCODE_END:
-         break;
-
-      default:
-         _mesa_problem(ctx, "Unsupported opcode %s in fragment program\n",
-                       _mesa_opcode_string(fpi->Opcode));
-      }
-
-      /* To handle saturates, we emit a MOV with a saturate bit, which
-       * optimization should fold into the preceding instructions when safe.
-       */
-      if (_mesa_num_inst_dst_regs(fpi->Opcode) != 0) {
-         fs_reg real_dst = get_fp_dst_reg(&fpi->DstReg);
-
-         for (int i = 0; i < 4; i++) {
-            if (fpi->DstReg.WriteMask & (1 << i)) {
-               fs_inst *inst = emit(MOV(offset(real_dst, i),
-                                        offset(dst, i)));
-               inst->saturate = fpi->SaturateMode;
-            }
-         }
-      }
-   }
-
-   /* Epilogue:
-    *
-    * Fragment depth has this strange convention of being the .z component of
-    * a vec4.  emit_fb_write() wants to see a float value, instead.
-    */
-   this->current_annotation = "result.depth write";
-   if (frag_depth.file != BAD_FILE) {
-      fs_reg temp = vgrf(glsl_type::float_type);
-      emit(MOV(temp, offset(frag_depth, 2)));
-      frag_depth = temp;
-   }
-}
-
-void
-fs_visitor::setup_fp_regs()
-{
-   /* PROGRAM_TEMPORARY */
-   int num_temp = prog->NumTemporaries;
-   fp_temp_regs = rzalloc_array(mem_ctx, fs_reg, num_temp);
-   for (int i = 0; i < num_temp; i++)
-      fp_temp_regs[i] = vgrf(glsl_type::vec4_type);
-
-   /* PROGRAM_STATE_VAR etc. */
-   if (dispatch_width == 8) {
-      for (unsigned p = 0;
-           p < prog->Parameters->NumParameters; p++) {
-         for (unsigned int i = 0; i < 4; i++) {
-            stage_prog_data->param[uniforms++] =
-               &prog->Parameters->ParameterValues[p][i];
-         }
-      }
-   }
-
-   fp_input_regs = rzalloc_array(mem_ctx, fs_reg, VARYING_SLOT_MAX);
-   for (int i = 0; i < VARYING_SLOT_MAX; i++) {
-      if (prog->InputsRead & BITFIELD64_BIT(i)) {
-         this->current_annotation = ralloc_asprintf(ctx, "interpolate input %d",
-                                                    i);
-
-         switch (i) {
-         case VARYING_SLOT_POS:
-            {
-               assert(stage == MESA_SHADER_FRAGMENT);
-               gl_fragment_program *fp = (gl_fragment_program*) prog;
-               fp_input_regs[i] =
-                  *emit_fragcoord_interpolation(fp->PixelCenterInteger,
-                                                fp->OriginUpperLeft);
-            }
-            break;
-         case VARYING_SLOT_FACE:
-            fp_input_regs[i] = *emit_frontfacing_interpolation();
-            break;
-         default:
-            fp_input_regs[i] = vgrf(glsl_type::vec4_type);
-            emit_general_interpolation(fp_input_regs[i], "fp_input",
-                                       glsl_type::vec4_type,
-                                       INTERP_QUALIFIER_NONE,
-                                       i, false, false);
-
-            if (i == VARYING_SLOT_FOGC) {
-               emit(MOV(offset(fp_input_regs[i], 1), fs_reg(0.0f)));
-               emit(MOV(offset(fp_input_regs[i], 2), fs_reg(0.0f)));
-               emit(MOV(offset(fp_input_regs[i], 3), fs_reg(1.0f)));
-            }
-
-            break;
-         }
-
-         this->current_annotation = NULL;
-      }
-   }
-}
-
-fs_reg
-fs_visitor::get_fp_dst_reg(const prog_dst_register *dst)
-{
-   assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-
-   switch (dst->File) {
-   case PROGRAM_TEMPORARY:
-      return fp_temp_regs[dst->Index];
-
-   case PROGRAM_OUTPUT:
-      if (dst->Index == FRAG_RESULT_DEPTH) {
-         if (frag_depth.file == BAD_FILE)
-            frag_depth = vgrf(glsl_type::vec4_type);
-         return frag_depth;
-      } else if (dst->Index == FRAG_RESULT_COLOR) {
-         if (outputs[0].file == BAD_FILE) {
-            outputs[0] = vgrf(glsl_type::vec4_type);
-            output_components[0] = 4;
-
-            /* Tell emit_fb_writes() to smear fragment.color across all the
-             * color attachments.
-             */
-            for (int i = 1; i < key->nr_color_regions; i++) {
-               outputs[i] = outputs[0];
-               output_components[i] = output_components[0];
-            }
-         }
-         return outputs[0];
-      } else {
-         int output_index = dst->Index - FRAG_RESULT_DATA0;
-         if (outputs[output_index].file == BAD_FILE) {
-            outputs[output_index] = vgrf(glsl_type::vec4_type);
-         }
-         output_components[output_index] = 4;
-         return outputs[output_index];
-      }
-
-   case PROGRAM_UNDEFINED:
-      return fs_reg();
-
-   default:
-      _mesa_problem(ctx, "bad dst register file: %s\n",
-                    _mesa_register_file_name((gl_register_file)dst->File));
-      return vgrf(glsl_type::vec4_type);
-   }
-}
-
-fs_reg
-fs_visitor::get_fp_src_reg(const prog_src_register *src)
-{
-   struct gl_program_parameter_list *plist = prog->Parameters;
-
-   fs_reg result;
-
-   assert(!src->Abs);
-
-   switch (src->File) {
-   case PROGRAM_UNDEFINED:
-      return fs_reg();
-   case PROGRAM_TEMPORARY:
-      result = fp_temp_regs[src->Index];
-      break;
-
-   case PROGRAM_INPUT:
-      result = fp_input_regs[src->Index];
-      break;
-
-   case PROGRAM_STATE_VAR:
-   case PROGRAM_UNIFORM:
-   case PROGRAM_CONSTANT:
-      /* We actually want to look at the type in the Parameters list for this,
-       * because this lets us upload constant builtin uniforms, as actual
-       * constants.
-       */
-      switch (plist->Parameters[src->Index].Type) {
-      case PROGRAM_CONSTANT: {
-         result = vgrf(glsl_type::vec4_type);
-
-         for (int i = 0; i < 4; i++) {
-            emit(MOV(offset(result, i),
-                     fs_reg(plist->ParameterValues[src->Index][i].f)));
-         }
-         break;
-      }
-
-      case PROGRAM_STATE_VAR:
-      case PROGRAM_UNIFORM:
-         result = fs_reg(UNIFORM, src->Index * 4);
-         break;
-
-      default:
-         _mesa_problem(ctx, "bad uniform src register file: %s\n",
-                       _mesa_register_file_name((gl_register_file)src->File));
-         return vgrf(glsl_type::vec4_type);
-      }
-      break;
-
-   default:
-      _mesa_problem(ctx, "bad src register file: %s\n",
-                    _mesa_register_file_name((gl_register_file)src->File));
-      return vgrf(glsl_type::vec4_type);
-   }
-
-   if (src->Swizzle != SWIZZLE_NOOP || src->Negate) {
-      fs_reg unswizzled = result;
-      result = vgrf(glsl_type::vec4_type);
-      for (int i = 0; i < 4; i++) {
-         bool negate = src->Negate & (1 << i);
-         /* The ZERO, ONE, and Negate options are only used for OPCODE_SWZ,
-          * but it costs us nothing to support it.
-          */
-         int src_swiz = GET_SWZ(src->Swizzle, i);
-         if (src_swiz == SWIZZLE_ZERO) {
-            emit(MOV(offset(result, i), fs_reg(0.0f)));
-         } else if (src_swiz == SWIZZLE_ONE) {
-            emit(MOV(offset(result, i),
-                     negate ? fs_reg(-1.0f) : fs_reg(1.0f)));
-         } else {
-            fs_reg src = offset(unswizzled, src_swiz);
-            if (negate)
-               src.negate = !src.negate;
-            emit(MOV(offset(result, i), src));
-         }
-      }
-   }
-
-   return result;
-}
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
index a99b7f7..2ed0bac 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp
@@ -121,7 +121,7 @@ brw_reg_from_fs_reg(fs_reg *reg)
    return brw_reg;
 }
 
-fs_generator::fs_generator(struct brw_context *brw,
+fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
                            void *mem_ctx,
                            const void *key,
                            struct brw_stage_prog_data *prog_data,
@@ -130,7 +130,8 @@ fs_generator::fs_generator(struct brw_context *brw,
                            bool runtime_check_aads_emit,
                            const char *stage_abbrev)
 
-   : brw(brw), devinfo(brw->intelScreen->devinfo), key(key),
+   : compiler(compiler), log_data(log_data),
+     devinfo(compiler->devinfo), key(key),
      prog_data(prog_data),
      prog(prog), promoted_constants(promoted_constants),
      runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
@@ -401,6 +402,13 @@ fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
 }
 
 void
+fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
+{
+   brw_barrier(p, src);
+   brw_WAIT(p);
+}
+
+void
 fs_generator::generate_blorp_fb_write(fs_inst *inst)
 {
    brw_fb_WRITE(p,
@@ -779,27 +787,19 @@ fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src
       brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
    } else {
       /* Non-const sampler index */
-      /* Note: this clobbers `dst` as a temporary before emitting the send */
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
-      struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
-
       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 
       brw_push_insn_state(p);
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
-      /* Some care required: `sampler` and `temp` may alias:
-       *    addr = sampler & 0xff
-       *    temp = (sampler << 8) & 0xf00
-       *    addr = addr | temp
-       */
-      brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
-      brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
-      brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
-      brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
-      brw_OR(p, addr, addr, temp);
+      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
+      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (base_binding_table_index)
+         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
 
       brw_pop_insn_state(p);
 
@@ -941,6 +941,7 @@ fs_generator::generate_ddy(enum opcode opcode,
       brw_push_insn_state(p);
       brw_set_default_access_mode(p, BRW_ALIGN_16);
       if (unroll_to_simd8) {
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
          brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
          if (negate_value) {
             brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0)));
@@ -1600,10 +1601,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          break;
       case 16:
       case 32:
-         if (type_sz(inst->dst.type) < sizeof(float))
-            brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
-         else
+         /* If the instruction writes to more than one register, it needs to
+          * be a "compressed" instruction on Gen <= 5.
+          */
+         if (inst->exec_size * inst->dst.stride * type_sz(inst->dst.type) > 32)
             brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+         else
+            brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
          break;
       default:
          unreachable("Invalid instruction width");
@@ -2121,6 +2125,10 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
          generate_cs_terminate(inst, src[0]);
          break;
 
+      case SHADER_OPCODE_BARRIER:
+	 generate_barrier(inst, src[0]);
+	 break;
+
       default:
          unreachable("Unsupported opcode");
 
@@ -2166,15 +2174,13 @@ fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
       ralloc_free(annotation.ann);
    }
 
-   static GLuint msg_id = 0;
-   _mesa_gl_debug(&brw->ctx, &msg_id,
-                  MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                  MESA_DEBUG_TYPE_OTHER,
-                  MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s SIMD%d shader: %d inst, %d loops, %d:%d spills:fills, "
-                  "Promoted %u constants, compacted %d to %d bytes.\n",
-                  stage_abbrev, dispatch_width, before_size / 16, loop_count,
-                  spill_count, fill_count, promoted_constants, before_size, after_size);
+   compiler->shader_debug_log(log_data,
+                              "%s SIMD%d shader: %d inst, %d loops, "
+                              "%d:%d spills:fills, Promoted %u constants, "
+                              "compacted %d to %d bytes.\n",
+                              stage_abbrev, dispatch_width, before_size / 16,
+                              loop_count, spill_count, fill_count,
+                              promoted_constants, before_size, after_size);
 
    return start_offset;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 270131a..a378019 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -28,6 +28,8 @@
 #include "brw_fs.h"
 #include "brw_nir.h"
 
+using namespace brw;
+
 void
 fs_visitor::emit_nir_code()
 {
@@ -38,12 +40,12 @@ fs_visitor::emit_nir_code()
     */
 
    if (nir->num_inputs > 0) {
-      nir_inputs = vgrf(nir->num_inputs);
+      nir_inputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_inputs);
       nir_setup_inputs(nir);
    }
 
    if (nir->num_outputs > 0) {
-      nir_outputs = vgrf(nir->num_outputs);
+      nir_outputs = bld.vgrf(BRW_REGISTER_TYPE_F, nir->num_outputs);
       nir_setup_outputs(nir);
    }
 
@@ -58,7 +60,7 @@ fs_visitor::emit_nir_code()
       unsigned array_elems =
          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
       unsigned size = array_elems * reg->num_components;
-      nir_globals[reg->index] = vgrf(size);
+      nir_globals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
    }
 
    /* get the main function and emit it */
@@ -93,8 +95,8 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
          unsigned array_length = var->type->is_array() ? var->type->length : 1;
          for (unsigned i = 0; i < array_length; i++) {
             for (unsigned j = 0; j < components; j++) {
-               emit(MOV(retype(offset(input, components * i + j), type),
-                        offset(fs_reg(ATTR, var->data.location + i, type), j)));
+               bld.MOV(retype(offset(input, components * i + j), type),
+                       offset(fs_reg(ATTR, var->data.location + i, type), j));
             }
          }
          break;
@@ -107,7 +109,7 @@ fs_visitor::nir_setup_inputs(nir_shader *shader)
          if (var->data.location == VARYING_SLOT_POS) {
             reg = *emit_fragcoord_interpolation(var->data.pixel_center_integer,
                                                 var->data.origin_upper_left);
-            emit_percomp(MOV(input, reg), 0xF);
+            emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, input, reg), 0xF);
          } else {
             emit_general_interpolation(input, var->name, var->type,
                                        (glsl_interp_qualifier) var->data.interpolation,
@@ -218,9 +220,12 @@ fs_visitor::nir_setup_uniform(nir_variable *var)
       * our name.
       */
    unsigned index = var->data.driver_location;
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
+   for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 
+      if (storage->builtin)
+              continue;
+
       if (strncmp(var->name, storage->name, namelen) != 0 ||
          (storage->name[namelen] != 0 &&
          storage->name[namelen] != '.' &&
@@ -358,7 +363,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl)
       unsigned array_elems =
          reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
       unsigned size = array_elems * reg->num_components;
-      nir_locals[reg->index] = vgrf(size);
+      nir_locals[reg->index] = bld.vgrf(BRW_REGISTER_TYPE_F, size);
    }
 
    nir_emit_cf_list(&impl->body);
@@ -392,21 +397,21 @@ void
 fs_visitor::nir_emit_if(nir_if *if_stmt)
 {
    /* first, put the condition into f0 */
-   fs_inst *inst = emit(MOV(reg_null_d,
+   fs_inst *inst = bld.MOV(bld.null_reg_d(),
                             retype(get_nir_src(if_stmt->condition),
-                                   BRW_REGISTER_TYPE_D)));
+                                   BRW_REGISTER_TYPE_D));
    inst->conditional_mod = BRW_CONDITIONAL_NZ;
 
-   emit(IF(BRW_PREDICATE_NORMAL));
+   bld.IF(BRW_PREDICATE_NORMAL);
 
    nir_emit_cf_list(&if_stmt->then_list);
 
    /* note: if the else is empty, dead CF elimination will remove it */
-   emit(BRW_OPCODE_ELSE);
+   bld.emit(BRW_OPCODE_ELSE);
 
    nir_emit_cf_list(&if_stmt->else_list);
 
-   emit(BRW_OPCODE_ENDIF);
+   bld.emit(BRW_OPCODE_ENDIF);
 
    if (!try_replace_with_sel() && devinfo->gen < 6) {
       no16("Can't support (non-uniform) control flow on SIMD16\n");
@@ -420,11 +425,11 @@ fs_visitor::nir_emit_loop(nir_loop *loop)
       no16("Can't support (non-uniform) control flow on SIMD16\n");
    }
 
-   emit(BRW_OPCODE_DO);
+   bld.emit(BRW_OPCODE_DO);
 
    nir_emit_cf_list(&loop->body);
 
-   emit(BRW_OPCODE_WHILE);
+   bld.emit(BRW_OPCODE_WHILE);
 }
 
 void
@@ -438,19 +443,19 @@ fs_visitor::nir_emit_block(nir_block *block)
 void
 fs_visitor::nir_emit_instr(nir_instr *instr)
 {
-   this->base_ir = instr;
+   const fs_builder abld = bld.annotate(NULL, instr);
 
    switch (instr->type) {
    case nir_instr_type_alu:
-      nir_emit_alu(nir_instr_as_alu(instr));
+      nir_emit_alu(abld, nir_instr_as_alu(instr));
       break;
 
    case nir_instr_type_intrinsic:
-      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+      nir_emit_intrinsic(abld, nir_instr_as_intrinsic(instr));
       break;
 
    case nir_instr_type_tex:
-      nir_emit_texture(nir_instr_as_tex(instr));
+      nir_emit_texture(abld, nir_instr_as_tex(instr));
       break;
 
    case nir_instr_type_load_const:
@@ -460,14 +465,12 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
       break;
 
    case nir_instr_type_jump:
-      nir_emit_jump(nir_instr_as_jump(instr));
+      nir_emit_jump(abld, nir_instr_as_jump(instr));
       break;
 
    default:
       unreachable("unknown instruction type");
    }
-
-   this->base_ir = NULL;
 }
 
 static brw_reg_type
@@ -540,7 +543,7 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
       tmp.subreg_offset = 2;
       tmp.stride = 2;
 
-      fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
+      fs_inst *or_inst = bld.OR(tmp, g0, fs_reg(0x3f80));
       or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
 
       tmp.type = BRW_REGISTER_TYPE_D;
@@ -565,15 +568,15 @@ fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
          g1_6.negate = true;
       }
 
-      emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
+      bld.OR(tmp, g1_6, fs_reg(0x3f800000));
    }
-   emit(AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000)));
+   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, fs_reg(0xbf800000));
 
    return true;
 }
 
 void
-fs_visitor::nir_emit_alu(nir_alu_instr *instr)
+fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
 {
    struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
    fs_inst *inst;
@@ -605,7 +608,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
          if (!instr->src[i].src.is_ssa &&
              instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
             need_extra_copy = true;
-            temp = retype(vgrf(4), result.type);
+            temp = bld.vgrf(result.type, 4);
             break;
          }
       }
@@ -615,11 +618,11 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
             continue;
 
          if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
-            inst = emit(MOV(offset(temp, i),
-                        offset(op[0], instr->src[0].swizzle[i])));
+            inst = bld.MOV(offset(temp, i),
+                           offset(op[0], instr->src[0].swizzle[i]));
          } else {
-            inst = emit(MOV(offset(temp, i),
-                        offset(op[i], instr->src[i].swizzle[0])));
+            inst = bld.MOV(offset(temp, i),
+                           offset(op[i], instr->src[i].swizzle[0]));
          }
          inst->saturate = instr->dest.saturate;
       }
@@ -633,7 +636,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
             if (!(instr->dest.write_mask & (1 << i)))
                continue;
 
-            emit(MOV(offset(result, i), offset(temp, i)));
+            bld.MOV(offset(result, i), offset(temp, i));
          }
       }
       return;
@@ -665,13 +668,13 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    switch (instr->op) {
    case nir_op_i2f:
    case nir_op_u2f:
-      inst = emit(MOV(result, op[0]));
+      inst = bld.MOV(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_f2i:
    case nir_op_f2u:
-      emit(MOV(result, op[0]));
+      bld.MOV(result, op[0]);
       break;
 
    case nir_op_fsign: {
@@ -680,17 +683,17 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
          * zero.
          */
-      emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
+      bld.CMP(bld.null_reg_f(), op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
 
       fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
       op[0].type = BRW_REGISTER_TYPE_UD;
       result.type = BRW_REGISTER_TYPE_UD;
-      emit(AND(result_int, op[0], fs_reg(0x80000000u)));
+      bld.AND(result_int, op[0], fs_reg(0x80000000u));
 
-      inst = emit(OR(result_int, result_int, fs_reg(0x3f800000u)));
+      inst = bld.OR(result_int, result_int, fs_reg(0x3f800000u));
       inst->predicate = BRW_PREDICATE_NORMAL;
       if (instr->dest.saturate) {
-         inst = emit(MOV(result, result));
+         inst = bld.MOV(result, result);
          inst->saturate = true;
       }
       break;
@@ -701,120 +704,88 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
        *               -> non-negative val generates 0x00000000.
        *  Predicated OR sets 1 if val is positive.
        */
-      emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
-      emit(ASR(result, op[0], fs_reg(31)));
-      inst = emit(OR(result, result, fs_reg(1)));
+      bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_G);
+      bld.ASR(result, op[0], fs_reg(31));
+      inst = bld.OR(result, result, fs_reg(1));
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
    case nir_op_frcp:
-      inst = emit_math(SHADER_OPCODE_RCP, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fexp2:
-      inst = emit_math(SHADER_OPCODE_EXP2, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_flog2:
-      inst = emit_math(SHADER_OPCODE_LOG2, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fsin:
-      inst = emit_math(SHADER_OPCODE_SIN, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fcos:
-      inst = emit_math(SHADER_OPCODE_COS, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fddx:
       if (fs_key->high_quality_derivatives) {
-         inst = emit(FS_OPCODE_DDX_FINE, result, op[0]);
+         inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
       } else {
-         inst = emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+         inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
       }
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddx_fine:
-      inst = emit(FS_OPCODE_DDX_FINE, result, op[0]);
+      inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddx_coarse:
-      inst = emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+      inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy:
       if (fs_key->high_quality_derivatives) {
-         inst = emit(FS_OPCODE_DDY_FINE, result, op[0],
-                     fs_reg(fs_key->render_to_fbo));
+         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
+                         fs_reg(fs_key->render_to_fbo));
       } else {
-         inst = emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                     fs_reg(fs_key->render_to_fbo));
+         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
+                         fs_reg(fs_key->render_to_fbo));
       }
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy_fine:
-      inst = emit(FS_OPCODE_DDY_FINE, result, op[0],
-                  fs_reg(fs_key->render_to_fbo));
+      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0],
+                      fs_reg(fs_key->render_to_fbo));
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fddy_coarse:
-      inst = emit(FS_OPCODE_DDY_COARSE, result, op[0],
-                  fs_reg(fs_key->render_to_fbo));
+      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0],
+                      fs_reg(fs_key->render_to_fbo));
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fadd:
    case nir_op_iadd:
-      inst = emit(ADD(result, op[0], op[1]));
+      inst = bld.ADD(result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fmul:
-      inst = emit(MUL(result, op[0], op[1]));
+      inst = bld.MUL(result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
       break;
 
-   case nir_op_imul: {
-      if (devinfo->gen >= 8) {
-         emit(MUL(result, op[0], op[1]));
-         break;
-      } else {
-         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
-         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
-
-         if (value0 && value0->u[0] < (1 << 16)) {
-            if (devinfo->gen < 7) {
-               emit(MUL(result, op[0], op[1]));
-            } else {
-               emit(MUL(result, op[1], op[0]));
-            }
-            break;
-         } else if (value1 && value1->u[0] < (1 << 16)) {
-            if (devinfo->gen < 7) {
-               emit(MUL(result, op[1], op[0]));
-            } else {
-               emit(MUL(result, op[0], op[1]));
-            }
-            break;
-         }
-      }
-
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
-
-      emit(MUL(acc, op[0], op[1]));
-      emit(MACH(reg_null_d, op[0], op[1]));
-      emit(MOV(result, fs_reg(acc)));
+   case nir_op_imul:
+      bld.MUL(result, op[0], op[1]);
       break;
-   }
 
    case nir_op_imul_high:
    case nir_op_umul_high: {
@@ -823,8 +794,8 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
 
       struct brw_reg acc = retype(brw_acc_reg(dispatch_width), result.type);
 
-      fs_inst *mul = emit(MUL(acc, op[0], op[1]));
-      emit(MACH(result, op[0], op[1]));
+      fs_inst *mul = bld.MUL(acc, op[0], op[1]);
+      bld.MACH(result, op[0], op[1]);
 
       /* Until Gen8, integer multiplies read 32-bits from one source, and
        * 16-bits from the other, and relying on the MACH instruction to
@@ -852,7 +823,7 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
 
    case nir_op_idiv:
    case nir_op_udiv:
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
+      bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
       break;
 
    case nir_op_uadd_carry: {
@@ -862,8 +833,8 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
                                   BRW_REGISTER_TYPE_UD);
 
-      emit(ADDC(reg_null_ud, op[0], op[1]));
-      emit(MOV(result, fs_reg(acc)));
+      bld.ADDC(bld.null_reg_ud(), op[0], op[1]);
+      bld.MOV(result, fs_reg(acc));
       break;
    }
 
@@ -874,63 +845,63 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
                                   BRW_REGISTER_TYPE_UD);
 
-      emit(SUBB(reg_null_ud, op[0], op[1]));
-      emit(MOV(result, fs_reg(acc)));
+      bld.SUBB(bld.null_reg_ud(), op[0], op[1]);
+      bld.MOV(result, fs_reg(acc));
       break;
    }
 
    case nir_op_umod:
-      emit_math(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
       break;
 
    case nir_op_flt:
    case nir_op_ilt:
    case nir_op_ult:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_L));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_L);
       break;
 
    case nir_op_fge:
    case nir_op_ige:
    case nir_op_uge:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_GE));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_GE);
       break;
 
    case nir_op_feq:
    case nir_op_ieq:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_Z));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_Z);
       break;
 
    case nir_op_fne:
    case nir_op_ine:
-      emit(CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ));
+      bld.CMP(result, op[0], op[1], BRW_CONDITIONAL_NZ);
       break;
 
    case nir_op_inot:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
       }
-      emit(NOT(result, op[0]));
+      bld.NOT(result, op[0]);
       break;
    case nir_op_ixor:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
          resolve_source_modifiers(&op[1]);
       }
-      emit(XOR(result, op[0], op[1]));
+      bld.XOR(result, op[0], op[1]);
       break;
    case nir_op_ior:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
          resolve_source_modifiers(&op[1]);
       }
-      emit(OR(result, op[0], op[1]));
+      bld.OR(result, op[0], op[1]);
       break;
    case nir_op_iand:
       if (devinfo->gen >= 8) {
          resolve_source_modifiers(&op[0]);
          resolve_source_modifiers(&op[1]);
       }
-      emit(AND(result, op[0], op[1]));
+      bld.AND(result, op[0], op[1]);
       break;
 
    case nir_op_fdot2:
@@ -978,53 +949,53 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       unreachable("not reached: should be handled by ldexp_to_arith()");
 
    case nir_op_fsqrt:
-      inst = emit_math(SHADER_OPCODE_SQRT, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_frsq:
-      inst = emit_math(SHADER_OPCODE_RSQ, result, op[0]);
+      inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_b2i:
-      emit(AND(result, op[0], fs_reg(1)));
+      bld.AND(result, op[0], fs_reg(1));
       break;
    case nir_op_b2f:
-      emit(AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u)));
+      bld.AND(retype(result, BRW_REGISTER_TYPE_UD), op[0], fs_reg(0x3f800000u));
       break;
 
    case nir_op_f2b:
-      emit(CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
+      bld.CMP(result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ);
       break;
    case nir_op_i2b:
-      emit(CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
+      bld.CMP(result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
       break;
 
    case nir_op_ftrunc:
-      inst = emit(RNDZ(result, op[0]));
+      inst = bld.RNDZ(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fceil: {
       op[0].negate = !op[0].negate;
       fs_reg temp = vgrf(glsl_type::float_type);
-      emit(RNDD(temp, op[0]));
+      bld.RNDD(temp, op[0]);
       temp.negate = true;
-      inst = emit(MOV(result, temp));
+      inst = bld.MOV(result, temp);
       inst->saturate = instr->dest.saturate;
       break;
    }
    case nir_op_ffloor:
-      inst = emit(RNDD(result, op[0]));
+      inst = bld.RNDD(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_ffract:
-      inst = emit(FRC(result, op[0]));
+      inst = bld.FRC(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_fround_even:
-      inst = emit(RNDE(result, op[0]));
+      inst = bld.RNDE(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -1032,11 +1003,11 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_imin:
    case nir_op_umin:
       if (devinfo->gen >= 6) {
-         inst = emit(BRW_OPCODE_SEL, result, op[0], op[1]);
+         inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
          inst->conditional_mod = BRW_CONDITIONAL_L;
       } else {
-         emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_L));
-         inst = emit(SEL(result, op[0], op[1]));
+         bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_L);
+         inst = bld.SEL(result, op[0], op[1]);
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
       inst->saturate = instr->dest.saturate;
@@ -1046,11 +1017,11 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    case nir_op_imax:
    case nir_op_umax:
       if (devinfo->gen >= 6) {
-         inst = emit(BRW_OPCODE_SEL, result, op[0], op[1]);
+         inst = bld.emit(BRW_OPCODE_SEL, result, op[0], op[1]);
          inst->conditional_mod = BRW_CONDITIONAL_GE;
       } else {
-         emit(CMP(reg_null_d, op[0], op[1], BRW_CONDITIONAL_GE));
-         inst = emit(SEL(result, op[0], op[1]));
+         bld.CMP(bld.null_reg_d(), op[0], op[1], BRW_CONDITIONAL_GE);
+         inst = bld.SEL(result, op[0], op[1]);
          inst->predicate = BRW_PREDICATE_NORMAL;
       }
       inst->saturate = instr->dest.saturate;
@@ -1069,57 +1040,57 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       unreachable("not reached: should be handled by lower_packing_builtins");
 
    case nir_op_unpack_half_2x16_split_x:
-      inst = emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
+      inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
    case nir_op_unpack_half_2x16_split_y:
-      inst = emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
+      inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_fpow:
-      inst = emit_math(SHADER_OPCODE_POW, result, op[0], op[1]);
+      inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_bitfield_reverse:
-      emit(BFREV(result, op[0]));
+      bld.BFREV(result, op[0]);
       break;
 
    case nir_op_bit_count:
-      emit(CBIT(result, op[0]));
+      bld.CBIT(result, op[0]);
       break;
 
    case nir_op_ufind_msb:
    case nir_op_ifind_msb: {
-      emit(FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]));
+      bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
 
       /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
        * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
        * subtract the result from 31 to convert the MSB count into an LSB count.
        */
 
-      emit(CMP(reg_null_d, result, fs_reg(-1), BRW_CONDITIONAL_NZ));
+      bld.CMP(bld.null_reg_d(), result, fs_reg(-1), BRW_CONDITIONAL_NZ);
       fs_reg neg_result(result);
       neg_result.negate = true;
-      inst = emit(ADD(result, neg_result, fs_reg(31)));
+      inst = bld.ADD(result, neg_result, fs_reg(31));
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
    }
 
    case nir_op_find_lsb:
-      emit(FBL(result, op[0]));
+      bld.FBL(result, op[0]);
       break;
 
    case nir_op_ubitfield_extract:
    case nir_op_ibitfield_extract:
-      emit(BFE(result, op[2], op[1], op[0]));
+      bld.BFE(result, op[2], op[1], op[0]);
       break;
    case nir_op_bfm:
-      emit(BFI1(result, op[0], op[1]));
+      bld.BFI1(result, op[0], op[1]);
       break;
    case nir_op_bfi:
-      emit(BFI2(result, op[0], op[1], op[2]));
+      bld.BFI2(result, op[0], op[1], op[2]);
       break;
 
    case nir_op_bitfield_insert:
@@ -1127,26 +1098,26 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
                   "lower_instructions::bitfield_insert_to_bfm_bfi");
 
    case nir_op_ishl:
-      emit(SHL(result, op[0], op[1]));
+      bld.SHL(result, op[0], op[1]);
       break;
    case nir_op_ishr:
-      emit(ASR(result, op[0], op[1]));
+      bld.ASR(result, op[0], op[1]);
       break;
    case nir_op_ushr:
-      emit(SHR(result, op[0], op[1]));
+      bld.SHR(result, op[0], op[1]);
       break;
 
    case nir_op_pack_half_2x16_split:
-      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
+      bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
       break;
 
    case nir_op_ffma:
-      inst = emit(MAD(result, op[2], op[1], op[0]));
+      inst = bld.MAD(result, op[2], op[1], op[0]);
       inst->saturate = instr->dest.saturate;
       break;
 
    case nir_op_flrp:
-      inst = emit_lrp(result, op[0], op[1], op[2]);
+      inst = bld.LRP(result, op[0], op[1], op[2]);
       inst->saturate = instr->dest.saturate;
       break;
 
@@ -1154,8 +1125,8 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
       if (optimize_frontfacing_ternary(instr, result))
          return;
 
-      emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      inst = emit(SEL(result, op[1], op[2]));
+      bld.CMP(bld.null_reg_d(), op[0], fs_reg(0), BRW_CONDITIONAL_NZ);
+      inst = bld.SEL(result, op[1], op[2]);
       inst->predicate = BRW_PREDICATE_NORMAL;
       break;
 
@@ -1169,9 +1140,9 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr)
    if (devinfo->gen <= 5 &&
        (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
       fs_reg masked = vgrf(glsl_type::int_type);
-      emit(AND(masked, result, fs_reg(1)));
+      bld.AND(masked, result, fs_reg(1));
       masked.negate = true;
-      emit(MOV(retype(result, BRW_REGISTER_TYPE_D), masked));
+      bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
    }
 }
 
@@ -1190,8 +1161,8 @@ fs_reg_for_nir_reg(fs_visitor *v, nir_register *nir_reg,
       int multiplier = nir_reg->num_components * (v->dispatch_width / 8);
 
       reg.reladdr = new(v->mem_ctx) fs_reg(v->vgrf(glsl_type::int_type));
-      v->emit(v->MUL(*reg.reladdr, v->get_nir_src(*indirect),
-                     fs_reg(multiplier)));
+      v->bld.MUL(*reg.reladdr, v->get_nir_src(*indirect),
+                 fs_reg(multiplier));
    }
 
    return reg;
@@ -1203,11 +1174,10 @@ fs_visitor::get_nir_src(nir_src src)
    if (src.is_ssa) {
       assert(src.ssa->parent_instr->type == nir_instr_type_load_const);
       nir_load_const_instr *load = nir_instr_as_load_const(src.ssa->parent_instr);
-      fs_reg reg = vgrf(src.ssa->num_components);
-      reg.type = BRW_REGISTER_TYPE_D;
+      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_D, src.ssa->num_components);
 
       for (unsigned i = 0; i < src.ssa->num_components; ++i)
-         emit(MOV(offset(reg, i), fs_reg(load->value.i[i])));
+         bld.MOV(offset(reg, i), fs_reg(load->value.i[i]));
 
       return reg;
    } else {
@@ -1230,24 +1200,25 @@ fs_visitor::get_nir_dest(nir_dest dest)
 }
 
 void
-fs_visitor::emit_percomp(fs_inst *inst, unsigned wr_mask)
+fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
+                         unsigned wr_mask)
 {
    for (unsigned i = 0; i < 4; i++) {
       if (!((wr_mask >> i) & 1))
          continue;
 
-      fs_inst *new_inst = new(mem_ctx) fs_inst(*inst);
+      fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
       new_inst->dst = offset(new_inst->dst, i);
       for (unsigned j = 0; j < new_inst->sources; j++)
-         if (inst->src[j].file == GRF)
+         if (new_inst->src[j].file == GRF)
             new_inst->src[j] = offset(new_inst->src[j], i);
 
-      emit(new_inst);
+      bld.emit(new_inst);
    }
 }
 
 void
-fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
 {
    fs_reg dest;
    if (nir_intrinsic_infos[instr->intrinsic].has_dest)
@@ -1265,12 +1236,12 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        */
       fs_inst *cmp;
       if (instr->intrinsic == nir_intrinsic_discard_if) {
-         cmp = emit(CMP(reg_null_f, get_nir_src(instr->src[0]),
-                        fs_reg(0), BRW_CONDITIONAL_Z));
+         cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
+                       fs_reg(0), BRW_CONDITIONAL_Z);
       } else {
          fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
                                        BRW_REGISTER_TYPE_UW));
-         cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
+         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
       }
       cmp->predicate = BRW_PREDICATE_NORMAL;
       cmp->flag_subreg = 1;
@@ -1307,8 +1278,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    }
 
    case nir_intrinsic_load_front_face:
-      emit(MOV(retype(dest, BRW_REGISTER_TYPE_D),
-               *emit_frontfacing_interpolation()));
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              *emit_frontfacing_interpolation());
       break;
 
    case nir_intrinsic_load_vertex_id:
@@ -1318,7 +1289,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg vertex_id = nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
       assert(vertex_id.file != BAD_FILE);
       dest.type = vertex_id.type;
-      emit(MOV(dest, vertex_id));
+      bld.MOV(dest, vertex_id);
       break;
    }
 
@@ -1326,7 +1297,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg base_vertex = nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
       assert(base_vertex.file != BAD_FILE);
       dest.type = base_vertex.type;
-      emit(MOV(dest, base_vertex));
+      bld.MOV(dest, base_vertex);
       break;
    }
 
@@ -1334,7 +1305,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg instance_id = nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
       assert(instance_id.file != BAD_FILE);
       dest.type = instance_id.type;
-      emit(MOV(dest, instance_id));
+      bld.MOV(dest, instance_id);
       break;
    }
 
@@ -1342,7 +1313,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg sample_mask_in = nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
       assert(sample_mask_in.file != BAD_FILE);
       dest.type = sample_mask_in.type;
-      emit(MOV(dest, sample_mask_in));
+      bld.MOV(dest, sample_mask_in);
       break;
    }
 
@@ -1350,8 +1321,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
       assert(sample_pos.file != BAD_FILE);
       dest.type = sample_pos.type;
-      emit(MOV(dest, sample_pos));
-      emit(MOV(offset(dest, 1), offset(sample_pos, 1)));
+      bld.MOV(dest, sample_pos);
+      bld.MOV(offset(dest, 1), offset(sample_pos, 1));
       break;
    }
 
@@ -1359,7 +1330,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       fs_reg sample_id = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
       assert(sample_id.file != BAD_FILE);
       dest.type = sample_id.type;
-      emit(MOV(dest, sample_id));
+      bld.MOV(dest, sample_id);
       break;
    }
 
@@ -1377,16 +1348,14 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          index -= num_direct_uniforms;
       }
 
-      for (int i = 0; i < instr->const_index[1]; i++) {
-         for (unsigned j = 0; j < instr->num_components; j++) {
-            fs_reg src = offset(retype(uniform_reg, dest.type), index);
-            if (has_indirect)
-               src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
-            index++;
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg src = offset(retype(uniform_reg, dest.type), index);
+         if (has_indirect)
+            src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
+         index++;
 
-            emit(MOV(dest, src));
-            dest = offset(dest, 1);
-         }
+         bld.MOV(dest, src);
+         dest = offset(dest, 1);
       }
       break;
    }
@@ -1417,9 +1386,9 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
           * from any live channel.
           */
          surf_index = vgrf(glsl_type::uint_type);
-         emit(ADD(surf_index, get_nir_src(instr->src[0]),
-                  fs_reg(stage_prog_data->binding_table.ubo_start)));
-         emit_uniformize(surf_index, surf_index);
+         bld.ADD(surf_index, get_nir_src(instr->src[0]),
+                 fs_reg(stage_prog_data->binding_table.ubo_start));
+         bld.emit_uniformize(surf_index, surf_index);
 
          /* Assume this may touch any UBO. It would be nice to provide
           * a tighter bound, but the array information is already lowered away.
@@ -1432,21 +1401,21 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       if (has_indirect) {
          /* Turn the byte offset into a dword offset. */
          fs_reg base_offset = vgrf(glsl_type::int_type);
-         emit(SHR(base_offset, retype(get_nir_src(instr->src[1]),
-                                 BRW_REGISTER_TYPE_D),
-                  fs_reg(2)));
+         bld.SHR(base_offset, retype(get_nir_src(instr->src[1]),
+                                     BRW_REGISTER_TYPE_D),
+                 fs_reg(2));
 
          unsigned vec4_offset = instr->const_index[0] / 4;
          for (int i = 0; i < instr->num_components; i++)
-            emit(VARYING_PULL_CONSTANT_LOAD(offset(dest, i), surf_index,
-                                            base_offset, vec4_offset + i));
+            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, i), surf_index,
+                                       base_offset, vec4_offset + i);
       } else {
          fs_reg packed_consts = vgrf(glsl_type::float_type);
          packed_consts.type = dest.type;
 
          fs_reg const_offset_reg((unsigned) instr->const_index[0] & ~15);
-         emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
-              surf_index, const_offset_reg);
+         bld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, packed_consts,
+                  surf_index, const_offset_reg);
 
          for (unsigned i = 0; i < instr->num_components; i++) {
             packed_consts.set_smear(instr->const_index[0] % 16 / 4 + i);
@@ -1456,7 +1425,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
              */
             assert(packed_consts.subreg_offset < 32);
 
-            emit(MOV(dest, packed_consts));
+            bld.MOV(dest, packed_consts);
             dest = offset(dest, 1);
          }
       }
@@ -1468,17 +1437,15 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
       /* fallthrough */
    case nir_intrinsic_load_input: {
       unsigned index = 0;
-      for (int i = 0; i < instr->const_index[1]; i++) {
-         for (unsigned j = 0; j < instr->num_components; j++) {
-            fs_reg src = offset(retype(nir_inputs, dest.type),
-                                instr->const_index[0] + index);
-            if (has_indirect)
-               src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
-            index++;
-
-            emit(MOV(dest, src));
-            dest = offset(dest, 1);
-         }
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg src = offset(retype(nir_inputs, dest.type),
+                             instr->const_index[0] + index);
+         if (has_indirect)
+            src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[0]));
+         index++;
+
+         bld.MOV(dest, src);
+         dest = offset(dest, 1);
       }
       break;
    }
@@ -1510,7 +1477,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
        */
       no16("interpolate_at_* not yet supported in SIMD16 mode.");
 
-      fs_reg dst_xy = vgrf(2);
+      fs_reg dst_xy = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
 
       /* For most messages, we need one reg of ignored data; the hardware
        * requires mlen==1 even when there is no payload. in the per-slot
@@ -1522,7 +1489,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
 
       switch (instr->intrinsic) {
       case nir_intrinsic_interp_var_at_centroid:
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
+         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_CENTROID,
+                         dst_xy, src, fs_reg(0u));
          break;
 
       case nir_intrinsic_interp_var_at_sample: {
@@ -1530,8 +1498,8 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
          assert(const_sample);
          unsigned msg_data = const_sample ? const_sample->i[0] << 4 : 0;
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
-                     fs_reg(msg_data));
+         inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src,
+                         fs_reg(msg_data));
          break;
       }
 
@@ -1542,17 +1510,17 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
             unsigned off_x = MIN2((int)(const_offset->f[0] * 16), 7) & 0xf;
             unsigned off_y = MIN2((int)(const_offset->f[1] * 16), 7) & 0xf;
 
-            inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
-                        fs_reg(off_x | (off_y << 4)));
+            inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
+                            fs_reg(off_x | (off_y << 4)));
          } else {
             src = vgrf(glsl_type::ivec2_type);
             fs_reg offset_src = retype(get_nir_src(instr->src[0]),
                                        BRW_REGISTER_TYPE_F);
             for (int i = 0; i < 2; i++) {
                fs_reg temp = vgrf(glsl_type::float_type);
-               emit(MUL(temp, offset(offset_src, i), fs_reg(16.0f)));
+               bld.MUL(temp, offset(offset_src, i), fs_reg(16.0f));
                fs_reg itemp = vgrf(glsl_type::int_type);
-               emit(MOV(itemp, temp));  /* float to int */
+               bld.MOV(itemp, temp);  /* float to int */
 
                /* Clamp the upper end of the range to +7/16.
                 * ARB_gpu_shader5 requires that we support a maximum offset
@@ -1569,14 +1537,13 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
                 * implementation-dependent constant
                 * FRAGMENT_INTERPOLATION_OFFSET_BITS"
                 */
-
-               emit(BRW_OPCODE_SEL, offset(src, i), itemp, fs_reg(7))
-                   ->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
+               set_condmod(BRW_CONDITIONAL_L,
+                           bld.SEL(offset(src, i), itemp, fs_reg(7)));
             }
 
             mlen = 2;
-            inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
-                        fs_reg(0u));
+            inst = bld.emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
+                            fs_reg(0u));
          }
          break;
       }
@@ -1594,7 +1561,7 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          fs_reg src = interp_reg(instr->variables[0]->var->data.location, j);
          src.type = dest.type;
 
-         emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
+         bld.emit(FS_OPCODE_LINTERP, dest, dst_xy, src);
          dest = offset(dest, 1);
       }
       break;
@@ -1606,27 +1573,29 @@ fs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
    case nir_intrinsic_store_output: {
       fs_reg src = get_nir_src(instr->src[0]);
       unsigned index = 0;
-      for (int i = 0; i < instr->const_index[1]; i++) {
-         for (unsigned j = 0; j < instr->num_components; j++) {
-            fs_reg new_dest = offset(retype(nir_outputs, src.type),
-                                     instr->const_index[0] + index);
-            if (has_indirect)
-               src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
-            index++;
-            emit(MOV(new_dest, src));
-            src = offset(src, 1);
-         }
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         fs_reg new_dest = offset(retype(nir_outputs, src.type),
+                                  instr->const_index[0] + index);
+         if (has_indirect)
+            src.reladdr = new(mem_ctx) fs_reg(get_nir_src(instr->src[1]));
+         index++;
+         bld.MOV(new_dest, src);
+         src = offset(src, 1);
       }
       break;
    }
 
+   case nir_intrinsic_barrier:
+      emit_barrier();
+      break;
+
    default:
       unreachable("unknown intrinsic");
    }
 }
 
 void
-fs_visitor::nir_emit_texture(nir_tex_instr *instr)
+fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
 {
    uint32_t set = instr->sampler_set;
    uint32_t binding = instr->sampler_index;
@@ -1650,7 +1619,8 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
    bool is_cube_array = instr->sampler_dim == GLSL_SAMPLER_DIM_CUBE &&
                         instr->is_array;
 
-   int lod_components = 0, offset_components = 0;
+   int lod_components = 0;
+   int UNUSED offset_components = 0;
 
    fs_reg coordinate, shadow_comparitor, lod, lod2, sample_index, mcs, tex_offset;
 
@@ -1719,8 +1689,8 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
 
          /* Emit code to evaluate the actual indexing expression */
          sampler_reg = vgrf(glsl_type::uint_type);
-         emit(ADD(sampler_reg, src, fs_reg(sampler)));
-         emit_uniformize(sampler_reg, sampler_reg);
+         bld.ADD(sampler_reg, src, fs_reg(sampler));
+         bld.emit_uniformize(sampler_reg, sampler_reg);
          break;
       }
 
@@ -1789,18 +1759,19 @@ fs_visitor::nir_emit_texture(nir_tex_instr *instr)
    fs_reg dest = get_nir_dest(instr->dest);
    dest.type = this->result.type;
    unsigned num_components = nir_tex_instr_dest_size(instr);
-   emit_percomp(MOV(dest, this->result), (1 << num_components) - 1);
+   emit_percomp(bld, fs_inst(BRW_OPCODE_MOV, dest, this->result),
+                (1 << num_components) - 1);
 }
 
 void
-fs_visitor::nir_emit_jump(nir_jump_instr *instr)
+fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
 {
    switch (instr->type) {
    case nir_jump_break:
-      emit(BRW_OPCODE_BREAK);
+      bld.emit(BRW_OPCODE_BREAK);
       break;
    case nir_jump_continue:
-      emit(BRW_OPCODE_CONTINUE);
+      bld.emit(BRW_OPCODE_CONTINUE);
       break;
    case nir_jump_return:
    default:
diff --git a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
index cf3da7b..d92d4bbd 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_peephole_predicated_break.cpp
@@ -85,9 +85,9 @@ fs_visitor::opt_peephole_predicated_break()
        * instruction to set the flag register.
        */
       if (devinfo->gen == 6 && if_inst->conditional_mod) {
-         fs_inst *cmp_inst = CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                                 if_inst->conditional_mod);
-         if_inst->insert_before(if_block, cmp_inst);
+         bld.at(if_block, if_inst)
+            .CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                 if_inst->conditional_mod);
          jump_inst->predicate = BRW_PREDICATE_NORMAL;
       } else {
          jump_inst->predicate = if_inst->predicate;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
index 582d099..364fc4a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp
@@ -30,6 +30,8 @@
 #include "glsl/glsl_types.h"
 #include "glsl/ir_optimization.h"
 
+using namespace brw;
+
 static void
 assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
 {
@@ -468,14 +470,14 @@ fs_visitor::setup_payload_interference(struct ra_graph *g,
  * see if we can actually use MRFs to do spills without overwriting normal MRF
  * contents.
  */
-void
-fs_visitor::get_used_mrfs(bool *mrf_used)
+static void
+get_used_mrfs(fs_visitor *v, bool *mrf_used)
 {
-   int reg_width = dispatch_width / 8;
+   int reg_width = v->dispatch_width / 8;
 
    memset(mrf_used, 0, BRW_MAX_MRF * sizeof(bool));
 
-   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
       if (inst->dst.file == MRF) {
          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
          mrf_used[reg] = true;
@@ -489,7 +491,7 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
       }
 
       if (inst->mlen > 0) {
-	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
+	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
             mrf_used[inst->base_mrf + i] = true;
          }
       }
@@ -500,12 +502,14 @@ fs_visitor::get_used_mrfs(bool *mrf_used)
  * Sets interference between virtual GRFs and usage of the high GRFs for SEND
  * messages (treated as MRFs in code generation).
  */
-void
-fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
+static void
+setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
+                            int first_mrf_node, int *first_used_mrf)
 {
    bool mrf_used[BRW_MAX_MRF];
-   get_used_mrfs(mrf_used);
+   get_used_mrfs(v, mrf_used);
 
+   *first_used_mrf = BRW_MAX_MRF;
    for (int i = 0; i < BRW_MAX_MRF; i++) {
       /* Mark each MRF reg node as being allocated to its physical register.
        *
@@ -518,7 +522,10 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
        * that are used as conflicting with all virtual GRFs.
        */
       if (mrf_used[i]) {
-         for (unsigned j = 0; j < this->alloc.count; j++) {
+         if (i < *first_used_mrf)
+            *first_used_mrf = i;
+
+         for (unsigned j = 0; j < v->alloc.count; j++) {
             ra_add_node_interference(g, first_mrf_node + i, j);
          }
       }
@@ -528,7 +535,6 @@ fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node)
 bool
 fs_visitor::assign_regs(bool allow_spilling)
 {
-   struct brw_compiler *compiler = brw->intelScreen->compiler;
    /* Most of this allocation was written for a reg_width of 1
     * (dispatch_width == 8).  In extending to SIMD16, the code was
     * left in place and it was converted to have the hardware
@@ -584,7 +590,9 @@ fs_visitor::assign_regs(bool allow_spilling)
 
    setup_payload_interference(g, payload_node_count, first_payload_node);
    if (devinfo->gen >= 7) {
-      setup_mrf_hack_interference(g, first_mrf_hack_node);
+      int first_used_mrf = BRW_MAX_MRF;
+      setup_mrf_hack_interference(this, g, first_mrf_hack_node,
+                                  &first_used_mrf);
 
       foreach_block_and_inst(block, fs_inst, inst, cfg) {
          /* When we do send-from-GRF for FB writes, we need to ensure that
@@ -600,6 +608,13 @@ fs_visitor::assign_regs(bool allow_spilling)
          if (inst->eot) {
             int size = alloc.sizes[inst->src[0].reg];
             int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
+
+            /* If something happened to spill, we want to push the EOT send
+             * register early enough in the register file that we don't
+             * conflict with any used MRF hack registers.
+             */
+            reg -= BRW_MAX_MRF - first_used_mrf;
+
             ra_set_node_reg(g, inst->src[0].reg, reg);
             break;
          }
@@ -696,25 +711,24 @@ fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst,
       dst.width = 16;
    }
 
+   const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
+                              .group(reg_size * 8, 0)
+                              .at(block, inst);
+
    for (int i = 0; i < count / reg_size; i++) {
       /* The gen7 descriptor-based offset is 12 bits of HWORD units. */
       bool gen7_read = devinfo->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE;
-
-      fs_inst *unspill_inst =
-         new(mem_ctx) fs_inst(gen7_read ?
-                              SHADER_OPCODE_GEN7_SCRATCH_READ :
-                              SHADER_OPCODE_GEN4_SCRATCH_READ,
-                              dst);
+      fs_inst *unspill_inst = ibld.emit(gen7_read ?
+                                        SHADER_OPCODE_GEN7_SCRATCH_READ :
+                                        SHADER_OPCODE_GEN4_SCRATCH_READ,
+                                        dst);
       unspill_inst->offset = spill_offset;
-      unspill_inst->ir = inst->ir;
-      unspill_inst->annotation = inst->annotation;
       unspill_inst->regs_written = reg_size;
 
       if (!gen7_read) {
          unspill_inst->base_mrf = 14;
          unspill_inst->mlen = 1; /* header contains offset */
       }
-      inst->insert_before(block, unspill_inst);
 
       dst.reg_offset += reg_size;
       spill_offset += reg_size * REG_SIZE;
@@ -732,17 +746,17 @@ fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src,
       reg_size = 2;
    }
 
+   const fs_builder ibld = bld.annotate(inst->annotation, inst->ir)
+                              .group(reg_size * 8, 0)
+                              .at(block, inst->next);
+
    for (int i = 0; i < count / reg_size; i++) {
       fs_inst *spill_inst =
-         new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
-                              reg_size * 8, reg_null_f, src);
+         ibld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
       src.reg_offset += reg_size;
       spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
-      spill_inst->ir = inst->ir;
-      spill_inst->annotation = inst->annotation;
       spill_inst->mlen = 1 + reg_size; /* header, value */
       spill_inst->base_mrf = spill_base_mrf;
-      inst->insert_after(block, spill_inst);
    }
 }
 
@@ -839,7 +853,7 @@ fs_visitor::spill_reg(int spill_reg)
     */
    if (!spilled_any_registers) {
       bool mrf_used[BRW_MAX_MRF];
-      get_used_mrfs(mrf_used);
+      get_used_mrfs(this, mrf_used);
 
       for (int i = spill_base_mrf; i < BRW_MAX_MRF; i++) {
          if (mrf_used[i]) {
diff --git a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
index 52aa559..8660ec0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_sel_peephole.cpp
@@ -37,6 +37,8 @@
  */
 #define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
 
+using namespace brw;
+
 /**
  * Scans forwards from an IF counting consecutive MOV instructions in the
  * "then" and "else" blocks of the if statement.
@@ -153,9 +155,6 @@ fs_visitor::opt_peephole_sel()
       if (movs == 0)
          continue;
 
-      fs_inst *sel_inst[MAX_MOVS] = { NULL };
-      fs_inst *mov_imm_inst[MAX_MOVS] = { NULL };
-
       enum brw_predicate predicate;
       bool predicate_inverse;
       if (devinfo->gen == 6 && if_inst->conditional_mod) {
@@ -188,9 +187,21 @@ fs_visitor::opt_peephole_sel()
             movs = i;
             break;
          }
+      }
+
+      if (movs == 0)
+         continue;
+
+      const fs_builder ibld = bld.at(block, if_inst);
 
+      /* Emit a CMP if our IF used the embedded comparison */
+      if (devinfo->gen == 6 && if_inst->conditional_mod)
+         ibld.CMP(ibld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                  if_inst->conditional_mod);
+
+      for (int i = 0; i < movs; i++) {
          if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
-            sel_inst[i] = MOV(then_mov[i]->dst, then_mov[i]->src[0]);
+            ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
          } else {
             /* Only the last source register can be a constant, so if the MOV
              * in the "then" clause uses a constant, we need to put it in a
@@ -200,29 +211,13 @@ fs_visitor::opt_peephole_sel()
             if (src0.file == IMM) {
                src0 = vgrf(glsl_type::float_type);
                src0.type = then_mov[i]->src[0].type;
-               mov_imm_inst[i] = MOV(src0, then_mov[i]->src[0]);
+               ibld.MOV(src0, then_mov[i]->src[0]);
             }
 
-            sel_inst[i] = SEL(then_mov[i]->dst, src0, else_mov[i]->src[0]);
-            sel_inst[i]->predicate = predicate;
-            sel_inst[i]->predicate_inverse = predicate_inverse;
+            set_predicate_inv(predicate, predicate_inverse,
+                              ibld.SEL(then_mov[i]->dst, src0,
+                                       else_mov[i]->src[0]));
          }
-      }
-
-      if (movs == 0)
-         continue;
-
-      /* Emit a CMP if our IF used the embedded comparison */
-      if (devinfo->gen == 6 && if_inst->conditional_mod) {
-         fs_inst *cmp_inst = CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                                 if_inst->conditional_mod);
-         if_inst->insert_before(block, cmp_inst);
-      }
-
-      for (int i = 0; i < movs; i++) {
-         if (mov_imm_inst[i])
-            if_inst->insert_before(block, mov_imm_inst[i]);
-         if_inst->insert_before(block, sel_inst[i]);
 
          then_mov[i]->remove(then_block);
          else_mov[i]->remove(else_block);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index e1f47d4..9a4bad6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -47,6 +47,7 @@
 #include "glsl/ir_optimization.h"
 #include "program/sampler.h"
 
+using namespace brw;
 
 fs_reg *
 fs_visitor::emit_vs_system_value(int location)
@@ -76,1371 +77,6 @@ fs_visitor::emit_vs_system_value(int location)
    return reg;
 }
 
-void
-fs_visitor::visit(ir_variable *ir)
-{
-   fs_reg *reg = NULL;
-
-   if (variable_storage(ir))
-      return;
-
-   if (ir->data.mode == ir_var_shader_in) {
-      assert(ir->data.location != -1);
-      if (stage == MESA_SHADER_VERTEX) {
-         reg = new(this->mem_ctx)
-            fs_reg(ATTR, ir->data.location,
-                   brw_type_for_base_type(ir->type->get_scalar_type()));
-      } else if (ir->data.location == VARYING_SLOT_POS) {
-         reg = emit_fragcoord_interpolation(ir->data.pixel_center_integer,
-                                            ir->data.origin_upper_left);
-      } else if (ir->data.location == VARYING_SLOT_FACE) {
-	 reg = emit_frontfacing_interpolation();
-      } else {
-         reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
-         emit_general_interpolation(*reg, ir->name, ir->type,
-                                    (glsl_interp_qualifier) ir->data.interpolation,
-                                    ir->data.location, ir->data.centroid,
-                                    ir->data.sample);
-      }
-      assert(reg);
-      hash_table_insert(this->variable_ht, reg, ir);
-      return;
-   } else if (ir->data.mode == ir_var_shader_out) {
-      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
-
-      if (stage == MESA_SHADER_VERTEX) {
-	 int vector_elements =
-	    ir->type->is_array() ? ir->type->fields.array->vector_elements
-				 : ir->type->vector_elements;
-
-	 for (int i = 0; i < (type_size(ir->type) + 3) / 4; i++) {
-	    int output = ir->data.location + i;
-	    this->outputs[output] = *reg;
-	    this->outputs[output].reg_offset = i * 4;
-	    this->output_components[output] = vector_elements;
-	 }
-
-      } else if (ir->data.index > 0) {
-	 assert(ir->data.location == FRAG_RESULT_DATA0);
-	 assert(ir->data.index == 1);
-	 this->dual_src_output = *reg;
-         this->do_dual_src = true;
-      } else if (ir->data.location == FRAG_RESULT_COLOR) {
-	 /* Writing gl_FragColor outputs to all color regions. */
-         assert(stage == MESA_SHADER_FRAGMENT);
-         brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-	 for (unsigned int i = 0; i < MAX2(key->nr_color_regions, 1); i++) {
-	    this->outputs[i] = *reg;
-	    this->output_components[i] = 4;
-	 }
-      } else if (ir->data.location == FRAG_RESULT_DEPTH) {
-	 this->frag_depth = *reg;
-      } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
-         this->sample_mask = *reg;
-      } else {
-	 /* gl_FragData or a user-defined FS output */
-	 assert(ir->data.location >= FRAG_RESULT_DATA0 &&
-		ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
-
-	 int vector_elements =
-	    ir->type->is_array() ? ir->type->fields.array->vector_elements
-				 : ir->type->vector_elements;
-
-	 /* General color output. */
-	 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
-	    int output = ir->data.location - FRAG_RESULT_DATA0 + i;
-	    this->outputs[output] = offset(*reg, vector_elements * i);
-	    this->output_components[output] = vector_elements;
-	 }
-      }
-   } else if (ir->data.mode == ir_var_uniform) {
-      int param_index = uniforms;
-
-      /* Thanks to the lower_ubo_reference pass, we will see only
-       * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
-       * variables, so no need for them to be in variable_ht.
-       *
-       * Some uniforms, such as samplers and atomic counters, have no actual
-       * storage, so we should ignore them.
-       */
-      if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
-         return;
-
-      if (dispatch_width == 16) {
-	 if (!variable_storage(ir)) {
-	    fail("Failed to find uniform '%s' in SIMD16\n", ir->name);
-	 }
-	 return;
-      }
-
-      param_size[param_index] = type_size(ir->type);
-      if (!strncmp(ir->name, "gl_", 3)) {
-	 setup_builtin_uniform_values(ir);
-      } else {
-	 setup_uniform_values(ir);
-      }
-
-      reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
-      reg->type = brw_type_for_base_type(ir->type);
-
-   } else if (ir->data.mode == ir_var_system_value) {
-      switch (ir->data.location) {
-      case SYSTEM_VALUE_BASE_VERTEX:
-      case SYSTEM_VALUE_VERTEX_ID:
-      case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
-      case SYSTEM_VALUE_INSTANCE_ID:
-         reg = emit_vs_system_value(ir->data.location);
-         break;
-      case SYSTEM_VALUE_SAMPLE_POS:
-	 reg = emit_samplepos_setup();
-         break;
-      case SYSTEM_VALUE_SAMPLE_ID:
-	 reg = emit_sampleid_setup();
-         break;
-      case SYSTEM_VALUE_SAMPLE_MASK_IN:
-         assert(devinfo->gen >= 7);
-         reg = new(mem_ctx)
-            fs_reg(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
-                          BRW_REGISTER_TYPE_D));
-         break;
-      }
-   }
-
-   if (!reg)
-      reg = new(this->mem_ctx) fs_reg(vgrf(ir->type));
-
-   hash_table_insert(this->variable_ht, reg, ir);
-}
-
-void
-fs_visitor::visit(ir_dereference_variable *ir)
-{
-   fs_reg *reg = variable_storage(ir->var);
-
-   if (!reg) {
-      fail("Failed to find variable storage for %s\n", ir->var->name);
-      this->result = fs_reg(reg_null_d);
-      return;
-   }
-   this->result = *reg;
-}
-
-void
-fs_visitor::visit(ir_dereference_record *ir)
-{
-   const glsl_type *struct_type = ir->record->type;
-
-   ir->record->accept(this);
-
-   unsigned int off = 0;
-   for (unsigned int i = 0; i < struct_type->length; i++) {
-      if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
-	 break;
-      off += type_size(struct_type->fields.structure[i].type);
-   }
-   this->result = offset(this->result, off);
-   this->result.type = brw_type_for_base_type(ir->type);
-}
-
-void
-fs_visitor::visit(ir_dereference_array *ir)
-{
-   ir_constant *constant_index;
-   fs_reg src;
-   int element_size = type_size(ir->type);
-
-   constant_index = ir->array_index->as_constant();
-
-   ir->array->accept(this);
-   src = this->result;
-   src.type = brw_type_for_base_type(ir->type);
-
-   if (constant_index) {
-      if (src.file == ATTR) {
-         /* Attribute arrays get loaded as one vec4 per element.  In that case
-          * offset the source register.
-          */
-         src.reg += constant_index->value.i[0];
-      } else {
-         assert(src.file == UNIFORM || src.file == GRF || src.file == HW_REG);
-         src = offset(src, constant_index->value.i[0] * element_size);
-      }
-   } else {
-      /* Variable index array dereference.  We attach the variable index
-       * component to the reg as a pointer to a register containing the
-       * offset.  Currently only uniform arrays are supported in this patch,
-       * and that reladdr pointer is resolved by
-       * move_uniform_array_access_to_pull_constants().  All other array types
-       * are lowered by lower_variable_index_to_cond_assign().
-       */
-      ir->array_index->accept(this);
-
-      fs_reg index_reg;
-      index_reg = vgrf(glsl_type::int_type);
-      emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
-
-      if (src.reladdr) {
-         emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
-      }
-
-      src.reladdr = ralloc(mem_ctx, fs_reg);
-      memcpy(src.reladdr, &index_reg, sizeof(index_reg));
-   }
-   this->result = src;
-}
-
-fs_inst *
-fs_visitor::emit_lrp(const fs_reg &dst, const fs_reg &x, const fs_reg &y,
-                     const fs_reg &a)
-{
-   if (devinfo->gen < 6) {
-      /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
-      fs_reg y_times_a           = vgrf(glsl_type::float_type);
-      fs_reg one_minus_a         = vgrf(glsl_type::float_type);
-      fs_reg x_times_one_minus_a = vgrf(glsl_type::float_type);
-
-      emit(MUL(y_times_a, y, a));
-
-      fs_reg negative_a = a;
-      negative_a.negate = !a.negate;
-      emit(ADD(one_minus_a, negative_a, fs_reg(1.0f)));
-      emit(MUL(x_times_one_minus_a, x, one_minus_a));
-
-      return emit(ADD(dst, x_times_one_minus_a, y_times_a));
-   } else {
-      /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
-       * we need to reorder the operands.
-       */
-      return emit(LRP(dst, a, y, x));
-   }
-}
-
-void
-fs_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, const fs_reg &dst,
-                        const fs_reg &src0, const fs_reg &src1)
-{
-   assert(conditionalmod == BRW_CONDITIONAL_GE ||
-          conditionalmod == BRW_CONDITIONAL_L);
-
-   fs_inst *inst;
-
-   if (devinfo->gen >= 6) {
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->conditional_mod = conditionalmod;
-   } else {
-      emit(CMP(reg_null_d, src0, src1, conditionalmod));
-
-      inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
-      inst->predicate = BRW_PREDICATE_NORMAL;
-   }
-}
-
-void
-fs_visitor::emit_uniformize(const fs_reg &dst, const fs_reg &src)
-{
-   const fs_reg chan_index = vgrf(glsl_type::uint_type);
-
-   emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, component(chan_index, 0))
-      ->force_writemask_all = true;
-   emit(SHADER_OPCODE_BROADCAST, component(dst, 0),
-        src, component(chan_index, 0))
-      ->force_writemask_all = true;
-}
-
-bool
-fs_visitor::try_emit_saturate(ir_expression *ir)
-{
-   if (ir->operation != ir_unop_saturate)
-      return false;
-
-   ir_rvalue *sat_val = ir->operands[0];
-
-   fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
-
-   sat_val->accept(this);
-   fs_reg src = this->result;
-
-   fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
-
-   /* If the last instruction from our accept() generated our
-    * src, just set the saturate flag instead of emmitting a separate mov.
-    */
-   fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
-   if (modify && modify->regs_written == modify->dst.width / 8 &&
-       modify->can_do_saturate()) {
-      modify->saturate = true;
-      this->result = src;
-      return true;
-   }
-
-   return false;
-}
-
-bool
-fs_visitor::try_emit_line(ir_expression *ir)
-{
-   /* LINE's src0 must be of type float. */
-   if (ir->type != glsl_type::float_type)
-      return false;
-
-   ir_rvalue *nonmul = ir->operands[1];
-   ir_expression *mul = ir->operands[0]->as_expression();
-
-   if (!mul || mul->operation != ir_binop_mul) {
-      nonmul = ir->operands[0];
-      mul = ir->operands[1]->as_expression();
-
-      if (!mul || mul->operation != ir_binop_mul)
-         return false;
-   }
-
-   ir_constant *const_add = nonmul->as_constant();
-   if (!const_add)
-      return false;
-
-   int add_operand_vf = brw_float_to_vf(const_add->value.f[0]);
-   if (add_operand_vf == -1)
-      return false;
-
-   ir_rvalue *non_const_mul = mul->operands[1];
-   ir_constant *const_mul = mul->operands[0]->as_constant();
-   if (!const_mul) {
-      const_mul = mul->operands[1]->as_constant();
-
-      if (!const_mul)
-         return false;
-
-      non_const_mul = mul->operands[0];
-   }
-
-   int mul_operand_vf = brw_float_to_vf(const_mul->value.f[0]);
-   if (mul_operand_vf == -1)
-      return false;
-
-   non_const_mul->accept(this);
-   fs_reg src1 = this->result;
-
-   fs_reg src0 = vgrf(ir->type);
-   emit(BRW_OPCODE_MOV, src0,
-        fs_reg((uint8_t)mul_operand_vf, 0, 0, (uint8_t)add_operand_vf));
-
-   this->result = vgrf(ir->type);
-   emit(BRW_OPCODE_LINE, this->result, src0, src1);
-   return true;
-}
-
-bool
-fs_visitor::try_emit_mad(ir_expression *ir)
-{
-   /* 3-src instructions were introduced in gen6. */
-   if (devinfo->gen < 6)
-      return false;
-
-   /* MAD can only handle floating-point data. */
-   if (ir->type != glsl_type::float_type)
-      return false;
-
-   ir_rvalue *nonmul;
-   ir_expression *mul;
-   bool mul_negate, mul_abs;
-
-   for (int i = 0; i < 2; i++) {
-      mul_negate = false;
-      mul_abs = false;
-
-      mul = ir->operands[i]->as_expression();
-      nonmul = ir->operands[1 - i];
-
-      if (mul && mul->operation == ir_unop_abs) {
-         mul = mul->operands[0]->as_expression();
-         mul_abs = true;
-      } else if (mul && mul->operation == ir_unop_neg) {
-         mul = mul->operands[0]->as_expression();
-         mul_negate = true;
-      }
-
-      if (mul && mul->operation == ir_binop_mul)
-         break;
-   }
-
-   if (!mul || mul->operation != ir_binop_mul)
-      return false;
-
-   nonmul->accept(this);
-   fs_reg src0 = this->result;
-
-   mul->operands[0]->accept(this);
-   fs_reg src1 = this->result;
-   src1.negate ^= mul_negate;
-   src1.abs = mul_abs;
-   if (mul_abs)
-      src1.negate = false;
-
-   mul->operands[1]->accept(this);
-   fs_reg src2 = this->result;
-   src2.abs = mul_abs;
-   if (mul_abs)
-      src2.negate = false;
-
-   this->result = vgrf(ir->type);
-   emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
-
-   return true;
-}
-
-bool
-fs_visitor::try_emit_b2f_of_comparison(ir_expression *ir)
-{
-   /* On platforms that do not natively generate 0u and ~0u for Boolean
-    * results, b2f expressions that look like
-    *
-    *     f = b2f(expr cmp 0)
-    *
-    * will generate better code by pretending the expression is
-    *
-    *     f = ir_triop_csel(0.0, 1.0, expr cmp 0)
-    *
-    * This is because the last instruction of "expr" can generate the
-    * condition code for the "cmp 0".  This avoids having to do the "-(b & 1)"
-    * trick to generate 0u or ~0u for the Boolean result.  This means code like
-    *
-    *     mov(16)         g16<1>F         1F
-    *     mul.ge.f0(16)   null            g6<8,8,1>F      g14<8,8,1>F
-    *     (+f0) sel(16)   m6<1>F          g16<8,8,1>F     0F
-    *
-    * will be generated instead of
-    *
-    *     mul(16)         g2<1>F          g12<8,8,1>F     g4<8,8,1>F
-    *     cmp.ge.f0(16)   g2<1>D          g4<8,8,1>F      0F
-    *     and(16)         g4<1>D          g2<8,8,1>D      1D
-    *     and(16)         m6<1>D          -g4<8,8,1>D     0x3f800000UD
-    *
-    * When the comparison is != 0.0 using the knowledge that the false case
-    * already results in zero would allow better code generation by possibly
-    * avoiding a load-immediate instruction.
-    */
-   ir_expression *cmp = ir->operands[0]->as_expression();
-   if (cmp == NULL)
-      return false;
-
-   if (cmp->operation == ir_binop_nequal) {
-      for (unsigned i = 0; i < 2; i++) {
-         ir_constant *c = cmp->operands[i]->as_constant();
-         if (c == NULL || !c->is_zero())
-            continue;
-
-         ir_expression *expr = cmp->operands[i ^ 1]->as_expression();
-         if (expr != NULL) {
-            fs_reg op[2];
-
-            for (unsigned j = 0; j < 2; j++) {
-               cmp->operands[j]->accept(this);
-               op[j] = this->result;
-
-               resolve_ud_negate(&op[j]);
-            }
-
-            emit_bool_to_cond_code_of_reg(cmp, op);
-
-            /* In this case we know when the condition is true, op[i ^ 1]
-             * contains zero.  Invert the predicate, use op[i ^ 1] as src0,
-             * and immediate 1.0f as src1.
-             */
-            this->result = vgrf(ir->type);
-            op[i ^ 1].type = BRW_REGISTER_TYPE_F;
-
-            fs_inst *inst = emit(SEL(this->result, op[i ^ 1], fs_reg(1.0f)));
-            inst->predicate = BRW_PREDICATE_NORMAL;
-            inst->predicate_inverse = true;
-            return true;
-         }
-      }
-   }
-
-   emit_bool_to_cond_code(cmp);
-
-   fs_reg temp = vgrf(ir->type);
-   emit(MOV(temp, fs_reg(1.0f)));
-
-   this->result = vgrf(ir->type);
-   fs_inst *inst = emit(SEL(this->result, temp, fs_reg(0.0f)));
-   inst->predicate = BRW_PREDICATE_NORMAL;
-
-   return true;
-}
-
-static int
-pack_pixel_offset(float x)
-{
-   /* Clamp upper end of the range to +7/16. See explanation in non-constant
-    * offset case below. */
-   int n = MIN2((int)(x * 16), 7);
-   return n & 0xf;
-}
-
-void
-fs_visitor::emit_interpolate_expression(ir_expression *ir)
-{
-   /* in SIMD16 mode, the pixel interpolator returns coords interleaved
-    * 8 channels at a time, same as the barycentric coords presented in
-    * the FS payload. this requires a bit of extra work to support.
-    */
-   no16("interpolate_at_* not yet supported in SIMD16 mode.");
-
-   assert(stage == MESA_SHADER_FRAGMENT);
-   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-
-   ir_dereference * deref = ir->operands[0]->as_dereference();
-   ir_swizzle * swiz = NULL;
-   if (!deref) {
-      /* the api does not allow a swizzle here, but the varying packing code
-       * may have pushed one into here.
-       */
-      swiz = ir->operands[0]->as_swizzle();
-      assert(swiz);
-      deref = swiz->val->as_dereference();
-   }
-   assert(deref);
-   ir_variable * var = deref->variable_referenced();
-   assert(var);
-
-   /* 1. collect interpolation factors */
-
-   fs_reg dst_xy = vgrf(glsl_type::get_instance(ir->type->base_type, 2, 1));
-
-   /* for most messages, we need one reg of ignored data; the hardware requires mlen==1
-    * even when there is no payload. in the per-slot offset case, we'll replace this with
-    * the proper source data. */
-   fs_reg src = vgrf(glsl_type::float_type);
-   int mlen = 1;     /* one reg unless overriden */
-   int reg_width = dispatch_width / 8;
-   fs_inst *inst;
-
-   switch (ir->operation) {
-   case ir_unop_interpolate_at_centroid:
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_CENTROID, dst_xy, src, fs_reg(0u));
-      break;
-
-   case ir_binop_interpolate_at_sample: {
-      ir_constant *sample_num = ir->operands[1]->as_constant();
-      assert(sample_num || !"nonconstant sample number should have been lowered.");
-
-      unsigned msg_data = sample_num->value.i[0] << 4;
-      inst = emit(FS_OPCODE_INTERPOLATE_AT_SAMPLE, dst_xy, src, fs_reg(msg_data));
-      break;
-   }
-
-   case ir_binop_interpolate_at_offset: {
-      ir_constant *const_offset = ir->operands[1]->as_constant();
-      if (const_offset) {
-         unsigned msg_data = pack_pixel_offset(const_offset->value.f[0]) |
-                            (pack_pixel_offset(const_offset->value.f[1]) << 4);
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET, dst_xy, src,
-                     fs_reg(msg_data));
-      } else {
-         /* pack the operands: hw wants offsets as 4 bit signed ints */
-         ir->operands[1]->accept(this);
-         src = vgrf(glsl_type::ivec2_type);
-         fs_reg src2 = src;
-         for (int i = 0; i < 2; i++) {
-            fs_reg temp = vgrf(glsl_type::float_type);
-            emit(MUL(temp, this->result, fs_reg(16.0f)));
-            emit(MOV(src2, temp));  /* float to int */
-
-            /* Clamp the upper end of the range to +7/16. ARB_gpu_shader5 requires
-             * that we support a maximum offset of +0.5, which isn't representable
-             * in a S0.4 value -- if we didn't clamp it, we'd end up with -8/16,
-             * which is the opposite of what the shader author wanted.
-             *
-             * This is legal due to ARB_gpu_shader5's quantization rules:
-             *
-             * "Not all values of <offset> may be supported; x and y offsets may
-             * be rounded to fixed-point values with the number of fraction bits
-             * given by the implementation-dependent constant
-             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
-             */
-
-            fs_inst *inst = emit(BRW_OPCODE_SEL, src2, src2, fs_reg(7));
-            inst->conditional_mod = BRW_CONDITIONAL_L; /* min(src2, 7) */
-
-            src2 = offset(src2, 1);
-            this->result = offset(this->result, 1);
-         }
-
-         mlen = 2 * reg_width;
-         inst = emit(FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET, dst_xy, src,
-                     fs_reg(0u));
-      }
-      break;
-   }
-
-   default:
-      unreachable("not reached");
-   }
-
-   inst->mlen = mlen;
-   inst->regs_written = 2 * reg_width; /* 2 floats per slot returned */
-   inst->pi_noperspective = var->determine_interpolation_mode(key->flat_shade) ==
-         INTERP_QUALIFIER_NOPERSPECTIVE;
-
-   /* 2. emit linterp */
-
-   fs_reg res = vgrf(ir->type);
-   this->result = res;
-
-   for (int i = 0; i < ir->type->vector_elements; i++) {
-      int ch = swiz ? ((*(int *)&swiz->mask) >> 2*i) & 3 : i;
-      emit(FS_OPCODE_LINTERP, res, dst_xy,
-           fs_reg(interp_reg(var->data.location, ch)));
-      res = offset(res, 1);
-   }
-}
-
-void
-fs_visitor::visit(ir_expression *ir)
-{
-   unsigned int operand;
-   fs_reg op[3], temp;
-   fs_inst *inst;
-   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
-
-   assert(ir->get_num_operands() <= 3);
-
-   if (try_emit_saturate(ir))
-      return;
-
-   /* Deal with the real oddball stuff first */
-   switch (ir->operation) {
-   case ir_binop_add:
-      if (devinfo->gen <= 5 && try_emit_line(ir))
-         return;
-      if (try_emit_mad(ir))
-         return;
-      break;
-
-   case ir_triop_csel:
-      ir->operands[1]->accept(this);
-      op[1] = this->result;
-      ir->operands[2]->accept(this);
-      op[2] = this->result;
-
-      emit_bool_to_cond_code(ir->operands[0]);
-
-      this->result = vgrf(ir->type);
-      inst = emit(SEL(this->result, op[1], op[2]));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      return;
-
-   case ir_unop_b2f:
-      if (devinfo->gen <= 5 && try_emit_b2f_of_comparison(ir))
-         return;
-      break;
-
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_offset:
-   case ir_binop_interpolate_at_sample:
-      emit_interpolate_expression(ir);
-      return;
-
-   default:
-      break;
-   }
-
-   for (operand = 0; operand < ir->get_num_operands(); operand++) {
-      ir->operands[operand]->accept(this);
-      if (this->result.file == BAD_FILE) {
-	 fail("Failed to get tree for expression operand:\n");
-	 ir->operands[operand]->fprint(stderr);
-         fprintf(stderr, "\n");
-      }
-      assert(this->result.file == GRF ||
-             this->result.file == UNIFORM || this->result.file == ATTR);
-      op[operand] = this->result;
-
-      /* Matrix expression operands should have been broken down to vector
-       * operations already.
-       */
-      assert(!ir->operands[operand]->type->is_matrix());
-      /* And then those vector operands should have been broken down to scalar.
-       */
-      assert(!ir->operands[operand]->type->is_vector());
-   }
-
-   /* Storage for our result.  If our result goes into an assignment, it will
-    * just get copy-propagated out, so no worries.
-    */
-   this->result = vgrf(ir->type);
-
-   switch (ir->operation) {
-   case ir_unop_logic_not:
-      emit(NOT(this->result, op[0]));
-      break;
-   case ir_unop_neg:
-      op[0].negate = !op[0].negate;
-      emit(MOV(this->result, op[0]));
-      break;
-   case ir_unop_abs:
-      op[0].abs = true;
-      op[0].negate = false;
-      emit(MOV(this->result, op[0]));
-      break;
-   case ir_unop_sign:
-      if (ir->type->is_float()) {
-         /* AND(val, 0x80000000) gives the sign bit.
-          *
-          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
-          * zero.
-          */
-         emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-
-         op[0].type = BRW_REGISTER_TYPE_UD;
-         this->result.type = BRW_REGISTER_TYPE_UD;
-         emit(AND(this->result, op[0], fs_reg(0x80000000u)));
-
-         inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-         this->result.type = BRW_REGISTER_TYPE_F;
-      } else {
-         /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
-          *               -> non-negative val generates 0x00000000.
-          *  Predicated OR sets 1 if val is positive.
-          */
-         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
-
-         emit(ASR(this->result, op[0], fs_reg(31)));
-
-         inst = emit(OR(this->result, this->result, fs_reg(1)));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-      }
-      break;
-   case ir_unop_rcp:
-      emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
-      break;
-
-   case ir_unop_exp2:
-      emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
-      break;
-   case ir_unop_log2:
-      emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
-      break;
-   case ir_unop_exp:
-   case ir_unop_log:
-      unreachable("not reached: should be handled by ir_explog_to_explog2");
-   case ir_unop_sin:
-      emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
-      break;
-   case ir_unop_cos:
-      emit_math(SHADER_OPCODE_COS, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx:
-      /* Select one of the two opcodes based on the glHint value. */
-      if (fs_key->high_quality_derivatives)
-         emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
-      else
-         emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx_coarse:
-      emit(FS_OPCODE_DDX_COARSE, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdx_fine:
-      emit(FS_OPCODE_DDX_FINE, this->result, op[0]);
-      break;
-
-   case ir_unop_dFdy:
-      /* Select one of the two opcodes based on the glHint value. */
-      if (fs_key->high_quality_derivatives)
-         emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      else
-         emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      break;
-
-   case ir_unop_dFdy_coarse:
-      emit(FS_OPCODE_DDY_COARSE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      break;
-
-   case ir_unop_dFdy_fine:
-      emit(FS_OPCODE_DDY_FINE, result, op[0], fs_reg(fs_key->render_to_fbo));
-      break;
-
-   case ir_binop_add:
-      emit(ADD(this->result, op[0], op[1]));
-      break;
-   case ir_binop_sub:
-      unreachable("not reached: should be handled by ir_sub_to_add_neg");
-
-   case ir_binop_mul:
-      if (devinfo->gen < 8 && ir->type->is_integer()) {
-	 /* For integer multiplication, the MUL uses the low 16 bits
-	  * of one of the operands (src0 on gen6, src1 on gen7).  The
-	  * MACH accumulates in the contribution of the upper 16 bits
-	  * of that operand.
-          */
-         if (ir->operands[0]->is_uint16_constant()) {
-            if (devinfo->gen < 7)
-               emit(MUL(this->result, op[0], op[1]));
-            else
-               emit(MUL(this->result, op[1], op[0]));
-         } else if (ir->operands[1]->is_uint16_constant()) {
-            if (devinfo->gen < 7)
-               emit(MUL(this->result, op[1], op[0]));
-            else
-               emit(MUL(this->result, op[0], op[1]));
-         } else {
-            if (devinfo->gen >= 7)
-               no16("SIMD16 explicit accumulator operands unsupported\n");
-
-            struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                        this->result.type);
-
-            emit(MUL(acc, op[0], op[1]));
-            emit(MACH(reg_null_d, op[0], op[1]));
-            emit(MOV(this->result, fs_reg(acc)));
-         }
-      } else {
-	 emit(MUL(this->result, op[0], op[1]));
-      }
-      break;
-   case ir_binop_imul_high: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  this->result.type);
-
-      fs_inst *mul = emit(MUL(acc, op[0], op[1]));
-      emit(MACH(this->result, op[0], op[1]));
-
-      /* Until Gen8, integer multiplies read 32-bits from one source, and
-       * 16-bits from the other, and relying on the MACH instruction to
-       * generate the high bits of the result.
-       *
-       * On Gen8, the multiply instruction does a full 32x32-bit multiply,
-       * but in order to do a 64x64-bit multiply we have to simulate the
-       * previous behavior and then use a MACH instruction.
-       *
-       * FINISHME: Don't use source modifiers on src1.
-       */
-      if (devinfo->gen >= 8) {
-         assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
-                mul->src[1].type == BRW_REGISTER_TYPE_UD);
-         if (mul->src[1].type == BRW_REGISTER_TYPE_D) {
-            mul->src[1].type = BRW_REGISTER_TYPE_W;
-            mul->src[1].stride = 2;
-         } else {
-            mul->src[1].type = BRW_REGISTER_TYPE_UW;
-            mul->src[1].stride = 2;
-         }
-      }
-
-      break;
-   }
-   case ir_binop_div:
-      /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
-      break;
-   case ir_binop_carry: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      emit(ADDC(reg_null_ud, op[0], op[1]));
-      emit(MOV(this->result, fs_reg(acc)));
-      break;
-   }
-   case ir_binop_borrow: {
-      if (devinfo->gen >= 7)
-         no16("SIMD16 explicit accumulator operands unsupported\n");
-
-      struct brw_reg acc = retype(brw_acc_reg(dispatch_width),
-                                  BRW_REGISTER_TYPE_UD);
-
-      emit(SUBB(reg_null_ud, op[0], op[1]));
-      emit(MOV(this->result, fs_reg(acc)));
-      break;
-   }
-   case ir_binop_mod:
-      /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
-      assert(ir->type->is_integer());
-      emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
-      break;
-
-   case ir_binop_less:
-   case ir_binop_greater:
-   case ir_binop_lequal:
-   case ir_binop_gequal:
-   case ir_binop_equal:
-   case ir_binop_all_equal:
-   case ir_binop_nequal:
-   case ir_binop_any_nequal:
-      if (devinfo->gen <= 5) {
-         resolve_bool_comparison(ir->operands[0], &op[0]);
-         resolve_bool_comparison(ir->operands[1], &op[1]);
-      }
-
-      emit(CMP(this->result, op[0], op[1],
-               brw_conditional_for_comparison(ir->operation)));
-      break;
-
-   case ir_binop_logic_xor:
-      emit(XOR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_or:
-      emit(OR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_logic_and:
-      emit(AND(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_dot:
-   case ir_unop_any:
-      unreachable("not reached: should be handled by brw_fs_channel_expressions");
-
-   case ir_unop_noise:
-      unreachable("not reached: should be handled by lower_noise");
-
-   case ir_quadop_vector:
-      unreachable("not reached: should be handled by lower_quadop_vector");
-
-   case ir_binop_vector_extract:
-      unreachable("not reached: should be handled by lower_vec_index_to_cond_assign()");
-
-   case ir_triop_vector_insert:
-      unreachable("not reached: should be handled by lower_vector_insert()");
-
-   case ir_binop_ldexp:
-      unreachable("not reached: should be handled by ldexp_to_arith()");
-
-   case ir_unop_sqrt:
-      emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
-      break;
-
-   case ir_unop_rsq:
-      emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
-      break;
-
-   case ir_unop_bitcast_i2f:
-   case ir_unop_bitcast_u2f:
-      op[0].type = BRW_REGISTER_TYPE_F;
-      this->result = op[0];
-      break;
-   case ir_unop_i2u:
-   case ir_unop_bitcast_f2u:
-      op[0].type = BRW_REGISTER_TYPE_UD;
-      this->result = op[0];
-      break;
-   case ir_unop_u2i:
-   case ir_unop_bitcast_f2i:
-      op[0].type = BRW_REGISTER_TYPE_D;
-      this->result = op[0];
-      break;
-   case ir_unop_i2f:
-   case ir_unop_u2f:
-   case ir_unop_f2i:
-   case ir_unop_f2u:
-      emit(MOV(this->result, op[0]));
-      break;
-
-   case ir_unop_b2i:
-      emit(AND(this->result, op[0], fs_reg(1)));
-      break;
-   case ir_unop_b2f:
-      if (devinfo->gen <= 5) {
-         resolve_bool_comparison(ir->operands[0], &op[0]);
-      }
-      op[0].type = BRW_REGISTER_TYPE_D;
-      this->result.type = BRW_REGISTER_TYPE_D;
-      emit(AND(this->result, op[0], fs_reg(0x3f800000u)));
-      this->result.type = BRW_REGISTER_TYPE_F;
-      break;
-
-   case ir_unop_f2b:
-      emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-      break;
-   case ir_unop_i2b:
-      emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      break;
-
-   case ir_unop_trunc:
-      emit(RNDZ(this->result, op[0]));
-      break;
-   case ir_unop_ceil: {
-         fs_reg tmp = vgrf(ir->type);
-         op[0].negate = !op[0].negate;
-         emit(RNDD(tmp, op[0]));
-         tmp.negate = true;
-         emit(MOV(this->result, tmp));
-      }
-      break;
-   case ir_unop_floor:
-      emit(RNDD(this->result, op[0]));
-      break;
-   case ir_unop_fract:
-      emit(FRC(this->result, op[0]));
-      break;
-   case ir_unop_round_even:
-      emit(RNDE(this->result, op[0]));
-      break;
-
-   case ir_binop_min:
-   case ir_binop_max:
-      resolve_ud_negate(&op[0]);
-      resolve_ud_negate(&op[1]);
-      emit_minmax(ir->operation == ir_binop_min ?
-                  BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
-                  this->result, op[0], op[1]);
-      break;
-   case ir_unop_pack_snorm_2x16:
-   case ir_unop_pack_snorm_4x8:
-   case ir_unop_pack_unorm_2x16:
-   case ir_unop_pack_unorm_4x8:
-   case ir_unop_unpack_snorm_2x16:
-   case ir_unop_unpack_snorm_4x8:
-   case ir_unop_unpack_unorm_2x16:
-   case ir_unop_unpack_unorm_4x8:
-   case ir_unop_unpack_half_2x16:
-   case ir_unop_pack_half_2x16:
-      unreachable("not reached: should be handled by lower_packing_builtins");
-   case ir_unop_unpack_half_2x16_split_x:
-      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
-      break;
-   case ir_unop_unpack_half_2x16_split_y:
-      emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
-      break;
-   case ir_binop_pow:
-      emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
-      break;
-
-   case ir_unop_bitfield_reverse:
-      emit(BFREV(this->result, op[0]));
-      break;
-   case ir_unop_bit_count:
-      emit(CBIT(this->result, op[0]));
-      break;
-   case ir_unop_find_msb:
-      temp = vgrf(glsl_type::uint_type);
-      emit(FBH(temp, op[0]));
-
-      /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
-       * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
-       * subtract the result from 31 to convert the MSB count into an LSB count.
-       */
-
-      /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
-      emit(MOV(this->result, temp));
-      emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
-
-      temp.negate = true;
-      inst = emit(ADD(this->result, temp, fs_reg(31)));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-      break;
-   case ir_unop_find_lsb:
-      emit(FBL(this->result, op[0]));
-      break;
-   case ir_unop_saturate:
-      inst = emit(MOV(this->result, op[0]));
-      inst->saturate = true;
-      break;
-   case ir_triop_bitfield_extract:
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(BFE(this->result, op[2], op[1], op[0]));
-      break;
-   case ir_binop_bfm:
-      emit(BFI1(this->result, op[0], op[1]));
-      break;
-   case ir_triop_bfi:
-      emit(BFI2(this->result, op[0], op[1], op[2]));
-      break;
-   case ir_quadop_bitfield_insert:
-      unreachable("not reached: should be handled by "
-              "lower_instructions::bitfield_insert_to_bfm_bfi");
-
-   case ir_unop_bit_not:
-      emit(NOT(this->result, op[0]));
-      break;
-   case ir_binop_bit_and:
-      emit(AND(this->result, op[0], op[1]));
-      break;
-   case ir_binop_bit_xor:
-      emit(XOR(this->result, op[0], op[1]));
-      break;
-   case ir_binop_bit_or:
-      emit(OR(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_lshift:
-      emit(SHL(this->result, op[0], op[1]));
-      break;
-
-   case ir_binop_rshift:
-      if (ir->type->base_type == GLSL_TYPE_INT)
-	 emit(ASR(this->result, op[0], op[1]));
-      else
-	 emit(SHR(this->result, op[0], op[1]));
-      break;
-   case ir_binop_pack_half_2x16_split:
-      emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
-      break;
-   case ir_binop_ubo_load: {
-      /* This IR node takes a constant uniform block and a constant or
-       * variable byte offset within the block and loads a vector from that.
-       */
-      ir_constant *const_uniform_block = ir->operands[0]->as_constant();
-      ir_constant *const_offset = ir->operands[1]->as_constant();
-      fs_reg surf_index;
-      uint32_t binding, set, index, set_index;
-
-      if (const_uniform_block) {
-         /* The block index is a constant, so just emit the binding table entry
-          * as an immediate.
-          */
-         index = const_uniform_block->value.u[0];
-         set = shader->base.UniformBlocks[index].Set;
-         set_index = shader->base.UniformBlocks[index].Binding;
-         binding = stage_prog_data->bind_map[set].index[set_index];
-         surf_index = fs_reg(binding);
-      } else {
-         assert(0 && "need more info from the ir for this.");
-
-         /* The block index is not a constant. Evaluate the index expression
-          * per-channel and add the base UBO index; we have to select a value
-          * from any live channel.
-          */
-         surf_index = vgrf(glsl_type::uint_type);
-         emit(ADD(surf_index, op[0],
-                  fs_reg(stage_prog_data->binding_table.ubo_start)));
-         emit_uniformize(surf_index, surf_index);
-
-         /* Assume this may touch any UBO. It would be nice to provide
-          * a tighter bound, but the array information is already lowered away.
-          */
-         brw_mark_surface_used(prog_data,
-                               stage_prog_data->binding_table.ubo_start +
-                               shader_prog->NumUniformBlocks - 1);
-      }
-
-      if (const_offset) {
-         fs_reg packed_consts = vgrf(glsl_type::float_type);
-         packed_consts.type = result.type;
-
-         fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
-         emit(new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
-                                   packed_consts, surf_index, const_offset_reg));
-
-         for (int i = 0; i < ir->type->vector_elements; i++) {
-            packed_consts.set_smear(const_offset->value.u[0] % 16 / 4 + i);
-
-            /* The std140 packing rules don't allow vectors to cross 16-byte
-             * boundaries, and a reg is 32 bytes.
-             */
-            assert(packed_consts.subreg_offset < 32);
-
-            /* UBO bools are any nonzero value.  We consider bools to be
-             * values with the low bit set to 1.  Convert them using CMP.
-             */
-            if (ir->type->base_type == GLSL_TYPE_BOOL) {
-               emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
-            } else {
-               emit(MOV(result, packed_consts));
-            }
-
-            result = offset(result, 1);
-         }
-      } else {
-         /* Turn the byte offset into a dword offset. */
-         fs_reg base_offset = vgrf(glsl_type::int_type);
-         emit(SHR(base_offset, op[1], fs_reg(2)));
-
-         for (int i = 0; i < ir->type->vector_elements; i++) {
-            emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
-                                            base_offset, i));
-
-            if (ir->type->base_type == GLSL_TYPE_BOOL)
-               emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
-
-            result = offset(result, 1);
-         }
-      }
-
-      result.reg_offset = 0;
-      break;
-   }
-
-   case ir_triop_fma:
-      /* Note that the instruction's argument order is reversed from GLSL
-       * and the IR.
-       */
-      emit(MAD(this->result, op[2], op[1], op[0]));
-      break;
-
-   case ir_triop_lrp:
-      emit_lrp(this->result, op[0], op[1], op[2]);
-      break;
-
-   case ir_triop_csel:
-   case ir_unop_interpolate_at_centroid:
-   case ir_binop_interpolate_at_offset:
-   case ir_binop_interpolate_at_sample:
-      unreachable("already handled above");
-      break;
-
-   case ir_unop_d2f:
-   case ir_unop_f2d:
-   case ir_unop_d2i:
-   case ir_unop_i2d:
-   case ir_unop_d2u:
-   case ir_unop_u2d:
-   case ir_unop_d2b:
-   case ir_unop_pack_double_2x32:
-   case ir_unop_unpack_double_2x32:
-   case ir_unop_frexp_sig:
-   case ir_unop_frexp_exp:
-      unreachable("fp64 todo");
-      break;
-   }
-}
-
-void
-fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
-				   const glsl_type *type, bool predicated)
-{
-   switch (type->base_type) {
-   case GLSL_TYPE_FLOAT:
-   case GLSL_TYPE_UINT:
-   case GLSL_TYPE_INT:
-   case GLSL_TYPE_BOOL:
-      for (unsigned int i = 0; i < type->components(); i++) {
-	 l.type = brw_type_for_base_type(type);
-	 r.type = brw_type_for_base_type(type);
-
-	 if (predicated || !l.equals(r)) {
-	    fs_inst *inst = emit(MOV(l, r));
-	    inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
-	 }
-
-	 l = offset(l, 1);
-	 r = offset(r, 1);
-      }
-      break;
-   case GLSL_TYPE_ARRAY:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.array, predicated);
-      }
-      break;
-
-   case GLSL_TYPE_STRUCT:
-      for (unsigned int i = 0; i < type->length; i++) {
-	 emit_assignment_writes(l, r, type->fields.structure[i].type,
-				predicated);
-      }
-      break;
-
-   case GLSL_TYPE_SAMPLER:
-   case GLSL_TYPE_IMAGE:
-   case GLSL_TYPE_ATOMIC_UINT:
-      break;
-
-   case GLSL_TYPE_DOUBLE:
-   case GLSL_TYPE_VOID:
-   case GLSL_TYPE_ERROR:
-   case GLSL_TYPE_INTERFACE:
-   case GLSL_TYPE_FUNCTION:
-      unreachable("not reached");
-   }
-}
-
-/* If the RHS processing resulted in an instruction generating a
- * temporary value, and it would be easy to rewrite the instruction to
- * generate its result right into the LHS instead, do so.  This ends
- * up reliably removing instructions where it can be tricky to do so
- * later without real UD chain information.
- */
-bool
-fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
-                                   fs_reg dst,
-                                   fs_reg src,
-                                   fs_inst *pre_rhs_inst,
-                                   fs_inst *last_rhs_inst)
-{
-   /* Only attempt if we're doing a direct assignment. */
-   if (ir->condition ||
-       !(ir->lhs->type->is_scalar() ||
-        (ir->lhs->type->is_vector() &&
-         ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
-      return false;
-
-   /* Make sure the last instruction generated our source reg. */
-   fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
-						    last_rhs_inst,
-						    src);
-   if (!modify)
-      return false;
-
-   /* If last_rhs_inst wrote a different number of components than our LHS,
-    * we can't safely rewrite it.
-    */
-   if (alloc.sizes[dst.reg] != modify->regs_written)
-      return false;
-
-   /* Success!  Rewrite the instruction. */
-   modify->dst = dst;
-
-   return true;
-}
-
-void
-fs_visitor::visit(ir_assignment *ir)
-{
-   fs_reg l, r;
-   fs_inst *inst;
-
-   /* FINISHME: arrays on the lhs */
-   ir->lhs->accept(this);
-   l = this->result;
-
-   fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
-
-   ir->rhs->accept(this);
-   r = this->result;
-
-   fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
-
-   assert(l.file != BAD_FILE);
-   assert(r.file != BAD_FILE);
-
-   if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
-      return;
-
-   if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
-   }
-
-   if (ir->lhs->type->is_scalar() ||
-       ir->lhs->type->is_vector()) {
-      for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
-	 if (ir->write_mask & (1 << i)) {
-	    inst = emit(MOV(l, r));
-	    if (ir->condition)
-	       inst->predicate = BRW_PREDICATE_NORMAL;
-	    r = offset(r, 1);
-	 }
-	 l = offset(l, 1);
-      }
-   } else {
-      emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
-   }
-}
-
 fs_inst *
 fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
                               fs_reg coordinate, int coord_components,
@@ -1458,7 +94,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 
    if (shadow_c.file != BAD_FILE) {
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
 
@@ -1466,7 +102,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
        * the unused slots must be zeroed.
        */
       for (int i = coord_components; i < 3; i++) {
-         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
       }
       mlen += 3;
 
@@ -1474,25 +110,25 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 	 /* There's no plain shadow compare message, so we use shadow
 	  * compare with a bias of 0.0.
 	  */
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
 	 mlen++;
       } else if (op == ir_txb || op == ir_txl) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), lod);
 	 mlen++;
       } else {
          unreachable("Should not get here.");
       }
 
-      emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen), shadow_c);
       mlen++;
    } else if (op == ir_tex) {
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
       /* zero the others. */
       for (int i = coord_components; i<3; i++) {
-         emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f));
       }
       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
       mlen += 3;
@@ -1500,7 +136,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
       fs_reg &dPdx = lod;
 
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate);
 	 coordinate = offset(coordinate, 1);
       }
       /* the slots for u and v are always present, but r is optional */
@@ -1521,20 +157,20 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
        *        m5     m6     m7     m8     m9     m10
        */
       for (int i = 0; i < grad_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdx);
 	 dPdx = offset(dPdx, 1);
       }
       mlen += MAX2(grad_components, 2);
 
       for (int i = 0; i < grad_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen), dPdy);
 	 dPdy = offset(dPdy, 1);
       }
       mlen += MAX2(grad_components, 2);
    } else if (op == ir_txs) {
       /* There's no SIMD8 resinfo message on Gen4.  Use SIMD16 instead. */
       simd16 = true;
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod);
       mlen += 2;
    } else {
       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
@@ -1544,8 +180,8 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
       assert(op == ir_txb || op == ir_txl || op == ir_txf);
 
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
-                  coordinate));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
+                 coordinate);
 	 coordinate = offset(coordinate, 1);
       }
 
@@ -1553,13 +189,13 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
        * be necessary for TXF (ld), but seems wise to do for all messages.
        */
       for (int i = coord_components; i < 3; i++) {
-	 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
+         bld.MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f));
       }
 
       /* lod/bias appears after u/v/r. */
       mlen += 6;
 
-      emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
+      bld.MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod);
       mlen++;
 
       /* The unused upper half. */
@@ -1587,7 +223,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
       unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
    inst->base_mrf = base_mrf;
    inst->mlen = mlen;
    inst->header_size = 1;
@@ -1595,7 +231,7 @@ fs_visitor::emit_texture_gen4(ir_texture_opcode op, fs_reg dst,
 
    if (simd16) {
       for (int i = 0; i < 4; i++) {
-	 emit(MOV(orig_dst, dst));
+         bld.MOV(orig_dst, dst);
 	 orig_dst = offset(orig_dst, 1);
 	 dst = offset(dst, 2);
       }
@@ -1621,7 +257,7 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
 
    /* Copy the coordinates. */
    for (int i = 0; i < vector_elements; i++) {
-      emit(MOV(retype(offset(message, i), coordinate.type), coordinate));
+      bld.MOV(retype(offset(message, i), coordinate.type), coordinate);
       coordinate = offset(coordinate, 1);
    }
 
@@ -1630,20 +266,20 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
    /* Messages other than sample and ld require all three components */
    if (has_lod || shadow_c.file != BAD_FILE) {
       for (int i = vector_elements; i < 3; i++) {
-         emit(MOV(offset(message, i), fs_reg(0.0f)));
+         bld.MOV(offset(message, i), fs_reg(0.0f));
       }
    }
 
    if (has_lod) {
       fs_reg msg_lod = retype(offset(message, 3), op == ir_txf ?
                               BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
    }
 
    if (shadow_c.file != BAD_FILE) {
       fs_reg msg_ref = offset(message, 3 + has_lod);
-      emit(MOV(msg_ref, shadow_c));
+      bld.MOV(msg_ref, shadow_c);
       msg_end = offset(msg_ref, 1);
    }
 
@@ -1658,7 +294,7 @@ fs_visitor::emit_texture_gen4_simd16(ir_texture_opcode op, fs_reg dst,
    default: unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
    inst->base_mrf = message.reg - 1;
    inst->mlen = msg_end.reg - inst->base_mrf;
    inst->header_size = 1;
@@ -1698,7 +334,7 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    }
 
    for (int i = 0; i < vector_elements; i++) {
-      emit(MOV(retype(offset(msg_coords, i), coordinate.type), coordinate));
+      bld.MOV(retype(offset(msg_coords, i), coordinate.type), coordinate);
       coordinate = offset(coordinate, 1);
    }
    fs_reg msg_end = offset(msg_coords, vector_elements);
@@ -1706,7 +342,7 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
 
    if (shadow_c.file != BAD_FILE) {
       fs_reg msg_shadow = msg_lod;
-      emit(MOV(msg_shadow, shadow_c));
+      bld.MOV(msg_shadow, shadow_c);
       msg_lod = offset(msg_shadow, 1);
       msg_end = msg_lod;
    }
@@ -1717,13 +353,13 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
       opcode = SHADER_OPCODE_TEX;
       break;
    case ir_txb:
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = FS_OPCODE_TXB;
       break;
    case ir_txl:
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXL;
@@ -1740,11 +376,11 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
        */
       msg_end = msg_lod;
       for (int i = 0; i < grad_components; i++) {
-         emit(MOV(msg_end, lod));
+         bld.MOV(msg_end, lod);
          lod = offset(lod, 1);
          msg_end = offset(msg_end, 1);
 
-         emit(MOV(msg_end, lod2));
+         bld.MOV(msg_end, lod2);
          lod2 = offset(lod2, 1);
          msg_end = offset(msg_end, 1);
       }
@@ -1754,21 +390,21 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    }
    case ir_txs:
       msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
-      emit(MOV(msg_lod, lod));
+      bld.MOV(msg_lod, lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_query_levels:
       msg_lod = msg_end;
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXS;
       break;
    case ir_txf:
       msg_lod = offset(msg_coords, 3);
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
       msg_end = offset(msg_lod, 1);
 
       opcode = SHADER_OPCODE_TXF;
@@ -1776,9 +412,9 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
    case ir_txf_ms:
       msg_lod = offset(msg_coords, 3);
       /* lod */
-      emit(MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), fs_reg(0u));
       /* sample index */
-      emit(MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index));
+      bld.MOV(retype(offset(msg_lod, 1), BRW_REGISTER_TYPE_UD), sample_index);
       msg_end = offset(msg_lod, 2);
 
       opcode = SHADER_OPCODE_TXF_CMS;
@@ -1793,7 +429,7 @@ fs_visitor::emit_texture_gen5(ir_texture_opcode op, fs_reg dst,
       unreachable("not reached");
    }
 
-   fs_inst *inst = emit(opcode, dst, reg_undef, fs_reg(sampler));
+   fs_inst *inst = bld.emit(opcode, dst, reg_undef, fs_reg(sampler));
    inst->base_mrf = message.reg;
    inst->mlen = msg_end.reg - message.reg;
    inst->header_size = header_size;
@@ -1851,7 +487,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    }
 
    if (shadow_c.file != BAD_FILE) {
-      emit(MOV(sources[length], shadow_c));
+      bld.MOV(sources[length], shadow_c);
       length++;
    }
 
@@ -1874,11 +510,11 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    case ir_lod:
       break;
    case ir_txb:
-      emit(MOV(sources[length], lod));
+      bld.MOV(sources[length], lod);
       length++;
       break;
    case ir_txl:
-      emit(MOV(sources[length], lod));
+      bld.MOV(sources[length], lod);
       length++;
       break;
    case ir_txd: {
@@ -1888,7 +524,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
        */
       for (int i = 0; i < coord_components; i++) {
-	 emit(MOV(sources[length], coordinate));
+         bld.MOV(sources[length], coordinate);
 	 coordinate = offset(coordinate, 1);
 	 length++;
 
@@ -1896,11 +532,11 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
           * only derivatives for (u, v, r).
           */
          if (i < grad_components) {
-            emit(MOV(sources[length], lod));
+            bld.MOV(sources[length], lod);
             lod = offset(lod, 1);
             length++;
 
-            emit(MOV(sources[length], lod2));
+            bld.MOV(sources[length], lod2);
             lod2 = offset(lod2, 1);
             length++;
          }
@@ -1910,11 +546,11 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
       break;
    }
    case ir_txs:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
       length++;
       break;
    case ir_query_levels:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u)));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), fs_reg(0u));
       length++;
       break;
    case ir_txf:
@@ -1922,23 +558,23 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
        * On Gen9 they are u, v, lod, r
        */
 
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
       coordinate = offset(coordinate, 1);
       length++;
 
       if (devinfo->gen >= 9) {
          if (coord_components >= 2) {
-            emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
             coordinate = offset(coordinate, 1);
          }
          length++;
       }
 
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
       length++;
 
       for (int i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++) {
-	 emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
 	 coordinate = offset(coordinate, 1);
 	 length++;
       }
@@ -1946,18 +582,18 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
       coordinate_done = true;
       break;
    case ir_txf_ms:
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
       length++;
 
       /* data from the multisample control surface */
-      emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs));
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
       length++;
 
       /* there is no offsetting for this message; just copy in the integer
        * texture coordinates
        */
       for (int i = 0; i < coord_components; i++) {
-         emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate));
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), coordinate);
          coordinate = offset(coordinate, 1);
          length++;
       }
@@ -1971,19 +607,19 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
 
          /* More crazy intermixing */
          for (int i = 0; i < 2; i++) { /* u, v */
-            emit(MOV(sources[length], coordinate));
+            bld.MOV(sources[length], coordinate);
             coordinate = offset(coordinate, 1);
             length++;
          }
 
          for (int i = 0; i < 2; i++) { /* offu, offv */
-            emit(MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value));
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), offset_value);
             offset_value = offset(offset_value, 1);
             length++;
          }
 
          if (coord_components == 3) { /* r if present */
-            emit(MOV(sources[length], coordinate));
+            bld.MOV(sources[length], coordinate);
             coordinate = offset(coordinate, 1);
             length++;
          }
@@ -1996,7 +632,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    /* Set up the coordinate (except for cases where it was done above) */
    if (!coordinate_done) {
       for (int i = 0; i < coord_components; i++) {
-         emit(MOV(sources[length], coordinate));
+         bld.MOV(sources[length], coordinate);
          coordinate = offset(coordinate, 1);
          length++;
       }
@@ -2010,7 +646,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
 
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
                                BRW_REGISTER_TYPE_F, dispatch_width);
-   emit(LOAD_PAYLOAD(src_payload, sources, length, header_size));
+   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
 
    /* Generate the SEND */
    enum opcode opcode;
@@ -2033,7 +669,7 @@ fs_visitor::emit_texture_gen7(ir_texture_opcode op, fs_reg dst,
    default:
       unreachable("not reached");
    }
-   fs_inst *inst = emit(opcode, dst, src_payload, sampler);
+   fs_inst *inst = bld.emit(opcode, dst, src_payload, sampler);
    inst->base_mrf = -1;
    inst->mlen = mlen;
    inst->header_size = header_size;
@@ -2051,7 +687,6 @@ fs_reg
 fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
                              bool is_rect, uint32_t sampler, int texunit)
 {
-   fs_inst *inst = NULL;
    bool needs_gl_clamp = true;
    fs_reg scale_x, scale_y;
 
@@ -2110,10 +745,10 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
       fs_reg src = coordinate;
       coordinate = dst;
 
-      emit(MUL(dst, src, scale_x));
+      bld.MUL(dst, src, scale_x);
       dst = offset(dst, 1);
       src = offset(src, 1);
-      emit(MUL(dst, src, scale_y));
+      bld.MUL(dst, src, scale_y);
    } else if (is_rect) {
       /* On gen6+, the sampler handles the rectangle coordinates
        * natively, without needing rescaling.  But that means we have
@@ -2127,8 +762,8 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 	    fs_reg chan = coordinate;
 	    chan = offset(chan, i);
 
-	    inst = emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f));
-	    inst->conditional_mod = BRW_CONDITIONAL_GE;
+            set_condmod(BRW_CONDITIONAL_GE,
+                        bld.emit(BRW_OPCODE_SEL, chan, chan, fs_reg(0.0f)));
 
 	    /* Our parameter comes in as 1.0/width or 1.0/height,
 	     * because that's what people normally want for doing
@@ -2137,11 +772,11 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 	     * parameter type, so just invert back.
 	     */
 	    fs_reg limit = vgrf(glsl_type::float_type);
-	    emit(MOV(limit, i == 0 ? scale_x : scale_y));
-	    emit(SHADER_OPCODE_RCP, limit, limit);
+            bld.MOV(limit, i == 0 ? scale_x : scale_y);
+            bld.emit(SHADER_OPCODE_RCP, limit, limit);
 
-	    inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
-	    inst->conditional_mod = BRW_CONDITIONAL_L;
+            set_condmod(BRW_CONDITIONAL_L,
+                        bld.emit(BRW_OPCODE_SEL, chan, chan, limit));
 	 }
       }
    }
@@ -2151,9 +786,7 @@ fs_visitor::rescale_texcoord(fs_reg coordinate, int coord_components,
 	 if (key_tex->gl_clamp_mask[i] & (1 << sampler)) {
 	    fs_reg chan = coordinate;
 	    chan = offset(chan, i);
-
-	    fs_inst *inst = emit(MOV(chan, chan));
-	    inst->saturate = true;
+            set_saturate(true, bld.MOV(chan, chan));
 	 }
       }
    }
@@ -2173,13 +806,13 @@ fs_visitor::emit_mcs_fetch(fs_reg coordinate, int components, fs_reg sampler)
    /* parameters are: u, v, r; missing parameters are treated as zero */
    for (int i = 0; i < components; i++) {
       sources[i] = vgrf(glsl_type::float_type);
-      emit(MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate));
+      bld.MOV(retype(sources[i], BRW_REGISTER_TYPE_D), coordinate);
       coordinate = offset(coordinate, 1);
    }
 
-   emit(LOAD_PAYLOAD(payload, sources, components, 0));
+   bld.LOAD_PAYLOAD(payload, sources, components, 0);
 
-   fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
+   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS, dest, payload, sampler);
    inst->base_mrf = -1;
    inst->mlen = components * reg_width;
    inst->header_size = 0;
@@ -2219,7 +852,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
          this->result = res;
 
          for (int i=0; i<4; i++) {
-            emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
+            bld.MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f));
             res = offset(res, 1);
          }
          return;
@@ -2276,7 +909,7 @@ fs_visitor::emit_texture(ir_texture_opcode op,
    if (op == ir_txs && is_cube_array) {
       fs_reg depth = offset(dst, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
-      emit_math(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
+      bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, fs_reg(6));
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
       int components = inst->regs_written / (dst.width / 8);
@@ -2287,167 +920,12 @@ fs_visitor::emit_texture(ir_texture_opcode op,
             fixed_payload[i] = offset(dst, i);
          }
       }
-      emit(LOAD_PAYLOAD(dst, fixed_payload, components, 0));
+      bld.LOAD_PAYLOAD(dst, fixed_payload, components, 0);
    }
 
    swizzle_result(op, dest_type->vector_elements, dst, sampler);
 }
 
-void
-fs_visitor::visit(ir_texture *ir)
-{
-   uint32_t sampler;
-
-   ir_dereference_variable *deref_var = ir->sampler->as_dereference_variable();
-   assert(deref_var);
-   ir_variable *var = deref_var->var;
-
-   sampler = stage_prog_data->bind_map[var->data.set].index[var->data.index];
-
-   ir_rvalue *nonconst_sampler_index =
-      _mesa_get_sampler_array_nonconst_index(ir->sampler);
-
-   /* Handle non-constant sampler array indexing */
-   fs_reg sampler_reg;
-   if (nonconst_sampler_index) {
-      /* The highest sampler which may be used by this operation is
-       * the last element of the array. Mark it here, because the generator
-       * doesn't have enough information to determine the bound.
-       */
-      uint32_t array_size = ir->sampler->as_dereference_array()
-         ->array->type->array_size();
-
-      uint32_t max_used = sampler + array_size - 1;
-      if (ir->op == ir_tg4 && devinfo->gen < 8) {
-         max_used += stage_prog_data->binding_table.gather_texture_start;
-      } else {
-         max_used += stage_prog_data->binding_table.texture_start;
-      }
-
-      brw_mark_surface_used(prog_data, max_used);
-
-      /* Emit code to evaluate the actual indexing expression */
-      nonconst_sampler_index->accept(this);
-      fs_reg temp = vgrf(glsl_type::uint_type);
-      emit(ADD(temp, this->result, fs_reg(sampler)));
-      emit_uniformize(temp, temp);
-
-      sampler_reg = temp;
-   } else {
-      /* Single sampler, or constant array index; the indexing expression
-       * is just an immediate.
-       */
-      sampler_reg = fs_reg(sampler);
-   }
-
-   /* FINISHME: We're failing to recompile our programs when the sampler is
-    * updated.  This only matters for the texture rectangle scale parameters
-    * (pre-gen6, or gen6+ with GL_CLAMP).
-    */
-   int texunit = prog->SamplerUnits[sampler];
-
-   /* Should be lowered by do_lower_texture_projection */
-   assert(!ir->projector);
-
-   /* Should be lowered */
-   assert(!ir->offset || !ir->offset->type->is_array());
-
-   /* Generate code to compute all the subexpression trees.  This has to be
-    * done before loading any values into MRFs for the sampler message since
-    * generating these values may involve SEND messages that need the MRFs.
-    */
-   fs_reg coordinate;
-   int coord_components = 0;
-   if (ir->coordinate) {
-      coord_components = ir->coordinate->type->vector_elements;
-      ir->coordinate->accept(this);
-      coordinate = this->result;
-   }
-
-   fs_reg shadow_comparitor;
-   if (ir->shadow_comparitor) {
-      ir->shadow_comparitor->accept(this);
-      shadow_comparitor = this->result;
-   }
-
-   fs_reg offset_value;
-   if (ir->offset) {
-      ir_constant *const_offset = ir->offset->as_constant();
-      if (const_offset) {
-         /* Store the header bitfield in an IMM register.  This allows us to
-          * use offset_value.file to distinguish between no offset, a constant
-          * offset, and a non-constant offset.
-          */
-         offset_value =
-            fs_reg(brw_texture_offset(const_offset->value.i,
-                                      const_offset->type->vector_elements));
-      } else {
-         ir->offset->accept(this);
-         offset_value = this->result;
-      }
-   }
-
-   fs_reg lod, lod2, sample_index, mcs;
-   int grad_components = 0;
-   switch (ir->op) {
-   case ir_tex:
-   case ir_lod:
-   case ir_tg4:
-   case ir_query_levels:
-      break;
-   case ir_txb:
-      ir->lod_info.bias->accept(this);
-      lod = this->result;
-      break;
-   case ir_txd:
-      ir->lod_info.grad.dPdx->accept(this);
-      lod = this->result;
-
-      ir->lod_info.grad.dPdy->accept(this);
-      lod2 = this->result;
-
-      grad_components = ir->lod_info.grad.dPdx->type->vector_elements;
-      break;
-   case ir_txf:
-   case ir_txl:
-   case ir_txs:
-      ir->lod_info.lod->accept(this);
-      lod = this->result;
-      break;
-   case ir_txf_ms:
-      ir->lod_info.sample_index->accept(this);
-      sample_index = this->result;
-
-      if (devinfo->gen >= 7 &&
-          key_tex->compressed_multisample_layout_mask & (1 << sampler)) {
-         mcs = emit_mcs_fetch(coordinate, ir->coordinate->type->vector_elements,
-                              sampler_reg);
-      } else {
-         mcs = fs_reg(0u);
-      }
-      break;
-   default:
-      unreachable("Unrecognized texture opcode");
-   };
-
-   int gather_component = 0;
-   if (ir->op == ir_tg4)
-      gather_component = ir->lod_info.component->as_constant()->value.i[0];
-
-   bool is_rect =
-      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT;
-
-   bool is_cube_array =
-      ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
-      ir->sampler->type->sampler_array;
-
-   emit_texture(ir->op, ir->type, coordinate, coord_components,
-                shadow_comparitor, lod, lod2, grad_components,
-                sample_index, offset_value, mcs,
-                gather_component, is_cube_array, is_rect, sampler,
-                sampler_reg, texunit);
-}
-
 /**
  * Apply workarounds for Gen6 gather with UINT/SINT
  */
@@ -2462,16 +940,16 @@ fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
    for (int i = 0; i < 4; i++) {
       fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
       /* Convert from UNORM to UINT */
-      emit(MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1))));
-      emit(MOV(dst, dst_f));
+      bld.MUL(dst_f, dst_f, fs_reg((float)((1 << width) - 1)));
+      bld.MOV(dst, dst_f);
 
       if (wa & WA_SIGN) {
          /* Reinterpret the UINT value as a signed INT value by
           * shifting the sign bit into place, then shifting back
           * preserving sign.
           */
-         emit(SHL(dst, dst, fs_reg(32 - width)));
-         emit(ASR(dst, dst, fs_reg(32 - width)));
+         bld.SHL(dst, dst, fs_reg(32 - width));
+         bld.ASR(dst, dst, fs_reg(32 - width));
       }
 
       dst = offset(dst, 1);
@@ -2535,461 +1013,18 @@ fs_visitor::swizzle_result(ir_texture_opcode op, int dest_components,
 	 l = offset(l, i);
 
 	 if (swiz == SWIZZLE_ZERO) {
-	    emit(MOV(l, fs_reg(0.0f)));
+            bld.MOV(l, fs_reg(0.0f));
 	 } else if (swiz == SWIZZLE_ONE) {
-	    emit(MOV(l, fs_reg(1.0f)));
+            bld.MOV(l, fs_reg(1.0f));
 	 } else {
-            emit(MOV(l, offset(orig_val,
-                               GET_SWZ(key_tex->swizzles[sampler], i))));
+            bld.MOV(l, offset(orig_val,
+                              GET_SWZ(key_tex->swizzles[sampler], i)));
 	 }
       }
       this->result = swizzled_result;
    }
 }
 
-void
-fs_visitor::visit(ir_swizzle *ir)
-{
-   ir->val->accept(this);
-   fs_reg val = this->result;
-
-   if (ir->type->vector_elements == 1) {
-      this->result = offset(this->result, ir->mask.x);
-      return;
-   }
-
-   fs_reg result = vgrf(ir->type);
-   this->result = result;
-
-   for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
-      fs_reg channel = val;
-      int swiz = 0;
-
-      switch (i) {
-      case 0:
-	 swiz = ir->mask.x;
-	 break;
-      case 1:
-	 swiz = ir->mask.y;
-	 break;
-      case 2:
-	 swiz = ir->mask.z;
-	 break;
-      case 3:
-	 swiz = ir->mask.w;
-	 break;
-      }
-
-      emit(MOV(result, offset(channel, swiz)));
-      result = offset(result, 1);
-   }
-}
-
-void
-fs_visitor::visit(ir_discard *ir)
-{
-   /* We track our discarded pixels in f0.1.  By predicating on it, we can
-    * update just the flag bits that aren't yet discarded.  If there's no
-    * condition, we emit a CMP of g0 != g0, so all currently executing
-    * channels will get turned off.
-    */
-   fs_inst *cmp;
-   if (ir->condition) {
-      emit_bool_to_cond_code(ir->condition);
-      cmp = (fs_inst *) this->instructions.get_tail();
-      cmp->conditional_mod = brw_negate_cmod(cmp->conditional_mod);
-   } else {
-      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
-                                      BRW_REGISTER_TYPE_UW));
-      cmp = emit(CMP(reg_null_f, some_reg, some_reg, BRW_CONDITIONAL_NZ));
-   }
-   cmp->predicate = BRW_PREDICATE_NORMAL;
-   cmp->flag_subreg = 1;
-
-   if (devinfo->gen >= 6) {
-      emit_discard_jump();
-   }
-}
-
-void
-fs_visitor::visit(ir_constant *ir)
-{
-   /* Set this->result to reg at the bottom of the function because some code
-    * paths will cause this visitor to be applied to other fields.  This will
-    * cause the value stored in this->result to be modified.
-    *
-    * Make reg constant so that it doesn't get accidentally modified along the
-    * way.  Yes, I actually had this problem. :(
-    */
-   const fs_reg reg = vgrf(ir->type);
-   fs_reg dst_reg = reg;
-
-   if (ir->type->is_array()) {
-      const unsigned size = type_size(ir->type->fields.array);
-
-      for (unsigned i = 0; i < ir->type->length; i++) {
-	 ir->array_elements[i]->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(MOV(dst_reg, src_reg));
-	    src_reg = offset(src_reg, 1);
-	    dst_reg = offset(dst_reg, 1);
-	 }
-      }
-   } else if (ir->type->is_record()) {
-      foreach_in_list(ir_constant, field, &ir->components) {
-	 const unsigned size = type_size(field->type);
-
-	 field->accept(this);
-	 fs_reg src_reg = this->result;
-
-	 dst_reg.type = src_reg.type;
-	 for (unsigned j = 0; j < size; j++) {
-	    emit(MOV(dst_reg, src_reg));
-	    src_reg = offset(src_reg, 1);
-	    dst_reg = offset(dst_reg, 1);
-	 }
-      }
-   } else {
-      const unsigned size = type_size(ir->type);
-
-      for (unsigned i = 0; i < size; i++) {
-	 switch (ir->type->base_type) {
-	 case GLSL_TYPE_FLOAT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
-	    break;
-	 case GLSL_TYPE_UINT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
-	    break;
-	 case GLSL_TYPE_INT:
-	    emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
-	    break;
-	 case GLSL_TYPE_BOOL:
-            emit(MOV(dst_reg, fs_reg(ir->value.b[i] != 0 ? ~0 : 0)));
-	    break;
-	 default:
-	    unreachable("Non-float/uint/int/bool constant");
-	 }
-	 dst_reg = offset(dst_reg, 1);
-      }
-   }
-
-   this->result = reg;
-}
-
-void
-fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
-{
-   ir_expression *expr = ir->as_expression();
-
-   if (!expr || expr->operation == ir_binop_ubo_load) {
-      ir->accept(this);
-
-      fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      return;
-   }
-
-   fs_reg op[3];
-
-   assert(expr->get_num_operands() <= 3);
-   for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-      assert(expr->operands[i]->type->is_scalar());
-
-      expr->operands[i]->accept(this);
-      op[i] = this->result;
-
-      resolve_ud_negate(&op[i]);
-   }
-
-   emit_bool_to_cond_code_of_reg(expr, op);
-}
-
-void
-fs_visitor::emit_bool_to_cond_code_of_reg(ir_expression *expr, fs_reg op[3])
-{
-   fs_inst *inst;
-
-   switch (expr->operation) {
-   case ir_unop_logic_not:
-      inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
-      inst->conditional_mod = BRW_CONDITIONAL_Z;
-      break;
-
-   case ir_binop_logic_xor:
-      if (devinfo->gen <= 5) {
-         fs_reg temp = vgrf(expr->type);
-         emit(XOR(temp, op[0], op[1]));
-         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
-      } else {
-         inst = emit(XOR(reg_null_d, op[0], op[1]));
-      }
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-
-   case ir_binop_logic_or:
-      if (devinfo->gen <= 5) {
-         fs_reg temp = vgrf(expr->type);
-         emit(OR(temp, op[0], op[1]));
-         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
-      } else {
-         inst = emit(OR(reg_null_d, op[0], op[1]));
-      }
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-
-   case ir_binop_logic_and:
-      if (devinfo->gen <= 5) {
-         fs_reg temp = vgrf(expr->type);
-         emit(AND(temp, op[0], op[1]));
-         inst = emit(AND(reg_null_d, temp, fs_reg(1)));
-      } else {
-         inst = emit(AND(reg_null_d, op[0], op[1]));
-      }
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-
-   case ir_unop_f2b:
-      if (devinfo->gen >= 6) {
-         emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
-      } else {
-         inst = emit(MOV(reg_null_f, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_unop_i2b:
-      if (devinfo->gen >= 6) {
-         emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-      } else {
-         inst = emit(MOV(reg_null_d, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      }
-      break;
-
-   case ir_binop_greater:
-   case ir_binop_gequal:
-   case ir_binop_less:
-   case ir_binop_lequal:
-   case ir_binop_equal:
-   case ir_binop_all_equal:
-   case ir_binop_nequal:
-   case ir_binop_any_nequal:
-      if (devinfo->gen <= 5) {
-         resolve_bool_comparison(expr->operands[0], &op[0]);
-         resolve_bool_comparison(expr->operands[1], &op[1]);
-      }
-
-      emit(CMP(reg_null_d, op[0], op[1],
-               brw_conditional_for_comparison(expr->operation)));
-      break;
-
-   case ir_triop_csel: {
-      /* Expand the boolean condition into the flag register. */
-      inst = emit(MOV(reg_null_d, op[0]));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-      /* Select which boolean to return. */
-      fs_reg temp = vgrf(expr->operands[1]->type);
-      inst = emit(SEL(temp, op[1], op[2]));
-      inst->predicate = BRW_PREDICATE_NORMAL;
-
-      /* Expand the result to a condition code. */
-      inst = emit(MOV(reg_null_d, temp));
-      inst->conditional_mod = BRW_CONDITIONAL_NZ;
-      break;
-   }
-
-   default:
-      unreachable("not reached");
-   }
-}
-
-/**
- * Emit a gen6 IF statement with the comparison folded into the IF
- * instruction.
- */
-void
-fs_visitor::emit_if_gen6(ir_if *ir)
-{
-   ir_expression *expr = ir->condition->as_expression();
-
-   if (expr && expr->operation != ir_binop_ubo_load) {
-      fs_reg op[3];
-      fs_inst *inst;
-      fs_reg temp;
-
-      assert(expr->get_num_operands() <= 3);
-      for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
-	 assert(expr->operands[i]->type->is_scalar());
-
-	 expr->operands[i]->accept(this);
-	 op[i] = this->result;
-      }
-
-      switch (expr->operation) {
-      case ir_unop_logic_not:
-         emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_Z));
-         return;
-
-      case ir_binop_logic_xor:
-         emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_binop_logic_or:
-         temp = vgrf(glsl_type::bool_type);
-         emit(OR(temp, op[0], op[1]));
-         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_binop_logic_and:
-         temp = vgrf(glsl_type::bool_type);
-         emit(AND(temp, op[0], op[1]));
-         emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-         return;
-
-      case ir_unop_f2b:
-	 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
-	 inst->conditional_mod = BRW_CONDITIONAL_NZ;
-	 return;
-
-      case ir_unop_i2b:
-	 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-
-      case ir_binop_greater:
-      case ir_binop_gequal:
-      case ir_binop_less:
-      case ir_binop_lequal:
-      case ir_binop_equal:
-      case ir_binop_all_equal:
-      case ir_binop_nequal:
-      case ir_binop_any_nequal:
-         if (devinfo->gen <= 5) {
-            resolve_bool_comparison(expr->operands[0], &op[0]);
-            resolve_bool_comparison(expr->operands[1], &op[1]);
-         }
-
-	 emit(IF(op[0], op[1],
-                 brw_conditional_for_comparison(expr->operation)));
-	 return;
-
-      case ir_triop_csel: {
-         /* Expand the boolean condition into the flag register. */
-         fs_inst *inst = emit(MOV(reg_null_d, op[0]));
-         inst->conditional_mod = BRW_CONDITIONAL_NZ;
-
-         /* Select which boolean to use as the result. */
-         fs_reg temp = vgrf(expr->operands[1]->type);
-         inst = emit(SEL(temp, op[1], op[2]));
-         inst->predicate = BRW_PREDICATE_NORMAL;
-
-	 emit(IF(temp, fs_reg(0), BRW_CONDITIONAL_NZ));
-	 return;
-      }
-
-      default:
-	 unreachable("not reached");
-      }
-   }
-
-   ir->condition->accept(this);
-   emit(IF(this->result, fs_reg(0), BRW_CONDITIONAL_NZ));
-}
-
-bool
-fs_visitor::try_opt_frontfacing_ternary(ir_if *ir)
-{
-   ir_dereference_variable *deref = ir->condition->as_dereference_variable();
-   if (!deref || strcmp(deref->var->name, "gl_FrontFacing") != 0)
-      return false;
-
-   if (ir->then_instructions.length() != 1 ||
-       ir->else_instructions.length() != 1)
-      return false;
-
-   ir_assignment *then_assign =
-         ((ir_instruction *)ir->then_instructions.head)->as_assignment();
-   ir_assignment *else_assign =
-         ((ir_instruction *)ir->else_instructions.head)->as_assignment();
-
-   if (!then_assign || then_assign->condition ||
-       !else_assign || else_assign->condition ||
-       then_assign->write_mask != else_assign->write_mask ||
-       !then_assign->lhs->equals(else_assign->lhs))
-      return false;
-
-   ir_constant *then_rhs = then_assign->rhs->as_constant();
-   ir_constant *else_rhs = else_assign->rhs->as_constant();
-
-   if (!then_rhs || !else_rhs)
-      return false;
-
-   if (then_rhs->type->base_type != GLSL_TYPE_FLOAT)
-      return false;
-
-   if ((then_rhs->is_one() && else_rhs->is_negative_one()) ||
-       (else_rhs->is_one() && then_rhs->is_negative_one())) {
-      then_assign->lhs->accept(this);
-      fs_reg dst = this->result;
-      dst.type = BRW_REGISTER_TYPE_D;
-      fs_reg tmp = vgrf(glsl_type::int_type);
-
-      if (devinfo->gen >= 6) {
-         /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
-         fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
-
-         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
-          *
-          *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
-          *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
-          *
-          * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
-          */
-
-         if (then_rhs->is_negative_one()) {
-            assert(else_rhs->is_one());
-            g0.negate = true;
-         }
-
-         tmp.type = BRW_REGISTER_TYPE_W;
-         tmp.subreg_offset = 2;
-         tmp.stride = 2;
-
-         fs_inst *or_inst = emit(OR(tmp, g0, fs_reg(0x3f80)));
-         or_inst->src[1].type = BRW_REGISTER_TYPE_UW;
-
-         tmp.type = BRW_REGISTER_TYPE_D;
-         tmp.subreg_offset = 0;
-         tmp.stride = 1;
-      } else {
-         /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
-         fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
-
-         /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
-          *
-          *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
-          *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
-          *
-          * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
-          */
-
-         if (then_rhs->is_negative_one()) {
-            assert(else_rhs->is_one());
-            g1_6.negate = true;
-         }
-
-         emit(OR(tmp, g1_6, fs_reg(0x3f800000)));
-      }
-      emit(AND(dst, tmp, fs_reg(0xbf800000)));
-      return true;
-   }
-
-   return false;
-}
-
 /**
  * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
  *
@@ -3056,21 +1091,21 @@ fs_visitor::try_replace_with_sel()
       if (src0.file == IMM) {
          src0 = vgrf(glsl_type::float_type);
          src0.type = then_mov->src[0].type;
-         emit(MOV(src0, then_mov->src[0]));
+         bld.MOV(src0, then_mov->src[0]);
       }
 
-      fs_inst *sel;
       if (if_inst->conditional_mod) {
          /* Sandybridge-specific IF with embedded comparison */
-         emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
-                  if_inst->conditional_mod));
-         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
-         sel->predicate = BRW_PREDICATE_NORMAL;
+         bld.CMP(bld.null_reg_d(), if_inst->src[0], if_inst->src[1],
+                 if_inst->conditional_mod);
+         set_predicate(BRW_PREDICATE_NORMAL,
+                       bld.emit(BRW_OPCODE_SEL, then_mov->dst,
+                                src0, else_mov->src[0]));
       } else {
          /* Separate CMP and IF instructions */
-         sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
-         sel->predicate = if_inst->predicate;
-         sel->predicate_inverse = if_inst->predicate_inverse;
+         set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
+                           bld.emit(BRW_OPCODE_SEL, then_mov->dst,
+                                    src0, else_mov->src[0]));
       }
 
       return true;
@@ -3080,178 +1115,6 @@ fs_visitor::try_replace_with_sel()
 }
 
 void
-fs_visitor::visit(ir_if *ir)
-{
-   if (try_opt_frontfacing_ternary(ir))
-      return;
-
-   /* Don't point the annotation at the if statement, because then it plus
-    * the then and else blocks get printed.
-    */
-   this->base_ir = ir->condition;
-
-   if (devinfo->gen == 6) {
-      emit_if_gen6(ir);
-   } else {
-      emit_bool_to_cond_code(ir->condition);
-
-      emit(IF(BRW_PREDICATE_NORMAL));
-   }
-
-   foreach_in_list(ir_instruction, ir_, &ir->then_instructions) {
-      this->base_ir = ir_;
-      ir_->accept(this);
-   }
-
-   if (!ir->else_instructions.is_empty()) {
-      emit(BRW_OPCODE_ELSE);
-
-      foreach_in_list(ir_instruction, ir_, &ir->else_instructions) {
-	 this->base_ir = ir_;
-	 ir_->accept(this);
-      }
-   }
-
-   emit(BRW_OPCODE_ENDIF);
-
-   if (!try_replace_with_sel() && devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-}
-
-void
-fs_visitor::visit(ir_loop *ir)
-{
-   if (devinfo->gen < 6) {
-      no16("Can't support (non-uniform) control flow on SIMD16\n");
-   }
-
-   this->base_ir = NULL;
-   emit(BRW_OPCODE_DO);
-
-   foreach_in_list(ir_instruction, ir_, &ir->body_instructions) {
-      this->base_ir = ir_;
-      ir_->accept(this);
-   }
-
-   this->base_ir = NULL;
-   emit(BRW_OPCODE_WHILE);
-}
-
-void
-fs_visitor::visit(ir_loop_jump *ir)
-{
-   switch (ir->mode) {
-   case ir_loop_jump::jump_break:
-      emit(BRW_OPCODE_BREAK);
-      break;
-   case ir_loop_jump::jump_continue:
-      emit(BRW_OPCODE_CONTINUE);
-      break;
-   }
-}
-
-void
-fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
-{
-   ir_dereference *deref = static_cast<ir_dereference *>(
-      ir->actual_parameters.get_head());
-   ir_variable *location = deref->variable_referenced();
-   unsigned surf_index = (stage_prog_data->binding_table.abo_start +
-                          location->data.binding);
-
-   /* Calculate the surface offset */
-   fs_reg offset = vgrf(glsl_type::uint_type);
-   ir_dereference_array *deref_array = deref->as_dereference_array();
-
-   if (deref_array) {
-      deref_array->array_index->accept(this);
-
-      fs_reg tmp = vgrf(glsl_type::uint_type);
-      emit(MUL(tmp, this->result, fs_reg(ATOMIC_COUNTER_SIZE)));
-      emit(ADD(offset, tmp, fs_reg(location->data.atomic.offset)));
-   } else {
-      offset = fs_reg(location->data.atomic.offset);
-   }
-
-   /* Emit the appropriate machine instruction */
-   const char *callee = ir->callee->function_name();
-   ir->return_deref->accept(this);
-   fs_reg dst = this->result;
-
-   if (!strcmp("__intrinsic_atomic_read", callee)) {
-      emit_untyped_surface_read(surf_index, dst, offset);
-
-   } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
-      emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
-                          fs_reg(), fs_reg());
-
-   } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
-      emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
-                          fs_reg(), fs_reg());
-   }
-}
-
-void
-fs_visitor::visit(ir_call *ir)
-{
-   const char *callee = ir->callee->function_name();
-
-   if (!strcmp("__intrinsic_atomic_read", callee) ||
-       !strcmp("__intrinsic_atomic_increment", callee) ||
-       !strcmp("__intrinsic_atomic_predecrement", callee)) {
-      visit_atomic_counter_intrinsic(ir);
-   } else {
-      unreachable("Unsupported intrinsic.");
-   }
-}
-
-void
-fs_visitor::visit(ir_return *)
-{
-   unreachable("FINISHME");
-}
-
-void
-fs_visitor::visit(ir_function *ir)
-{
-   /* Ignore function bodies other than main() -- we shouldn't see calls to
-    * them since they should all be inlined before we get to ir_to_mesa.
-    */
-   if (strcmp(ir->name, "main") == 0) {
-      const ir_function_signature *sig;
-      exec_list empty;
-
-      sig = ir->matching_signature(NULL, &empty, false);
-
-      assert(sig);
-
-      foreach_in_list(ir_instruction, ir_, &sig->body) {
-	 this->base_ir = ir_;
-	 ir_->accept(this);
-      }
-   }
-}
-
-void
-fs_visitor::visit(ir_function_signature *)
-{
-   unreachable("not reached");
-}
-
-void
-fs_visitor::visit(ir_emit_vertex *)
-{
-   unreachable("not reached");
-}
-
-void
-fs_visitor::visit(ir_end_primitive *)
-{
-   unreachable("not reached");
-}
-
-void
 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                                 fs_reg dst, fs_reg offset, fs_reg src0,
                                 fs_reg src1)
@@ -3263,17 +1126,16 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
 
    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    /* Initialize the sample mask in the message header. */
-   emit(MOV(sources[0], fs_reg(0u)))
-      ->force_writemask_all = true;
+   bld.exec_all().MOV(sources[0], fs_reg(0u));
 
    if (stage == MESA_SHADER_FRAGMENT) {
       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
       } else {
-         emit(MOV(component(sources[0], 7),
-                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7),
+                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
       }
    } else {
       /* The execution mask is part of the side-band information sent together with
@@ -3282,37 +1144,37 @@ fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
        * the atomic operation.
        */
       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      emit(MOV(component(sources[0], 7),
-               fs_reg(0xffffu)))->force_writemask_all = true;
+      bld.exec_all()
+         .MOV(component(sources[0], 7), fs_reg(0xffffu));
    }
    length++;
 
    /* Set the atomic operation offset. */
    sources[1] = vgrf(glsl_type::uint_type);
-   emit(MOV(sources[1], offset));
+   bld.MOV(sources[1], offset);
    length++;
 
    /* Set the atomic operation arguments. */
    if (src0.file != BAD_FILE) {
       sources[length] = vgrf(glsl_type::uint_type);
-      emit(MOV(sources[length], src0));
+      bld.MOV(sources[length], src0);
       length++;
    }
 
    if (src1.file != BAD_FILE) {
       sources[length] = vgrf(glsl_type::uint_type);
-      emit(MOV(sources[length], src1));
+      bld.MOV(sources[length], src1);
       length++;
    }
 
    int mlen = 1 + (length - 1) * reg_width;
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
                                BRW_REGISTER_TYPE_UD, dispatch_width);
-   emit(LOAD_PAYLOAD(src_payload, sources, length, 1));
+   bld.LOAD_PAYLOAD(src_payload, sources, length, 1);
 
    /* Emit the instruction. */
-   fs_inst *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
-                        fs_reg(surf_index), fs_reg(atomic_op));
+   fs_inst *inst = bld.emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst, src_payload,
+                            fs_reg(surf_index), fs_reg(atomic_op));
    inst->mlen = mlen;
 }
 
@@ -3326,17 +1188,17 @@ fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
 
    sources[0] = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
    /* Initialize the sample mask in the message header. */
-   emit(MOV(sources[0], fs_reg(0u)))
-      ->force_writemask_all = true;
+   bld.exec_all()
+      .MOV(sources[0], fs_reg(0u));
 
    if (stage == MESA_SHADER_FRAGMENT) {
       if (((brw_wm_prog_data*)this->prog_data)->uses_kill) {
-         emit(MOV(component(sources[0], 7), brw_flag_reg(0, 1)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7), brw_flag_reg(0, 1));
       } else {
-         emit(MOV(component(sources[0], 7),
-                  retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
-            ->force_writemask_all = true;
+         bld.exec_all()
+            .MOV(component(sources[0], 7),
+                 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD));
       }
    } else {
       /* The execution mask is part of the side-band information sent together with
@@ -3345,48 +1207,25 @@ fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
        * the atomic operation.
        */
       assert(stage == MESA_SHADER_VERTEX || stage == MESA_SHADER_COMPUTE);
-      emit(MOV(component(sources[0], 7),
-               fs_reg(0xffffu)))->force_writemask_all = true;
+      bld.exec_all()
+         .MOV(component(sources[0], 7), fs_reg(0xffffu));
    }
 
    /* Set the surface read offset. */
    sources[1] = vgrf(glsl_type::uint_type);
-   emit(MOV(sources[1], offset));
+   bld.MOV(sources[1], offset);
 
    int mlen = 1 + reg_width;
    fs_reg src_payload = fs_reg(GRF, alloc.allocate(mlen),
                                BRW_REGISTER_TYPE_UD, dispatch_width);
-   fs_inst *inst = emit(LOAD_PAYLOAD(src_payload, sources, 2, 1));
+   fs_inst *inst = bld.LOAD_PAYLOAD(src_payload, sources, 2, 1);
 
    /* Emit the instruction. */
-   inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
-               fs_reg(surf_index), fs_reg(1));
+   inst = bld.emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, src_payload,
+                   fs_reg(surf_index), fs_reg(1));
    inst->mlen = mlen;
 }
 
-fs_inst *
-fs_visitor::emit(fs_inst *inst)
-{
-   if (dispatch_width == 16 && inst->exec_size == 8)
-      inst->force_uncompressed = true;
-
-   inst->annotation = this->current_annotation;
-   inst->ir = this->base_ir;
-
-   this->instructions.push_tail(inst);
-
-   return inst;
-}
-
-void
-fs_visitor::emit(exec_list list)
-{
-   foreach_in_list_safe(fs_inst, inst, &list) {
-      inst->exec_node::remove();
-      emit(inst);
-   }
-}
-
 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
 void
 fs_visitor::emit_dummy_fs()
@@ -3396,12 +1235,12 @@ fs_visitor::emit_dummy_fs()
    /* Everyone's favorite color. */
    const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
    for (int i = 0; i < 4; i++) {
-      emit(MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
-                      dispatch_width), fs_reg(color[i])));
+      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F,
+                     dispatch_width), fs_reg(color[i]));
    }
 
    fs_inst *write;
-   write = emit(FS_OPCODE_FB_WRITE);
+   write = bld.emit(FS_OPCODE_FB_WRITE);
    write->eot = true;
    if (devinfo->gen >= 6) {
       write->base_mrf = 2;
@@ -3454,19 +1293,19 @@ fs_visitor::emit_interpolation_setup_gen4()
 {
    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 
-   this->current_annotation = "compute pixel centers";
+   fs_builder abld = bld.annotate("compute pixel centers");
    this->pixel_x = vgrf(glsl_type::uint_type);
    this->pixel_y = vgrf(glsl_type::uint_type);
    this->pixel_x.type = BRW_REGISTER_TYPE_UW;
    this->pixel_y.type = BRW_REGISTER_TYPE_UW;
-   emit(ADD(this->pixel_x,
+   abld.ADD(this->pixel_x,
             fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-            fs_reg(brw_imm_v(0x10101010))));
-   emit(ADD(this->pixel_y,
+            fs_reg(brw_imm_v(0x10101010)));
+   abld.ADD(this->pixel_y,
             fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-            fs_reg(brw_imm_v(0x11001100))));
+            fs_reg(brw_imm_v(0x11001100)));
 
-   this->current_annotation = "compute pixel deltas from v0";
+   abld = bld.annotate("compute pixel deltas from v0");
 
    this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
       vgrf(glsl_type::vec2_type);
@@ -3475,27 +1314,27 @@ fs_visitor::emit_interpolation_setup_gen4()
    const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
 
    if (devinfo->has_pln && dispatch_width == 16) {
-      emit(ADD(half(offset(delta_xy, 0), 0), half(this->pixel_x, 0), xstart));
-      emit(ADD(half(offset(delta_xy, 0), 1), half(this->pixel_y, 0), ystart));
-      emit(ADD(half(offset(delta_xy, 1), 0), half(this->pixel_x, 1), xstart))
-         ->force_sechalf = true;
-      emit(ADD(half(offset(delta_xy, 1), 1), half(this->pixel_y, 1), ystart))
-         ->force_sechalf = true;
+      for (unsigned i = 0; i < 2; i++) {
+         abld.half(i).ADD(half(offset(delta_xy, i), 0),
+                          half(this->pixel_x, i), xstart);
+         abld.half(i).ADD(half(offset(delta_xy, i), 1),
+                          half(this->pixel_y, i), ystart);
+      }
    } else {
-      emit(ADD(offset(delta_xy, 0), this->pixel_x, xstart));
-      emit(ADD(offset(delta_xy, 1), this->pixel_y, ystart));
+      abld.ADD(offset(delta_xy, 0), this->pixel_x, xstart);
+      abld.ADD(offset(delta_xy, 1), this->pixel_y, ystart);
    }
 
-   this->current_annotation = "compute pos.w and 1/pos.w";
+   abld = bld.annotate("compute pos.w and 1/pos.w");
    /* Compute wpos.w.  It's always in our setup, since it's needed to
     * interpolate the other attributes.
     */
    this->wpos_w = vgrf(glsl_type::float_type);
-   emit(FS_OPCODE_LINTERP, wpos_w, delta_xy, interp_reg(VARYING_SLOT_POS, 3));
+   abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
+             interp_reg(VARYING_SLOT_POS, 3));
    /* Compute the pixel 1/W value from wpos.w. */
    this->pixel_w = vgrf(glsl_type::float_type);
-   emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
-   this->current_annotation = NULL;
+   abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
 }
 
 /** Emits the interpolation for the varying inputs. */
@@ -3504,8 +1343,8 @@ fs_visitor::emit_interpolation_setup_gen6()
 {
    struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
 
-   this->current_annotation = "compute pixel centers";
-   if (brw->gen >= 8 || dispatch_width == 8) {
+   fs_builder abld = bld.annotate("compute pixel centers");
+   if (devinfo->gen >= 8 || dispatch_width == 8) {
       /* The "Register Region Restrictions" page says for BDW (and newer,
        * presumably):
        *
@@ -3518,15 +1357,15 @@ fs_visitor::emit_interpolation_setup_gen6()
        */
       fs_reg int_pixel_xy(GRF, alloc.allocate(dispatch_width / 8),
                           BRW_REGISTER_TYPE_UW, dispatch_width * 2);
-      emit(ADD(int_pixel_xy,
+      abld.exec_all()
+          .ADD(int_pixel_xy,
                fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
-               fs_reg(brw_imm_v(0x11001010))))
-         ->force_writemask_all = true;
+               fs_reg(brw_imm_v(0x11001010)));
 
       this->pixel_x = vgrf(glsl_type::float_type);
       this->pixel_y = vgrf(glsl_type::float_type);
-      emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
-      emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
+      abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
+      abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
    } else {
       /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
        *
@@ -3540,12 +1379,12 @@ fs_visitor::emit_interpolation_setup_gen6()
       fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
       int_pixel_x.type = BRW_REGISTER_TYPE_UW;
       int_pixel_y.type = BRW_REGISTER_TYPE_UW;
-      emit(ADD(int_pixel_x,
+      abld.ADD(int_pixel_x,
                fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
-               fs_reg(brw_imm_v(0x10101010))));
-      emit(ADD(int_pixel_y,
+               fs_reg(brw_imm_v(0x10101010)));
+      abld.ADD(int_pixel_y,
                fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
-               fs_reg(brw_imm_v(0x11001100))));
+               fs_reg(brw_imm_v(0x11001100)));
 
       /* As of gen6, we can no longer mix float and int sources.  We have
        * to turn the integer pixel centers into floats for their actual
@@ -3553,21 +1392,19 @@ fs_visitor::emit_interpolation_setup_gen6()
        */
       this->pixel_x = vgrf(glsl_type::float_type);
       this->pixel_y = vgrf(glsl_type::float_type);
-      emit(MOV(this->pixel_x, int_pixel_x));
-      emit(MOV(this->pixel_y, int_pixel_y));
+      abld.MOV(this->pixel_x, int_pixel_x);
+      abld.MOV(this->pixel_y, int_pixel_y);
    }
 
-   this->current_annotation = "compute pos.w";
+   abld = bld.annotate("compute pos.w");
    this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
    this->wpos_w = vgrf(glsl_type::float_type);
-   emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+   abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
 
    for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
       uint8_t reg = payload.barycentric_coord_reg[i];
       this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
    }
-
-   this->current_annotation = NULL;
 }
 
 void
@@ -3581,7 +1418,7 @@ fs_visitor::setup_color_payload(fs_reg *dst, fs_reg color, unsigned components,
       fs_reg tmp = vgrf(glsl_type::vec4_type);
       assert(color.type == BRW_REGISTER_TYPE_F);
       for (unsigned i = 0; i < components; i++) {
-         inst = emit(MOV(offset(tmp, i), offset(color, i)));
+         inst = bld.MOV(offset(tmp, i), offset(color, i));
          inst->saturate = true;
       }
       color = tmp;
@@ -3627,7 +1464,7 @@ fs_visitor::emit_alpha_test()
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-   this->current_annotation = "Alpha test";
+   const fs_builder abld = bld.annotate("Alpha test");
 
    fs_inst *cmp;
    if (key->alpha_test_func == GL_ALWAYS)
@@ -3637,30 +1474,29 @@ fs_visitor::emit_alpha_test()
       /* f0.1 = 0 */
       fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
                                       BRW_REGISTER_TYPE_UW));
-      cmp = emit(CMP(reg_null_f, some_reg, some_reg,
-                     BRW_CONDITIONAL_NEQ));
+      cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
+                     BRW_CONDITIONAL_NEQ);
    } else {
       /* RT0 alpha */
       fs_reg color = offset(outputs[0], 3);
 
       /* f0.1 &= func(color, ref) */
-      cmp = emit(CMP(reg_null_f, color, fs_reg(key->alpha_test_ref),
-                     cond_for_alpha_func(key->alpha_test_func)));
+      cmp = abld.CMP(bld.null_reg_f(), color, fs_reg(key->alpha_test_ref),
+                     cond_for_alpha_func(key->alpha_test_func));
    }
    cmp->predicate = BRW_PREDICATE_NORMAL;
    cmp->flag_subreg = 1;
 }
 
 fs_inst *
-fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
+fs_visitor::emit_single_fb_write(const fs_builder &bld,
+                                 fs_reg color0, fs_reg color1,
                                  fs_reg src0_alpha, unsigned components,
                                  unsigned exec_size, bool use_2nd_half)
 {
    assert(stage == MESA_SHADER_FRAGMENT);
    brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
    brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
-
-   this->current_annotation = "FB write header";
    int header_size = 2, payload_header_size;
 
    /* We can potentially have a message length of up to 15, so we have to set
@@ -3691,22 +1527,23 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
 
    if (payload.aa_dest_stencil_reg) {
       sources[length] = fs_reg(GRF, alloc.allocate(1));
-      emit(MOV(sources[length],
-               fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0))));
+      bld.exec_all().annotate("FB write stencil/AA alpha")
+         .MOV(sources[length],
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
       length++;
    }
 
    prog_data->uses_omask =
       prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
    if (prog_data->uses_omask) {
-      this->current_annotation = "FB write oMask";
       assert(this->sample_mask.file != BAD_FILE);
       /* Hand over gl_SampleMask. Only lower 16 bits are relevant.  Since
        * it's unsinged single words, one vgrf is always 16-wide.
        */
       sources[length] = fs_reg(GRF, alloc.allocate(1),
                                BRW_REGISTER_TYPE_UW, 16);
-      emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
+      bld.exec_all().annotate("FB write oMask")
+         .emit(FS_OPCODE_SET_OMASK, sources[length], this->sample_mask);
       length++;
    }
 
@@ -3752,7 +1589,11 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
       if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
 	 /* Hand over gl_FragDepth. */
 	 assert(this->frag_depth.file != BAD_FILE);
-         sources[length] = this->frag_depth;
+         if (exec_size < dispatch_width) {
+            sources[length] = half(this->frag_depth, use_2nd_half);
+         } else {
+            sources[length] = this->frag_depth;
+         }
       } else {
 	 /* Pass through the payload depth. */
          sources[length] = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
@@ -3763,28 +1604,29 @@ fs_visitor::emit_single_fb_write(fs_reg color0, fs_reg color1,
    if (payload.dest_depth_reg)
       sources[length++] = fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0));
 
+   const fs_builder ubld = bld.group(exec_size, use_2nd_half);
    fs_inst *load;
    fs_inst *write;
    if (devinfo->gen >= 7) {
       /* Send from the GRF */
       fs_reg payload = fs_reg(GRF, -1, BRW_REGISTER_TYPE_F, exec_size);
-      load = emit(LOAD_PAYLOAD(payload, sources, length, payload_header_size));
+      load = ubld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
       payload.reg = alloc.allocate(load->regs_written);
       load->dst = payload;
-      write = emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
+      write = ubld.emit(FS_OPCODE_FB_WRITE, reg_undef, payload);
       write->base_mrf = -1;
    } else {
       /* Send from the MRF */
-      load = emit(LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
-                               sources, length, payload_header_size));
+      load = ubld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F, exec_size),
+                               sources, length, payload_header_size);
 
       /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
        * will do this for us if we just give it a COMPR4 destination.
        */
-      if (brw->gen < 6 && exec_size == 16)
+      if (devinfo->gen < 6 && exec_size == 16)
          load->dst.reg |= BRW_MRF_COMPR4;
 
-      write = emit(FS_OPCODE_FB_WRITE);
+      write = ubld.emit(FS_OPCODE_FB_WRITE);
       write->exec_size = exec_size;
       write->base_mrf = 1;
    }
@@ -3807,10 +1649,10 @@ fs_visitor::emit_fb_writes()
 
    fs_inst *inst = NULL;
    if (do_dual_src) {
-      this->current_annotation = ralloc_asprintf(this->mem_ctx,
-						 "FB dual-source write");
-      inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
-                                  reg_undef, 4, 8);
+      const fs_builder abld = bld.annotate("FB dual-source write");
+
+      inst = emit_single_fb_write(abld, this->outputs[0],
+                                  this->dual_src_output, reg_undef, 4, 8);
       inst->target = 0;
 
       /* SIMD16 dual source blending requires to send two SIMD8 dual source
@@ -3831,8 +1673,9 @@ fs_visitor::emit_fb_writes()
        * m + 3: a1
        */
       if (dispatch_width == 16) {
-         inst = emit_single_fb_write(this->outputs[0], this->dual_src_output,
-                                     reg_undef, 4, 8, true);
+         inst = emit_single_fb_write(abld, this->outputs[0],
+                                     this->dual_src_output, reg_undef, 4, 8,
+                                     true);
          inst->target = 0;
       }
 
@@ -3843,14 +1686,14 @@ fs_visitor::emit_fb_writes()
          if (this->outputs[target].file == BAD_FILE)
             continue;
 
-         this->current_annotation = ralloc_asprintf(this->mem_ctx,
-                                                    "FB write target %d",
-                                                    target);
+         const fs_builder abld = bld.annotate(
+            ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
+
          fs_reg src0_alpha;
          if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
             src0_alpha = offset(outputs[0], 3);
 
-         inst = emit_single_fb_write(this->outputs[target], reg_undef,
+         inst = emit_single_fb_write(abld, this->outputs[target], reg_undef,
                                      src0_alpha,
                                      this->output_components[target],
                                      dispatch_width);
@@ -3863,19 +1706,17 @@ fs_visitor::emit_fb_writes()
        * alpha out the pipeline to our null renderbuffer to support
        * alpha-testing, alpha-to-coverage, and so on.
        */
-      inst = emit_single_fb_write(reg_undef, reg_undef, reg_undef, 0,
+      inst = emit_single_fb_write(bld, reg_undef, reg_undef, reg_undef, 0,
                                   dispatch_width);
       inst->target = 0;
    }
 
    inst->eot = true;
-   this->current_annotation = NULL;
 }
 
 void
-fs_visitor::setup_uniform_clipplane_values()
+fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 {
-   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
    const struct brw_vue_prog_key *key =
       (const struct brw_vue_prog_key *) this->key;
 
@@ -3889,7 +1730,7 @@ fs_visitor::setup_uniform_clipplane_values()
    }
 }
 
-void fs_visitor::compute_clip_distance()
+void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
 {
    struct brw_vue_prog_data *vue_prog_data =
       (struct brw_vue_prog_data *) prog_data;
@@ -3918,9 +1759,9 @@ void fs_visitor::compute_clip_distance()
    if (outputs[clip_vertex].file == BAD_FILE)
       return;
 
-   setup_uniform_clipplane_values();
+   setup_uniform_clipplane_values(clip_planes);
 
-   current_annotation = "user clip distances";
+   const fs_builder abld = bld.annotate("user clip distances");
 
    this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
    this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
@@ -3930,16 +1771,16 @@ void fs_visitor::compute_clip_distance()
       fs_reg output = outputs[VARYING_SLOT_CLIP_DIST0 + i / 4];
       output.reg_offset = i & 3;
 
-      emit(MUL(output, outputs[clip_vertex], u));
+      abld.MUL(output, outputs[clip_vertex], u);
       for (int j = 1; j < 4; j++) {
          u.reg = userplane[i].reg + j;
-         emit(MAD(output, output, offset(outputs[clip_vertex], j), u));
+         abld.MAD(output, output, offset(outputs[clip_vertex], j), u);
       }
    }
 }
 
 void
-fs_visitor::emit_urb_writes()
+fs_visitor::emit_urb_writes(gl_clip_plane *clip_planes)
 {
    int slot, urb_offset, length;
    struct brw_vs_prog_data *vs_prog_data =
@@ -3954,18 +1795,17 @@ fs_visitor::emit_urb_writes()
 
    /* Lower legacy ff and ClipVertex clipping to clip distances */
    if (key->base.userclip_active && !prog->UsesClipDistanceOut)
-      compute_clip_distance();
+      compute_clip_distance(clip_planes);
 
    /* If we don't have any valid slots to write, just do a minimal urb write
     * send to terminate the shader. */
    if (vue_map->slots_valid == 0) {
 
       fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-      fs_inst *inst = emit(MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
-                                                      BRW_REGISTER_TYPE_UD))));
-      inst->force_writemask_all = true;
+      bld.exec_all().MOV(payload, fs_reg(retype(brw_vec8_grf(1, 0),
+                                                BRW_REGISTER_TYPE_UD)));
 
-      inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
       inst->eot = true;
       inst->mlen = 1;
       inst->offset = 1;
@@ -3994,7 +1834,7 @@ fs_visitor::emit_urb_writes()
          }
 
          zero = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-         emit(MOV(zero, fs_reg(0u)));
+         bld.MOV(zero, fs_reg(0u));
 
          sources[length++] = zero;
          if (vue_map->slots_valid & VARYING_BIT_LAYER)
@@ -4049,8 +1889,7 @@ fs_visitor::emit_urb_writes()
             for (int i = 0; i < 4; i++) {
                reg = fs_reg(GRF, alloc.allocate(1), outputs[varying].type);
                src = offset(this->outputs[varying], i);
-               fs_inst *inst = emit(MOV(reg, src));
-               inst->saturate = true;
+               set_saturate(true, bld.MOV(reg, src));
                sources[length++] = reg;
             }
          } else {
@@ -4060,7 +1899,7 @@ fs_visitor::emit_urb_writes()
          break;
       }
 
-      current_annotation = "URB write";
+      const fs_builder abld = bld.annotate("URB write");
 
       /* If we've queued up 8 registers of payload (2 VUE slots), if this is
        * the last slot or if we need to flush (see BAD_FILE varying case
@@ -4073,22 +1912,14 @@ fs_visitor::emit_urb_writes()
          fs_reg *payload_sources = ralloc_array(mem_ctx, fs_reg, length + 1);
          fs_reg payload = fs_reg(GRF, alloc.allocate(length + 1),
                                  BRW_REGISTER_TYPE_F, dispatch_width);
-
-         /* We need WE_all on the MOV for the message header (the URB handles)
-          * so do a MOV to a dummy register and set force_writemask_all on the
-          * MOV.  LOAD_PAYLOAD will preserve that.
-          */
-         fs_reg dummy = fs_reg(GRF, alloc.allocate(1),
-                               BRW_REGISTER_TYPE_UD);
-         fs_inst *inst = emit(MOV(dummy, fs_reg(retype(brw_vec8_grf(1, 0),
-                                                       BRW_REGISTER_TYPE_UD))));
-         inst->force_writemask_all = true;
-         payload_sources[0] = dummy;
+         payload_sources[0] =
+            fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
 
          memcpy(&payload_sources[1], sources, length * sizeof sources[0]);
-         emit(LOAD_PAYLOAD(payload, payload_sources, length + 1, 1));
+         abld.LOAD_PAYLOAD(payload, payload_sources, length + 1, 1);
 
-         inst = emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+         fs_inst *inst =
+            abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
          inst->eot = last;
          inst->mlen = length + 1;
          inst->offset = urb_offset;
@@ -4100,21 +1931,9 @@ fs_visitor::emit_urb_writes()
 }
 
 void
-fs_visitor::resolve_ud_negate(fs_reg *reg)
-{
-   if (reg->type != BRW_REGISTER_TYPE_UD ||
-       !reg->negate)
-      return;
-
-   fs_reg temp = vgrf(glsl_type::uint_type);
-   emit(MOV(temp, *reg));
-   *reg = temp;
-}
-
-void
 fs_visitor::emit_cs_terminate()
 {
-   assert(brw->gen >= 7);
+   assert(devinfo->gen >= 7);
 
    /* We are getting the thread ID from the compute shader header */
    assert(stage == MESA_SHADER_COMPUTE);
@@ -4125,94 +1944,53 @@ fs_visitor::emit_cs_terminate()
     */
    struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
    fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
-   fs_inst *inst = emit(MOV(payload, g0));
-   inst->force_writemask_all = true;
+   bld.exec_all().MOV(payload, g0);
 
    /* Send a message to the thread spawner to terminate the thread. */
-   inst = emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
+   fs_inst *inst = bld.exec_all()
+                      .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
    inst->eot = true;
 }
 
-/**
- * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
- *
- * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
- * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
- */
 void
-fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
+fs_visitor::emit_barrier()
 {
-   assert(devinfo->gen <= 5);
+   assert(devinfo->gen >= 7);
 
-   if (rvalue->type != glsl_type::bool_type)
-      return;
+   /* We are getting the barrier ID from the compute shader header */
+   assert(stage == MESA_SHADER_COMPUTE);
 
-   fs_reg and_result = vgrf(glsl_type::bool_type);
-   fs_reg neg_result = vgrf(glsl_type::bool_type);
-   emit(AND(and_result, *reg, fs_reg(1)));
-   emit(MOV(neg_result, negate(and_result)));
-   *reg = neg_result;
-}
+   fs_reg payload = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
 
-fs_visitor::fs_visitor(struct brw_context *brw,
-                       void *mem_ctx,
-                       const struct brw_wm_prog_key *key,
-                       struct brw_wm_prog_data *prog_data,
-                       struct gl_shader_program *shader_prog,
-                       struct gl_fragment_program *fp,
-                       unsigned dispatch_width)
-   : backend_visitor(brw, shader_prog, &fp->Base, &prog_data->base,
-                     MESA_SHADER_FRAGMENT),
-     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
-     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
-     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
-     key(key), prog_data(&prog_data->base),
-     dispatch_width(dispatch_width), promoted_constants(0)
-{
-   this->mem_ctx = mem_ctx;
-   init();
-}
+   /* Clear the message payload */
+   bld.exec_all().MOV(payload, fs_reg(0u));
 
-fs_visitor::fs_visitor(struct brw_context *brw,
-                       void *mem_ctx,
-                       const struct brw_vs_prog_key *key,
-                       struct brw_vs_prog_data *prog_data,
-                       struct gl_shader_program *shader_prog,
-                       struct gl_vertex_program *cp,
-                       unsigned dispatch_width)
-   : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base.base,
-                     MESA_SHADER_VERTEX),
-     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
-     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
-     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
-     key(key), prog_data(&prog_data->base.base),
-     dispatch_width(dispatch_width), promoted_constants(0)
-{
-   this->mem_ctx = mem_ctx;
-   init();
+   /* Copy bits 27:24 of r0.2 (barrier id) to the message payload reg.2 */
+   fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
+   bld.exec_all().AND(component(payload, 2), r0_2, fs_reg(0x0f000000u));
+
+   /* Emit a gateway "barrier" message using the payload we set up, followed
+    * by a wait instruction.
+    */
+   bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
 }
 
-fs_visitor::fs_visitor(struct brw_context *brw,
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
                        void *mem_ctx,
-                       const struct brw_cs_prog_key *key,
-                       struct brw_cs_prog_data *prog_data,
+                       gl_shader_stage stage,
+                       const void *key,
+                       struct brw_stage_prog_data *prog_data,
                        struct gl_shader_program *shader_prog,
-                       struct gl_compute_program *cp,
-                       unsigned dispatch_width)
-   : backend_visitor(brw, shader_prog, &cp->Base, &prog_data->base,
-                     MESA_SHADER_COMPUTE),
-     reg_null_f(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_F)),
-     reg_null_d(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_D)),
-     reg_null_ud(retype(brw_null_vec(dispatch_width), BRW_REGISTER_TYPE_UD)),
-     key(key), prog_data(&prog_data->base),
-     dispatch_width(dispatch_width)
-{
-   this->mem_ctx = mem_ctx;
-   init();
-}
-
-void
-fs_visitor::init()
+                       struct gl_program *prog,
+                       unsigned dispatch_width,
+                       int shader_time_index)
+   : backend_shader(compiler, log_data, mem_ctx,
+                    shader_prog, prog, prog_data, stage),
+     key(key), prog_data(prog_data),
+     dispatch_width(dispatch_width),
+     shader_time_index(shader_time_index),
+     promoted_constants(0),
+     bld(fs_builder(this, dispatch_width).at_end())
 {
    switch (stage) {
    case MESA_SHADER_FRAGMENT:
@@ -4232,9 +2010,6 @@ fs_visitor::init()
    this->failed = false;
    this->simd16_unsupported = false;
    this->no16_msg = NULL;
-   this->variable_ht = hash_table_ctor(0,
-                                       hash_table_pointer_hash,
-                                       hash_table_pointer_compare);
 
    this->nir_locals = NULL;
    this->nir_globals = NULL;
@@ -4247,9 +2022,6 @@ fs_visitor::init()
    this->first_non_payload_grf = 0;
    this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
 
-   this->current_annotation = NULL;
-   this->base_ir = NULL;
-
    this->virtual_grf_start = NULL;
    this->virtual_grf_end = NULL;
    this->live_intervals = NULL;
@@ -4269,5 +2041,4 @@ fs_visitor::init()
 
 fs_visitor::~fs_visitor()
 {
-   hash_table_dtor(this->variable_ht);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
index a323e4d..0b8bfc3 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_surface_state.c
@@ -47,11 +47,12 @@ brw_upload_gs_pull_constants(struct brw_context *brw)
       return;
 
    /* BRW_NEW_GS_PROG_DATA */
-   const struct brw_stage_prog_data *prog_data = &brw->gs.prog_data->base.base;
+   const struct brw_vue_prog_data *prog_data = &brw->gs.prog_data->base;
+   const bool dword_pitch = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
 
    /* _NEW_PROGRAM_CONSTANTS */
    brw_upload_pull_constants(brw, BRW_NEW_GS_CONSTBUF, &gp->program.Base,
-                             stage_state, prog_data, false);
+                             stage_state, &prog_data->base, dword_pitch);
 }
 
 const struct brw_tracked_state brw_gs_pull_constants = {
@@ -77,8 +78,11 @@ brw_upload_gs_ubo_surfaces(struct brw_context *brw)
       return;
 
    /* BRW_NEW_GS_PROG_DATA */
+   struct brw_vue_prog_data *prog_data = &brw->gs.prog_data->base;
+   bool dword_pitch = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8;
+
    brw_upload_ubo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_GEOMETRY],
-			   &brw->gs.base, &brw->gs.prog_data->base.base, false);
+			   &brw->gs.base, &prog_data->base, dword_pitch);
 }
 
 const struct brw_tracked_state brw_gs_ubo_surfaces = {
diff --git a/src/mesa/drivers/dri/i965/brw_inst.h b/src/mesa/drivers/dri/i965/brw_inst.h
index e347c51..7a8c210 100644
--- a/src/mesa/drivers/dri/i965/brw_inst.h
+++ b/src/mesa/drivers/dri/i965/brw_inst.h
@@ -322,6 +322,9 @@ FJ(gen4_jump_count, 111,  96, devinfo->gen < 6)
 FC(gen4_pop_count,  115, 112, devinfo->gen < 6)
 /** @} */
 
+/* Message descriptor bits */
+#define MD(x) ((x) + 96)
+
 /**
  * Fields for SEND messages:
  *  @{
@@ -347,6 +350,7 @@ FF(header_present,
    /* 6:   */ 115, 115,
    /* 7:   */ 115, 115,
    /* 8:   */ 115, 115)
+F(gateway_notify, MD(16), MD(15))
 FF(function_control,
    /* 4:   */ 111,  96,
    /* 4.5: */ 111,  96,
@@ -354,6 +358,13 @@ FF(function_control,
    /* 6:   */ 114,  96,
    /* 7:   */ 114,  96,
    /* 8:   */ 114,  96)
+FF(gateway_subfuncid,
+   /* 4:   */ MD(1), MD(0),
+   /* 4.5: */ MD(1), MD(0),
+   /* 5:   */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */
+   /* 6:   */ MD(2), MD(0),
+   /* 7:   */ MD(2), MD(0),
+   /* 8:   */ MD(2), MD(0))
 FF(sfid,
    /* 4:   */ 123, 120, /* called msg_target */
    /* 4.5  */ 123, 120,
@@ -364,9 +375,6 @@ FF(sfid,
 FC(base_mrf,   27,  24, devinfo->gen < 6);
 /** @} */
 
-/* Message descriptor bits */
-#define MD(x) (x + 96)
-
 /**
  * URB message function control bits:
  *  @{
diff --git a/src/mesa/drivers/dri/i965/brw_ir_fs.h b/src/mesa/drivers/dri/i965/brw_ir_fs.h
index f3dfe79..96dc20d 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_fs.h
@@ -131,14 +131,15 @@ horiz_offset(fs_reg reg, unsigned delta)
 static inline fs_reg
 offset(fs_reg reg, unsigned delta)
 {
-   assert(reg.stride > 0);
    switch (reg.file) {
    case BAD_FILE:
       break;
    case GRF:
    case MRF:
    case ATTR:
-      return byte_offset(reg, delta * reg.width * reg.stride * type_sz(reg.type));
+      return byte_offset(reg,
+                         delta * MAX2(reg.width * reg.stride, 1) *
+                         type_sz(reg.type));
    case UNIFORM:
       reg.reg_offset += delta;
       break;
@@ -155,6 +156,7 @@ component(fs_reg reg, unsigned idx)
    assert(idx < reg.width);
    reg.subreg_offset = idx * type_sz(reg.type);
    reg.width = 1;
+   reg.stride = 0;
    return reg;
 }
 
@@ -254,9 +256,62 @@ public:
    uint8_t exec_size;
 
    bool eot:1;
-   bool force_uncompressed:1;
    bool force_sechalf:1;
    bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
 };
 
+/**
+ * Set second-half quarter control on \p inst.
+ */
+static inline fs_inst *
+set_sechalf(fs_inst *inst)
+{
+   inst->force_sechalf = true;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+static inline fs_inst *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  fs_inst *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+static inline fs_inst *
+set_predicate(enum brw_predicate pred, fs_inst *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+static inline fs_inst *
+set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+static inline fs_inst *
+set_saturate(bool saturate, fs_inst *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index a56fdd6..fceacae 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -190,6 +190,50 @@ public:
    }
 };
 
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+inline vec4_instruction *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  vec4_instruction *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+inline vec4_instruction *
+set_predicate(enum brw_predicate pred, vec4_instruction *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+inline vec4_instruction *
+set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+inline vec4_instruction *
+set_saturate(bool saturate, vec4_instruction *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
 } /* namespace brw */
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
index 0424003..7a5f983 100644
--- a/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
+++ b/src/mesa/drivers/dri/i965/brw_lower_texture_gradients.cpp
@@ -89,19 +89,18 @@ txs_type(const glsl_type *type)
 ir_visitor_status
 lower_texture_grad_visitor::visit_leave(ir_texture *ir)
 {
-   /* Only lower textureGrad with shadow samplers */
-   if (ir->op != ir_txd || !ir->shadow_comparitor)
+   /* Only lower textureGrad with cube maps or shadow samplers */
+   if (ir->op != ir_txd ||
+      (ir->sampler->type->sampler_dimensionality != GLSL_SAMPLER_DIM_CUBE &&
+       !ir->shadow_comparitor))
       return visit_continue;
 
-   /* Lower textureGrad() with samplerCubeShadow even if we have the sample_d_c
+   /* Lower textureGrad() with samplerCube* even if we have the sample_d_c
     * message.  GLSL provides gradients for the 'r' coordinate.  Unfortunately:
     *
     * From the Ivybridge PRM, Volume 4, Part 1, sample_d message description:
     * "The r coordinate contains the faceid, and the r gradients are ignored
     *  by hardware."
-    *
-    * We likely need to do a similar treatment for samplerCube and
-    * samplerCubeArray, but we have insufficient testing for that at the moment.
     */
    bool need_lowering = !has_sample_d_c ||
       ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE;
@@ -155,9 +154,20 @@ lower_texture_grad_visitor::visit_leave(ir_texture *ir)
 			       expr(ir_unop_sqrt, dot(dPdy, dPdy)));
    }
 
-   /* lambda_base = log2(rho).  We're ignoring GL state biases for now. */
+   /* lambda_base = log2(rho).  We're ignoring GL state biases for now.
+    *
+    * For cube maps the result of these formulas is giving us a value of rho
+    * that is twice the value we should use, so divide it by 2 or,
+    * alternatively, remove one unit from the result of the log2 computation.
+    */
    ir->op = ir_txl;
-   ir->lod_info.lod = expr(ir_unop_log2, rho);
+   if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE) {
+      ir->lod_info.lod = expr(ir_binop_add,
+                              expr(ir_unop_log2, rho),
+                              new(mem_ctx) ir_constant(-1.0f));
+   } else {
+      ir->lod_info.lod = expr(ir_unop_log2, rho);
+   }
 
    progress = true;
    return visit_continue;
diff --git a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
index 06916e2..49f2e3e 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_fast_clear.c
@@ -339,8 +339,13 @@ is_color_fast_clear_compatible(struct brw_context *brw,
                                mesa_format format,
                                const union gl_color_union *color)
 {
-   if (_mesa_is_format_integer_color(format))
+   if (_mesa_is_format_integer_color(format)) {
+      if (brw->gen >= 8) {
+         perf_debug("Integer fast clear not enabled for (%s)",
+                    _mesa_get_format_name(format));
+      }
       return false;
+   }
 
    for (int i = 0; i < 4; i++) {
       if (color->f[i] != 0.0 && color->f[i] != 1.0 &&
@@ -466,7 +471,8 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
        *      linear (untiled) memory is UNDEFINED."
        */
       if (irb->mt->tiling == I915_TILING_NONE) {
-         perf_debug("falling back to plain clear because buffers are untiled\n");
+         perf_debug("Falling back to plain clear because %dx%d buffer is untiled\n",
+                    irb->mt->logical_width0, irb->mt->logical_height0);
          clear_type = PLAIN_CLEAR;
       }
 
@@ -477,7 +483,8 @@ brw_meta_fast_clear(struct brw_context *brw, struct gl_framebuffer *fb,
       for (int i = 0; i < 4; i++) {
          if (_mesa_format_has_color_component(irb->mt->format, i) &&
              !color_mask[i]) {
-            perf_debug("falling back to plain clear because of color mask\n");
+            perf_debug("Falling back to plain clear on %dx%d buffer because of color mask\n",
+                       irb->mt->logical_width0, irb->mt->logical_height0);
             clear_type = PLAIN_CLEAR;
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
index fc7018d..d079197 100644
--- a/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
+++ b/src/mesa/drivers/dri/i965/brw_meta_stencil_blit.c
@@ -414,6 +414,12 @@ brw_meta_stencil_blit(struct brw_context *brw,
    GLenum target;
 
    _mesa_meta_fb_tex_blit_begin(ctx, &blit);
+   /* XXX: Pretend to support stencil textures so _mesa_base_tex_format()
+    * returns a valid format.  When we properly support the extension, we
+    * should remove this.
+    */
+   assert(ctx->Extensions.ARB_texture_stencil8 == false);
+   ctx->Extensions.ARB_texture_stencil8 = true;
 
    _mesa_GenFramebuffers(1, &fbo);
    /* Force the surface to be configured for level zero. */
@@ -451,6 +457,7 @@ brw_meta_stencil_blit(struct brw_context *brw,
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
 error:
+   ctx->Extensions.ARB_texture_stencil8 = false;
    _mesa_meta_fb_tex_blit_end(ctx, target, &blit);
    _mesa_meta_end(ctx);
 
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 67a693b..5a4515b 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -39,6 +39,7 @@
 #include "brw_state.h"
 #include "brw_defines.h"
 
+#include "main/framebuffer.h"
 #include "main/fbobject.h"
 #include "main/glformats.h"
 
@@ -46,12 +47,14 @@
 static void upload_drawing_rect(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
+   const struct gl_framebuffer *fb = ctx->DrawBuffer;
+   const unsigned int fb_width = _mesa_geometric_width(fb);
+   const unsigned int fb_height = _mesa_geometric_height(fb);
 
    BEGIN_BATCH(4);
    OUT_BATCH(_3DSTATE_DRAWING_RECTANGLE << 16 | (4 - 2));
    OUT_BATCH(0); /* xmin, ymin */
-   OUT_BATCH(((ctx->DrawBuffer->Width - 1) & 0xffff) |
-	    ((ctx->DrawBuffer->Height - 1) << 16));
+   OUT_BATCH(((fb_width - 1) & 0xffff) | ((fb_height - 1) << 16));
    OUT_BATCH(0);
    ADVANCE_BATCH();
 }
@@ -767,7 +770,7 @@ static void upload_polygon_stipple_offset(struct brw_context *brw)
     * works just fine, and there's no window system to worry about.
     */
    if (_mesa_is_winsys_fbo(ctx->DrawBuffer))
-      OUT_BATCH((32 - (ctx->DrawBuffer->Height & 31)) & 31);
+      OUT_BATCH((32 - (_mesa_geometric_height(ctx->DrawBuffer) & 31)) & 31);
    else
       OUT_BATCH(0);
    ADVANCE_BATCH();
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index e4119b1..b7bb231 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -122,18 +122,9 @@ brw_create_nir(struct brw_context *brw,
    /* Get rid of split copies */
    nir_optimize(nir);
 
-   if (shader_prog) {
-      nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
-                                                   &nir->num_direct_uniforms,
-                                                   &nir->num_uniforms);
-   } else {
-      /* ARB programs generally create a giant array of "uniform" data, and allow
-       * indirect addressing without any boundaries.  In the absence of bounds
-       * analysis, it's all or nothing.  num_direct_uniforms is only useful when
-       * we have some direct and some indirect access; it doesn't matter here.
-       */
-      nir->num_direct_uniforms = 0;
-   }
+   nir_assign_var_locations_scalar_direct_first(nir, &nir->uniforms,
+                                                &nir->num_direct_uniforms,
+                                                &nir->num_uniforms);
    nir_assign_var_locations_scalar(&nir->inputs, &nir->num_inputs);
    nir_assign_var_locations_scalar(&nir->outputs, &nir->num_outputs);
 
@@ -176,6 +167,12 @@ brw_create_nir(struct brw_context *brw,
    nir_validate_shader(nir);
 
    if (unlikely(debug_enabled)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_overload(nir, overload) {
+         if (overload->impl)
+            nir_index_ssa_defs(overload->impl);
+      }
+
       fprintf(stderr, "NIR (SSA form) for %s shader:\n",
               _mesa_shader_stage_to_string(stage));
       nir_print_shader(nir, stderr);
diff --git a/src/mesa/drivers/dri/i965/brw_program.c b/src/mesa/drivers/dri/i965/brw_program.c
index b056fbf..ea128cc 100644
--- a/src/mesa/drivers/dri/i965/brw_program.c
+++ b/src/mesa/drivers/dri/i965/brw_program.c
@@ -88,7 +88,7 @@ static struct gl_program *brwNewProgram( struct gl_context *ctx,
 	 return NULL;
    }
 
-   case MESA_GEOMETRY_PROGRAM: {
+   case GL_GEOMETRY_PROGRAM_NV: {
       struct brw_geometry_program *prog = CALLOC_STRUCT(brw_geometry_program);
       if (prog) {
          prog->id = get_new_program_id(brw->intelScreen);
@@ -287,18 +287,24 @@ void brwInitFragProgFuncs( struct dd_function_table *functions )
    functions->MemoryBarrier = brw_memory_barrier;
 }
 
+struct shader_times {
+   uint64_t time;
+   uint64_t written;
+   uint64_t reset;
+};
+
 void
 brw_init_shader_time(struct brw_context *brw)
 {
-   const int max_entries = 4096;
-   brw->shader_time.bo = drm_intel_bo_alloc(brw->bufmgr, "shader time",
-                                            max_entries * SHADER_TIME_STRIDE,
-                                            4096);
+   const int max_entries = 2048;
+   brw->shader_time.bo =
+      drm_intel_bo_alloc(brw->bufmgr, "shader time",
+                         max_entries * SHADER_TIME_STRIDE * 3, 4096);
    brw->shader_time.names = rzalloc_array(brw, const char *, max_entries);
    brw->shader_time.ids = rzalloc_array(brw, int, max_entries);
    brw->shader_time.types = rzalloc_array(brw, enum shader_time_shader_type,
                                           max_entries);
-   brw->shader_time.cumulative = rzalloc_array(brw, uint64_t,
+   brw->shader_time.cumulative = rzalloc_array(brw, struct shader_times,
                                                max_entries);
    brw->shader_time.max_entries = max_entries;
 }
@@ -319,27 +325,6 @@ compare_time(const void *a, const void *b)
 }
 
 static void
-get_written_and_reset(struct brw_context *brw, int i,
-                      uint64_t *written, uint64_t *reset)
-{
-   enum shader_time_shader_type type = brw->shader_time.types[i];
-   assert(type == ST_VS || type == ST_GS || type == ST_FS8 ||
-          type == ST_FS16 || type == ST_CS);
-
-   /* Find where we recorded written and reset. */
-   int wi, ri;
-
-   for (wi = i; brw->shader_time.types[wi] != type + 1; wi++)
-      ;
-
-   for (ri = i; brw->shader_time.types[ri] != type + 2; ri++)
-      ;
-
-   *written = brw->shader_time.cumulative[wi];
-   *reset = brw->shader_time.cumulative[ri];
-}
-
-static void
 print_shader_time_line(const char *stage, const char *name,
                        int shader_num, uint64_t time, uint64_t total)
 {
@@ -374,26 +359,13 @@ brw_report_shader_time(struct brw_context *brw)
       sorted[i] = &scaled[i];
 
       switch (type) {
-      case ST_VS_WRITTEN:
-      case ST_VS_RESET:
-      case ST_GS_WRITTEN:
-      case ST_GS_RESET:
-      case ST_FS8_WRITTEN:
-      case ST_FS8_RESET:
-      case ST_FS16_WRITTEN:
-      case ST_FS16_RESET:
-      case ST_CS_WRITTEN:
-      case ST_CS_RESET:
-         /* We'll handle these when along with the time. */
-         scaled[i] = 0;
-         continue;
-
       case ST_VS:
       case ST_GS:
       case ST_FS8:
       case ST_FS16:
       case ST_CS:
-         get_written_and_reset(brw, i, &written, &reset);
+         written = brw->shader_time.cumulative[i].written;
+         reset = brw->shader_time.cumulative[i].reset;
          break;
 
       default:
@@ -405,7 +377,7 @@ brw_report_shader_time(struct brw_context *brw)
          break;
       }
 
-      uint64_t time = brw->shader_time.cumulative[i];
+      uint64_t time = brw->shader_time.cumulative[i].time;
       if (written) {
          scaled[i] = time / written * (written + reset);
       } else {
@@ -491,16 +463,19 @@ brw_collect_shader_time(struct brw_context *brw)
     * overhead compared to the cost of tracking the time in the first place.
     */
    drm_intel_bo_map(brw->shader_time.bo, true);
-
-   uint32_t *times = brw->shader_time.bo->virtual;
+   void *bo_map = brw->shader_time.bo->virtual;
 
    for (int i = 0; i < brw->shader_time.num_entries; i++) {
-      brw->shader_time.cumulative[i] += times[i * SHADER_TIME_STRIDE / 4];
+      uint32_t *times = bo_map + i * 3 * SHADER_TIME_STRIDE;
+
+      brw->shader_time.cumulative[i].time += times[SHADER_TIME_STRIDE * 0 / 4];
+      brw->shader_time.cumulative[i].written += times[SHADER_TIME_STRIDE * 1 / 4];
+      brw->shader_time.cumulative[i].reset += times[SHADER_TIME_STRIDE * 2 / 4];
    }
 
    /* Zero the BO out to clear it out for our next collection.
     */
-   memset(times, 0, brw->shader_time.bo->size);
+   memset(bo_map, 0, brw->shader_time.bo->size);
    drm_intel_bo_unmap(brw->shader_time.bo);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
index 667c900..aea4d9b 100644
--- a/src/mesa/drivers/dri/i965/brw_queryobj.c
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -66,10 +66,20 @@ brw_write_timestamp(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
 void
 brw_write_depth_count(struct brw_context *brw, drm_intel_bo *query_bo, int idx)
 {
-   brw_emit_pipe_control_write(brw,
-                               PIPE_CONTROL_WRITE_DEPTH_COUNT
-                               | PIPE_CONTROL_DEPTH_STALL,
-                               query_bo, idx * sizeof(uint64_t), 0, 0);
+   uint32_t flags;
+
+   flags = (PIPE_CONTROL_WRITE_DEPTH_COUNT |
+            PIPE_CONTROL_DEPTH_STALL);
+
+   /* Needed to ensure the memory is coherent for the MI_LOAD_REGISTER_MEM
+    * command when loading the values into the predicate source registers for
+    * conditional rendering.
+    */
+   if (brw->predicate.supported)
+      flags |= PIPE_CONTROL_FLUSH_ENABLE;
+
+   brw_emit_pipe_control_write(brw, flags, query_bo,
+                               idx * sizeof(uint64_t), 0, 0);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index c03a8ae..c8b1341 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -765,6 +765,22 @@ brw_ip_reg(void)
 }
 
 static inline struct brw_reg
+brw_notification_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_NOTIFICATION_COUNT,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XXXX,
+                  WRITEMASK_X);
+}
+
+static inline struct brw_reg
 brw_acc_reg(unsigned width)
 {
    return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE,
@@ -778,7 +794,11 @@ brw_flag_reg(int reg, int subreg)
                       BRW_ARF_FLAG + reg, subreg);
 }
 
-
+/**
+ * Return the mask register present in Gen4-5, or the related register present
+ * in Gen7.5 and later hardware referred to as "channel enable" register in
+ * the documentation.
+ */
 static inline struct brw_reg
 brw_mask_reg(unsigned subnr)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 34f75fd..ee0add5 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -399,10 +399,10 @@ schedule_node::set_latency_gen7(bool is_haswell)
 
 class instruction_scheduler {
 public:
-   instruction_scheduler(backend_visitor *v, int grf_count,
+   instruction_scheduler(backend_shader *s, int grf_count,
                          instruction_scheduler_mode mode)
    {
-      this->bv = v;
+      this->bs = s;
       this->mem_ctx = ralloc_context(NULL);
       this->grf_count = grf_count;
       this->instructions.make_empty();
@@ -455,7 +455,7 @@ public:
    int grf_count;
    int time;
    exec_list instructions;
-   backend_visitor *bv;
+   backend_shader *bs;
 
    instruction_scheduler_mode mode;
 
@@ -606,7 +606,7 @@ vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *b
 schedule_node::schedule_node(backend_instruction *inst,
                              instruction_scheduler *sched)
 {
-   const struct brw_device_info *devinfo = sched->bv->devinfo;
+   const struct brw_device_info *devinfo = sched->bs->devinfo;
 
    this->inst = inst;
    this->child_array_size = 0;
@@ -1384,7 +1384,7 @@ vec4_instruction_scheduler::issue_time(backend_instruction *inst)
 void
 instruction_scheduler::schedule_instructions(bblock_t *block)
 {
-   const struct brw_device_info *devinfo = bv->devinfo;
+   const struct brw_device_info *devinfo = bs->devinfo;
    backend_instruction *inst = block->end();
    time = 0;
 
@@ -1419,7 +1419,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
 
       if (debug) {
          fprintf(stderr, "clock %4d, scheduled: ", time);
-         bv->dump_instruction(chosen->inst);
+         bs->dump_instruction(chosen->inst);
       }
 
       /* Now that we've scheduled a new instruction, some of its
@@ -1435,7 +1435,7 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
 
          if (debug) {
             fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
-            bv->dump_instruction(child->inst);
+            bs->dump_instruction(child->inst);
          }
 
          child->cand_generation = cand_generation;
@@ -1474,7 +1474,7 @@ instruction_scheduler::run(cfg_t *cfg)
    if (debug) {
       fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
               post_reg_alloc);
-      bv->dump_instructions();
+      bs->dump_instructions();
    }
 
    /* Populate the remaining GRF uses array to improve the pre-regalloc
@@ -1504,7 +1504,7 @@ instruction_scheduler::run(cfg_t *cfg)
    if (debug) {
       fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
               post_reg_alloc);
-      bv->dump_instructions();
+      bs->dump_instructions();
    }
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index 014b434..5d98922 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -52,6 +52,12 @@ static void upload_sf_vp(struct brw_context *brw)
 			 sizeof(*sfv), 32, &brw->sf.vp_offset);
    memset(sfv, 0, sizeof(*sfv));
 
+   /* Accessing the fields Width and Height of gl_framebuffer to produce the
+    * values to program the viewport and scissor is fine as long as the
+    * gl_framebuffer has atleast one attachment.
+    */
+   assert(ctx->DrawBuffer->_HasAttachments);
+
    if (render_to_fbo) {
       y_scale = 1.0;
       y_bias = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index ebfb49a..06393c8 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -32,16 +32,106 @@
 #include "glsl/glsl_parser_extras.h"
 #include "main/shaderapi.h"
 
+static void
+shader_debug_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+   va_list args;
+
+   va_start(args, fmt);
+   GLuint msg_id = 0;
+   _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                   MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                   MESA_DEBUG_TYPE_OTHER,
+                   MESA_DEBUG_SEVERITY_NOTIFICATION, fmt, args);
+   va_end(args);
+}
+
+static void
+shader_perf_log_mesa(void *data, const char *fmt, ...)
+{
+   struct brw_context *brw = (struct brw_context *)data;
+
+   va_list args;
+   va_start(args, fmt);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
+      va_list args_copy;
+      va_copy(args_copy, args);
+      vfprintf(stderr, fmt, args_copy);
+      va_end(args_copy);
+   }
+
+   if (brw->perf_debug) {
+      GLuint msg_id = 0;
+      _mesa_gl_vdebug(&brw->ctx, &msg_id,
+                      MESA_DEBUG_SOURCE_SHADER_COMPILER,
+                      MESA_DEBUG_TYPE_PERFORMANCE,
+                      MESA_DEBUG_SEVERITY_MEDIUM, fmt, args);
+   }
+   va_end(args);
+}
+
 struct brw_compiler *
 brw_compiler_create(void *mem_ctx, const struct brw_device_info *devinfo)
 {
    struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
 
    compiler->devinfo = devinfo;
+   compiler->shader_debug_log = shader_debug_log_mesa;
+   compiler->shader_perf_log = shader_perf_log_mesa;
 
    brw_fs_alloc_reg_sets(compiler);
    brw_vec4_alloc_reg_set(compiler);
 
+   if (devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
+      compiler->scalar_vs = true;
+
+   nir_shader_compiler_options *nir_options =
+      rzalloc(compiler, nir_shader_compiler_options);
+   nir_options->native_integers = true;
+   /* In order to help allow for better CSE at the NIR level we tell NIR
+    * to split all ffma instructions during opt_algebraic and we then
+    * re-combine them as a later step.
+    */
+   nir_options->lower_ffma = true;
+   nir_options->lower_sub = true;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      compiler->glsl_compiler_options[i].MaxUnrollIterations = 32;
+      compiler->glsl_compiler_options[i].MaxIfDepth =
+         devinfo->gen < 6 ? 16 : UINT_MAX;
+
+      compiler->glsl_compiler_options[i].EmitCondCodes = true;
+      compiler->glsl_compiler_options[i].EmitNoNoise = true;
+      compiler->glsl_compiler_options[i].EmitNoMainReturn = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectOutput =
+	 (i == MESA_SHADER_FRAGMENT);
+      compiler->glsl_compiler_options[i].EmitNoIndirectTemp =
+	 (i == MESA_SHADER_FRAGMENT);
+      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
+      compiler->glsl_compiler_options[i].LowerClipDistance = true;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = true;
+   compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].OptimizeForAOS = true;
+
+   if (compiler->scalar_vs) {
+      /* If we're using the scalar backend for vertex shaders, we need to
+       * configure these accordingly.
+       */
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectOutput = true;
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].EmitNoIndirectTemp = true;
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].OptimizeForAOS = false;
+
+      compiler->glsl_compiler_options[MESA_SHADER_VERTEX].NirOptions = nir_options;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions = nir_options;
+   compiler->glsl_compiler_options[MESA_SHADER_COMPUTE].NirOptions = nir_options;
+
    return compiler;
 }
 
@@ -97,7 +187,7 @@ is_scalar_shader_stage(struct brw_context *brw, int stage)
    case MESA_SHADER_FRAGMENT:
       return true;
    case MESA_SHADER_VERTEX:
-      return brw->scalar_vs;
+      return brw->intelScreen->compiler->scalar_vs;
    default:
       return false;
    }
@@ -632,6 +722,8 @@ brw_instruction_name(enum opcode op)
       return "gs_ff_sync_set_primitives";
    case CS_OPCODE_CS_TERMINATE:
       return "cs_terminate";
+   case SHADER_OPCODE_BARRIER:
+      return "barrier";
    }
 
    unreachable("not reached");
@@ -755,19 +847,22 @@ brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
    return false;
 }
 
-backend_visitor::backend_visitor(struct brw_context *brw,
-                                 struct gl_shader_program *shader_prog,
-                                 struct gl_program *prog,
-                                 struct brw_stage_prog_data *stage_prog_data,
-                                 gl_shader_stage stage)
-   : brw(brw),
-     devinfo(brw->intelScreen->devinfo),
-     ctx(&brw->ctx),
+backend_shader::backend_shader(const struct brw_compiler *compiler,
+                               void *log_data,
+                               void *mem_ctx,
+                               struct gl_shader_program *shader_prog,
+                               struct gl_program *prog,
+                               struct brw_stage_prog_data *stage_prog_data,
+                               gl_shader_stage stage)
+   : compiler(compiler),
+     log_data(log_data),
+     devinfo(compiler->devinfo),
      shader(shader_prog ?
         (struct brw_shader *)shader_prog->_LinkedShaders[stage] : NULL),
      shader_prog(shader_prog),
      prog(prog),
      stage_prog_data(stage_prog_data),
+     mem_ctx(mem_ctx),
      cfg(NULL),
      stage(stage)
 {
@@ -950,7 +1045,6 @@ backend_instruction::can_do_saturate() const
    case BRW_OPCODE_LINE:
    case BRW_OPCODE_LRP:
    case BRW_OPCODE_MAC:
-   case BRW_OPCODE_MACH:
    case BRW_OPCODE_MAD:
    case BRW_OPCODE_MATH:
    case BRW_OPCODE_MOV:
@@ -1060,6 +1154,7 @@ backend_instruction::has_side_effects() const
    case SHADER_OPCODE_MEMORY_FENCE:
    case SHADER_OPCODE_URB_WRITE_SIMD8:
    case FS_OPCODE_FB_WRITE:
+   case SHADER_OPCODE_BARRIER:
       return true;
    default:
       return false;
@@ -1148,13 +1243,13 @@ backend_instruction::remove(bblock_t *block)
 }
 
 void
-backend_visitor::dump_instructions()
+backend_shader::dump_instructions()
 {
    dump_instructions(NULL);
 }
 
 void
-backend_visitor::dump_instructions(const char *name)
+backend_shader::dump_instructions(const char *name)
 {
    FILE *file = stderr;
    if (name && geteuid() != 0) {
@@ -1183,7 +1278,7 @@ backend_visitor::dump_instructions(const char *name)
 }
 
 void
-backend_visitor::calculate_cfg()
+backend_shader::calculate_cfg()
 {
    if (this->cfg)
       return;
@@ -1191,7 +1286,7 @@ backend_visitor::calculate_cfg()
 }
 
 void
-backend_visitor::invalidate_cfg()
+backend_shader::invalidate_cfg()
 {
    ralloc_free(this->cfg);
    this->cfg = NULL;
@@ -1206,7 +1301,7 @@ backend_visitor::invalidate_cfg()
  * trigger some of our asserts that surface indices are < BRW_MAX_SURFACES.
  */
 void
-backend_visitor::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
+backend_shader::assign_common_binding_table_offsets(uint32_t next_binding_table_offset)
 {
    int num_textures = _mesa_fls(prog->SamplersUsed);
 
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 59a0eff..b2c1a0b 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -86,6 +86,12 @@ struct brw_compiler {
        */
       int aligned_pairs_class;
    } fs_reg_sets[2];
+
+   void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+   void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+
+   bool scalar_vs;
+   struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
 };
 
 enum PACKED register_file {
@@ -211,20 +217,23 @@ enum instruction_scheduler_mode {
    SCHEDULE_POST,
 };
 
-class backend_visitor : public ir_visitor {
+class backend_shader {
 protected:
 
-   backend_visitor(struct brw_context *brw,
-                   struct gl_shader_program *shader_prog,
-                   struct gl_program *prog,
-                   struct brw_stage_prog_data *stage_prog_data,
-                   gl_shader_stage stage);
+   backend_shader(const struct brw_compiler *compiler,
+                  void *log_data,
+                  void *mem_ctx,
+                  struct gl_shader_program *shader_prog,
+                  struct gl_program *prog,
+                  struct brw_stage_prog_data *stage_prog_data,
+                  gl_shader_stage stage);
 
 public:
 
-   struct brw_context * const brw;
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
    const struct brw_device_info * const devinfo;
-   struct gl_context * const ctx;
    struct brw_shader * const shader;
    struct gl_shader_program * const shader_prog;
    struct gl_program * const prog;
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 26fdae6..987672f 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -229,11 +229,14 @@ void brw_destroy_caches( struct brw_context *brw );
 #define BRW_BATCH_STRUCT(brw, s) \
    intel_batchbuffer_data(brw, (s), sizeof(*(s)), RENDER_RING)
 
-void *brw_state_batch(struct brw_context *brw,
-		      enum aub_state_struct_type type,
-		      int size,
-		      int alignment,
-		      uint32_t *out_offset);
+void *__brw_state_batch(struct brw_context *brw,
+                        enum aub_state_struct_type type,
+                        int size,
+                        int alignment,
+                        int index,
+                        uint32_t *out_offset);
+#define brw_state_batch(brw, type, size, alignment, out_offset) \
+   __brw_state_batch(brw, type, size, alignment, 0, out_offset)
 
 /* brw_wm_surface_state.c */
 void gen4_init_vtable_surface_functions(struct brw_context *brw);
@@ -246,6 +249,7 @@ void brw_configure_w_tiled(const struct intel_mipmap_tree *mt,
                            unsigned *pitch, uint32_t *tiling,
                            unsigned *format);
 
+const char *brw_surface_format_name(unsigned format);
 uint32_t brw_format_for_mesa_format(mesa_format mesa_format);
 
 GLuint translate_tex_target(GLenum target);
diff --git a/src/mesa/drivers/dri/i965/brw_state_batch.c b/src/mesa/drivers/dri/i965/brw_state_batch.c
index 45dca69..a405a80 100644
--- a/src/mesa/drivers/dri/i965/brw_state_batch.c
+++ b/src/mesa/drivers/dri/i965/brw_state_batch.c
@@ -38,7 +38,8 @@ static void
 brw_track_state_batch(struct brw_context *brw,
 		      enum aub_state_struct_type type,
 		      uint32_t offset,
-		      int size)
+                      int size,
+                      int index)
 {
    struct intel_batchbuffer *batch = &brw->batch;
 
@@ -53,6 +54,7 @@ brw_track_state_batch(struct brw_context *brw,
    brw->state_batch_list[brw->state_batch_count].offset = offset;
    brw->state_batch_list[brw->state_batch_count].size = size;
    brw->state_batch_list[brw->state_batch_count].type = type;
+   brw->state_batch_list[brw->state_batch_count].index = index;
    brw->state_batch_count++;
 }
 
@@ -108,18 +110,20 @@ brw_annotate_aub(struct brw_context *brw)
  * margin (4096 bytes, even if the object is just a 20-byte surface
  * state), and more buffers to walk and count for aperture size checking.
  *
- * However, due to the restrictions inposed by the aperture size
+ * However, due to the restrictions imposed by the aperture size
  * checking performance hacks, we can't have the batch point at a
  * separate indirect state buffer, because once the batch points at
  * it, no more relocations can be added to it.  So, we sneak these
  * buffers in at the top of the batchbuffer.
  */
 void *
-brw_state_batch(struct brw_context *brw,
-		enum aub_state_struct_type type,
-		int size,
-		int alignment,
-		uint32_t *out_offset)
+__brw_state_batch(struct brw_context *brw,
+                  enum aub_state_struct_type type,
+                  int size,
+                  int alignment,
+                  int index,
+                  uint32_t *out_offset)
+
 {
    struct intel_batchbuffer *batch = &brw->batch;
    uint32_t offset;
@@ -140,7 +144,7 @@ brw_state_batch(struct brw_context *brw,
    batch->state_batch_offset = offset;
 
    if (unlikely(INTEL_DEBUG & (DEBUG_BATCH | DEBUG_AUB)))
-      brw_track_state_batch(brw, type, offset, size);
+      brw_track_state_batch(brw, type, offset, size, index);
 
    *out_offset = offset;
    return batch->map + (offset>>2);
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index 530f5a8..b6f4d59 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -1,5 +1,5 @@
 /*
- * Copyright © 2007 Intel Corporation
+ * Copyright © 2007-2015 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -31,6 +31,41 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_eu.h"
+#include "brw_state.h"
+
+static const char *sampler_mip_filter[] = {
+   "NONE",
+   "NEAREST",
+   "RSVD",
+   "LINEAR"
+};
+
+static const char *sampler_mag_filter[] = {
+   "NEAREST",
+   "LINEAR",
+   "ANISOTROPIC",
+   "FLEXIBLE (GEN8+)",
+   "RSVD", "RSVD",
+   "MONO",
+   "RSVD"
+};
+
+static const char *sampler_addr_mode[] = {
+   "WRAP",
+   "MIRROR",
+   "CLAMP",
+   "CUBE",
+   "CLAMP_BORDER",
+   "MIRROR_ONCE",
+   "HALF_BORDER"
+};
+
+static const char *surface_tiling[] = {
+   "LINEAR",
+   "W-tiled",
+   "X-tiled",
+   "Y-tiled"
+};
 
 static void
 batch_out(struct brw_context *brw, const char *name, uint32_t offset,
@@ -50,6 +85,25 @@ batch_out(struct brw_context *brw, const char *name, uint32_t offset,
    va_end(va);
 }
 
+static void
+batch_out64(struct brw_context *brw, const char *name, uint32_t offset,
+            int index, char *fmt, ...)
+{
+   uint32_t *tmp = brw->batch.bo->virtual + offset;
+
+   /* Swap the dwords since we want to handle this as a 64b value, but the data
+    * is typically emitted as dwords.
+    */
+   uint64_t data = ((uint64_t)tmp[index + 1]) << 32 | tmp[index];
+   va_list va;
+
+   fprintf(stderr, "0x%08x:      0x%016" PRIx64 ": %8s: ",
+          offset + index * 4, data, name);
+   va_start(va, fmt);
+   vfprintf(stderr, fmt, va);
+   va_end(va);
+}
+
 static const char *
 get_965_surfacetype(unsigned int surfacetype)
 {
@@ -64,19 +118,6 @@ get_965_surfacetype(unsigned int surfacetype)
     }
 }
 
-static const char *
-get_965_surface_format(unsigned int surface_format)
-{
-    switch (surface_format) {
-    case 0x000: return "r32g32b32a32_float";
-    case 0x0c1: return "b8g8r8a8_unorm";
-    case 0x100: return "b5g6r5_unorm";
-    case 0x102: return "b5g5r5a1_unorm";
-    case 0x104: return "b4g4r4a4_unorm";
-    default: return "unknown";
-    }
-}
-
 static void dump_vs_state(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "VS_STATE";
@@ -176,7 +217,7 @@ static void dump_surface_state(struct brw_context *brw, uint32_t offset)
 
    batch_out(brw, name, offset, 0, "%s %s\n",
 	     get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
-	     get_965_surface_format(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)));
+             brw_surface_format_name(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)));
    batch_out(brw, name, offset, 1, "offset\n");
    batch_out(brw, name, offset, 2, "%dx%d size, %d mips\n",
 	     GET_FIELD(surf[2], BRW_SURFACE_WIDTH) + 1,
@@ -200,7 +241,7 @@ static void dump_gen7_surface_state(struct brw_context *brw, uint32_t offset)
 
    batch_out(brw, name, offset, 0, "%s %s %s\n",
              get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
-             get_965_surface_format(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)),
+             brw_surface_format_name(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)),
              (surf[0] & GEN7_SURFACE_IS_ARRAY) ? "array" : "");
    batch_out(brw, name, offset, 1, "offset\n");
    batch_out(brw, name, offset, 2, "%dx%d size, %d mips, %d slices\n",
@@ -222,6 +263,87 @@ static void dump_gen7_surface_state(struct brw_context *brw, uint32_t offset)
    batch_out(brw, name, offset, 7, "\n");
 }
 
+static float q_to_float(uint32_t data, int integer_end, int integer_start,
+                        int fractional_end, int fractional_start)
+{
+   /* Convert the number to floating point. */
+   float n = GET_BITS(data, integer_start, fractional_end);
+
+   /* Multiply by 2^-n */
+   return n * exp2(-(fractional_end - fractional_start + 1));
+}
+
+static void
+dump_gen8_surface_state(struct brw_context *brw, uint32_t offset, int index)
+{
+   uint32_t *surf = brw->batch.bo->virtual + offset;
+   int aux_mode = surf[6] & INTEL_MASK(2, 0);
+   const char *aux_str;
+   char *name;
+
+   if (brw->gen >= 9 && (aux_mode == 1 || aux_mode == 5)) {
+      bool msrt = GET_BITS(surf[4], 5, 3) > 0;
+      bool compression = GET_FIELD(surf[7], GEN9_SURFACE_RT_COMPRESSION) == 1;
+      aux_str = ralloc_asprintf(NULL, "AUX_CCS_%c (%s, MULTISAMPLE_COUNT%c1)",
+                                (aux_mode == 1) ? 'D' : 'E',
+                                compression ? "Compressed RT" : "Uncompressed",
+                                msrt ? '>' : '=');
+   } else {
+      static const char *surface_aux_mode[] = { "AUX_NONE", "AUX_MCS",
+                                                "AUX_APPEND", "AUX_HIZ",
+                                                "RSVD", "RSVD"};
+      aux_str = ralloc_asprintf(NULL, "%s", surface_aux_mode[aux_mode]);
+   }
+
+   name = ralloc_asprintf(NULL, "SURF%03d", index);
+   batch_out(brw, name, offset, 0, "%s %s %s VALIGN%d HALIGN%d %s\n",
+             get_965_surfacetype(GET_FIELD(surf[0], BRW_SURFACE_TYPE)),
+             brw_surface_format_name(GET_FIELD(surf[0], BRW_SURFACE_FORMAT)),
+             (surf[0] & GEN7_SURFACE_IS_ARRAY) ? "array" : "",
+             1 << (GET_BITS(surf[0], 17, 16) + 1), /* VALIGN */
+             1 << (GET_BITS(surf[0], 15, 14) + 1), /* HALIGN */
+             surface_tiling[GET_BITS(surf[0], 13, 12)]);
+   batch_out(brw, name, offset, 1, "MOCS: 0x%x Base MIP: %.1f (%u mips) Surface QPitch: %d\n",
+             GET_FIELD(surf[1], GEN8_SURFACE_MOCS),
+             q_to_float(surf[1], 23, 20, 19, 19),
+             surf[5] & INTEL_MASK(3, 0),
+             GET_FIELD(surf[1], GEN8_SURFACE_QPITCH) << 2);
+   batch_out(brw, name, offset, 2, "%dx%d [%s]\n",
+             GET_FIELD(surf[2], GEN7_SURFACE_WIDTH) + 1,
+             GET_FIELD(surf[2], GEN7_SURFACE_HEIGHT) + 1,
+             aux_str);
+   batch_out(brw, name, offset, 3, "%d slices (depth), pitch: %d\n",
+             GET_FIELD(surf[3], BRW_SURFACE_DEPTH) + 1,
+             (surf[3] & INTEL_MASK(17, 0)) + 1);
+   batch_out(brw, name, offset, 4, "min array element: %d, array extent %d, MULTISAMPLE_%d\n",
+             GET_FIELD(surf[4], GEN7_SURFACE_MIN_ARRAY_ELEMENT),
+             GET_FIELD(surf[4], GEN7_SURFACE_RENDER_TARGET_VIEW_EXTENT) + 1,
+             1 << GET_BITS(surf[4], 5, 3));
+   batch_out(brw, name, offset, 5, "x,y offset: %d,%d, min LOD: %d\n",
+             GET_FIELD(surf[5], BRW_SURFACE_X_OFFSET),
+             GET_FIELD(surf[5], BRW_SURFACE_Y_OFFSET),
+             GET_FIELD(surf[5], GEN7_SURFACE_MIN_LOD));
+   batch_out(brw, name, offset, 6, "AUX pitch: %d qpitch: %d\n",
+             GET_FIELD(surf[6], GEN8_SURFACE_AUX_QPITCH) << 2,
+             GET_FIELD(surf[6], GEN8_SURFACE_AUX_PITCH) << 2);
+   if (brw->gen >= 9) {
+      batch_out(brw, name, offset, 7, "Clear color: R(%x)G(%x)B(%x)A(%x)\n",
+                surf[12], surf[13], surf[14], surf[15]);
+   } else {
+      batch_out(brw, name, offset, 7, "Clear color: %c%c%c%c\n",
+                GET_BITS(surf[7], 31, 31) ? 'R' : '-',
+                GET_BITS(surf[7], 30, 30) ? 'G' : '-',
+                GET_BITS(surf[7], 29, 29) ? 'B' : '-',
+                GET_BITS(surf[7], 28, 28) ? 'A' : '-');
+   }
+
+   for (int i = 8; i < 12; i++)
+      batch_out(brw, name, offset, i, "0x%08x\n", surf[i]);
+
+   ralloc_free((void *)aux_str);
+   ralloc_free(name);
+}
+
 static void
 dump_sdc(struct brw_context *brw, uint32_t offset)
 {
@@ -229,7 +351,7 @@ dump_sdc(struct brw_context *brw, uint32_t offset)
 
    if (brw->gen >= 5 && brw->gen <= 6) {
       struct gen5_sampler_default_color *sdc = (brw->batch.bo->virtual +
-						offset);
+                                                offset);
       batch_out(brw, name, offset, 0, "unorm rgba\n");
       batch_out(brw, name, offset, 1, "r %f\n", sdc->f[0]);
       batch_out(brw, name, offset, 2, "b %f\n", sdc->f[1]);
@@ -271,6 +393,45 @@ static void dump_sampler_state(struct brw_context *brw,
    }
 }
 
+static void gen7_dump_sampler_state(struct brw_context *brw,
+                                    uint32_t offset, uint32_t size)
+{
+   const uint32_t *samp = brw->batch.bo->virtual + offset;
+   char name[20];
+
+   for (int i = 0; i < size / 16; i++) {
+      sprintf(name, "SAMPLER_STATE %d", i);
+      batch_out(brw, name, offset, i,
+                "Disabled = %s, Base Mip: %u.%u, Mip/Mag/Min Filter: %s/%s/%s, LOD Bias: %d.%d\n",
+                GET_BITS(samp[0], 31, 31) ? "yes" : "no",
+                GET_BITS(samp[0], 26, 23),
+                GET_BITS(samp[0], 22, 22),
+                sampler_mip_filter[GET_FIELD(samp[0], BRW_SAMPLER_MIP_FILTER)],
+                sampler_mag_filter[GET_FIELD(samp[0], BRW_SAMPLER_MAG_FILTER)],
+                /* min filter defs are the same as mag */
+                sampler_mag_filter[GET_FIELD(samp[0], BRW_SAMPLER_MIN_FILTER)],
+                GET_BITS(samp[0], 13, 10),
+                GET_BITS(samp[0], 9, 1)
+               );
+      batch_out(brw, name, offset, i+1, "Min LOD: %u.%u, Max LOD: %u.%u\n",
+                GET_BITS(samp[1], 31, 28),
+                GET_BITS(samp[1], 27, 20),
+                GET_BITS(samp[1], 19, 16),
+                GET_BITS(samp[1], 15, 8)
+               );
+      batch_out(brw, name, offset, i+2, "Border Color\n"); /* FINISHME: gen8+ */
+      batch_out(brw, name, offset, i+3, "Max aniso: RATIO %d:1, TC[XYZ] Address Control: %s|%s|%s\n",
+                (GET_FIELD(samp[3], BRW_SAMPLER_MAX_ANISOTROPY) + 1) * 2,
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCX_WRAP_MODE)],
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCY_WRAP_MODE)],
+                sampler_addr_mode[GET_FIELD(samp[3], BRW_SAMPLER_TCZ_WRAP_MODE)]
+               );
+
+      samp += 4;
+      offset += 4 * sizeof(uint32_t);
+   }
+}
+
 static void dump_sf_viewport_state(struct brw_context *brw,
 				   uint32_t offset)
 {
@@ -320,10 +481,17 @@ static void dump_sf_clip_viewport_state(struct brw_context *brw,
    batch_out(brw, name, offset, 3, "m30 = %f\n", vp->viewport.m30);
    batch_out(brw, name, offset, 4, "m31 = %f\n", vp->viewport.m31);
    batch_out(brw, name, offset, 5, "m32 = %f\n", vp->viewport.m32);
-   batch_out(brw, name, offset, 6, "guardband xmin = %f\n", vp->guardband.xmin);
-   batch_out(brw, name, offset, 7, "guardband xmax = %f\n", vp->guardband.xmax);
-   batch_out(brw, name, offset, 8, "guardband ymin = %f\n", vp->guardband.ymin);
-   batch_out(brw, name, offset, 9, "guardband ymax = %f\n", vp->guardband.ymax);
+   batch_out(brw, name, offset, 8, "guardband xmin = %f\n", vp->guardband.xmin);
+   batch_out(brw, name, offset, 9, "guardband xmax = %f\n", vp->guardband.xmax);
+   batch_out(brw, name, offset, 9, "guardband ymin = %f\n", vp->guardband.ymin);
+   batch_out(brw, name, offset, 10, "guardband ymax = %f\n", vp->guardband.ymax);
+   if (brw->gen >= 8) {
+      float *cc_vp = brw->batch.bo->virtual + offset;
+      batch_out(brw, name, offset, 12, "Min extents: %.2fx%.2f\n",
+                cc_vp[12], cc_vp[14]);
+      batch_out(brw, name, offset, 14, "Max extents: %.2fx%.2f\n",
+                cc_vp[13], cc_vp[15]);
+   }
 }
 
 
@@ -398,6 +566,92 @@ static void dump_blend_state(struct brw_context *brw, uint32_t offset)
 }
 
 static void
+gen8_dump_blend_state(struct brw_context *brw, uint32_t offset, uint32_t size)
+{
+   const uint32_t *blend = brw->batch.bo->virtual + offset;
+   const char *logicop[] =
+   {
+        "LOGICOP_CLEAR (BLACK)",
+        "LOGICOP_NOR",
+        "LOGICOP_AND_INVERTED",
+        "LOGICOP_COPY_INVERTED",
+        "LOGICOP_AND_REVERSE",
+        "LOGICOP_INVERT",
+        "LOGICOP_XOR",
+        "LOGICOP_NAND",
+        "LOGICOP_AND",
+        "LOGICOP_EQUIV",
+        "LOGICOP_NOOP",
+        "LOGICOP_OR_INVERTED",
+        "LOGICOP_COPY",
+        "LOGICOP_OR_REVERSE",
+        "LOGICOP_OR",
+        "LOGICOP_SET (WHITE)"
+   };
+
+   const char *blend_function[] =
+   { "ADD", "SUBTRACT", "REVERSE_SUBTRACT", "MIN", "MAX};" };
+
+   const char *blend_factor[0x1b] =
+   {
+      "RSVD",
+      "ONE",
+      "SRC_COLOR", "SRC_ALPHA",
+      "DST_ALPHA", "DST_COLOR",
+      "SRC_ALPHA_SATURATE",
+      "CONST_COLOR", "CONST_ALPHA",
+      "SRC1_COLOR", "SRC1_ALPHA",
+      "RSVD", "RSVD", "RSVD", "RSVD", "RSVD", "RSVD",
+      "ZERO",
+      "INV_SRC_COLOR", "INV_SRC_ALPHA",
+      "INV_DST_ALPHA", "INV_DST_COLOR",
+      "RSVD",
+      "INV_CONST_COLOR", "INV_CONST_ALPHA",
+      "INV_SRC1_COLOR", "INV_SRC1_ALPHA"
+   };
+
+   batch_out(brw, "BLEND", offset, 0, "Alpha blend/test\n");
+
+   if (((size) % 2) != 0)
+      fprintf(stderr, "Invalid blend state size %d\n", size);
+
+   for (int i = 1; i < size / 4; i += 2) {
+      char name[sizeof("BLEND_ENTRYXXX")];
+      sprintf(name, "BLEND_ENTRY%02d", (i - 1) / 2);
+      if (blend[i + 1] & GEN8_BLEND_LOGIC_OP_ENABLE) {
+         batch_out(brw, name, offset, i + 1, "%s\n",
+                   logicop[GET_FIELD(blend[i + 1],
+                                     GEN8_BLEND_LOGIC_OP_FUNCTION)]);
+      } else if (blend[i] & GEN8_BLEND_COLOR_BUFFER_BLEND_ENABLE) {
+         batch_out64(brw, name, offset, i,
+                   "\n\t\t\tColor Buffer Blend factor %s,%s,%s,%s (src,dst,src alpha, dst alpha)"
+                   "\n\t\t\tfunction %s,%s (color, alpha), Disables: %c%c%c%c\n",
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_SRC_BLEND_FACTOR)],
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_DST_BLEND_FACTOR)],
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_SRC_ALPHA_BLEND_FACTOR)],
+                   blend_factor[GET_FIELD(blend[i],
+                                          GEN8_BLEND_DST_ALPHA_BLEND_FACTOR)],
+                   blend_function[GET_FIELD(blend[i],
+                                            GEN8_BLEND_COLOR_BLEND_FUNCTION)],
+                   blend_function[GET_FIELD(blend[i],
+                                            GEN8_BLEND_ALPHA_BLEND_FUNCTION)],
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_RED ? 'R' : '-',
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_GREEN ? 'G' : '-',
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_BLUE ? 'B' : '-',
+                   blend[i] & GEN8_BLEND_WRITE_DISABLE_ALPHA ? 'A' : '-'
+                   );
+      } else if (!blend[i] && (blend[i + 1] == 0xb)) {
+         batch_out64(brw, name, offset, i, "NOP blend state\n");
+      } else {
+         batch_out64(brw, name, offset, i, "????\n");
+      }
+   }
+}
+
+static void
 dump_scissor(struct brw_context *brw, uint32_t offset)
 {
    const char *name = "SCISSOR";
@@ -555,20 +809,29 @@ dump_state_batch(struct brw_context *brw)
 	    dump_cc_state_gen4(brw, offset);
 	 break;
       case AUB_TRACE_BLEND_STATE:
-	 dump_blend_state(brw, offset);
+         if (brw->gen >= 8)
+            gen8_dump_blend_state(brw, offset, size);
+         else
+            dump_blend_state(brw, offset);
 	 break;
       case AUB_TRACE_BINDING_TABLE:
 	 dump_binding_table(brw, offset, size);
 	 break;
       case AUB_TRACE_SURFACE_STATE:
-	 if (brw->gen < 7) {
-	    dump_surface_state(brw, offset);
-	 } else {
+         if (brw->gen >= 8) {
+            dump_gen8_surface_state(brw, offset,
+                                    brw->state_batch_list[i].index);
+         } else if (brw->gen >= 7) {
 	    dump_gen7_surface_state(brw, offset);
-	 }
+         } else {
+            dump_surface_state(brw, offset);
+         }
 	 break;
       case AUB_TRACE_SAMPLER_STATE:
-         dump_sampler_state(brw, offset, size);
+         if (brw->gen >= 7)
+            gen7_dump_sampler_state(brw, offset, size);
+         else
+            dump_sampler_state(brw, offset, size);
 	 break;
       case AUB_TRACE_SAMPLER_DEFAULT_COLOR:
 	 dump_sdc(brw, offset);
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 84b0861..08d1ac2 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -41,6 +41,7 @@
 #include "brw_gs.h"
 #include "brw_wm.h"
 #include "brw_cs.h"
+#include "main/framebuffer.h"
 
 static const struct brw_tracked_state *gen4_atoms[] =
 {
@@ -660,6 +661,7 @@ brw_upload_pipeline_state(struct brw_context *brw,
    int i;
    static int dirty_count = 0;
    struct brw_state_flags state = brw->state.pipelines[pipeline];
+   unsigned int fb_samples = _mesa_geometric_samples(ctx->DrawBuffer);
 
    brw_select_pipeline(brw, pipeline);
 
@@ -696,8 +698,8 @@ brw_upload_pipeline_state(struct brw_context *brw,
       brw->ctx.NewDriverState |= BRW_NEW_META_IN_PROGRESS;
    }
 
-   if (brw->num_samples != ctx->DrawBuffer->Visual.samples) {
-      brw->num_samples = ctx->DrawBuffer->Visual.samples;
+   if (brw->num_samples != fb_samples) {
+      brw->num_samples = fb_samples;
       brw->ctx.NewDriverState |= BRW_NEW_NUM_SAMPLES;
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_surface_formats.c b/src/mesa/drivers/dri/i965/brw_surface_formats.c
index 016f87a..0501606 100644
--- a/src/mesa/drivers/dri/i965/brw_surface_formats.c
+++ b/src/mesa/drivers/dri/i965/brw_surface_formats.c
@@ -39,13 +39,14 @@ struct surface_format_info {
    int input_vb;
    int streamed_output_vb;
    int color_processing;
+   const char *name;
 };
 
 /* This macro allows us to write the table almost as it appears in the PRM,
  * while restructuring it to turn it into the C code we want.
  */
 #define SF(sampl, filt, shad, ck, rt, ab, vb, so, color, sf) \
-   [sf] = { true, sampl, filt, shad, ck, rt, ab, vb, so, color },
+   [BRW_SURFACEFORMAT_##sf] = { true, sampl, filt, shad, ck, rt, ab, vb, so, color, #sf},
 
 #define Y 0
 #define x 999
@@ -73,6 +74,7 @@ struct surface_format_info {
  * VB    - Input Vertex Buffer
  * SO    - Steamed Output Vertex Buffers (transform feedback)
  * color - Color Processing
+ * sf    - Surface Format
  *
  * See page 88 of the Sandybridge PRM VOL4_Part1 PDF.
  *
@@ -85,230 +87,236 @@ struct surface_format_info {
  */
 const struct surface_format_info surface_formats[] = {
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32A32_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32A32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32A32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64G64_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32B32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32B32A32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64G64_PASSTHRU)
-   SF( Y, 50,  x,  x,  x,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32_FLOAT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32_SINT)
-   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32B32_UINT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32B32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32B32_SFIXED)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x, 60, BRW_SURFACEFORMAT_R16G16B16A16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_FLOAT)
-   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_FLOAT)
-   SF( Y, 70,  x,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_FLOAT_LD)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32G32_UINT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32_FLOAT_X8X24_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_X32_TYPELESS_G8X24_UINT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L32A32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16X16_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16X16_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L32X32_FLOAT)
-   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I32X32_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16A16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32G32_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32G32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64_PASSTHRU)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  Y,  x, 60, BRW_SURFACEFORMAT_B8G8R8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B8G8R8A8_UNORM_SRGB)
+   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32B32A32_FLOAT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32B32A32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32B32A32_UINT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, R32G32B32X32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32A32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32B32A32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64_PASSTHRU)
+   SF( Y, 50,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_FLOAT)
+   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_SINT)
+   SF( Y,  x,  x,  x,  x,  x,  Y,  Y,  x, R32G32B32_UINT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32B32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32B32_SFIXED)
+   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x, 60, R16G16B16A16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16G16B16A16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16B16A16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16B16A16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16G16B16A16_FLOAT)
+   SF( Y, 50,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32_FLOAT)
+   SF( Y, 70,  x,  x,  Y,  Y,  Y,  Y,  x, R32G32_FLOAT_LD)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32G32_UINT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, R32_FLOAT_X8X24_TYPELESS)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, X32_TYPELESS_G8X24_UINT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, L32A32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64_FLOAT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R16G16B16X16_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R16G16B16X16_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, A32X32_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, L32X32_FLOAT)
+   SF( Y, 50,  x,  x,  x,  x,  x,  x,  x, I32X32_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16A16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16A16_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32G32_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32G32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64_PASSTHRU)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  Y,  x, 60, B8G8R8A8_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B8G8R8A8_UNORM_SRGB)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, BRW_SURFACEFORMAT_R10G10B10A2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x, 60, BRW_SURFACEFORMAT_R10G10B10A2_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_UINT)
-   SF( Y,  Y,  x,  x,  x,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R10G10B10_SNORM_A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, BRW_SURFACEFORMAT_R8G8B8A8_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, BRW_SURFACEFORMAT_R8G8B8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_UINT)
-   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_FLOAT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, BRW_SURFACEFORMAT_B10G10R10A2_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, BRW_SURFACEFORMAT_B10G10R10A2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R11G11B10_FLOAT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, BRW_SURFACEFORMAT_R32_UINT)
-   SF( Y, 50,  Y,  x,  Y,  Y,  Y,  Y,  x, BRW_SURFACEFORMAT_R32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R24_UNORM_X8_TYPELESS)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_X24_TYPELESS_G8_UINT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16A16_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A24X8_UNORM)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L32_FLOAT)
-   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A32_FLOAT)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x, 60, BRW_SURFACEFORMAT_B8G8R8X8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B8G8R8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8X8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8X8_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R9G9B9E5_SHAREDEXP)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10X2_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16A16_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_SNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, R10G10B10A2_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x, 60, R10G10B10A2_UNORM_SRGB)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R10G10B10A2_UINT)
+   SF( Y,  Y,  x,  x,  x,  Y,  Y,  x,  x, R10G10B10_SNORM_A2_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x, 60, R8G8B8A8_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, R8G8B8A8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R8G8B8A8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8B8A8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8B8A8_UINT)
+   SF( Y,  Y,  x,  x,  Y, 45,  Y,  x,  x, R16G16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16G16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16G16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16G16_FLOAT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, B10G10R10A2_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x, 60, B10G10R10A2_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R11G11B10_FLOAT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  Y,  x, R32_UINT)
+   SF( Y, 50,  Y,  x,  Y,  Y,  Y,  Y,  x, R32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, R24_UNORM_X8_TYPELESS)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, X24_TYPELESS_G8_UINT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, L16A16_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, I24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, L24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, A24X8_UNORM)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, I32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, L32_FLOAT)
+   SF( Y, 50,  Y,  x,  x,  x,  x,  x,  x, A32_FLOAT)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x, 60, B8G8R8X8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, B8G8R8X8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R8G8B8X8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R8G8B8X8_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R9G9B9E5_SHAREDEXP)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, B10G10R10X2_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, L16A16_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_SNORM)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R10G10B10X2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8A8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R32_USCALED)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G6R5_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G6R5_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5A1_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5A1_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B4G4R4A4_UNORM)
-   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B4G4R4A4_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_UNORM)
-   SF( Y,  Y,  x,  Y,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_UINT)
-   SF( Y,  Y,  Y,  x,  Y, 45,  Y,  x, 70, BRW_SURFACEFORMAT_R16_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R16_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_UINT)
-   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R16_FLOAT)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A8P8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A8P8_UNORM_PALETTE1)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A16_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_UNORM)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L16_FLOAT)
-   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A16_FLOAT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_UNORM_SRGB)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R5G5_SNORM_B6_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5X1_UNORM)
-   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_B5G5R5X1_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R10G10B10X2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8A8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8A8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R32_USCALED)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B5G6R5_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B5G6R5_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B5G5R5A1_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5A1_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, B4G4R4A4_UNORM)
+   SF( Y,  Y,  x,  x,  Y,  Y,  x,  x,  x, B4G4R4A4_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R8G8_UNORM)
+   SF( Y,  Y,  x,  Y,  Y, 60,  Y,  x,  x, R8G8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8G8_UINT)
+   SF( Y,  Y,  Y,  x,  Y, 45,  Y,  x, 70, R16_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R16_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R16_UINT)
+   SF( Y,  Y,  x,  x,  Y,  Y,  Y,  x,  x, R16_FLOAT)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, A8P8_UNORM_PALETTE0)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, A8P8_UNORM_PALETTE1)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, I16_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, L16_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, A16_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, L8A8_UNORM)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, I16_FLOAT)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, L16_FLOAT)
+   SF( Y,  Y,  Y,  x,  x,  x,  x,  x,  x, A16_FLOAT)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, L8A8_UNORM_SRGB)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, R5G5_SNORM_B6_UNORM)
+   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5X1_UNORM)
+   SF( x,  x,  x,  x,  Y,  Y,  x,  x,  x, B5G5R5X1_UNORM_SRGB)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8_USCALED)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16_USCALED)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8A8_UNORM_PALETTE0)
-   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8A8_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A1B5G5R5_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A4B4G4R4_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8A8_SINT)
-   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x, BRW_SURFACEFORMAT_R8_UNORM)
-   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, BRW_SURFACEFORMAT_R8_SNORM)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_SINT)
-   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_UINT)
-   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, BRW_SURFACEFORMAT_A8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I8_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P4A4_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A4P4_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8_USCALED)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_UNORM_SRGB)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P8_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P4A4_UNORM_PALETTE1)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_A4P4_UNORM_PALETTE1)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_Y8_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_L8_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_I8_SINT)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_DXT1_RGB_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R1_UINT)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_NORMAL)
-   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_SWAPUVY)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P2_UNORM_PALETTE0)
-   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_P2_UNORM_PALETTE1)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC1_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC2_UNORM)
-   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC3_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC4_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC5_UNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC1_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC2_UNORM_SRGB)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC3_UNORM_SRGB)
-   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_MONO8)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_SWAPUV)
-   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, BRW_SURFACEFORMAT_YCRCB_SWAPY)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_DXT1_RGB)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16_USCALED)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, P8A8_UNORM_PALETTE0)
+   SF(50, 50,  x,  x,  x,  x,  x,  x,  x, P8A8_UNORM_PALETTE1)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, A1B5G5R5_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, A4B4G4R4_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8A8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8A8_SINT)
+   SF( Y,  Y,  x, 45,  Y,  Y,  Y,  x,  x, R8_UNORM)
+   SF( Y,  Y,  x,  x,  Y, 60,  Y,  x,  x, R8_SNORM)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8_SINT)
+   SF( Y,  x,  x,  x,  Y,  x,  Y,  x,  x, R8_UINT)
+   SF( Y,  Y,  x,  Y,  Y,  Y,  x,  x,  x, A8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, I8_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, L8_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, P4A4_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, A4P4_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8_USCALED)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P8_UNORM_PALETTE0)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, L8_UNORM_SRGB)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P8_UNORM_PALETTE1)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P4A4_UNORM_PALETTE1)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, A4P4_UNORM_PALETTE1)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, Y8_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, L8_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, I8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, I8_SINT)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, DXT1_RGB_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, R1_UINT)
+   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, YCRCB_NORMAL)
+   SF( Y,  Y,  x,  Y,  Y,  x,  x,  x, 60, YCRCB_SWAPUVY)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P2_UNORM_PALETTE0)
+   SF(45, 45,  x,  x,  x,  x,  x,  x,  x, P2_UNORM_PALETTE1)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC1_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC2_UNORM)
+   SF( Y,  Y,  x,  Y,  x,  x,  x,  x,  x, BC3_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC4_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC5_UNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC1_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC2_UNORM_SRGB)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC3_UNORM_SRGB)
+   SF( Y,  x,  x,  x,  x,  x,  x,  x,  x, MONO8)
+   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, YCRCB_SWAPUV)
+   SF( Y,  Y,  x,  x,  Y,  x,  x,  x, 60, YCRCB_SWAPY)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, DXT1_RGB)
 /* smpl filt shad CK  RT  AB  VB  SO  color */
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_FXT1)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R8G8B8_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64G64B64A64_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R64G64B64_FLOAT)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC4_SNORM)
-   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC5_SNORM)
-   SF(50, 50,  x,  x,  x,  x, 60,  x,  x, BRW_SURFACEFORMAT_R16G16B16_FLOAT)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_UNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, BRW_SURFACEFORMAT_R16G16B16_USCALED)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC6H_SF16)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC7_UNORM)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC7_UNORM_SRGB)
-   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_BC6H_UF16)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_PLANAR_420_8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8_UNORM_SRGB)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC1_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_RGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_SIGNED_R11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_EAC_SIGNED_RG11)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_SRGB8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R16G16B16_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R32_SFIXED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R10G10B10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_SNORM)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_USCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_SSCALED)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_B10G10R10A2_SINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64G64B64A64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R64G64B64_PASSTHRU)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_RGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_SRGB8_PTA)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_EAC_RGBA8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_ETC2_EAC_SRGB8_A8)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8_UINT)
-   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, BRW_SURFACEFORMAT_R8G8B8_SINT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, FXT1)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R8G8B8_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64B64A64_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R64G64B64_FLOAT)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC4_SNORM)
+   SF( Y,  Y,  x,  x,  x,  x,  x,  x,  x, BC5_SNORM)
+   SF(50, 50,  x,  x,  x,  x, 60,  x,  x, R16G16B16_FLOAT)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_UNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  Y,  x,  x, R16G16B16_USCALED)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC6H_SF16)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC7_UNORM)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC7_UNORM_SRGB)
+   SF(70, 70,  x,  x,  x,  x,  x,  x,  x, BC6H_UF16)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, PLANAR_420_8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_UNORM_SRGB)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC1_RGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_RGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_R11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_RG11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_SIGNED_R11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, EAC_SIGNED_RG11)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_SRGB8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R16G16B16_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R16G16B16_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R32_SFIXED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R10G10B10A2_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SNORM)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_USCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SSCALED)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, B10G10R10A2_SINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64B64A64_PASSTHRU)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R64G64B64_PASSTHRU)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_RGB8_PTA)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_SRGB8_PTA)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_EAC_RGBA8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, ETC2_EAC_SRGB8_A8)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_UINT)
+   SF( x,  x,  x,  x,  x,  x,  x,  x,  x, R8G8B8_SINT)
 };
 #undef x
 #undef Y
 
+const char *
+brw_surface_format_name(unsigned format)
+{
+   return surface_formats[format].name;
+}
+
 uint32_t
 brw_format_for_mesa_format(mesa_format mesa_format)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 72b02a2..998d8c4 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -40,9 +40,88 @@
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
 static unsigned int
+tr_mode_horizontal_texture_alignment(const struct brw_context *brw,
+                                     const struct intel_mipmap_tree *mt)
+{
+   const unsigned *align_yf, *align_ys;
+   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
+   unsigned ret_align, divisor;
+
+   /* Horizontal alignment tables for TRMODE_{YF,YS}. Value in below
+    * tables specifies the horizontal alignment requirement in elements
+    * for the surface. An element is defined as a pixel in uncompressed
+    * surface formats, and as a compression block in compressed surface
+    * formats. For MSFMT_DEPTH_STENCIL type multisampled surfaces, an
+    * element is a sample.
+    */
+   const unsigned align_1d_yf[] = {4096, 2048, 1024, 512, 256};
+   const unsigned align_1d_ys[] = {65536, 32768, 16384, 8192, 4096};
+   const unsigned align_2d_yf[] = {64, 64, 32, 32, 16};
+   const unsigned align_2d_ys[] = {256, 256, 128, 128, 64};
+   const unsigned align_3d_yf[] = {16, 8, 8, 8, 4};
+   const unsigned align_3d_ys[] = {64, 32, 32, 32, 16};
+   int i = 0;
+
+   /* Alignment computations below assume bpp >= 8 and a power of 2. */
+   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp));
+
+   switch(mt->target) {
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_1D_ARRAY:
+      align_yf = align_1d_yf;
+      align_ys = align_1d_ys;
+      break;
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      align_yf = align_2d_yf;
+      align_ys = align_2d_ys;
+      break;
+   case GL_TEXTURE_3D:
+      align_yf = align_3d_yf;
+      align_ys = align_3d_ys;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* Compute array index. */
+   i = ffs(bpp/8) - 1;
+
+   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+               align_yf[i] : align_ys[i];
+
+   assert(is_power_of_two(mt->num_samples));
+
+   switch (mt->num_samples) {
+   case 2:
+   case 4:
+      divisor = 2;
+      break;
+   case 8:
+   case 16:
+      divisor = 4;
+      break;
+   default:
+      divisor = 1;
+      break;
+   }
+   return ret_align / divisor;
+}
+
+
+static unsigned int
 intel_horizontal_texture_alignment_unit(struct brw_context *brw,
-                                        struct intel_mipmap_tree *mt)
+                                        struct intel_mipmap_tree *mt,
+                                        uint32_t layout_flags)
 {
+   if (layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16)
+      return 16;
+
    /**
     * From the "Alignment Unit Size" section of various specs, namely:
     * - Gen3 Spec: "Memory Data Formats" Volume,         Section 1.20.1.4
@@ -88,18 +167,85 @@ intel_horizontal_texture_alignment_unit(struct brw_context *brw,
    if (mt->format == MESA_FORMAT_S_UINT8)
       return 8;
 
+   if (brw->gen >= 9 && mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      uint32_t align = tr_mode_horizontal_texture_alignment(brw, mt);
+      /* XY_FAST_COPY_BLT doesn't support horizontal alignment < 32. */
+      return align < 32 ? 32 : align;
+   }
+
    if (brw->gen >= 7 && mt->format == MESA_FORMAT_Z_UNORM16)
       return 8;
 
-   if (brw->gen == 8 && mt->mcs_mt && mt->num_samples <= 1)
-      return 16;
-
    return 4;
 }
 
 static unsigned int
+tr_mode_vertical_texture_alignment(const struct brw_context *brw,
+                                   const struct intel_mipmap_tree *mt)
+{
+   const unsigned *align_yf, *align_ys;
+   const unsigned bpp = _mesa_get_format_bytes(mt->format) * 8;
+   unsigned ret_align, divisor;
+
+   /* Vertical alignment tables for TRMODE_YF and TRMODE_YS. */
+   const unsigned align_2d_yf[] = {64, 32, 32, 16, 16};
+   const unsigned align_2d_ys[] = {256, 128, 128, 64, 64};
+   const unsigned align_3d_yf[] = {16, 16, 16, 8, 8};
+   const unsigned align_3d_ys[] = {32, 32, 32, 16, 16};
+   int i = 0;
+
+   assert(brw->gen >= 9 &&
+          mt->target != GL_TEXTURE_1D &&
+          mt->target != GL_TEXTURE_1D_ARRAY);
+
+   /* Alignment computations below assume bpp >= 8 and a power of 2. */
+   assert (bpp >= 8 && bpp <= 128 && is_power_of_two(bpp)) ;
+
+   switch(mt->target) {
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      align_yf = align_2d_yf;
+      align_ys = align_2d_ys;
+      break;
+   case GL_TEXTURE_3D:
+      align_yf = align_3d_yf;
+      align_ys = align_3d_ys;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   /* Compute array index. */
+   i = ffs(bpp / 8) - 1;
+
+   ret_align = mt->tr_mode == INTEL_MIPTREE_TRMODE_YF ?
+               align_yf[i] : align_ys[i];
+
+   assert(is_power_of_two(mt->num_samples));
+
+   switch (mt->num_samples) {
+   case 4:
+   case 8:
+      divisor = 2;
+      break;
+   case 16:
+      divisor = 4;
+      break;
+   default:
+      divisor = 1;
+      break;
+   }
+   return ret_align / divisor;
+}
+
+static unsigned int
 intel_vertical_texture_alignment_unit(struct brw_context *brw,
-                                      mesa_format format, bool multisampled)
+                                      const struct intel_mipmap_tree *mt)
 {
    /**
     * From the "Alignment Unit Size" section of various specs, namely:
@@ -124,23 +270,29 @@ intel_vertical_texture_alignment_unit(struct brw_context *brw,
     * Where "*" means either VALIGN_2 or VALIGN_4 depending on the setting of
     * the SURFACE_STATE "Surface Vertical Alignment" field.
     */
-   if (_mesa_is_format_compressed(format))
+   if (_mesa_is_format_compressed(mt->format))
       /* See comment above for the horizontal alignment */
       return brw->gen >= 9 ? 16 : 4;
 
-   if (format == MESA_FORMAT_S_UINT8)
+   if (mt->format == MESA_FORMAT_S_UINT8)
       return brw->gen >= 7 ? 8 : 4;
 
+   if (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE) {
+      uint32_t align = tr_mode_vertical_texture_alignment(brw, mt);
+      /* XY_FAST_COPY_BLT doesn't support vertical alignment < 64 */
+      return align < 64 ? 64 : align;
+   }
+
    /* Broadwell only supports VALIGN of 4, 8, and 16.  The BSpec says 4
     * should always be used, except for stencil buffers, which should be 8.
     */
    if (brw->gen >= 8)
       return 4;
 
-   if (multisampled)
+   if (mt->num_samples > 1)
       return 4;
 
-   GLenum base_format = _mesa_get_format_base_format(format);
+   GLenum base_format = _mesa_get_format_base_format(mt->format);
 
    if (brw->gen >= 6 &&
        (base_format == GL_DEPTH_COMPONENT ||
@@ -161,7 +313,7 @@ intel_vertical_texture_alignment_unit(struct brw_context *brw,
        *
        *     VALIGN_4 is not supported for surface format R32G32B32_FLOAT.
        */
-      if (base_format == GL_YCBCR_MESA || format == MESA_FORMAT_RGB_FLOAT32)
+      if (base_format == GL_YCBCR_MESA || mt->format == MESA_FORMAT_RGB_FLOAT32)
          return 2;
 
       return 4;
@@ -348,9 +500,9 @@ align_cube(struct intel_mipmap_tree *mt)
       mt->total_height += 2;
 }
 
-static bool
-use_linear_1d_layout(struct brw_context *brw,
-                     struct intel_mipmap_tree *mt)
+bool
+gen9_use_linear_1d_layout(const struct brw_context *brw,
+                          const struct intel_mipmap_tree *mt)
 {
    /* On Gen9+ the mipmap levels of a 1D surface are all laid out in a
     * horizontal line. This isn't done for depth/stencil buffers however
@@ -375,7 +527,7 @@ brw_miptree_layout_texture_array(struct brw_context *brw,
 				 struct intel_mipmap_tree *mt)
 {
    unsigned height = mt->physical_height0;
-   bool layout_1d = use_linear_1d_layout(brw, mt);
+   bool layout_1d = gen9_use_linear_1d_layout(brw, mt);
    int physical_qpitch;
 
    if (layout_1d)
@@ -458,46 +610,111 @@ brw_miptree_layout_texture_3d(struct brw_context *brw,
    align_cube(mt);
 }
 
-void
-brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
+/**
+ * \brief Helper function for intel_miptree_create().
+ */
+static uint32_t
+brw_miptree_choose_tiling(struct brw_context *brw,
+                          enum intel_miptree_tiling_mode requested,
+                          const struct intel_mipmap_tree *mt)
 {
-   bool multisampled = mt->num_samples > 1;
-   bool gen6_hiz_or_stencil = false;
+   if (mt->format == MESA_FORMAT_S_UINT8) {
+      /* The stencil buffer is W tiled. However, we request from the kernel a
+       * non-tiled buffer because the GTT is incapable of W fencing.
+       */
+      return I915_TILING_NONE;
+   }
 
-   if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
-      const GLenum base_format = _mesa_get_format_base_format(mt->format);
-      gen6_hiz_or_stencil = _mesa_is_depth_or_stencil_format(base_format);
+   /* Some usages may want only one type of tiling, like depth miptrees (Y
+    * tiled), or temporary BOs for uploading data once (linear).
+    */
+   switch (requested) {
+   case INTEL_MIPTREE_TILING_ANY:
+      break;
+   case INTEL_MIPTREE_TILING_Y:
+      return I915_TILING_Y;
+   case INTEL_MIPTREE_TILING_NONE:
+      return I915_TILING_NONE;
    }
 
-   if (gen6_hiz_or_stencil) {
-      /* On gen6, we use ALL_SLICES_AT_EACH_LOD for stencil/hiz because the
-       * hardware doesn't support multiple mip levels on stencil/hiz.
+   if (mt->num_samples > 1) {
+      /* From p82 of the Sandy Bridge PRM, dw3[1] of SURFACE_STATE ("Tiled
+       * Surface"):
        *
-       * PRM Vol 2, Part 1, 7.5.3 Hierarchical Depth Buffer:
-       * "The hierarchical depth buffer does not support the LOD field"
+       *   [DevSNB+]: For multi-sample render targets, this field must be
+       *   1. MSRTs can only be tiled.
        *
-       * PRM Vol 2, Part 1, 7.5.4.1 Separate Stencil Buffer:
-       * "The stencil depth buffer does not support the LOD field"
+       * Our usual reason for preferring X tiling (fast blits using the
+       * blitting engine) doesn't apply to MSAA, since we'll generally be
+       * downsampling or upsampling when blitting between the MSAA buffer
+       * and another buffer, and the blitting engine doesn't support that.
+       * So use Y tiling, since it makes better use of the cache.
        */
-      if (mt->format == MESA_FORMAT_S_UINT8) {
-         /* Stencil uses W tiling, so we force W tiling alignment for the
-          * ALL_SLICES_AT_EACH_LOD miptree layout.
-          */
-         mt->align_w = 64;
-         mt->align_h = 64;
-      } else {
-         /* Depth uses Y tiling, so we force need Y tiling alignment for the
-          * ALL_SLICES_AT_EACH_LOD miptree layout.
-          */
-         mt->align_w = 128 / mt->cpp;
-         mt->align_h = 32;
-      }
-   } else {
-      mt->align_w = intel_horizontal_texture_alignment_unit(brw, mt);
-      mt->align_h =
-         intel_vertical_texture_alignment_unit(brw, mt->format, multisampled);
+      return I915_TILING_Y;
+   }
+
+   GLenum base_format = _mesa_get_format_base_format(mt->format);
+   if (base_format == GL_DEPTH_COMPONENT ||
+       base_format == GL_DEPTH_STENCIL_EXT)
+      return I915_TILING_Y;
+
+   /* 1D textures (and 1D array textures) don't get any benefit from tiling,
+    * in fact it leads to a less efficient use of memory space and bandwidth
+    * due to tile alignment.
+    */
+   if (mt->logical_height0 == 1)
+      return I915_TILING_NONE;
+
+   int minimum_pitch = mt->total_width * mt->cpp;
+
+   /* If the width is much smaller than a tile, don't bother tiling. */
+   if (minimum_pitch < 64)
+      return I915_TILING_NONE;
+
+   if (ALIGN(minimum_pitch, 512) >= 32768 ||
+       mt->total_width >= 32768 || mt->total_height >= 32768) {
+      perf_debug("%dx%d miptree too large to blit, falling back to untiled",
+                 mt->total_width, mt->total_height);
+      return I915_TILING_NONE;
+   }
+
+   /* Pre-gen6 doesn't have BLORP to handle Y-tiling, so use X-tiling. */
+   if (brw->gen < 6)
+      return I915_TILING_X;
+
+   /* From the Sandybridge PRM, Volume 1, Part 2, page 32:
+    * "NOTE: 128BPE Format Color Buffer ( render target ) MUST be either TileX
+    *  or Linear."
+    * 128 bits per pixel translates to 16 bytes per pixel. This is necessary
+    * all the way back to 965, but is permitted on Gen7+.
+    */
+   if (brw->gen < 7 && mt->cpp >= 16)
+      return I915_TILING_X;
+
+   /* From the Ivy Bridge PRM, Vol4 Part1 2.12.2.1 (SURFACE_STATE for most
+    * messages), on p64, under the heading "Surface Vertical Alignment":
+    *
+    *     This field must be set to VALIGN_4 for all tiled Y Render Target
+    *     surfaces.
+    *
+    * So if the surface is renderable and uses a vertical alignment of 2,
+    * force it to be X tiled.  This is somewhat conservative (it's possible
+    * that the client won't ever render to this surface), but it's difficult
+    * to know that ahead of time.  And besides, since we use a vertical
+    * alignment of 4 as often as we can, this shouldn't happen very often.
+    */
+   if (brw->gen == 7 && mt->align_h == 2 &&
+       brw->format_supported_as_render_target[mt->format]) {
+      return I915_TILING_X;
    }
 
+   return I915_TILING_Y | I915_TILING_X;
+}
+
+static void
+intel_miptree_set_total_width_height(struct brw_context *brw,
+                                     struct intel_mipmap_tree *mt)
+{
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
       if (brw->gen == 4) {
@@ -532,7 +749,7 @@ brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
          break;
       case INTEL_MSAA_LAYOUT_NONE:
       case INTEL_MSAA_LAYOUT_IMS:
-         if (use_linear_1d_layout(brw, mt))
+         if (gen9_use_linear_1d_layout(brw, mt))
             gen9_miptree_layout_1d(mt);
          else
             brw_miptree_layout_2d(mt);
@@ -540,8 +757,62 @@ brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
       }
       break;
    }
+
    DBG("%s: %dx%dx%d\n", __func__,
        mt->total_width, mt->total_height, mt->cpp);
+}
+
+void
+brw_miptree_layout(struct brw_context *brw,
+                   struct intel_mipmap_tree *mt,
+                   enum intel_miptree_tiling_mode requested,
+                   uint32_t layout_flags)
+{
+   bool gen6_hiz_or_stencil = false;
+
+   mt->tr_mode = INTEL_MIPTREE_TRMODE_NONE;
+
+   if (brw->gen == 6 && mt->array_layout == ALL_SLICES_AT_EACH_LOD) {
+      const GLenum base_format = _mesa_get_format_base_format(mt->format);
+      gen6_hiz_or_stencil = _mesa_is_depth_or_stencil_format(base_format);
+   }
+
+   if (gen6_hiz_or_stencil) {
+      /* On gen6, we use ALL_SLICES_AT_EACH_LOD for stencil/hiz because the
+       * hardware doesn't support multiple mip levels on stencil/hiz.
+       *
+       * PRM Vol 2, Part 1, 7.5.3 Hierarchical Depth Buffer:
+       * "The hierarchical depth buffer does not support the LOD field"
+       *
+       * PRM Vol 2, Part 1, 7.5.4.1 Separate Stencil Buffer:
+       * "The stencil depth buffer does not support the LOD field"
+       */
+      if (mt->format == MESA_FORMAT_S_UINT8) {
+         /* Stencil uses W tiling, so we force W tiling alignment for the
+          * ALL_SLICES_AT_EACH_LOD miptree layout.
+          */
+         mt->align_w = 64;
+         mt->align_h = 64;
+         assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
+      } else {
+         /* Depth uses Y tiling, so we force need Y tiling alignment for the
+          * ALL_SLICES_AT_EACH_LOD miptree layout.
+          */
+         mt->align_w = 128 / mt->cpp;
+         mt->align_h = 32;
+      }
+   } else {
+      mt->align_w =
+         intel_horizontal_texture_alignment_unit(brw, mt, layout_flags);
+      mt->align_h = intel_vertical_texture_alignment_unit(brw, mt);
+   }
+
+   intel_miptree_set_total_width_height(brw, mt);
+
+   if (!mt->total_width || !mt->total_height) {
+      intel_miptree_release(&mt);
+      return;
+   }
 
    /* On Gen9+ the alignment values are expressed in multiples of the block
     * size
@@ -552,5 +823,8 @@ brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt)
       mt->align_w /= i;
       mt->align_h /= j;
    }
+
+   if ((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0)
+      mt->tiling = brw_miptree_choose_tiling(brw, requested, mt);
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_util.h b/src/mesa/drivers/dri/i965/brw_util.h
index b548d23..04e4e94 100644
--- a/src/mesa/drivers/dri/i965/brw_util.h
+++ b/src/mesa/drivers/dri/i965/brw_util.h
@@ -35,9 +35,47 @@
 
 #include "main/mtypes.h"
 #include "main/imports.h"
+#include "brw_context.h"
 
 extern GLuint brw_translate_blend_factor( GLenum factor );
 extern GLuint brw_translate_blend_equation( GLenum mode );
 extern GLenum brw_fix_xRGB_alpha(GLenum function);
 
+static inline uint32_t
+brw_get_line_width(struct brw_context *brw)
+{
+   /* From the OpenGL 4.4 spec:
+    *
+    * "The actual width of non-antialiased lines is determined by rounding
+    * the supplied width to the nearest integer, then clamping it to the
+    * implementation-dependent maximum non-antialiased line width."
+    */
+   float line_width =
+      CLAMP(!brw->ctx.Multisample._Enabled && !brw->ctx.Line.SmoothFlag
+            ? roundf(brw->ctx.Line.Width) : brw->ctx.Line.Width,
+            0.0, brw->ctx.Const.MaxLineWidth);
+   uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
+
+   /* Line width of 0 is not allowed when MSAA enabled */
+   if (brw->ctx.Multisample._Enabled) {
+      if (line_width_u3_7 == 0)
+         line_width_u3_7 = 1;
+   } else if (brw->ctx.Line.SmoothFlag && line_width < 1.5) {
+      /* For 1 pixel line thickness or less, the general
+       * anti-aliasing algorithm gives up, and a garbage line is
+       * generated.  Setting a Line Width of 0.0 specifies the
+       * rasterization of the "thinnest" (one-pixel-wide),
+       * non-antialiased lines.
+       *
+       * Lines rendered with zero Line Width are rasterized using
+       * Grid Intersection Quantization rules as specified by
+       * bspec section 6.3.12.1 Zero-Width (Cosmetic) Line
+       * Rasterization.
+       */
+      line_width_u3_7 = 0;
+   }
+
+   return line_width_u3_7;
+}
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 2841d98..a5c686c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -35,6 +35,7 @@ extern "C" {
 #include "program/prog_print.h"
 #include "program/prog_parameter.h"
 }
+#include "main/context.h"
 
 #define MAX_INSTRUCTION (1 << 30)
 
@@ -1676,20 +1677,16 @@ vec4_visitor::emit_shader_time_end()
     */
    emit(ADD(diff, src_reg(diff), src_reg(-2u)));
 
-   emit_shader_time_write(st_base, src_reg(diff));
-   emit_shader_time_write(st_written, src_reg(1u));
+   emit_shader_time_write(0, src_reg(diff));
+   emit_shader_time_write(1, src_reg(1u));
    emit(BRW_OPCODE_ELSE);
-   emit_shader_time_write(st_reset, src_reg(1u));
+   emit_shader_time_write(2, src_reg(1u));
    emit(BRW_OPCODE_ENDIF);
 }
 
 void
-vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
-                                     src_reg value)
+vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
 {
-   int shader_time_index =
-      brw_get_shader_time_index(brw, shader_prog, prog, type);
-
    dst_reg dst =
       dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
 
@@ -1698,7 +1695,8 @@ vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
    time.reg_offset++;
 
    offset.type = BRW_REGISTER_TYPE_UD;
-   emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
+   int index = shader_time_index * 3 + shader_time_subindex;
+   emit(MOV(offset, src_reg(index * SHADER_TIME_STRIDE)));
 
    time.type = BRW_REGISTER_TYPE_UD;
    emit(MOV(time, src_reg(value)));
@@ -1709,11 +1707,11 @@ vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
 }
 
 bool
-vec4_visitor::run()
+vec4_visitor::run(gl_clip_plane *clip_planes)
 {
    sanity_param_count = prog->Parameters->NumParameters;
 
-   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+   if (shader_time_index >= 0)
       emit_shader_time_begin();
 
    assign_binding_table_offsets();
@@ -1731,7 +1729,7 @@ vec4_visitor::run()
    base_ir = NULL;
 
    if (key->userclip_active && !prog->UsesClipDistanceOut)
-      setup_uniform_clipplane_values();
+      setup_uniform_clipplane_values(clip_planes);
 
    emit_thread_end();
 
@@ -1768,7 +1766,7 @@ vec4_visitor::run()
          snprintf(filename, 64, "%s-%04d-%02d-%02d-" #pass,            \
                   stage_abbrev, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
                                                                        \
-         backend_visitor::dump_instructions(filename);                 \
+         backend_shader::dump_instructions(filename);                  \
       }                                                                \
                                                                        \
       progress = progress || this_progress;                            \
@@ -1781,7 +1779,7 @@ vec4_visitor::run()
       snprintf(filename, 64, "%s-%04d-00-start",
                stage_abbrev, shader_prog ? shader_prog->Name : 0);
 
-      backend_visitor::dump_instructions(filename);
+      backend_shader::dump_instructions(filename);
    }
 
    bool progress;
@@ -1868,8 +1866,6 @@ brw_vs_emit(struct brw_context *brw,
    bool start_busy = false;
    double start_time = 0;
    const unsigned *assembly = NULL;
-   bool use_nir =
-      brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions != NULL;
 
    if (unlikely(brw->perf_debug)) {
       start_busy = (brw->batch.last_bo &&
@@ -1881,22 +1877,33 @@ brw_vs_emit(struct brw_context *brw,
    if (prog)
       shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
 
+   int st_index = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      st_index = brw_get_shader_time_index(brw, prog, &c->vp->program.Base,
+                                           ST_VS);
+
    if (unlikely(INTEL_DEBUG & DEBUG_VS))
       brw_dump_ir("vertex", prog, &shader->base, &c->vp->program.Base);
 
-   if (use_nir && !c->vp->program.Base.nir) {
-      /* Normally we generate NIR in LinkShader() or ProgramStringNotify(), but
-       * Mesa's fixed-function vertex program handling doesn't notify the driver
-       * at all.  Just do it here, at the last minute, even though it's lame.
-       */
-      assert(c->vp->program.Base.Id == 0 && prog == NULL);
-      c->vp->program.Base.nir =
-         brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
-   }
+   if (brw->intelScreen->compiler->scalar_vs) {
+      if (!c->vp->program.Base.nir) {
+         /* Normally we generate NIR in LinkShader() or
+          * ProgramStringNotify(), but Mesa's fixed-function vertex program
+          * handling doesn't notify the driver at all.  Just do it here, at
+          * the last minute, even though it's lame.
+          */
+         assert(c->vp->program.Base.Id == 0 && prog == NULL);
+         c->vp->program.Base.nir =
+            brw_create_nir(brw, NULL, &c->vp->program.Base, MESA_SHADER_VERTEX);
+      }
 
-   if (brw->scalar_vs && (prog || use_nir)) {
-      fs_visitor v(brw, mem_ctx, &c->key, prog_data, prog, &c->vp->program, 8);
-      if (!v.run_vs()) {
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_visitor v(brw->intelScreen->compiler, brw,
+                   mem_ctx, MESA_SHADER_VERTEX, &c->key,
+                   &prog_data->base.base, prog, &c->vp->program.Base,
+                   8, st_index);
+      if (!v.run_vs(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
             prog->LinkStatus = false;
             ralloc_strcat(&prog->InfoLog, v.fail_msg);
@@ -1908,7 +1915,8 @@ brw_vs_emit(struct brw_context *brw,
          return NULL;
       }
 
-      fs_generator g(brw, mem_ctx, (void *) &c->key, &prog_data->base.base,
+      fs_generator g(brw->intelScreen->compiler, brw,
+                     mem_ctx, (void *) &c->key, &prog_data->base.base,
                      &c->vp->program.Base, v.promoted_constants,
                      v.runtime_check_aads_emit, "VS");
       if (INTEL_DEBUG & DEBUG_VS) {
@@ -1926,13 +1934,16 @@ brw_vs_emit(struct brw_context *brw,
       g.generate_code(v.cfg, 8);
       assembly = g.get_assembly(final_assembly_size);
 
-      prog_data->base.simd8 = true;
       c->base.last_scratch = v.last_scratch;
    }
 
    if (!assembly) {
-      vec4_vs_visitor v(brw, c, prog_data, prog, mem_ctx);
-      if (!v.run()) {
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+      vec4_vs_visitor v(brw->intelScreen->compiler,
+                        c, prog_data, prog, mem_ctx, st_index,
+                        !_mesa_is_gles3(&brw->ctx));
+      if (!v.run(brw_select_clip_planes(&brw->ctx))) {
          if (prog) {
             prog->LinkStatus = false;
             ralloc_strcat(&prog->InfoLog, v.fail_msg);
@@ -1944,7 +1955,8 @@ brw_vs_emit(struct brw_context *brw,
          return NULL;
       }
 
-      vec4_generator g(brw, prog, &c->vp->program.Base, &prog_data->base,
+      vec4_generator g(brw->intelScreen->compiler, brw,
+                       prog, &c->vp->program.Base, &prog_data->base,
                        mem_ctx, INTEL_DEBUG & DEBUG_VS, "vertex", "VS");
       assembly = g.generate_assembly(v.cfg, final_assembly_size);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 628c631..2ac1693 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -73,10 +73,10 @@ class vec4_live_variables;
  * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
  * fixed-function) into VS IR.
  */
-class vec4_visitor : public backend_visitor
+class vec4_visitor : public backend_shader, public ir_visitor
 {
 public:
-   vec4_visitor(struct brw_context *brw,
+   vec4_visitor(const struct brw_compiler *compiler,
                 struct brw_vec4_compile *c,
                 struct gl_program *prog,
                 const struct brw_vue_prog_key *key,
@@ -85,9 +85,7 @@ public:
                 gl_shader_stage stage,
 		void *mem_ctx,
                 bool no_spills,
-                shader_time_shader_type st_base,
-                shader_time_shader_type st_written,
-                shader_time_shader_type st_reset);
+                int shader_time_index);
    ~vec4_visitor();
 
    dst_reg dst_null_f()
@@ -160,6 +158,7 @@ public:
    virtual void visit(ir_if *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void visit(ir_barrier *);
    /*@}*/
 
    src_reg result;
@@ -178,10 +177,10 @@ public:
 
    struct hash_table *variable_ht;
 
-   bool run(void);
+   bool run(gl_clip_plane *clip_planes);
    void fail(const char *msg, ...);
 
-   void setup_uniform_clipplane_values();
+   void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
    void setup_uniform_values(ir_variable *ir);
    void setup_builtin_uniform_values(ir_variable *ir);
    int setup_uniforms(int payload_reg);
@@ -344,8 +343,7 @@ public:
 
    void emit_shader_time_begin();
    void emit_shader_time_end();
-   void emit_shader_time_write(enum shader_time_shader_type type,
-                               src_reg value);
+   void emit_shader_time_write(int shader_time_subindex, src_reg value);
 
    void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                             dst_reg dst, src_reg offset, src_reg src0,
@@ -412,9 +410,7 @@ private:
     */
    const bool no_spills;
 
-   const shader_time_shader_type st_base;
-   const shader_time_shader_type st_written;
-   const shader_time_shader_type st_reset;
+   int shader_time_index;
 };
 
 
@@ -426,7 +422,7 @@ private:
 class vec4_generator
 {
 public:
-   vec4_generator(struct brw_context *brw,
+   vec4_generator(const struct brw_compiler *compiler, void *log_data,
                   struct gl_shader_program *shader_prog,
                   struct gl_program *prog,
                   struct brw_vue_prog_data *prog_data,
@@ -508,7 +504,9 @@ private:
                                          struct brw_reg dst);
    void generate_unpack_flags(struct brw_reg dst);
 
-   struct brw_context *brw;
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
    const struct brw_device_info *devinfo;
 
    struct brw_codegen *p;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 9147c3c..c9fe0ce 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -114,8 +114,16 @@ instructions_match(vec4_instruction *a, vec4_instruction *b)
 {
    return a->opcode == b->opcode &&
           a->saturate == b->saturate &&
+          a->predicate == b->predicate &&
+          a->predicate_inverse == b->predicate_inverse &&
           a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
           a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->base_mrf == b->base_mrf &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
           a->dst.writemask == b->dst.writemask &&
           a->force_writemask_all == b->force_writemask_all &&
           a->regs_written == b->regs_written &&
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index ef77b8d..d2de2f0 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -134,7 +134,8 @@ vec4_instruction::get_src(const struct brw_vue_prog_data *prog_data, int i)
    return brw_reg;
 }
 
-vec4_generator::vec4_generator(struct brw_context *brw,
+vec4_generator::vec4_generator(const struct brw_compiler *compiler,
+                               void *log_data,
                                struct gl_shader_program *shader_prog,
                                struct gl_program *prog,
                                struct brw_vue_prog_data *prog_data,
@@ -142,13 +143,13 @@ vec4_generator::vec4_generator(struct brw_context *brw,
                                bool debug_flag,
                                const char *stage_name,
                                const char *stage_abbrev)
-   : brw(brw), devinfo(brw->intelScreen->devinfo),
+   : compiler(compiler), log_data(log_data), devinfo(compiler->devinfo),
      shader_prog(shader_prog), prog(prog), prog_data(prog_data),
      mem_ctx(mem_ctx), stage_name(stage_name), stage_abbrev(stage_abbrev),
      debug_flag(debug_flag)
 {
    p = rzalloc(mem_ctx, struct brw_codegen);
-   brw_init_codegen(brw->intelScreen->devinfo, p, mem_ctx);
+   brw_init_codegen(devinfo, p, mem_ctx);
 }
 
 vec4_generator::~vec4_generator()
@@ -398,30 +399,25 @@ vec4_generator::generate_tex(vec4_instruction *inst,
       brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
    } else {
       /* Non-constant sampler index. */
-      /* Note: this clobbers `dst` as a temporary before emitting the send */
 
       struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
-      struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
-
       struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
 
       brw_push_insn_state(p);
       brw_set_default_mask_control(p, BRW_MASK_DISABLE);
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
-      /* Some care required: `sampler` and `temp` may alias:
-       *    addr = sampler & 0xff
-       *    temp = (sampler << 8) & 0xf00
-       *    addr = addr | temp
-       */
-      brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
-      brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
-      brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
-      brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
-      brw_OR(p, addr, addr, temp);
+      /* addr = ((sampler * 0x101) + base_binding_table_index) & 0xfff */
+      brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      if (base_binding_table_index)
+         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
 
       brw_pop_insn_state(p);
 
+      if (inst->base_mrf != -1)
+         gen6_resolve_implied_move(p, &src, inst->base_mrf);
+
       /* dst = send(offset, a0.0 | <descriptor>) */
       brw_inst *insn = brw_send_indirect_message(
          p, BRW_SFID_SAMPLER, dst, src, addr);
@@ -1631,16 +1627,11 @@ vec4_generator::generate_code(const cfg_t *cfg)
       ralloc_free(annotation.ann);
    }
 
-   static GLuint msg_id = 0;
-   _mesa_gl_debug(&brw->ctx, &msg_id,
-                  MESA_DEBUG_SOURCE_SHADER_COMPILER,
-                  MESA_DEBUG_TYPE_OTHER,
-                  MESA_DEBUG_SEVERITY_NOTIFICATION,
-                  "%s vec4 shader: %d inst, %d loops, "
-                  "compacted %d to %d bytes.\n",
-                  stage_abbrev,
-                  before_size / 16, loop_count,
-                  before_size, after_size);
+   compiler->shader_debug_log(log_data,
+                              "%s vec4 shader: %d inst, %d loops, "
+                              "compacted %d to %d bytes.\n",
+                              stage_abbrev, before_size / 16, loop_count,
+                              before_size, after_size);
 }
 
 const unsigned *
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 363e30e..69bcf5a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -34,15 +34,15 @@ const unsigned MAX_GS_INPUT_VERTICES = 6;
 
 namespace brw {
 
-vec4_gs_visitor::vec4_gs_visitor(struct brw_context *brw,
+vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
                                  struct brw_gs_compile *c,
                                  struct gl_shader_program *prog,
                                  void *mem_ctx,
-                                 bool no_spills)
-   : vec4_visitor(brw, &c->base, &c->gp->program.Base, &c->key.base,
+                                 bool no_spills,
+                                 int shader_time_index)
+   : vec4_visitor(compiler, &c->base, &c->gp->program.Base, &c->key.base,
                   &c->prog_data.base, prog, MESA_SHADER_GEOMETRY, mem_ctx,
-                  no_spills,
-                  ST_GS, ST_GS_WRITTEN, ST_GS_RESET),
+                  no_spills, shader_time_index),
      c(c)
 {
 }
@@ -106,7 +106,7 @@ vec4_gs_visitor::setup_payload()
     * to be interleaved, so one register contains two attribute slots.
     */
    int attributes_per_reg =
-      c->prog_data.dispatch_mode == GEN7_GS_DISPATCH_MODE_DUAL_OBJECT ? 1 : 2;
+      c->prog_data.base.dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
 
    /* If a geometry shader tries to read from an input that wasn't written by
     * the vertex shader, that produces undefined results, but it shouldn't
@@ -629,7 +629,8 @@ generate_assembly(struct brw_context *brw,
                   const cfg_t *cfg,
                   unsigned *final_assembly_size)
 {
-   vec4_generator g(brw, shader_prog, prog, prog_data, mem_ctx,
+   vec4_generator g(brw->intelScreen->compiler, brw,
+                    shader_prog, prog, prog_data, mem_ctx,
                     INTEL_DEBUG & DEBUG_GS, "geometry", "GS");
    return g.generate_assembly(cfg, final_assembly_size);
 }
@@ -648,6 +649,10 @@ brw_gs_emit(struct brw_context *brw,
       brw_dump_ir("geometry", prog, &shader->base, NULL);
    }
 
+   int st_index = -1;
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      st_index = brw_get_shader_time_index(brw, prog, NULL, ST_GS);
+
    if (brw->gen >= 7) {
       /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
        * so without spilling. If the GS invocations count > 1, then we can't use
@@ -655,10 +660,11 @@ brw_gs_emit(struct brw_context *brw,
        */
       if (c->prog_data.invocations <= 1 &&
           likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
-         c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_OBJECT;
+         c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
 
-         vec4_gs_visitor v(brw, c, prog, mem_ctx, true /* no_spills */);
-         if (v.run()) {
+         vec4_gs_visitor v(brw->intelScreen->compiler,
+                           c, prog, mem_ctx, true /* no_spills */, st_index);
+         if (v.run(NULL /* clip planes */)) {
             return generate_assembly(brw, prog, &c->gp->program.Base,
                                      &c->prog_data.base, mem_ctx, v.cfg,
                                      final_assembly_size);
@@ -690,19 +696,23 @@ brw_gs_emit(struct brw_context *brw,
     * SINGLE mode.
     */
    if (c->prog_data.invocations <= 1 || brw->gen < 7)
-      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_SINGLE;
+      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
    else
-      c->prog_data.dispatch_mode = GEN7_GS_DISPATCH_MODE_DUAL_INSTANCE;
+      c->prog_data.base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
 
    vec4_gs_visitor *gs = NULL;
    const unsigned *ret = NULL;
 
    if (brw->gen >= 7)
-      gs = new vec4_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
+      gs = new vec4_gs_visitor(brw->intelScreen->compiler,
+                               c, prog, mem_ctx, false /* no_spills */,
+                               st_index);
    else
-      gs = new gen6_gs_visitor(brw, c, prog, mem_ctx, false /* no_spills */);
+      gs = new gen6_gs_visitor(brw->intelScreen->compiler,
+                               c, prog, mem_ctx, false /* no_spills */,
+                               st_index);
 
-   if (!gs->run()) {
+   if (!gs->run(NULL /* clip planes */)) {
       prog->LinkStatus = false;
       ralloc_strcat(&prog->InfoLog, gs->fail_msg);
    } else {
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
index bcb5a2b..e693c56 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.h
@@ -68,11 +68,12 @@ namespace brw {
 class vec4_gs_visitor : public vec4_visitor
 {
 public:
-   vec4_gs_visitor(struct brw_context *brw,
+   vec4_gs_visitor(const struct brw_compiler *compiler,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
-                   bool no_spills);
+                   bool no_spills,
+                   int shader_time_index);
 
 protected:
    virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
index 5368a75..555c42e 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_reg_allocate.cpp
@@ -191,7 +191,6 @@ vec4_visitor::setup_payload_interference(struct ra_graph *g,
 bool
 vec4_visitor::reg_allocate()
 {
-   struct brw_compiler *compiler = brw->intelScreen->compiler;
    unsigned int hw_reg_mapping[alloc.count];
    int payload_reg_count = this->first_non_payload_grf;
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index e51c140..236fa51 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -684,9 +684,12 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
     * order we'd walk the type, so walk the list of storage and find anything
     * with our name, or the prefix of a component that starts with our name.
     */
-   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
+   for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
       struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
 
+      if (storage->builtin)
+         continue;
+
       if (strncmp(ir->name, storage->name, namelen) != 0 ||
           (storage->name[namelen] != 0 &&
            storage->name[namelen] != '.' &&
@@ -718,10 +721,8 @@ vec4_visitor::setup_uniform_values(ir_variable *ir)
 }
 
 void
-vec4_visitor::setup_uniform_clipplane_values()
+vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
 {
-   gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
-
    for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
       assert(this->uniforms < uniform_array_size);
       this->uniform_vector_size[this->uniforms] = 4;
@@ -2461,11 +2462,27 @@ vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler
       new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
                                     dst_reg(this, glsl_type::uvec4_type));
    inst->base_mrf = 2;
-   inst->mlen = 1;
    inst->src[1] = sampler;
 
+   int param_base;
+
+   if (devinfo->gen >= 9) {
+      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+      vec4_instruction *header_inst = new(mem_ctx)
+         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+                          dst_reg(MRF, inst->base_mrf));
+
+      emit(header_inst);
+
+      inst->mlen = 2;
+      inst->header_size = 1;
+      param_base = inst->base_mrf + 1;
+   } else {
+      inst->mlen = 1;
+      param_base = inst->base_mrf;
+   }
+
    /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
-   int param_base = inst->base_mrf;
    int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
    int zero_mask = 0xf & ~coord_mask;
 
@@ -2949,6 +2966,12 @@ vec4_visitor::visit(ir_end_primitive *)
 }
 
 void
+vec4_visitor::visit(ir_barrier *)
+{
+   unreachable("not reached");
+}
+
+void
 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
                                   dst_reg dst, src_reg offset,
                                   src_reg src0, src_reg src1)
@@ -3655,7 +3678,7 @@ vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
    *reg = neg_result;
 }
 
-vec4_visitor::vec4_visitor(struct brw_context *brw,
+vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
                            struct brw_vec4_compile *c,
                            struct gl_program *prog,
                            const struct brw_vue_prog_key *key,
@@ -3664,10 +3687,9 @@ vec4_visitor::vec4_visitor(struct brw_context *brw,
                            gl_shader_stage stage,
 			   void *mem_ctx,
                            bool no_spills,
-                           shader_time_shader_type st_base,
-                           shader_time_shader_type st_written,
-                           shader_time_shader_type st_reset)
-   : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
+                           int shader_time_index)
+   : backend_shader(compiler, NULL, mem_ctx,
+                    shader_prog, prog, &prog_data->base, stage),
      c(c),
      key(key),
      prog_data(prog_data),
@@ -3676,11 +3698,8 @@ vec4_visitor::vec4_visitor(struct brw_context *brw,
      first_non_payload_grf(0),
      need_all_constants_in_pull_buffer(false),
      no_spills(no_spills),
-     st_base(st_base),
-     st_written(st_written),
-     st_reset(st_reset)
+     shader_time_index(shader_time_index)
 {
-   this->mem_ctx = mem_ctx;
    this->failed = false;
 
    this->base_ir = NULL;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
index 92d1085..dcbd240 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vp.cpp
@@ -381,8 +381,7 @@ vec4_vs_visitor::emit_program_code()
          break;
 
       default:
-         _mesa_problem(ctx, "Unsupported opcode %s in vertex program\n",
-                       _mesa_opcode_string(vpi->Opcode));
+         assert(!"Unsupported opcode in vertex program");
       }
 
       /* Copy the temporary back into the actual destination register. */
@@ -574,15 +573,13 @@ vec4_vs_visitor::get_vp_src_reg(const prog_src_register &src)
          break;
 
       default:
-         _mesa_problem(ctx, "bad uniform src register file: %s\n",
-                       _mesa_register_file_name((gl_register_file)src.File));
+         assert(!"Bad uniform in src register file");
          return src_reg(this, glsl_type::vec4_type);
       }
       break;
 
    default:
-      _mesa_problem(ctx, "bad src register file: %s\n",
-                    _mesa_register_file_name((gl_register_file)src.File));
+      assert(!"Bad src register file");
       return src_reg(this, glsl_type::vec4_type);
    }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
index 4baf73e..f93062b 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_vs_visitor.cpp
@@ -23,7 +23,6 @@
 
 
 #include "brw_vs.h"
-#include "main/context.h"
 
 
 namespace brw {
@@ -78,7 +77,7 @@ vec4_vs_visitor::emit_prolog()
             /* ES 3.0 has different rules for converting signed normalized
              * fixed-point numbers than desktop GL.
              */
-            if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
+            if ((wa_flags & BRW_ATTRIB_WA_SIGN) && !use_legacy_snorm_formula) {
                /* According to equation 2.2 of the ES 3.0 specification,
                 * signed normalization conversion is done by:
                 *
@@ -212,18 +211,21 @@ vec4_vs_visitor::emit_thread_end()
 }
 
 
-vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
+vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
                                  struct brw_vs_compile *vs_compile,
                                  struct brw_vs_prog_data *vs_prog_data,
                                  struct gl_shader_program *prog,
-                                 void *mem_ctx)
-   : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
+                                 void *mem_ctx,
+                                 int shader_time_index,
+                                 bool use_legacy_snorm_formula)
+   : vec4_visitor(compiler, &vs_compile->base, &vs_compile->vp->program.Base,
                   &vs_compile->key.base, &vs_prog_data->base, prog,
                   MESA_SHADER_VERTEX,
                   mem_ctx, false /* no_spills */,
-                  ST_VS, ST_VS_WRITTEN, ST_VS_RESET),
+                  shader_time_index),
      vs_compile(vs_compile),
-     vs_prog_data(vs_prog_data)
+     vs_prog_data(vs_prog_data),
+     use_legacy_snorm_formula(use_legacy_snorm_formula)
 {
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vs.c b/src/mesa/drivers/dri/i965/brw_vs.c
index d03567e..6e9848f 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.c
+++ b/src/mesa/drivers/dri/i965/brw_vs.c
@@ -40,108 +40,6 @@
 
 #include "util/ralloc.h"
 
-static inline void assign_vue_slot(struct brw_vue_map *vue_map,
-                                   int varying)
-{
-   /* Make sure this varying hasn't been assigned a slot already */
-   assert (vue_map->varying_to_slot[varying] == -1);
-
-   vue_map->varying_to_slot[varying] = vue_map->num_slots;
-   vue_map->slot_to_varying[vue_map->num_slots++] = varying;
-}
-
-/**
- * Compute the VUE map for vertex shader program.
- */
-void
-brw_compute_vue_map(const struct brw_device_info *devinfo,
-                    struct brw_vue_map *vue_map,
-                    GLbitfield64 slots_valid)
-{
-   vue_map->slots_valid = slots_valid;
-   int i;
-
-   /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
-    * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
-    */
-   slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
-
-   /* Make sure that the values we store in vue_map->varying_to_slot and
-    * vue_map->slot_to_varying won't overflow the signed chars that are used
-    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
-    * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that
-    * BRW_VARYING_SLOT_COUNT is <= 127, not 128.
-    */
-   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
-
-   vue_map->num_slots = 0;
-   for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
-      vue_map->varying_to_slot[i] = -1;
-      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_COUNT;
-   }
-
-   /* VUE header: format depends on chip generation and whether clipping is
-    * enabled.
-    */
-   if (devinfo->gen < 6) {
-      /* There are 8 dwords in VUE header pre-Ironlake:
-       * dword 0-3 is indices, point width, clip flags.
-       * dword 4-7 is ndc position
-       * dword 8-11 is the first vertex data.
-       *
-       * On Ironlake the VUE header is nominally 20 dwords, but the hardware
-       * will accept the same header layout as Gen4 [and should be a bit faster]
-       */
-      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
-      assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC);
-      assign_vue_slot(vue_map, VARYING_SLOT_POS);
-   } else {
-      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
-       * dword 0-3 of the header is indices, point width, clip flags.
-       * dword 4-7 is the 4D space position
-       * dword 8-15 of the vertex header is the user clip distance if
-       * enabled.
-       * dword 8-11 or 16-19 is the first vertex element data we fill.
-       */
-      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
-      assign_vue_slot(vue_map, VARYING_SLOT_POS);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0))
-         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1))
-         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1);
-
-      /* front and back colors need to be consecutive so that we can use
-       * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing
-       * two-sided color.
-       */
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0))
-         assign_vue_slot(vue_map, VARYING_SLOT_COL0);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0))
-         assign_vue_slot(vue_map, VARYING_SLOT_BFC0);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1))
-         assign_vue_slot(vue_map, VARYING_SLOT_COL1);
-      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1))
-         assign_vue_slot(vue_map, VARYING_SLOT_BFC1);
-   }
-
-   /* The hardware doesn't care about the rest of the vertex outputs, so just
-    * assign them contiguously.  Don't reassign outputs that already have a
-    * slot.
-    *
-    * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
-    * since it's encoded as the clip distances by emit_clip_distances().
-    * However, it may be output by transform feedback, and we'd rather not
-    * recompute state when TF changes, so we just always include it.
-    */
-   for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
-      if ((slots_valid & BITFIELD64_BIT(i)) &&
-          vue_map->varying_to_slot[i] == -1) {
-         assign_vue_slot(vue_map, i);
-      }
-   }
-}
-
-
 /**
  * Decide which set of clip planes should be used when clipping via
  * gl_Position or gl_ClipVertex.
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 6157ae6..61f9b00 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -90,11 +90,13 @@ namespace brw {
 class vec4_vs_visitor : public vec4_visitor
 {
 public:
-   vec4_vs_visitor(struct brw_context *brw,
+   vec4_vs_visitor(const struct brw_compiler *compiler,
                    struct brw_vs_compile *vs_compile,
                    struct brw_vs_prog_data *vs_prog_data,
                    struct gl_shader_program *prog,
-                   void *mem_ctx);
+                   void *mem_ctx,
+                   int shader_time_index,
+                   bool use_legacy_snorm_formula);
 
 protected:
    virtual dst_reg *make_reg_for_system_value(ir_variable *ir);
@@ -115,6 +117,8 @@ private:
    struct brw_vs_prog_data * const vs_prog_data;
    src_reg *vp_temp_regs;
    src_reg vp_addr_reg;
+
+   bool use_legacy_snorm_formula;
 };
 
 } /* namespace brw */
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
index f82a62b..b2f91bd 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -121,7 +121,7 @@ brw_upload_vs_pull_constants(struct brw_context *brw)
    /* BRW_NEW_VS_PROG_DATA */
    const struct brw_stage_prog_data *prog_data = &brw->vs.prog_data->base.base;
 
-   dword_pitch = brw->vs.prog_data->base.simd8;
+   dword_pitch = brw->vs.prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
 
    /* _NEW_PROGRAM_CONSTANTS */
    brw_upload_pull_constants(brw, BRW_NEW_VS_CONSTBUF, &vp->program.Base,
@@ -151,7 +151,7 @@ brw_upload_vs_ubo_surfaces(struct brw_context *brw)
       return;
 
    /* BRW_NEW_VS_PROG_DATA */
-   dword_pitch = brw->vs.prog_data->base.simd8;
+   dword_pitch = brw->vs.prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8;
    brw_upload_ubo_surfaces(brw, prog->_LinkedShaders[MESA_SHADER_VERTEX],
                            &brw->vs.base, &brw->vs.prog_data->base.base,
                            dword_pitch);
diff --git a/src/mesa/drivers/dri/i965/brw_vue_map.c b/src/mesa/drivers/dri/i965/brw_vue_map.c
new file mode 100644
index 0000000..7687578
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vue_map.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vue_map.c
+ *
+ * This file computes the "VUE map" for a (non-fragment) shader stage, which
+ * describes the layout of its output varyings.  The VUE map is used to match
+ * outputs from one stage with the inputs of the next.
+ *
+ * Largely, varyings can be placed however we like - producers/consumers simply
+ * have to agree on the layout.  However, there is also a "VUE Header" that
+ * prescribes a fixed-layout for items that interact with fixed function
+ * hardware, such as the clipper and rasterizer.
+ *
+ * Authors:
+ *   Paul Berry <stereotype441@gmail.com>
+ *   Chris Forbes <chrisf@ijw.co.nz>
+ *   Eric Anholt <eric@anholt.net>
+ */
+
+
+#include "main/compiler.h"
+#include "brw_context.h"
+
+static inline void
+assign_vue_slot(struct brw_vue_map *vue_map, int varying)
+{
+   /* Make sure this varying hasn't been assigned a slot already */
+   assert (vue_map->varying_to_slot[varying] == -1);
+
+   vue_map->varying_to_slot[varying] = vue_map->num_slots;
+   vue_map->slot_to_varying[vue_map->num_slots++] = varying;
+}
+
+/**
+ * Compute the VUE map for a shader stage.
+ */
+void
+brw_compute_vue_map(const struct brw_device_info *devinfo,
+                    struct brw_vue_map *vue_map,
+                    GLbitfield64 slots_valid)
+{
+   vue_map->slots_valid = slots_valid;
+   int i;
+
+   /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
+    * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
+    */
+   slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that
+    * BRW_VARYING_SLOT_COUNT is <= 127, not 128.
+    */
+   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
+
+   vue_map->num_slots = 0;
+   for (i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_COUNT;
+   }
+
+   /* VUE header: format depends on chip generation and whether clipping is
+    * enabled.
+    *
+    * See the Sandybridge PRM, Volume 2 Part 1, section 1.5.1 (page 30),
+    * "Vertex URB Entry (VUE) Formats" which describes the VUE header layout.
+    */
+   if (devinfo->gen < 6) {
+      /* There are 8 dwords in VUE header pre-Ironlake:
+       * dword 0-3 is indices, point width, clip flags.
+       * dword 4-7 is ndc position
+       * dword 8-11 is the first vertex data.
+       *
+       * On Ironlake the VUE header is nominally 20 dwords, but the hardware
+       * will accept the same header layout as Gen4 [and should be a bit faster]
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
+      assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS);
+   } else {
+      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
+       * dword 0-3 of the header is indices, point width, clip flags.
+       * dword 4-7 is the 4D space position
+       * dword 8-15 of the vertex header is the user clip distance if
+       * enabled.
+       * dword 8-11 or 16-19 is the first vertex element data we fill.
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1);
+
+      /* front and back colors need to be consecutive so that we can use
+       * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing
+       * two-sided color.
+       */
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL0);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC0);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL1);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC1);
+   }
+
+   /* The hardware doesn't care about the rest of the vertex outputs, so just
+    * assign them contiguously.  Don't reassign outputs that already have a
+    * slot.
+    *
+    * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
+    * since it's encoded as the clip distances by emit_clip_distances().
+    * However, it may be output by transform feedback, and we'd rather not
+    * recompute state when TF changes, so we just always include it.
+    */
+   for (int i = 0; i < VARYING_SLOT_MAX; ++i) {
+      if ((slots_valid & BITFIELD64_BIT(i)) &&
+          vue_map->varying_to_slot[i] == -1) {
+         assign_vue_slot(vue_map, i);
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 5496225..4619ce1 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -36,6 +36,7 @@
 #include "main/formats.h"
 #include "main/fbobject.h"
 #include "main/samplerobj.h"
+#include "main/framebuffer.h"
 #include "program/prog_parameter.h"
 #include "program/program.h"
 #include "intel_mipmap_tree.h"
@@ -462,7 +463,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    GLuint lookup = 0;
    GLuint line_aa;
    bool program_uses_dfdy = fp->program.UsesDFdy;
-   bool multisample_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisample_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    memset(key, 0, sizeof(*key));
 
@@ -561,7 +562,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
     * drawable height in order to invert the Y axis.
     */
    if (fp->program.Base.InputsRead & VARYING_BIT_POS) {
-      key->drawable_height = ctx->DrawBuffer->Height;
+      key->drawable_height = _mesa_geometric_height(ctx->DrawBuffer);
    }
 
    if ((fp->program.Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
@@ -580,7 +581,7 @@ static void brw_wm_populate_key( struct brw_context *brw,
    key->persample_shading =
       _mesa_get_min_invocations_per_fragment(ctx, &fp->program, true) > 1;
    if (key->persample_shading)
-      key->persample_2x = ctx->DrawBuffer->Visual.samples == 2;
+      key->persample_2x = _mesa_geometric_samples(ctx->DrawBuffer) == 2;
 
    key->compute_pos_offset =
       _mesa_get_min_invocations_per_fragment(ctx, &fp->program, false) > 1 &&
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 160dd2f..72aad96 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -35,6 +35,7 @@
 #include "main/mtypes.h"
 #include "main/samplerobj.h"
 #include "program/prog_parameter.h"
+#include "main/framebuffer.h"
 
 #include "intel_mipmap_tree.h"
 #include "intel_batchbuffer.h"
@@ -738,6 +739,9 @@ brw_update_renderbuffer_surfaces(struct brw_context *brw,
                                  uint32_t *surf_offset)
 {
    GLuint i;
+   const unsigned int w = _mesa_geometric_width(fb);
+   const unsigned int h = _mesa_geometric_height(fb);
+   const unsigned int s = _mesa_geometric_samples(fb);
 
    /* Update surfaces for drawing buffers */
    if (fb->_NumColorDrawBuffers >= 1) {
@@ -748,17 +752,15 @@ brw_update_renderbuffer_surfaces(struct brw_context *brw,
             surf_offset[surf_index] = 
                brw->vtbl.update_renderbuffer_surface(
                   brw, fb->_ColorDrawBuffers[i],
-                  fb->MaxNumLayers > 0, i, surf_index);
+                  _mesa_geometric_layers(fb) > 0, i, surf_index);
 	 } else {
-            brw->vtbl.emit_null_surface_state(
-               brw, fb->Width, fb->Height, fb->Visual.samples,
+            brw->vtbl.emit_null_surface_state(brw, w, h, s,
                &surf_offset[surf_index]);
 	 }
       }
    } else {
       const uint32_t surf_index = render_target_start;
-      brw->vtbl.emit_null_surface_state(
-         brw, fb->Width, fb->Height, fb->Visual.samples,
+      brw->vtbl.emit_null_surface_state(brw, w, h, s,
          &surf_offset[surf_index]);
    }
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index aaf90df..9a29366 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -31,6 +31,7 @@
 #include "brw_util.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 
 static void
 upload_clip_state(struct brw_context *brw)
@@ -145,11 +146,14 @@ upload_clip_state(struct brw_context *brw)
     * the viewport, so we can ignore this restriction.
     */
    if (brw->gen < 8) {
+      const float fb_width = (float)_mesa_geometric_width(fb);
+      const float fb_height = (float)_mesa_geometric_height(fb);
+
       for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
          if (ctx->ViewportArray[i].X != 0 ||
              ctx->ViewportArray[i].Y != 0 ||
-             ctx->ViewportArray[i].Width != (float) fb->Width ||
-             ctx->ViewportArray[i].Height != (float) fb->Height) {
+             ctx->ViewportArray[i].Width != fb_width ||
+             ctx->ViewportArray[i].Height != fb_height) {
             dw2 &= ~GEN6_CLIP_GB_TEST;
             break;
          }
@@ -179,7 +183,7 @@ upload_clip_state(struct brw_context *brw)
 	     dw2);
    OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
              U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
-             (fb->MaxNumLayers > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
+             (_mesa_geometric_layers(fb) > 0 ? 0 : GEN6_CLIP_FORCE_ZERO_RTAINDEX) |
              ((ctx->Const.MaxViewports - 1) & GEN6_CLIP_MAX_VP_INDEX_MASK));
    ADVANCE_BATCH();
 }
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
index 28f23c9..27254eb 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.h
@@ -35,12 +35,13 @@ namespace brw {
 class gen6_gs_visitor : public vec4_gs_visitor
 {
 public:
-   gen6_gs_visitor(struct brw_context *brw,
+   gen6_gs_visitor(const struct brw_compiler *comp,
                    struct brw_gs_compile *c,
                    struct gl_shader_program *prog,
                    void *mem_ctx,
-                   bool no_spills) :
-      vec4_gs_visitor(brw, c, prog, mem_ctx, no_spills) {}
+                   bool no_spills,
+                   int shader_time_index) :
+      vec4_gs_visitor(comp, c, prog, mem_ctx, no_spills, shader_time_index) {}
 
 protected:
    virtual void assign_binding_table_offsets();
diff --git a/src/mesa/drivers/dri/i965/gen6_multisample_state.c b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
index ec46479..36734f5 100644
--- a/src/mesa/drivers/dri/i965/gen6_multisample_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_multisample_state.c
@@ -26,6 +26,7 @@
 #include "brw_context.h"
 #include "brw_defines.h"
 #include "brw_multisample_state.h"
+#include "main/framebuffer.h"
 
 void
 gen6_get_sample_position(struct gl_context *ctx,
@@ -34,7 +35,7 @@ gen6_get_sample_position(struct gl_context *ctx,
 {
    uint8_t bits;
 
-   switch (fb->Visual.samples) {
+   switch (_mesa_geometric_samples(fb)) {
    case 1:
       result[0] = result[1] = 0.5f;
       return;
diff --git a/src/mesa/drivers/dri/i965/gen6_queryobj.c b/src/mesa/drivers/dri/i965/gen6_queryobj.c
index 6431ed5..ba5c944 100644
--- a/src/mesa/drivers/dri/i965/gen6_queryobj.c
+++ b/src/mesa/drivers/dri/i965/gen6_queryobj.c
@@ -246,7 +246,7 @@ gen6_queryobj_get_results(struct gl_context *ctx,
        * and correctly emitted the number of pixel shader invocations, but,
        * whomever forgot to undo the multiply by 4.
        */
-      if (brw->gen >= 8 || brw->is_haswell)
+      if (brw->gen == 8 || brw->is_haswell)
          query->Base.Result /= 4;
       break;
 
diff --git a/src/mesa/drivers/dri/i965/gen6_scissor_state.c b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
index 0111f15..17b4a7f 100644
--- a/src/mesa/drivers/dri/i965/gen6_scissor_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_scissor_state.c
@@ -39,6 +39,8 @@ gen6_upload_scissor_state(struct brw_context *brw)
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
    struct gen6_scissor_rect *scissor;
    uint32_t scissor_state_offset;
+   const unsigned int fb_width= _mesa_geometric_width(ctx->DrawBuffer);
+   const unsigned int fb_height = _mesa_geometric_height(ctx->DrawBuffer);
 
    scissor = brw_state_batch(brw, AUB_TRACE_SCISSOR_STATE,
 			     sizeof(*scissor) * ctx->Const.MaxViewports, 32,
@@ -56,7 +58,11 @@ gen6_upload_scissor_state(struct brw_context *brw)
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
       int bbox[4];
 
-      _mesa_scissor_bounding_box(ctx, ctx->DrawBuffer, i, bbox);
+      bbox[0] = 0;
+      bbox[1] = fb_width;
+      bbox[2] = 0;
+      bbox[3] = fb_height;
+      _mesa_intersect_scissor_bounding_box(ctx, i, bbox);
 
       if (bbox[0] == bbox[1] || bbox[2] == bbox[3]) {
          /* If the scissor was out of bounds and got clamped to 0 width/height
@@ -80,8 +86,8 @@ gen6_upload_scissor_state(struct brw_context *brw)
          /* memory: Y=0=top */
          scissor[i].xmin = bbox[0];
          scissor[i].xmax = bbox[1] - 1;
-         scissor[i].ymin = ctx->DrawBuffer->Height - bbox[3];
-         scissor[i].ymax = ctx->DrawBuffer->Height - bbox[2] - 1;
+         scissor[i].ymin = fb_height - bbox[3];
+         scissor[i].ymax = fb_height - bbox[2] - 1;
       }
    }
    BEGIN_BATCH(2);
diff --git a/src/mesa/drivers/dri/i965/gen6_sf_state.c b/src/mesa/drivers/dri/i965/gen6_sf_state.c
index e445ce2..b00517e 100644
--- a/src/mesa/drivers/dri/i965/gen6_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_sf_state.c
@@ -31,6 +31,7 @@
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 /**
@@ -273,7 +274,7 @@ upload_sf_state(struct brw_context *brw)
    int i;
    /* _NEW_BUFFER */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    const int urb_entry_read_offset = BRW_SF_URB_ENTRY_READ_OFFSET;
    float point_size;
@@ -361,31 +362,7 @@ upload_sf_state(struct brw_context *brw)
 
    /* _NEW_LINE */
    {
-      /* OpenGL dictates that line width should be rounded to the nearest
-       * integer
-       */
-      float line_width =
-         roundf(CLAMP(ctx->Line.Width, 0.0, ctx->Const.MaxLineWidth));
-      uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
-
-      /* Line width of 0 is not allowed when MSAA enabled */
-      if (ctx->Multisample._Enabled) {
-         if (line_width_u3_7 == 0)
-             line_width_u3_7 = 1;
-      } else if (ctx->Line.SmoothFlag && ctx->Line.Width < 1.5) {
-         /* For 1 pixel line thickness or less, the general
-          * anti-aliasing algorithm gives up, and a garbage line is
-          * generated.  Setting a Line Width of 0.0 specifies the
-          * rasterization of the "thinnest" (one-pixel-wide),
-          * non-antialiased lines.
-          *
-          * Lines rendered with zero Line Width are rasterized using
-          * Grid Intersection Quantization rules as specified by
-          * bspec section 6.3.12.1 Zero-Width (Cosmetic) Line
-          * Rasterization.
-          */
-         line_width_u3_7 = 0;
-      }
+      uint32_t line_width_u3_7 = brw_get_line_width(brw);
       dw3 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
    }
    if (ctx->Line.SmoothFlag) {
diff --git a/src/mesa/drivers/dri/i965/gen6_viewport_state.c b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
index 2fb0182..7c8d884 100644
--- a/src/mesa/drivers/dri/i965/gen6_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_viewport_state.c
@@ -30,6 +30,7 @@
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "main/viewport.h"
 
 /* The clip VP defines the guardband region where expensive clipping is skipped
@@ -93,10 +94,10 @@ gen6_upload_sf_vp(struct brw_context *brw)
    /* _NEW_BUFFERS */
    if (render_to_fbo) {
       y_scale = 1.0;
-      y_bias = 0;
+      y_bias = 0.0;
    } else {
       y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
+      y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
diff --git a/src/mesa/drivers/dri/i965/gen6_wm_state.c b/src/mesa/drivers/dri/i965/gen6_wm_state.c
index 7081eb7..d1748ba 100644
--- a/src/mesa/drivers/dri/i965/gen6_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_wm_state.c
@@ -33,6 +33,7 @@
 #include "program/program.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -284,7 +285,7 @@ upload_wm_state(struct brw_context *brw)
    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
 
    /* _NEW_BUFFERS */
-   const bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    /* In case of non 1x per sample shading, only one of SIMD8 and SIMD16
     * should be enabled. We do 'SIMD16 only' dispatch if a SIMD16 shader
diff --git a/src/mesa/drivers/dri/i965/gen7_gs_state.c b/src/mesa/drivers/dri/i965/gen7_gs_state.c
index e1c4f8b..8d6d3fe 100644
--- a/src/mesa/drivers/dri/i965/gen7_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_gs_state.c
@@ -112,7 +112,7 @@ upload_gs_state(struct brw_context *brw)
           GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
          ((brw->gs.prog_data->invocations - 1) <<
           GEN7_GS_INSTANCE_CONTROL_SHIFT) |
-         brw->gs.prog_data->dispatch_mode |
+         SET_FIELD(prog_data->dispatch_mode, GEN7_GS_DISPATCH_MODE) |
          GEN6_GS_STATISTICS_ENABLE |
          (brw->gs.prog_data->include_primitive_id ?
           GEN7_GS_INCLUDE_PRIMITIVE_ID : 0) |
diff --git a/src/mesa/drivers/dri/i965/gen7_sf_state.c b/src/mesa/drivers/dri/i965/gen7_sf_state.c
index 58e3337..4fa46a8 100644
--- a/src/mesa/drivers/dri/i965/gen7_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sf_state.c
@@ -27,6 +27,7 @@
 #include "brw_util.h"
 #include "main/macros.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -109,7 +110,7 @@ upload_sf_state(struct brw_context *brw)
    float point_size;
    /* _NEW_BUFFERS */
    bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
-   bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    dw1 = GEN6_SF_STATISTICS_ENABLE;
 
@@ -192,30 +193,7 @@ upload_sf_state(struct brw_context *brw)
 
    /* _NEW_LINE */
    {
-      /* OpenGL dictates that line width should be rounded to the nearest
-       * integer
-       */
-      float line_width =
-         roundf(CLAMP(ctx->Line.Width, 0.0, ctx->Const.MaxLineWidth));
-      uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
-      /* Line width of 0 is not allowed when MSAA enabled */
-      if (ctx->Multisample._Enabled) {
-         if (line_width_u3_7 == 0)
-             line_width_u3_7 = 1;
-      } else if (ctx->Line.SmoothFlag && ctx->Line.Width < 1.5) {
-         /* For 1 pixel line thickness or less, the general
-          * anti-aliasing algorithm gives up, and a garbage line is
-          * generated.  Setting a Line Width of 0.0 specifies the
-          * rasterization of the "thinnest" (one-pixel-wide),
-          * non-antialiased lines.
-          *
-          * Lines rendered with zero Line Width are rasterized using
-          * Grid Intersection Quantization rules as specified by
-          * bspec section 6.3.12.1 Zero-Width (Cosmetic) Line
-          * Rasterization.
-          */
-         line_width_u3_7 = 0;
-      }
+      uint32_t line_width_u3_7 = brw_get_line_width(brw);
       dw2 |= line_width_u3_7 << GEN6_SF_LINE_WIDTH_SHIFT;
    }
    if (ctx->Line.SmoothFlag) {
diff --git a/src/mesa/drivers/dri/i965/gen7_viewport_state.c b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
index eb59684..b655205 100644
--- a/src/mesa/drivers/dri/i965/gen7_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_viewport_state.c
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "main/viewport.h"
 
 static void
@@ -45,10 +46,10 @@ gen7_upload_sf_clip_viewport(struct brw_context *brw)
    /* _NEW_BUFFERS */
    if (render_to_fbo) {
       y_scale = 1.0;
-      y_bias = 0;
+      y_bias = 0.0;
    } else {
       y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
+      y_bias = (float)_mesa_geometric_height(ctx->DrawBuffer);
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
diff --git a/src/mesa/drivers/dri/i965/gen7_vs_state.c b/src/mesa/drivers/dri/i965/gen7_vs_state.c
index 278b3ec..4b17d06 100644
--- a/src/mesa/drivers/dri/i965/gen7_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_vs_state.c
@@ -43,18 +43,52 @@ gen7_upload_constant_state(struct brw_context *brw,
    int dwords = brw->gen >= 8 ? 11 : 7;
    BEGIN_BATCH(dwords);
    OUT_BATCH(opcode << 16 | (dwords - 2));
-   OUT_BATCH(active ? stage_state->push_const_size : 0);
-   OUT_BATCH(0);
+
+   /* Workaround for SKL+ (we use option #2 until we have a need for more
+    * constant buffers). This comes from the documentation for 3DSTATE_CONSTANT_*
+    *
+    * The driver must ensure The following case does not occur without a flush
+    * to the 3D engine: 3DSTATE_CONSTANT_* with buffer 3 read length equal to
+    * zero committed followed by a 3DSTATE_CONSTANT_* with buffer 0 read length
+    * not equal to zero committed. Possible ways to avoid this condition
+    * include:
+    *     1. always force buffer 3 to have a non zero read length
+    *     2. always force buffer 0 to a zero read length
+    */
+   if (brw->gen >= 9 && active) {
+      OUT_BATCH(0);
+      OUT_BATCH(stage_state->push_const_size);
+   } else {
+      OUT_BATCH(active ? stage_state->push_const_size : 0);
+      OUT_BATCH(0);
+   }
    /* Pointer to the constant buffer.  Covered by the set of state flags
     * from gen6_prepare_wm_contants
     */
-   OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   OUT_BATCH(0);
-   if (brw->gen >= 8) {
+   if (brw->gen >= 9 && active) {
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      /* XXX: When using buffers other than 0, you need to specify the
+       * graphics virtual address regardless of INSPM/debug bits
+       */
+      OUT_RELOC64(brw->batch.bo, I915_GEM_DOMAIN_RENDER, 0,
+                  stage_state->push_const_offset);
       OUT_BATCH(0);
       OUT_BATCH(0);
+   } else if (brw->gen>= 8) {
+      OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+      OUT_BATCH(0);
+   } else {
+      OUT_BATCH(active ? (stage_state->push_const_offset | mocs) : 0);
+      OUT_BATCH(0);
       OUT_BATCH(0);
       OUT_BATCH(0);
    }
diff --git a/src/mesa/drivers/dri/i965/gen7_wm_state.c b/src/mesa/drivers/dri/i965/gen7_wm_state.c
index b918275..ea11ae8 100644
--- a/src/mesa/drivers/dri/i965/gen7_wm_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_wm_state.c
@@ -30,6 +30,7 @@
 #include "program/program.h"
 #include "program/prog_parameter.h"
 #include "program/prog_statevars.h"
+#include "main/framebuffer.h"
 #include "intel_batchbuffer.h"
 
 static void
@@ -45,7 +46,7 @@ upload_wm_state(struct brw_context *brw)
    uint32_t dw1, dw2;
 
    /* _NEW_BUFFERS */
-   bool multisampled_fbo = ctx->DrawBuffer->Visual.samples > 1;
+   const bool multisampled_fbo = _mesa_geometric_samples(ctx->DrawBuffer) > 1;
 
    dw1 = dw2 = 0;
    dw1 |= GEN7_WM_STATISTICS_ENABLE;
@@ -76,6 +77,10 @@ upload_wm_state(struct brw_context *brw)
       dw1 |= GEN7_WM_KILL_ENABLE;
    }
 
+   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx)) {
+      dw1 |= GEN7_WM_DISPATCH_ENABLE;
+   }
+
    /* _NEW_BUFFERS | _NEW_COLOR */
    if (brw_color_buffer_write_enabled(brw) || writes_depth ||
        dw1 & GEN7_WM_KILL_ENABLE) {
diff --git a/src/mesa/drivers/dri/i965/gen8_depth_state.c b/src/mesa/drivers/dri/i965/gen8_depth_state.c
index b502650..12ac97a 100644
--- a/src/mesa/drivers/dri/i965/gen8_depth_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_depth_state.c
@@ -417,6 +417,16 @@ gen8_hiz_exec(struct brw_context *brw, struct intel_mipmap_tree *mt,
    uint32_t surface_width  = ALIGN(mt->logical_width0,  level == 0 ? 8 : 1);
    uint32_t surface_height = ALIGN(mt->logical_height0, level == 0 ? 4 : 1);
 
+   /* From the documentation for 3DSTATE_WM_HZ_OP: "3DSTATE_MULTISAMPLE packet
+    * must be used prior to this packet to change the Number of Multisamples.
+    * This packet must not be used to change Number of Multisamples in a
+    * rendering sequence."
+    */
+   if (brw->num_samples != mt->num_samples) {
+      gen8_emit_3dstate_multisample(brw, mt->num_samples);
+      brw->NewGLState |= _NEW_MULTISAMPLE;
+   }
+
    /* The basic algorithm is:
     * - If needed, emit 3DSTATE_{DEPTH,HIER_DEPTH,STENCIL}_BUFFER and
     *   3DSTATE_CLEAR_PARAMS packets to set up the relevant buffers.
diff --git a/src/mesa/drivers/dri/i965/gen8_gs_state.c b/src/mesa/drivers/dri/i965/gen8_gs_state.c
index 46b9713..26a02d3 100644
--- a/src/mesa/drivers/dri/i965/gen8_gs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_gs_state.c
@@ -48,8 +48,7 @@ gen8_upload_gs_state(struct brw_context *brw)
       OUT_BATCH(_3DSTATE_GS << 16 | (10 - 2));
       OUT_BATCH(stage_state->prog_offset);
       OUT_BATCH(0);
-      OUT_BATCH(GEN6_GS_VECTOR_MASK_ENABLE |
-                brw->geometry_program->VerticesIn |
+      OUT_BATCH(brw->geometry_program->VerticesIn |
                 ((ALIGN(stage_state->sampler_count, 4)/4) <<
                  GEN6_GS_SAMPLER_COUNT_SHIFT) |
                 ((prog_data->base.binding_table.size_bytes / 4) <<
@@ -59,10 +58,6 @@ gen8_upload_gs_state(struct brw_context *brw)
          OUT_RELOC64(stage_state->scratch_bo,
                      I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
                      ffs(brw->gs.prog_data->base.base.total_scratch) - 11);
-         WARN_ONCE(true,
-                   "May need to implement a temporary workaround: GS Number of "
-                   "URB Entries must be less than or equal to the GS Maximum "
-                   "Number of Threads.\n");
       } else {
          OUT_BATCH(0);
          OUT_BATCH(0);
@@ -81,7 +76,8 @@ gen8_upload_gs_state(struct brw_context *brw)
 
       uint32_t dw7 = (brw->gs.prog_data->control_data_header_size_hwords <<
                       GEN7_GS_CONTROL_DATA_HEADER_SIZE_SHIFT) |
-                      brw->gs.prog_data->dispatch_mode |
+                     SET_FIELD(prog_data->dispatch_mode,
+                               GEN7_GS_DISPATCH_MODE) |
                      ((brw->gs.prog_data->invocations - 1) <<
                       GEN7_GS_INSTANCE_CONTROL_SHIFT) |
                       GEN6_GS_STATISTICS_ENABLE |
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 85ad3b6..a88f109 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -58,6 +58,9 @@ gen8_upload_ps_extra(struct brw_context *brw,
    if (prog_data->uses_omask)
       dw1 |= GEN8_PSX_OMASK_TO_RENDER_TARGET;
 
+   if (_mesa_active_fragment_shader_has_atomic_ops(&brw->ctx))
+      dw1 |= GEN8_PSX_SHADER_HAS_UAV;
+
    BEGIN_BATCH(2);
    OUT_BATCH(_3DSTATE_PS_EXTRA << 16 | (2 - 2));
    OUT_BATCH(dw1);
@@ -72,7 +75,7 @@ upload_ps_extra(struct brw_context *brw)
       brw_fragment_program_const(brw->fragment_program);
    /* BRW_NEW_FS_PROG_DATA */
    const struct brw_wm_prog_data *prog_data = brw->wm.prog_data;
-   /* BRW_NEW_NUM_SAMPLES | _NEW_MULTISAMPLE */
+   /* BRW_NEW_NUM_SAMPLES */
    const bool multisampled_fbo = brw->num_samples > 1;
 
    gen8_upload_ps_extra(brw, &fp->program, prog_data, multisampled_fbo);
@@ -80,7 +83,7 @@ upload_ps_extra(struct brw_context *brw)
 
 const struct brw_tracked_state gen8_ps_extra = {
    .dirty = {
-      .mesa  = _NEW_MULTISAMPLE,
+      .mesa  = 0,
       .brw   = BRW_NEW_CONTEXT |
                BRW_NEW_FRAGMENT_PROGRAM |
                BRW_NEW_FS_PROG_DATA |
diff --git a/src/mesa/drivers/dri/i965/gen8_sf_state.c b/src/mesa/drivers/dri/i965/gen8_sf_state.c
index 52a21b6..c2b585d 100644
--- a/src/mesa/drivers/dri/i965/gen8_sf_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sf_state.c
@@ -154,14 +154,7 @@ upload_sf(struct brw_context *brw)
        dw1 |= GEN6_SF_VIEWPORT_TRANSFORM_ENABLE;
 
    /* _NEW_LINE */
-   /* OpenGL dictates that line width should be rounded to the nearest
-    * integer
-    */
-   float line_width =
-      roundf(CLAMP(ctx->Line.Width, 0.0, ctx->Const.MaxLineWidth));
-   uint32_t line_width_u3_7 = U_FIXED(line_width, 7);
-   if (line_width_u3_7 == 0)
-      line_width_u3_7 = 1;
+   uint32_t line_width_u3_7 = brw_get_line_width(brw);
    if (brw->gen >= 9 || brw->is_cherryview) {
       dw1 |= line_width_u3_7 << GEN9_SF_LINE_WIDTH_SHIFT;
    } else {
diff --git a/src/mesa/drivers/dri/i965/gen8_surface_state.c b/src/mesa/drivers/dri/i965/gen8_surface_state.c
index d0c2d80..b2d1a57 100644
--- a/src/mesa/drivers/dri/i965/gen8_surface_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_surface_state.c
@@ -57,6 +57,19 @@ swizzle_to_scs(unsigned swizzle)
 }
 
 static uint32_t
+surface_tiling_resource_mode(uint32_t tr_mode)
+{
+   switch (tr_mode) {
+   case INTEL_MIPTREE_TRMODE_YF:
+      return GEN9_SURFACE_TRMODE_TILEYF;
+   case INTEL_MIPTREE_TRMODE_YS:
+      return GEN9_SURFACE_TRMODE_TILEYS;
+   default:
+      return GEN9_SURFACE_TRMODE_NONE;
+   }
+}
+
+static uint32_t
 surface_tiling_mode(uint32_t tiling)
 {
    switch (tiling) {
@@ -70,8 +83,18 @@ surface_tiling_mode(uint32_t tiling)
 }
 
 static unsigned
-vertical_alignment(const struct intel_mipmap_tree *mt)
+vertical_alignment(const struct brw_context *brw,
+                   const struct intel_mipmap_tree *mt,
+                   uint32_t surf_type)
 {
+   /* On Gen9+ vertical alignment is ignored for 1D surfaces and when
+    * tr_mode is not TRMODE_NONE.
+    */
+   if (brw->gen > 8 &&
+       (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
+        surf_type == BRW_SURFACE_1D))
+      return 0;
+
    switch (mt->align_h) {
    case 4:
       return GEN8_SURFACE_VALIGN_4;
@@ -85,8 +108,18 @@ vertical_alignment(const struct intel_mipmap_tree *mt)
 }
 
 static unsigned
-horizontal_alignment(const struct intel_mipmap_tree *mt)
+horizontal_alignment(const struct brw_context *brw,
+                     const struct intel_mipmap_tree *mt,
+                     uint32_t surf_type)
 {
+   /* On Gen9+ horizontal alignment is ignored when tr_mode is not
+    * TRMODE_NONE.
+    */
+   if (brw->gen > 8 &&
+       (mt->tr_mode != INTEL_MIPTREE_TRMODE_NONE ||
+        gen9_use_linear_1d_layout(brw, mt)))
+      return 0;
+
    switch (mt->align_w) {
    case 4:
       return GEN8_SURFACE_HALIGN_4;
@@ -100,11 +133,11 @@ horizontal_alignment(const struct intel_mipmap_tree *mt)
 }
 
 static uint32_t *
-allocate_surface_state(struct brw_context *brw, uint32_t *out_offset)
+allocate_surface_state(struct brw_context *brw, uint32_t *out_offset, int index)
 {
    int dwords = brw->gen >= 9 ? 16 : 13;
-   uint32_t *surf = brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
-                                    dwords * 4, 64, out_offset);
+   uint32_t *surf = __brw_state_batch(brw, AUB_TRACE_SURFACE_STATE,
+                                      dwords * 4, 64, index, out_offset);
    memset(surf, 0, dwords * 4);
    return surf;
 }
@@ -120,7 +153,7 @@ gen8_emit_buffer_surface_state(struct brw_context *brw,
                                bool rw)
 {
    const unsigned mocs = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
-   uint32_t *surf = allocate_surface_state(brw, out_offset);
+   uint32_t *surf = allocate_surface_state(brw, out_offset, -1);
 
    surf[0] = BRW_SURFACE_BUFFER << BRW_SURFACE_TYPE_SHIFT |
              surface_format << BRW_SURFACE_FORMAT_SHIFT |
@@ -164,7 +197,9 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    struct intel_mipmap_tree *aux_mt = NULL;
    uint32_t aux_mode = 0;
    uint32_t mocs_wb = brw->gen >= 9 ? SKL_MOCS_WB : BDW_MOCS_WB;
+   int surf_index = surf_offset - &brw->wm.base.surf_offset[0];
    unsigned tiling_mode, pitch;
+   const unsigned tr_mode = surface_tiling_resource_mode(mt->tr_mode);
 
    if (mt->format == MESA_FORMAT_S_UINT8) {
       tiling_mode = GEN8_SURFACE_TILING_W;
@@ -177,18 +212,29 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    if (mt->mcs_mt) {
       aux_mt = mt->mcs_mt;
       aux_mode = GEN8_SURFACE_AUX_MODE_MCS;
+
+      /*
+       * From the BDW PRM, Volume 2d, page 260 (RENDER_SURFACE_STATE):
+       * "When MCS is enabled for non-MSRT, HALIGN_16 must be used"
+       *
+       * From the hardware spec for GEN9:
+       * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN
+       *  16 must be used."
+       */
+      assert(brw->gen < 9 || mt->align_w == 16);
+      assert(brw->gen < 8 || mt->num_samples > 1 || mt->align_w == 16);
    }
 
-   uint32_t *surf = allocate_surface_state(brw, surf_offset);
+   const uint32_t surf_type = translate_tex_target(target);
+   uint32_t *surf = allocate_surface_state(brw, surf_offset, surf_index);
 
-   surf[0] = translate_tex_target(target) << BRW_SURFACE_TYPE_SHIFT |
+   surf[0] = SET_FIELD(surf_type, BRW_SURFACE_TYPE) |
              format << BRW_SURFACE_FORMAT_SHIFT |
-             vertical_alignment(mt) |
-             horizontal_alignment(mt) |
+             vertical_alignment(brw, mt, surf_type) |
+             horizontal_alignment(brw, mt, surf_type) |
              tiling_mode;
 
-   if (target == GL_TEXTURE_CUBE_MAP ||
-       target == GL_TEXTURE_CUBE_MAP_ARRAY) {
+   if (surf_type == BRW_SURFACE_CUBE) {
       surf[0] |= BRW_SURFACE_CUBEFACE_ENABLES;
    }
 
@@ -209,6 +255,12 @@ gen8_emit_texture_surface_state(struct brw_context *brw,
    surf[5] = SET_FIELD(min_level - mt->first_level, GEN7_SURFACE_MIN_LOD) |
              (max_level - min_level - 1); /* mip count */
 
+   if (brw->gen >= 9) {
+      surf[5] |= SET_FIELD(tr_mode, GEN9_SURFACE_TRMODE);
+      /* Disable Mip Tail by setting a large value. */
+      surf[5] |= SET_FIELD(15, GEN9_SURFACE_MIP_TAIL_START_LOD);
+   }
+
    if (aux_mt) {
       surf[6] = SET_FIELD(mt->qpitch / 4, GEN8_SURFACE_AUX_QPITCH) |
                 SET_FIELD((aux_mt->pitch / 128) - 1, GEN8_SURFACE_AUX_PITCH) |
@@ -310,7 +362,7 @@ gen8_emit_null_surface_state(struct brw_context *brw,
                              unsigned samples,
                              uint32_t *out_offset)
 {
-   uint32_t *surf = allocate_surface_state(brw, out_offset);
+   uint32_t *surf = allocate_surface_state(brw, out_offset, -1);
 
    surf[0] = BRW_SURFACE_NULL << BRW_SURFACE_TYPE_SHIFT |
              BRW_SURFACEFORMAT_B8G8R8A8_UNORM << BRW_SURFACE_FORMAT_SHIFT |
@@ -339,6 +391,7 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
    unsigned height = mt->logical_height0;
    unsigned pitch = mt->pitch;
    uint32_t tiling = mt->tiling;
+   unsigned tr_mode = surface_tiling_resource_mode(mt->tr_mode);
    uint32_t format = 0;
    uint32_t surf_type;
    uint32_t offset;
@@ -390,15 +443,26 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
    if (mt->mcs_mt) {
       aux_mt = mt->mcs_mt;
       aux_mode = GEN8_SURFACE_AUX_MODE_MCS;
+
+      /*
+       * From the BDW PRM, Volume 2d, page 260 (RENDER_SURFACE_STATE):
+       * "When MCS is enabled for non-MSRT, HALIGN_16 must be used"
+       *
+       * From the hardware spec for GEN9:
+       * "When Auxiliary Surface Mode is set to AUX_CCS_D or AUX_CCS_E, HALIGN
+       *  16 must be used."
+       */
+      assert(brw->gen < 9 || mt->align_w == 16);
+      assert(brw->gen < 8 || mt->num_samples > 1 || mt->align_w == 16);
    }
 
-   uint32_t *surf = allocate_surface_state(brw, &offset);
+   uint32_t *surf = allocate_surface_state(brw, &offset, surf_index);
 
    surf[0] = (surf_type << BRW_SURFACE_TYPE_SHIFT) |
              (is_array ? GEN7_SURFACE_IS_ARRAY : 0) |
              (format << BRW_SURFACE_FORMAT_SHIFT) |
-             vertical_alignment(mt) |
-             horizontal_alignment(mt) |
+             vertical_alignment(brw, mt, surf_type) |
+             horizontal_alignment(brw, mt, surf_type) |
              surface_tiling_mode(tiling);
 
    surf[1] = SET_FIELD(mocs, GEN8_SURFACE_MOCS) | mt->qpitch >> 2;
@@ -417,6 +481,12 @@ gen8_update_renderbuffer_surface(struct brw_context *brw,
 
    surf[5] = irb->mt_level - irb->mt->first_level;
 
+   if (brw->gen >= 9) {
+      surf[5] |= SET_FIELD(tr_mode, GEN9_SURFACE_TRMODE);
+      /* Disable Mip Tail by setting a large value. */
+      surf[5] |= SET_FIELD(15, GEN9_SURFACE_MIP_TAIL_START_LOD);
+   }
+
    if (aux_mt) {
       surf[6] = SET_FIELD(mt->qpitch / 4, GEN8_SURFACE_AUX_QPITCH) |
                 SET_FIELD((aux_mt->pitch / 128) - 1, GEN8_SURFACE_AUX_PITCH) |
diff --git a/src/mesa/drivers/dri/i965/gen8_viewport_state.c b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
index 322e466..2d8eeb1 100644
--- a/src/mesa/drivers/dri/i965/gen8_viewport_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_viewport_state.c
@@ -26,6 +26,7 @@
 #include "brw_defines.h"
 #include "intel_batchbuffer.h"
 #include "main/fbobject.h"
+#include "main/framebuffer.h"
 #include "main/viewport.h"
 
 static void
@@ -33,6 +34,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
 {
    struct gl_context *ctx = &brw->ctx;
    float y_scale, y_bias;
+   const float fb_height = (float)_mesa_geometric_height(ctx->DrawBuffer);
    const bool render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
 
    float *vp = brw_state_batch(brw, AUB_TRACE_SF_VP_STATE,
@@ -47,7 +49,7 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
       y_bias = 0;
    } else {
       y_scale = -1.0;
-      y_bias = ctx->DrawBuffer->Height;
+      y_bias = fb_height;
    }
 
    for (unsigned i = 0; i < ctx->Const.MaxViewports; i++) {
@@ -116,8 +118,8 @@ gen8_upload_sf_clip_viewport(struct brw_context *brw)
       } else {
          vp[12] = ctx->ViewportArray[i].X;
          vp[13] = viewport_Xmax - 1;
-         vp[14] = ctx->DrawBuffer->Height - viewport_Ymax;
-         vp[15] = ctx->DrawBuffer->Height - ctx->ViewportArray[i].Y - 1;
+         vp[14] = fb_height - viewport_Ymax;
+         vp[15] = fb_height - ctx->ViewportArray[i].Y - 1;
       }
 
       vp += 16;
diff --git a/src/mesa/drivers/dri/i965/gen8_vs_state.c b/src/mesa/drivers/dri/i965/gen8_vs_state.c
index f92af55..28f5add 100644
--- a/src/mesa/drivers/dri/i965/gen8_vs_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_vs_state.c
@@ -39,6 +39,9 @@ upload_vs_state(struct brw_context *brw)
    /* BRW_NEW_VS_PROG_DATA */
    const struct brw_vue_prog_data *prog_data = &brw->vs.prog_data->base;
 
+   assert(prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ||
+          prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT);
+
    if (prog_data->base.use_alt_mode)
       floating_point_mode = GEN6_VS_FLOATING_POINT_MODE_ALT;
 
@@ -66,7 +69,8 @@ upload_vs_state(struct brw_context *brw)
              (prog_data->urb_read_length << GEN6_VS_URB_READ_LENGTH_SHIFT) |
              (0 << GEN6_VS_URB_ENTRY_READ_OFFSET_SHIFT));
 
-   uint32_t simd8_enable = prog_data->simd8 ? GEN8_VS_SIMD8_ENABLE : 0;
+   uint32_t simd8_enable = prog_data->dispatch_mode == DISPATCH_MODE_SIMD8 ?
+      GEN8_VS_SIMD8_ENABLE : 0;
    OUT_BATCH(((brw->max_vs_threads - 1) << HSW_VS_MAX_THREADS_SHIFT) |
              GEN6_VS_STATISTICS_ENABLE |
              simd8_enable |
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index e522e4e..ed659ed 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -743,27 +743,54 @@ intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
    brw_render_cache_set_clear(brw);
 }
 
-void
-brw_load_register_mem(struct brw_context *brw,
-                      uint32_t reg,
-                      drm_intel_bo *bo,
-                      uint32_t read_domains, uint32_t write_domain,
-                      uint32_t offset)
+static void
+load_sized_register_mem(struct brw_context *brw,
+                        uint32_t reg,
+                        drm_intel_bo *bo,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t offset,
+                        int size)
 {
+   int i;
+
    /* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
    assert(brw->gen >= 7);
 
    if (brw->gen >= 8) {
-      BEGIN_BATCH(4);
-      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
-      OUT_BATCH(reg);
-      OUT_RELOC64(bo, read_domains, write_domain, offset);
+      BEGIN_BATCH(4 * size);
+      for (i = 0; i < size; i++) {
+         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (4 - 2));
+         OUT_BATCH(reg + i * 4);
+         OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
+      }
       ADVANCE_BATCH();
    } else {
-      BEGIN_BATCH(3);
-      OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
-      OUT_BATCH(reg);
-      OUT_RELOC(bo, read_domains, write_domain, offset);
+      BEGIN_BATCH(3 * size);
+      for (i = 0; i < size; i++) {
+         OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM | (3 - 2));
+         OUT_BATCH(reg + i * 4);
+         OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
+      }
       ADVANCE_BATCH();
    }
 }
+
+void
+brw_load_register_mem(struct brw_context *brw,
+                      uint32_t reg,
+                      drm_intel_bo *bo,
+                      uint32_t read_domains, uint32_t write_domain,
+                      uint32_t offset)
+{
+   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
+}
+
+void
+brw_load_register_mem64(struct brw_context *brw,
+                        uint32_t reg,
+                        drm_intel_bo *bo,
+                        uint32_t read_domains, uint32_t write_domain,
+                        uint32_t offset)
+{
+   load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
+}
diff --git a/src/mesa/drivers/dri/i965/intel_blit.c b/src/mesa/drivers/dri/i965/intel_blit.c
index 7680a40..d3ab769 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.c
+++ b/src/mesa/drivers/dri/i965/intel_blit.c
@@ -77,13 +77,10 @@ br13_for_cpp(int cpp)
    switch (cpp) {
    case 4:
       return BR13_8888;
-      break;
    case 2:
       return BR13_565;
-      break;
    case 1:
       return BR13_8;
-      break;
    default:
       unreachable("not reached");
    }
@@ -130,6 +127,40 @@ set_blitter_tiling(struct brw_context *brw,
       ADVANCE_BATCH();                                                  \
    } while (0)
 
+static int
+blt_pitch(struct intel_mipmap_tree *mt)
+{
+   int pitch = mt->pitch;
+   if (mt->tiling)
+      pitch /= 4;
+   return pitch;
+}
+
+bool
+intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst)
+{
+   /* The BLT doesn't handle sRGB conversion */
+   assert(src == _mesa_get_srgb_format_linear(src));
+   assert(dst == _mesa_get_srgb_format_linear(dst));
+
+   /* No swizzle or format conversions possible, except... */
+   if (src == dst)
+      return true;
+
+   /* ...we can either discard the alpha channel when going from A->X,
+    * or we can fill the alpha channel with 0xff when going from X->A
+    */
+   if (src == MESA_FORMAT_B8G8R8A8_UNORM || src == MESA_FORMAT_B8G8R8X8_UNORM)
+      return (dst == MESA_FORMAT_B8G8R8A8_UNORM ||
+              dst == MESA_FORMAT_B8G8R8X8_UNORM);
+
+   if (src == MESA_FORMAT_R8G8B8A8_UNORM || src == MESA_FORMAT_R8G8B8X8_UNORM)
+      return (dst == MESA_FORMAT_R8G8B8A8_UNORM ||
+              dst == MESA_FORMAT_R8G8B8X8_UNORM);
+
+   return false;
+}
+
 /**
  * Implements a rectangular block transfer (blit) of pixels between two
  * miptrees.
@@ -172,11 +203,7 @@ intel_miptree_blit(struct brw_context *brw,
     * the X channel don't matter), and XRGB8888 to ARGB8888 by setting the A
     * channel to 1.0 at the end.
     */
-   if (src_format != dst_format &&
-      ((src_format != MESA_FORMAT_B8G8R8A8_UNORM &&
-        src_format != MESA_FORMAT_B8G8R8X8_UNORM) ||
-       (dst_format != MESA_FORMAT_B8G8R8A8_UNORM &&
-        dst_format != MESA_FORMAT_B8G8R8X8_UNORM))) {
+   if (!intel_miptree_blit_compatible_formats(src_format, dst_format)) {
       perf_debug("%s: Can't use hardware blitter from %s to %s, "
                  "falling back.\n", __func__,
                  _mesa_get_format_name(src_format),
@@ -197,14 +224,14 @@ intel_miptree_blit(struct brw_context *brw,
     *
     * Furthermore, intelEmitCopyBlit (which is called below) uses a signed
     * 16-bit integer to represent buffer pitch, so it can only handle buffer
-    * pitches < 32k.
+    * pitches < 32k. However, the pitch is measured in bytes for linear buffers
+    * and dwords for tiled buffers.
     *
     * As a result of these two limitations, we can only use the blitter to do
-    * this copy when the miptree's pitch is less than 32k.
+    * this copy when the miptree's pitch is less than 32k linear or 128k tiled.
     */
-   if (src_mt->pitch >= 32768 ||
-       dst_mt->pitch >= 32768) {
-      perf_debug("Falling back due to >=32k pitch\n");
+   if (blt_pitch(src_mt) >= 32768 || blt_pitch(dst_mt) >= 32768) {
+      perf_debug("Falling back due to >= 32k/128k pitch\n");
       return false;
    }
 
@@ -261,8 +288,9 @@ intel_miptree_blit(struct brw_context *brw,
       return false;
    }
 
-   if (src_mt->format == MESA_FORMAT_B8G8R8X8_UNORM &&
-       dst_mt->format == MESA_FORMAT_B8G8R8A8_UNORM) {
+   /* XXX This could be done in a single pass using XY_FULL_MONO_PATTERN_BLT */
+   if (_mesa_get_format_bits(src_format, GL_ALPHA_BITS) == 0 &&
+       _mesa_get_format_bits(dst_format, GL_ALPHA_BITS) > 0) {
       intel_miptree_set_alpha_to_one(brw, dst_mt,
                                      dst_x, dst_y,
                                      width, height);
diff --git a/src/mesa/drivers/dri/i965/intel_blit.h b/src/mesa/drivers/dri/i965/intel_blit.h
index f563939..2287c37 100644
--- a/src/mesa/drivers/dri/i965/intel_blit.h
+++ b/src/mesa/drivers/dri/i965/intel_blit.h
@@ -46,6 +46,8 @@ intelEmitCopyBlit(struct brw_context *brw,
                               GLshort w, GLshort h,
 			      GLenum logicop );
 
+bool intel_miptree_blit_compatible_formats(mesa_format src, mesa_format dst);
+
 bool intel_miptree_blit(struct brw_context *brw,
                         struct intel_mipmap_tree *src_mt,
                         int src_level, int src_slice,
diff --git a/src/mesa/drivers/dri/i965/intel_debug.c b/src/mesa/drivers/dri/i965/intel_debug.c
index 33a0348..75cf785 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.c
+++ b/src/mesa/drivers/dri/i965/intel_debug.c
@@ -88,25 +88,22 @@ intel_debug_flag_for_shader_stage(gl_shader_stage stage)
 }
 
 void
-brw_process_intel_debug_variable(struct brw_context *brw)
+brw_process_intel_debug_variable(struct intel_screen *screen)
 {
    uint64_t intel_debug = driParseDebugString(getenv("INTEL_DEBUG"), debug_control);
    (void) p_atomic_cmpxchg(&INTEL_DEBUG, 0, intel_debug);
 
    if (INTEL_DEBUG & DEBUG_BUFMGR)
-      dri_bufmgr_set_debug(brw->bufmgr, true);
+      dri_bufmgr_set_debug(screen->bufmgr, true);
 
-   if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && brw->gen < 7) {
+   if ((INTEL_DEBUG & DEBUG_SHADER_TIME) && screen->devinfo->gen < 7) {
       fprintf(stderr,
               "shader_time debugging requires gen7 (Ivybridge) or better.\n");
       INTEL_DEBUG &= ~DEBUG_SHADER_TIME;
    }
 
-   if (INTEL_DEBUG & DEBUG_PERF)
-      brw->perf_debug = true;
-
    if (INTEL_DEBUG & DEBUG_AUB)
-      drm_intel_bufmgr_gem_set_aub_dump(brw->bufmgr, true);
+      drm_intel_bufmgr_gem_set_aub_dump(screen->bufmgr, true);
 }
 
 /**
diff --git a/src/mesa/drivers/dri/i965/intel_debug.h b/src/mesa/drivers/dri/i965/intel_debug.h
index f754be2..4689492 100644
--- a/src/mesa/drivers/dri/i965/intel_debug.h
+++ b/src/mesa/drivers/dri/i965/intel_debug.h
@@ -114,8 +114,8 @@ extern uint64_t INTEL_DEBUG;
 
 extern uint64_t intel_debug_flag_for_shader_stage(gl_shader_stage stage);
 
-struct brw_context;
+struct intel_screen;
 
-extern void brw_process_intel_debug_variable(struct brw_context *brw);
+extern void brw_process_intel_debug_variable(struct intel_screen *);
 
 extern bool brw_env_var_as_boolean(const char *var_name, bool default_value);
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index d6da34c..c99677c 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -323,9 +323,12 @@ intelInitExtensions(struct gl_context *ctx)
       }
    }
 
+   brw->predicate.supported = false;
+
    if (brw->gen >= 7) {
       ctx->Extensions.ARB_conservative_depth = true;
       ctx->Extensions.ARB_derivative_control = true;
+      ctx->Extensions.ARB_framebuffer_no_attachments = true;
       ctx->Extensions.ARB_gpu_shader5 = true;
       ctx->Extensions.ARB_shader_atomic_counters = true;
       ctx->Extensions.ARB_texture_compression_bptc = true;
@@ -337,6 +340,9 @@ intelInitExtensions(struct gl_context *ctx)
          ctx->Extensions.ARB_transform_feedback2 = true;
          ctx->Extensions.ARB_transform_feedback3 = true;
          ctx->Extensions.ARB_transform_feedback_instanced = true;
+
+         if (brw->intelScreen->cmd_parser_version >= 2)
+            brw->predicate.supported = true;
       }
 
       /* Only enable this in core profile because other parts of Mesa behave
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index aebed72..1b3a72f 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -390,7 +390,7 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
                                          image->height,
                                          1,
                                          image->pitch,
-                                         true /*disable_aux_buffers*/);
+                                         MIPTREE_LAYOUT_DISABLE_AUX);
    if (!irb->mt)
       return;
 
@@ -1027,10 +1027,9 @@ intel_renderbuffer_move_to_temp(struct brw_context *brw,
                                  intel_image->base.Base.Level,
                                  intel_image->base.Base.Level,
                                  width, height, depth,
-                                 true,
                                  irb->mt->num_samples,
                                  INTEL_MIPTREE_TILING_ANY,
-                                 false);
+                                 MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
 
    if (intel_miptree_wants_hiz_buffer(brw, new_mt)) {
       intel_miptree_alloc_hiz(brw, new_mt);
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index 24a5c3d..6aa969a 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -158,15 +158,32 @@ intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
    }
 }
 
+bool
+intel_tiling_supports_non_msrt_mcs(struct brw_context *brw, unsigned tiling)
+{
+   /* From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render
+    * Target(s)", beneath the "Fast Color Clear" bullet (p326):
+    *
+    *     - Support is limited to tiled render targets.
+    *
+    * Gen9 changes the restriction to Y-tile only.
+    */
+   if (brw->gen >= 9)
+      return tiling == I915_TILING_Y;
+   else if (brw->gen >= 7)
+      return tiling != I915_TILING_NONE;
+   else
+      return false;
+}
 
 /**
  * For a single-sampled render target ("non-MSRT"), determine if an MCS buffer
- * can be used.
+ * can be used. This doesn't (and should not) inspect any of the properties of
+ * the miptree's BO.
  *
  * From the Ivy Bridge PRM, Vol2 Part1 11.7 "MCS Buffer for Render Target(s)",
  * beneath the "Fast Color Clear" bullet (p326):
  *
- *     - Support is limited to tiled render targets.
  *     - Support is for non-mip-mapped and non-array surface types only.
  *
  * And then later, on p327:
@@ -175,8 +192,8 @@ intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
  *       64bpp, and 128bpp.
  */
 bool
-intel_is_non_msrt_mcs_buffer_supported(struct brw_context *brw,
-                                       struct intel_mipmap_tree *mt)
+intel_miptree_is_fast_clear_capable(struct brw_context *brw,
+                                    struct intel_mipmap_tree *mt)
 {
    /* MCS support does not exist prior to Gen7 */
    if (brw->gen < 7)
@@ -193,15 +210,25 @@ intel_is_non_msrt_mcs_buffer_supported(struct brw_context *brw,
       return false;
    }
 
-   if (mt->tiling != I915_TILING_X &&
-       mt->tiling != I915_TILING_Y)
-      return false;
    if (mt->cpp != 4 && mt->cpp != 8 && mt->cpp != 16)
       return false;
-   if (mt->first_level != 0 || mt->last_level != 0)
+   if (mt->first_level != 0 || mt->last_level != 0) {
+      if (brw->gen >= 8) {
+         perf_debug("Multi-LOD fast clear - giving up (%dx%dx%d).\n",
+                    mt->logical_width0, mt->logical_height0, mt->last_level);
+      }
+
       return false;
-   if (mt->physical_depth0 != 1)
+   }
+   if (mt->physical_depth0 != 1) {
+      if (brw->gen >= 8) {
+         perf_debug("Layered fast clear - giving up. (%dx%d%d)\n",
+                    mt->logical_width0, mt->logical_height0,
+                    mt->physical_depth0);
+      }
+
       return false;
+   }
 
    /* There's no point in using an MCS buffer if the surface isn't in a
     * renderable format.
@@ -244,10 +271,9 @@ intel_miptree_create_layout(struct brw_context *brw,
                             GLuint width0,
                             GLuint height0,
                             GLuint depth0,
-                            bool for_bo,
                             GLuint num_samples,
-                            bool force_all_slices_at_each_lod,
-                            bool disable_aux_buffers)
+                            enum intel_miptree_tiling_mode requested,
+                            uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
    if (!mt)
@@ -286,7 +312,7 @@ intel_miptree_create_layout(struct brw_context *brw,
    mt->logical_height0 = height0;
    mt->logical_depth0 = depth0;
    mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_NO_MCS;
-   mt->disable_aux_buffers = disable_aux_buffers;
+   mt->disable_aux_buffers = (layout_flags & MIPTREE_LAYOUT_DISABLE_AUX) != 0;
    exec_list_make_empty(&mt->hiz_map);
 
    /* The cpp is bytes per (1, blockheight)-sized block for compressed
@@ -422,12 +448,15 @@ intel_miptree_create_layout(struct brw_context *brw,
    mt->physical_height0 = height0;
    mt->physical_depth0 = depth0;
 
-   if (!for_bo &&
+   if (!(layout_flags & MIPTREE_LAYOUT_FOR_BO) &&
        _mesa_get_format_base_format(format) == GL_DEPTH_STENCIL &&
        (brw->must_use_separate_stencil ||
 	(brw->has_separate_stencil &&
          intel_miptree_wants_hiz_buffer(brw, mt)))) {
-      const bool force_all_slices_at_each_lod = brw->gen == 6;
+      uint32_t stencil_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
+      if (brw->gen == 6)
+         stencil_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
+
       mt->stencil_mt = intel_miptree_create(brw,
                                             mt->target,
                                             MESA_FORMAT_S_UINT8,
@@ -436,10 +465,10 @@ intel_miptree_create_layout(struct brw_context *brw,
                                             mt->logical_width0,
                                             mt->logical_height0,
                                             mt->logical_depth0,
-                                            true,
                                             num_samples,
                                             INTEL_MIPTREE_TILING_ANY,
-                                            force_all_slices_at_each_lod);
+                                            stencil_flags);
+
       if (!mt->stencil_mt) {
 	 intel_miptree_release(&mt);
 	 return NULL;
@@ -457,119 +486,36 @@ intel_miptree_create_layout(struct brw_context *brw,
       }
    }
 
-   if (force_all_slices_at_each_lod)
+   if (layout_flags & MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD)
       mt->array_layout = ALL_SLICES_AT_EACH_LOD;
 
-   brw_miptree_layout(brw, mt);
-
-   if (mt->disable_aux_buffers)
-      assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
-
-   return mt;
-}
-
-/**
- * \brief Helper function for intel_miptree_create().
- */
-static uint32_t
-intel_miptree_choose_tiling(struct brw_context *brw,
-                            mesa_format format,
-                            uint32_t width0,
-                            uint32_t num_samples,
-                            enum intel_miptree_tiling_mode requested,
-                            struct intel_mipmap_tree *mt)
-{
-   if (format == MESA_FORMAT_S_UINT8) {
-      /* The stencil buffer is W tiled. However, we request from the kernel a
-       * non-tiled buffer because the GTT is incapable of W fencing.
-       */
-      return I915_TILING_NONE;
-   }
-
-   /* Some usages may want only one type of tiling, like depth miptrees (Y
-    * tiled), or temporary BOs for uploading data once (linear).
-    */
-   switch (requested) {
-   case INTEL_MIPTREE_TILING_ANY:
-      break;
-   case INTEL_MIPTREE_TILING_Y:
-      return I915_TILING_Y;
-   case INTEL_MIPTREE_TILING_NONE:
-      return I915_TILING_NONE;
-   }
-
-   if (num_samples > 1) {
-      /* From p82 of the Sandy Bridge PRM, dw3[1] of SURFACE_STATE ("Tiled
-       * Surface"):
-       *
-       *   [DevSNB+]: For multi-sample render targets, this field must be
-       *   1. MSRTs can only be tiled.
-       *
-       * Our usual reason for preferring X tiling (fast blits using the
-       * blitting engine) doesn't apply to MSAA, since we'll generally be
-       * downsampling or upsampling when blitting between the MSAA buffer
-       * and another buffer, and the blitting engine doesn't support that.
-       * So use Y tiling, since it makes better use of the cache.
-       */
-      return I915_TILING_Y;
-   }
-
-   GLenum base_format = _mesa_get_format_base_format(format);
-   if (base_format == GL_DEPTH_COMPONENT ||
-       base_format == GL_DEPTH_STENCIL_EXT)
-      return I915_TILING_Y;
-
-   /* 1D textures (and 1D array textures) don't get any benefit from tiling,
-    * in fact it leads to a less efficient use of memory space and bandwidth
-    * due to tile alignment.
+   /*
+    * Obey HALIGN_16 constraints for Gen8 and Gen9 buffers which are
+    * multisampled or have an AUX buffer attached to it.
+    *
+    * GEN  |    MSRT        | AUX_CCS_* or AUX_MCS
+    *  -------------------------------------------
+    *  9   |  HALIGN_16     |    HALIGN_16
+    *  8   |  HALIGN_ANY    |    HALIGN_16
+    *  7   |      ?         |        ?
+    *  6   |      ?         |        ?
     */
-   if (mt->logical_height0 == 1)
-      return I915_TILING_NONE;
-
-   int minimum_pitch = mt->total_width * mt->cpp;
-
-   /* If the width is much smaller than a tile, don't bother tiling. */
-   if (minimum_pitch < 64)
-      return I915_TILING_NONE;
-
-   if (ALIGN(minimum_pitch, 512) >= 32768 ||
-       mt->total_width >= 32768 || mt->total_height >= 32768) {
-      perf_debug("%dx%d miptree too large to blit, falling back to untiled",
-                 mt->total_width, mt->total_height);
-      return I915_TILING_NONE;
+   if (intel_miptree_is_fast_clear_capable(brw, mt)) {
+      if (brw->gen >= 9 || (brw->gen == 8 && num_samples <= 1))
+         layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   } else if (brw->gen >= 9 && num_samples > 1) {
+      layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
+   } else {
+      /* For now, nothing else has this requirement */
+      assert((layout_flags & MIPTREE_LAYOUT_FORCE_HALIGN16) == 0);
    }
 
-   /* Pre-gen6 doesn't have BLORP to handle Y-tiling, so use X-tiling. */
-   if (brw->gen < 6)
-      return I915_TILING_X;
+   brw_miptree_layout(brw, mt, requested, layout_flags);
 
-   /* From the Sandybridge PRM, Volume 1, Part 2, page 32:
-    * "NOTE: 128BPE Format Color Buffer ( render target ) MUST be either TileX
-    *  or Linear."
-    * 128 bits per pixel translates to 16 bytes per pixel. This is necessary
-    * all the way back to 965, but is permitted on Gen7+.
-    */
-   if (brw->gen < 7 && mt->cpp >= 16)
-      return I915_TILING_X;
-
-   /* From the Ivy Bridge PRM, Vol4 Part1 2.12.2.1 (SURFACE_STATE for most
-    * messages), on p64, under the heading "Surface Vertical Alignment":
-    *
-    *     This field must be set to VALIGN_4 for all tiled Y Render Target
-    *     surfaces.
-    *
-    * So if the surface is renderable and uses a vertical alignment of 2,
-    * force it to be X tiled.  This is somewhat conservative (it's possible
-    * that the client won't ever render to this surface), but it's difficult
-    * to know that ahead of time.  And besides, since we use a vertical
-    * alignment of 4 as often as we can, this shouldn't happen very often.
-    */
-   if (brw->gen == 7 && mt->align_h == 2 &&
-       brw->format_supported_as_render_target[format]) {
-      return I915_TILING_X;
-   }
+   if (mt->disable_aux_buffers)
+      assert(mt->msaa_layout != INTEL_MSAA_LAYOUT_CMS);
 
-   return I915_TILING_Y | I915_TILING_X;
+   return mt;
 }
 
 
@@ -615,33 +561,33 @@ intel_lower_compressed_format(struct brw_context *brw, mesa_format format)
 
 struct intel_mipmap_tree *
 intel_miptree_create(struct brw_context *brw,
-		     GLenum target,
-		     mesa_format format,
-		     GLuint first_level,
-		     GLuint last_level,
-		     GLuint width0,
-		     GLuint height0,
-		     GLuint depth0,
-		     bool expect_accelerated_upload,
+                     GLenum target,
+                     mesa_format format,
+                     GLuint first_level,
+                     GLuint last_level,
+                     GLuint width0,
+                     GLuint height0,
+                     GLuint depth0,
                      GLuint num_samples,
                      enum intel_miptree_tiling_mode requested_tiling,
-                     bool force_all_slices_at_each_lod)
+                     uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
    mesa_format tex_format = format;
    mesa_format etc_format = MESA_FORMAT_NONE;
    GLuint total_width, total_height;
+   uint32_t alloc_flags = 0;
 
    format = intel_lower_compressed_format(brw, format);
 
    etc_format = (format != tex_format) ? tex_format : MESA_FORMAT_NONE;
 
+   assert((layout_flags & MIPTREE_LAYOUT_DISABLE_AUX) == 0);
+   assert((layout_flags & MIPTREE_LAYOUT_FOR_BO) == 0);
    mt = intel_miptree_create_layout(brw, target, format,
-				      first_level, last_level, width0,
-				      height0, depth0,
-                                    false, num_samples,
-                                    force_all_slices_at_each_lod,
-                                    false /*disable_aux_buffers*/);
+                                    first_level, last_level, width0,
+                                    height0, depth0, num_samples,
+                                    requested_tiling, layout_flags);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -659,25 +605,21 @@ intel_miptree_create(struct brw_context *brw,
       total_height = ALIGN(total_height, 64);
    }
 
-   uint32_t tiling = intel_miptree_choose_tiling(brw, format, width0,
-                                                 num_samples, requested_tiling,
-                                                 mt);
    bool y_or_x = false;
 
-   if (tiling == (I915_TILING_Y | I915_TILING_X)) {
+   if (mt->tiling == (I915_TILING_Y | I915_TILING_X)) {
       y_or_x = true;
       mt->tiling = I915_TILING_Y;
-   } else {
-      mt->tiling = tiling;
    }
 
+   if (layout_flags & MIPTREE_LAYOUT_ACCELERATED_UPLOAD)
+      alloc_flags |= BO_ALLOC_FOR_RENDER;
+
    unsigned long pitch;
+   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree", total_width,
+                                     total_height, mt->cpp, &mt->tiling,
+                                     &pitch, alloc_flags);
    mt->etc_format = etc_format;
-   mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                     total_width, total_height, mt->cpp,
-                                     &mt->tiling, &pitch,
-                                     (expect_accelerated_upload ?
-                                      BO_ALLOC_FOR_RENDER : 0));
    mt->pitch = pitch;
 
    /* If the BO is too large to fit in the aperture, we need to use the
@@ -691,10 +633,8 @@ intel_miptree_create(struct brw_context *brw,
       mt->tiling = I915_TILING_X;
       drm_intel_bo_unreference(mt->bo);
       mt->bo = drm_intel_bo_alloc_tiled(brw->bufmgr, "miptree",
-                                        total_width, total_height, mt->cpp,
-                                        &mt->tiling, &pitch,
-                                        (expect_accelerated_upload ?
-                                         BO_ALLOC_FOR_RENDER : 0));
+                                  total_width, total_height, mt->cpp,
+                                  &mt->tiling, &pitch, alloc_flags);
       mt->pitch = pitch;
    }
 
@@ -707,6 +647,7 @@ intel_miptree_create(struct brw_context *brw,
 
 
    if (mt->msaa_layout == INTEL_MSAA_LAYOUT_CMS) {
+      assert(mt->num_samples > 1);
       if (!intel_miptree_alloc_mcs(brw, mt, num_samples)) {
          intel_miptree_release(&mt);
          return NULL;
@@ -718,8 +659,11 @@ intel_miptree_create(struct brw_context *brw,
     * Allocation of the MCS miptree will be deferred until the first fast
     * clear actually occurs.
     */
-   if (intel_is_non_msrt_mcs_buffer_supported(brw, mt))
+   if (intel_tiling_supports_non_msrt_mcs(brw, mt->tiling) &&
+       intel_miptree_is_fast_clear_capable(brw, mt)) {
       mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
+      assert(brw->gen < 8 || mt->align_w == 16 || num_samples <= 1);
+   }
 
    return mt;
 }
@@ -733,7 +677,7 @@ intel_miptree_create_for_bo(struct brw_context *brw,
                             uint32_t height,
                             uint32_t depth,
                             int pitch,
-                            bool disable_aux_buffers)
+                            uint32_t layout_flags)
 {
    struct intel_mipmap_tree *mt;
    uint32_t tiling, swizzle;
@@ -754,11 +698,18 @@ intel_miptree_create_for_bo(struct brw_context *brw,
 
    target = depth > 1 ? GL_TEXTURE_2D_ARRAY : GL_TEXTURE_2D;
 
+   /* 'requested' parameter of intel_miptree_create_layout() is relevant
+    * only for non bo miptree. Tiling for bo is already computed above.
+    * So, the tiling requested (INTEL_MIPTREE_TILING_ANY) below is
+    * just a place holder and will not make any change to the miptree
+    * tiling format.
+    */
+   layout_flags |= MIPTREE_LAYOUT_FOR_BO;
    mt = intel_miptree_create_layout(brw, target, format,
                                     0, 0,
-                                    width, height, depth,
-                                    true, 0, false,
-                                    disable_aux_buffers);
+                                    width, height, depth, 0,
+                                    INTEL_MIPTREE_TILING_ANY,
+                                    layout_flags);
    if (!mt)
       return NULL;
 
@@ -808,7 +759,7 @@ intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
                                                  height,
                                                  1,
                                                  pitch,
-                                                 false);
+                                                 0);
    if (!singlesample_mt)
       goto fail;
 
@@ -817,7 +768,8 @@ intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
     * Allocation of the MCS miptree will be deferred until the first fast
     * clear actually occurs.
     */
-   if (intel_is_non_msrt_mcs_buffer_supported(intel, singlesample_mt))
+   if (intel_tiling_supports_non_msrt_mcs(intel, singlesample_mt->tiling) &&
+       intel_miptree_is_fast_clear_capable(intel, singlesample_mt))
       singlesample_mt->fast_clear_state = INTEL_FAST_CLEAR_STATE_RESOLVED;
 
    if (num_samples == 0) {
@@ -866,8 +818,9 @@ intel_miptree_create_for_renderbuffer(struct brw_context *brw,
    GLenum target = num_samples > 1 ? GL_TEXTURE_2D_MULTISAMPLE : GL_TEXTURE_2D;
 
    mt = intel_miptree_create(brw, target, format, 0, 0,
-			     width, height, depth, true, num_samples,
-                             INTEL_MIPTREE_TILING_ANY, false);
+                             width, height, depth, num_samples,
+                             INTEL_MIPTREE_TILING_ANY,
+                             MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
    if (!mt)
       goto fail;
 
@@ -1258,8 +1211,10 @@ intel_miptree_copy_slice(struct brw_context *brw,
    assert(src_mt->format == dst_mt->format);
 
    if (dst_mt->compressed) {
-      height = ALIGN(height, dst_mt->align_h) / dst_mt->align_h;
-      width = ALIGN(width, dst_mt->align_w);
+      unsigned int i, j;
+      _mesa_get_format_block_size(dst_mt->format, &i, &j);
+      height = ALIGN(height, j) / j;
+      width = ALIGN(width, i);
    }
 
    /* If it's a packed depth/stencil buffer with separate stencil, the blit
@@ -1378,10 +1333,9 @@ intel_miptree_alloc_mcs(struct brw_context *brw,
                                      mt->logical_width0,
                                      mt->logical_height0,
                                      mt->logical_depth0,
-                                     true,
                                      0 /* num_samples */,
                                      INTEL_MIPTREE_TILING_Y,
-                                     false);
+                                     MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
 
    /* From the Ivy Bridge PRM, Vol 2 Part 1 p326:
     *
@@ -1429,6 +1383,9 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
    unsigned mcs_height =
       ALIGN(mt->logical_height0, height_divisor) / height_divisor;
    assert(mt->logical_depth0 == 1);
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
+   if (brw->gen >= 8)
+      layout_flags |= MIPTREE_LAYOUT_FORCE_HALIGN16;
    mt->mcs_mt = intel_miptree_create(brw,
                                      mt->target,
                                      format,
@@ -1437,10 +1394,9 @@ intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                      mcs_width,
                                      mcs_height,
                                      mt->logical_depth0,
-                                     true,
                                      0 /* num_samples */,
                                      INTEL_MIPTREE_TILING_Y,
-                                     false);
+                                     layout_flags);
 
    return mt->mcs_mt;
 }
@@ -1682,7 +1638,10 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                              struct intel_mipmap_tree *mt)
 {
    struct intel_miptree_aux_buffer *buf = calloc(sizeof(*buf), 1);
-   const bool force_all_slices_at_each_lod = brw->gen == 6;
+   uint32_t layout_flags = MIPTREE_LAYOUT_ACCELERATED_UPLOAD;
+
+   if (brw->gen == 6)
+      layout_flags |= MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD;
 
    if (!buf)
       return NULL;
@@ -1695,10 +1654,9 @@ intel_hiz_miptree_buf_create(struct brw_context *brw,
                                   mt->logical_width0,
                                   mt->logical_height0,
                                   mt->logical_depth0,
-                                  true,
                                   mt->num_samples,
                                   INTEL_MIPTREE_TILING_ANY,
-                                  force_all_slices_at_each_lod);
+                                  layout_flags);
    if (!buf->mt) {
       free(buf);
       return NULL;
@@ -2128,9 +2086,8 @@ intel_miptree_map_blit(struct brw_context *brw,
    map->mt = intel_miptree_create(brw, GL_TEXTURE_2D, mt->format,
                                   0, 0,
                                   map->w, map->h, 1,
-                                  false, 0,
-                                  INTEL_MIPTREE_TILING_NONE,
-                                  false);
+                                  0, INTEL_MIPTREE_TILING_NONE, 0);
+
    if (!map->mt) {
       fprintf(stderr, "Failed to allocate blit temporary\n");
       goto fail;
@@ -2675,7 +2632,9 @@ intel_miptree_map(struct brw_context *brw,
    } else if (use_intel_mipree_map_blit(brw, mt, mode, level, slice)) {
       intel_miptree_map_blit(brw, mt, map, level, slice);
 #if defined(USE_SSE41)
-   } else if (!(mode & GL_MAP_WRITE_BIT) && !mt->compressed && cpu_has_sse4_1) {
+   } else if (!(mode & GL_MAP_WRITE_BIT) &&
+              !mt->compressed && cpu_has_sse4_1 &&
+              (mt->pitch % 16 == 0)) {
       intel_miptree_map_movntdqa(brw, mt, map, level, slice);
 #endif
    } else {
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 8b42e4a..bde6daa 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -330,6 +330,13 @@ struct intel_miptree_aux_buffer
    struct intel_mipmap_tree *mt; /**< hiz miptree used with Gen6 */
 };
 
+/* Tile resource modes */
+enum intel_miptree_tr_mode {
+   INTEL_MIPTREE_TRMODE_NONE,
+   INTEL_MIPTREE_TRMODE_YF,
+   INTEL_MIPTREE_TRMODE_YS
+};
+
 struct intel_mipmap_tree
 {
    /** Buffer object containing the pixel data. */
@@ -338,6 +345,7 @@ struct intel_mipmap_tree
    uint32_t pitch; /**< pitch in bytes. */
 
    uint32_t tiling; /**< One of the I915_TILING_* flags */
+   enum intel_miptree_tr_mode tr_mode;
 
    /* Effectively the key:
     */
@@ -514,19 +522,27 @@ enum intel_miptree_tiling_mode {
    INTEL_MIPTREE_TILING_NONE,
 };
 
-bool
-intel_is_non_msrt_mcs_buffer_supported(struct brw_context *brw,
-                                       struct intel_mipmap_tree *mt);
-
 void
 intel_get_non_msrt_mcs_alignment(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt,
                                  unsigned *width_px, unsigned *height);
-
+bool
+intel_tiling_supports_non_msrt_mcs(struct brw_context *brw, unsigned tiling);
+bool
+intel_miptree_is_fast_clear_capable(struct brw_context *brw,
+                                    struct intel_mipmap_tree *mt);
 bool
 intel_miptree_alloc_non_msrt_mcs(struct brw_context *brw,
                                  struct intel_mipmap_tree *mt);
 
+enum {
+   MIPTREE_LAYOUT_ACCELERATED_UPLOAD       = 1 << 0,
+   MIPTREE_LAYOUT_FORCE_ALL_SLICE_AT_LOD   = 1 << 1,
+   MIPTREE_LAYOUT_FOR_BO                   = 1 << 2,
+   MIPTREE_LAYOUT_DISABLE_AUX              = 1 << 3,
+   MIPTREE_LAYOUT_FORCE_HALIGN16           = 1 << 4,
+};
+
 struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLenum target,
 					       mesa_format format,
@@ -535,10 +551,9 @@ struct intel_mipmap_tree *intel_miptree_create(struct brw_context *brw,
                                                GLuint width0,
                                                GLuint height0,
                                                GLuint depth0,
-					       bool expect_accelerated_upload,
                                                GLuint num_samples,
                                                enum intel_miptree_tiling_mode,
-                                               bool force_all_slices_at_each_lod);
+                                               uint32_t flags);
 
 struct intel_mipmap_tree *
 intel_miptree_create_for_bo(struct brw_context *brw,
@@ -549,7 +564,7 @@ intel_miptree_create_for_bo(struct brw_context *brw,
                             uint32_t height,
                             uint32_t depth,
                             int pitch,
-                            bool disable_aux_buffers);
+                            uint32_t layout_flags);
 
 void
 intel_update_winsys_renderbuffer_miptree(struct brw_context *intel,
@@ -753,7 +768,11 @@ brw_miptree_get_vertical_slice_pitch(const struct brw_context *brw,
                                      const struct intel_mipmap_tree *mt,
                                      unsigned level);
 
-void brw_miptree_layout(struct brw_context *brw, struct intel_mipmap_tree *mt);
+void
+brw_miptree_layout(struct brw_context *brw,
+                   struct intel_mipmap_tree *mt,
+                   enum intel_miptree_tiling_mode requested,
+                   uint32_t layout_flags);
 
 void *intel_miptree_map_raw(struct brw_context *brw,
                             struct intel_mipmap_tree *mt);
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
index 4ecefc8..6c6bd86 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -28,6 +28,7 @@
 #include "main/glheader.h"
 #include "main/enums.h"
 #include "main/image.h"
+#include "main/glformats.h"
 #include "main/mtypes.h"
 #include "main/condrender.h"
 #include "main/fbobject.h"
@@ -76,8 +77,16 @@ do_blit_drawpixels(struct gl_context * ctx,
    struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[0];
    struct intel_renderbuffer *irb = intel_renderbuffer(rb);
 
-   if (!_mesa_format_matches_format_and_type(irb->mt->format, format, type,
-                                             false)) {
+   mesa_format src_format = _mesa_format_from_format_and_type(format, type);
+   if (_mesa_format_is_mesa_array_format(src_format))
+      src_format = _mesa_format_from_array_format(src_format);
+   mesa_format dst_format = irb->mt->format;
+
+   /* We can safely discard sRGB encode/decode for the DrawPixels interface */
+   src_format = _mesa_get_srgb_format_linear(src_format);
+   dst_format = _mesa_get_srgb_format_linear(dst_format);
+
+   if (!intel_miptree_blit_compatible_formats(src_format, dst_format)) {
       DBG("%s: bad format for blit\n", __func__);
       return false;
    }
@@ -112,7 +121,7 @@ do_blit_drawpixels(struct gl_context * ctx,
                                   src_offset,
                                   width, height, 1,
                                   src_stride,
-                                  false /*disable_aux_buffers*/);
+                                  0);
    if (!pbo_mt)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index d3ca38b..3038057 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -226,8 +226,30 @@ intelReadPixels(struct gl_context * ctx,
 
    if (_mesa_is_bufferobj(pack->BufferObj)) {
       if (_mesa_meta_pbo_GetTexSubImage(ctx, 2, NULL, x, y, 0, width, height, 1,
-                                        format, type, pixels, pack))
+                                        format, type, pixels, pack)) {
+         /* _mesa_meta_pbo_GetTexSubImage() implements PBO transfers by
+          * binding the user-provided BO as a fake framebuffer and rendering
+          * to it.  This breaks the invariant of the GL that nothing is able
+          * to render to a BO, causing nondeterministic corruption issues
+          * because the render cache is not coherent with a number of other
+          * caches that the BO could potentially be bound to afterwards.
+          *
+          * This could be solved in the same way that we guarantee texture
+          * coherency after a texture is attached to a framebuffer and
+          * rendered to, but that would involve checking *all* BOs bound to
+          * the pipeline for the case we need to emit a cache flush due to
+          * previous rendering to any of them -- Including vertex, index,
+          * uniform, atomic counter, shader image, transform feedback,
+          * indirect draw buffers, etc.
+          *
+          * That would increase the per-draw call overhead even though it's
+          * very unlikely that any of the BOs bound to the pipeline has been
+          * rendered to via a PBO at any point, so it seems better to just
+          * flush here unconditionally.
+          */
+         intel_batchbuffer_emit_mi_flush(brw);
          return;
+      }
 
       perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
    }
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index 488fb5b..bd14e18 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -48,6 +48,20 @@
 #define GEN7_MI_LOAD_REGISTER_MEM	(CMD_MI | (0x29 << 23))
 # define MI_LOAD_REGISTER_MEM_USE_GGTT		(1 << 22)
 
+/* Manipulate the predicate bit based on some register values. Only on Gen7+ */
+#define GEN7_MI_PREDICATE		(CMD_MI | (0xC << 23))
+# define MI_PREDICATE_LOADOP_KEEP		(0 << 6)
+# define MI_PREDICATE_LOADOP_LOAD		(2 << 6)
+# define MI_PREDICATE_LOADOP_LOADINV		(3 << 6)
+# define MI_PREDICATE_COMBINEOP_SET		(0 << 3)
+# define MI_PREDICATE_COMBINEOP_AND		(1 << 3)
+# define MI_PREDICATE_COMBINEOP_OR		(2 << 3)
+# define MI_PREDICATE_COMBINEOP_XOR		(3 << 3)
+# define MI_PREDICATE_COMPAREOP_TRUE		(0 << 0)
+# define MI_PREDICATE_COMPAREOP_FALSE		(1 << 0)
+# define MI_PREDICATE_COMPAREOP_SRCS_EQUAL	(2 << 0)
+# define MI_PREDICATE_COMPAREOP_DELTAS_EQUAL	(3 << 0)
+
 /** @{
  *
  * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
@@ -69,6 +83,7 @@
 #define PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE	(1 << 10) /* GM45+ only */
 #define PIPE_CONTROL_ISP_DIS		(1 << 9)
 #define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_FLUSH_ENABLE	(1 << 7) /* Gen7+ only */
 /* GT */
 #define PIPE_CONTROL_DATA_CACHE_INVALIDATE	(1 << 5)
 #define PIPE_CONTROL_VF_CACHE_INVALIDATE	(1 << 4)
@@ -147,3 +162,11 @@
 # define GEN9_PARTIAL_RESOLVE_DISABLE_IN_VC (1 << 1)
 # define GEN8_HIZ_PMA_MASK_BITS \
    ((GEN8_HIZ_NP_PMA_FIX_ENABLE | GEN8_HIZ_NP_EARLY_Z_FAILS_DISABLE) << 16)
+
+/* Predicate registers */
+#define MI_PREDICATE_SRC0               0x2400
+#define MI_PREDICATE_SRC1               0x2408
+#define MI_PREDICATE_DATA               0x2410
+#define MI_PREDICATE_RESULT             0x2418
+#define MI_PREDICATE_RESULT_1           0x241C
+#define MI_PREDICATE_RESULT_2           0x2214
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index 4860a16..de14696 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -39,6 +39,7 @@
 #include "swrast/s_renderbuffer.h"
 #include "util/ralloc.h"
 #include "brw_shader.h"
+#include "glsl/nir/nir.h"
 
 #include "utils.h"
 #include "xmlpool.h"
@@ -1372,6 +1373,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
    if (!intelScreen->devinfo)
       return false;
 
+   brw_process_intel_debug_variable(intelScreen);
+
    intelScreen->hw_must_use_separate_stencil = intelScreen->devinfo->gen >= 7;
 
    intelScreen->hw_has_swizzling = intel_detect_swizzling(intelScreen);
@@ -1407,6 +1410,13 @@ __DRIconfig **intelInitScreen2(__DRIscreen *psp)
          (ret != -1 || errno != EINVAL);
    }
 
+   struct drm_i915_getparam getparam;
+   getparam.param = I915_PARAM_CMD_PARSER_VERSION;
+   getparam.value = &intelScreen->cmd_parser_version;
+   const int ret = drmIoctl(psp->fd, DRM_IOCTL_I915_GETPARAM, &getparam);
+   if (ret == -1)
+      intelScreen->cmd_parser_version = 0;
+
    psp->extensions = !intelScreen->has_context_reset_notification
       ? intelScreenExtensions : intelRobustScreenExtensions;
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index e7a1490..742b3d3 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -72,7 +72,13 @@ struct intel_screen
    * Configuration cache with default values for all contexts
    */
    driOptionCache optionCache;
-};
+
+   /**
+    * Version of the command parser reported by the
+    * I915_PARAM_CMD_PARSER_VERSION parameter
+    */
+   int cmd_parser_version;
+ };
 
 extern void intelDestroyContext(__DRIcontext * driContextPriv);
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index 777a682..b0181ad 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -93,7 +93,7 @@ intel_alloc_texture_image_buffer(struct gl_context *ctx,
    } else {
       intel_image->mt = intel_miptree_create_for_teximage(brw, intel_texobj,
                                                           intel_image,
-                                                          false);
+                                                          0);
 
       /* Even if the object currently has a mipmap tree associated
        * with it, this one is a more likely candidate to represent the
@@ -144,10 +144,8 @@ intel_alloc_texture_storage(struct gl_context *ctx,
                                               first_image->TexFormat,
                                               0, levels - 1,
                                               width, height, depth,
-                                              false, /* expect_accelerated */
                                               num_samples,
-                                              INTEL_MIPTREE_TILING_ANY,
-                                              false);
+                                              INTEL_MIPTREE_TILING_ANY, 0);
 
       if (intel_texobj->mt == NULL) {
          return false;
@@ -341,7 +339,7 @@ intel_set_texture_storage_for_buffer_object(struct gl_context *ctx,
                                   buffer_offset,
                                   image->Width, image->Height, image->Depth,
                                   row_stride,
-                                  false /*disable_aux_buffers*/);
+                                  0);
    if (!intel_texobj->mt)
       return false;
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex.h b/src/mesa/drivers/dri/i965/intel_tex.h
index f048e84..402a389 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.h
+++ b/src/mesa/drivers/dri/i965/intel_tex.h
@@ -53,7 +53,7 @@ struct intel_mipmap_tree *
 intel_miptree_create_for_teximage(struct brw_context *brw,
 				  struct intel_texture_object *intelObj,
 				  struct intel_texture_image *intelImage,
-				  bool expect_accelerated_upload);
+                                  uint32_t layout_flags);
 
 GLuint intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit);
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index 7952ee5..ebe84b6 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -36,7 +36,7 @@ struct intel_mipmap_tree *
 intel_miptree_create_for_teximage(struct brw_context *brw,
 				  struct intel_texture_object *intelObj,
 				  struct intel_texture_image *intelImage,
-				  bool expect_accelerated_upload)
+                                  uint32_t layout_flags)
 {
    GLuint lastLevel;
    int width, height, depth;
@@ -79,10 +79,9 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
 			       width,
 			       height,
 			       depth,
-			       expect_accelerated_upload,
                                intelImage->base.Base.NumSamples,
                                INTEL_MIPTREE_TILING_ANY,
-                               false);
+                               layout_flags);
 }
 
 static void
@@ -155,7 +154,7 @@ intel_set_texture_image_bo(struct gl_context *ctx,
                            GLuint width, GLuint height,
                            GLuint pitch,
                            GLuint tile_x, GLuint tile_y,
-                           bool disable_aux_buffers)
+                           uint32_t layout_flags)
 {
    struct brw_context *brw = brw_context(ctx);
    struct intel_texture_image *intel_image = intel_texture_image(image);
@@ -171,7 +170,7 @@ intel_set_texture_image_bo(struct gl_context *ctx,
 
    intel_image->mt = intel_miptree_create_for_bo(brw, bo, image->TexFormat,
                                                  0, width, height, 1, pitch,
-                                                 disable_aux_buffers);
+                                                 layout_flags);
    if (intel_image->mt == NULL)
        return;
    intel_image->mt->target = target;
@@ -255,8 +254,7 @@ intelSetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
                               rb->Base.Base.Width,
                               rb->Base.Base.Height,
                               rb->mt->pitch,
-                              0, 0,
-                              false /*disable_aux_buffers*/);
+                              0, 0, 0);
    _mesa_unlock_texture(&brw->ctx, texObj);
 }
 
@@ -349,7 +347,7 @@ intel_image_target_texture_2d(struct gl_context *ctx, GLenum target,
                               image->width,  image->height,
                               image->pitch,
                               image->tile_x, image->tile_y,
-                              true /*disable_aux_buffers*/);
+                              MIPTREE_LAYOUT_DISABLE_AUX);
 }
 
 /**
@@ -486,8 +484,15 @@ intel_get_tex_image(struct gl_context *ctx,
       if (_mesa_meta_pbo_GetTexSubImage(ctx, 3, texImage, 0, 0, 0,
                                         texImage->Width, texImage->Height,
                                         texImage->Depth, format, type,
-                                        pixels, &ctx->Pack))
+                                        pixels, &ctx->Pack)) {
+         /* Flush to guarantee coherency between the render cache and other
+          * caches the PBO could potentially be bound to after this point.
+          * See the related comment in intelReadPixels() for a more detailed
+          * explanation.
+          */
+         intel_batchbuffer_emit_mi_flush(brw);
          return;
+      }
 
       perf_debug("%s: fallback to CPU mapping in PBO case\n", __func__);
    }
diff --git a/src/mesa/drivers/dri/i965/intel_tex_validate.c b/src/mesa/drivers/dri/i965/intel_tex_validate.c
index 1d82768..4991c29 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_validate.c
@@ -47,8 +47,10 @@ intel_update_max_level(struct intel_texture_object *intelObj,
 {
    struct gl_texture_object *tObj = &intelObj->base;
 
-   if (sampler->MinFilter == GL_NEAREST ||
-       sampler->MinFilter == GL_LINEAR) {
+   if (!tObj->_MipmapComplete ||
+       (tObj->_RenderToTexture &&
+        (sampler->MinFilter == GL_NEAREST ||
+         sampler->MinFilter == GL_LINEAR))) {
       intelObj->_MaxLevel = tObj->BaseLevel;
    } else {
       intelObj->_MaxLevel = tObj->_MaxLevel;
@@ -142,10 +144,9 @@ intel_finalize_mipmap_tree(struct brw_context *brw, GLuint unit)
                                           width,
                                           height,
                                           depth,
-					  true,
                                           0 /* num_samples */,
                                           INTEL_MIPTREE_TILING_ANY,
-                                          false);
+                                          MIPTREE_LAYOUT_ACCELERATED_UPLOAD);
       if (!intelObj->mt)
          return false;
    }
diff --git a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
index 206a76e..8010fb4 100644
--- a/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_cmod_propagation.cpp
@@ -26,11 +26,13 @@
 #include "brw_cfg.h"
 #include "program/program.h"
 
+using namespace brw;
+
 class cmod_propagation_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct brw_wm_prog_data *prog_data;
@@ -42,30 +44,31 @@ public:
 class cmod_propagation_fs_visitor : public fs_visitor
 {
 public:
-   cmod_propagation_fs_visitor(struct brw_context *brw,
+   cmod_propagation_fs_visitor(struct brw_compiler *compiler,
                                struct brw_wm_prog_data *prog_data,
                                struct gl_shader_program *shader_prog)
-      : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
+      : fs_visitor(compiler, NULL, NULL, MESA_SHADER_FRAGMENT, NULL,
+                   &prog_data->base, shader_prog,
+                   (struct gl_program *) NULL, 8, -1) {}
 };
 
 
 void cmod_propagation_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    fp = ralloc(NULL, struct brw_fragment_program);
    prog_data = ralloc(NULL, struct brw_wm_prog_data);
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new cmod_propagation_fs_visitor(brw, prog_data, shader_prog);
+   v = new cmod_propagation_fs_visitor(compiler, prog_data, shader_prog);
 
    _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static fs_inst *
@@ -100,13 +103,13 @@ cmod_propagation(fs_visitor *v)
 
 TEST_F(cmod_propagation_test, basic)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -132,13 +135,13 @@ TEST_F(cmod_propagation_test, basic)
 
 TEST_F(cmod_propagation_test, cmp_nonzero)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg nonzero(1.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, nonzero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, nonzero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -165,12 +168,12 @@ TEST_F(cmod_propagation_test, cmp_nonzero)
 
 TEST_F(cmod_propagation_test, non_cmod_instruction)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::uint_type);
    fs_reg src0 = v->vgrf(glsl_type::uint_type);
    fs_reg zero(0u);
-   v->emit(BRW_OPCODE_FBL, dest, src0);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_ud, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.FBL(dest, src0);
+   bld.CMP(bld.null_reg_ud(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -197,16 +200,15 @@ TEST_F(cmod_propagation_test, non_cmod_instruction)
 
 TEST_F(cmod_propagation_test, intervening_flag_write)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, src2, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -236,17 +238,16 @@ TEST_F(cmod_propagation_test, intervening_flag_write)
 
 TEST_F(cmod_propagation_test, intervening_flag_read)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest0 = v->vgrf(glsl_type::float_type);
    fs_reg dest1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest0, src0, src1);
-   v->emit(BRW_OPCODE_SEL, dest1, src2, zero)
-      ->predicate = BRW_PREDICATE_NORMAL;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -276,16 +277,16 @@ TEST_F(cmod_propagation_test, intervening_flag_read)
 
 TEST_F(cmod_propagation_test, intervening_dest_write)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::vec4_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, offset(dest, 2), src0, src1);
-   v->emit(SHADER_OPCODE_TEX, dest, src2)
+   bld.ADD(offset(dest, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dest, src2)
       ->regs_written = 4;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, offset(dest, 2), zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.CMP(bld.null_reg_f(), offset(dest, 2), zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -316,18 +317,16 @@ TEST_F(cmod_propagation_test, intervening_dest_write)
 
 TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest0 = v->vgrf(glsl_type::float_type);
    fs_reg dest1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest0, src0, src1)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
-   v->emit(BRW_OPCODE_SEL, dest1, src2, zero)
-      ->predicate = BRW_PREDICATE_NORMAL;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -357,14 +356,14 @@ TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
 
 TEST_F(cmod_propagation_test, negate)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
+   bld.ADD(dest, src0, src1);
    dest.negate = true;
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, dest, zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -390,13 +389,13 @@ TEST_F(cmod_propagation_test, negate)
 
 TEST_F(cmod_propagation_test, movnz)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_CMP, dest, src0, src1)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
-   v->emit(BRW_OPCODE_MOV, v->reg_null_f, dest)
-      ->conditional_mod = BRW_CONDITIONAL_NZ;
+   bld.CMP(dest, src0, src1, BRW_CONDITIONAL_GE);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(bld.null_reg_f(), dest));
 
    /* = Before =
     *
@@ -422,14 +421,14 @@ TEST_F(cmod_propagation_test, movnz)
 
 TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::int_type);
    fs_reg src1 = v->vgrf(glsl_type::int_type);
    fs_reg zero(0.0f);
-   v->emit(BRW_OPCODE_ADD, dest, src0, src1);
-   v->emit(BRW_OPCODE_CMP, v->reg_null_f, retype(dest, BRW_REGISTER_TYPE_F),
-                                          zero)
-      ->conditional_mod = BRW_CONDITIONAL_GE;
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), retype(dest, BRW_REGISTER_TYPE_F), zero,
+           BRW_CONDITIONAL_GE);
 
    /* = Before =
     *
@@ -456,15 +455,15 @@ TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
 
 TEST_F(cmod_propagation_test, andnz_one)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
    fs_reg one(1);
 
-   v->emit(BRW_OPCODE_CMP, retype(dest, BRW_REGISTER_TYPE_F), src0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_L;
-   v->emit(BRW_OPCODE_AND, v->reg_null_d, dest, one)
-      ->conditional_mod = BRW_CONDITIONAL_NZ;
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, one));
 
    /* = Before =
     * 0: cmp.l.f0(8)     dest:F  src0:F  0F
@@ -491,15 +490,15 @@ TEST_F(cmod_propagation_test, andnz_one)
 
 TEST_F(cmod_propagation_test, andnz_non_one)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
    fs_reg nonone(38);
 
-   v->emit(BRW_OPCODE_CMP, retype(dest, BRW_REGISTER_TYPE_F), src0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_L;
-   v->emit(BRW_OPCODE_AND, v->reg_null_d, dest, nonone)
-      ->conditional_mod = BRW_CONDITIONAL_NZ;
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, nonone));
 
    /* = Before =
     * 0: cmp.l.f0(8)     dest:F  src0:F  0F
@@ -526,15 +525,15 @@ TEST_F(cmod_propagation_test, andnz_non_one)
 
 TEST_F(cmod_propagation_test, andz_one)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dest = v->vgrf(glsl_type::int_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg zero(0.0f);
    fs_reg one(1);
 
-   v->emit(BRW_OPCODE_CMP, retype(dest, BRW_REGISTER_TYPE_F), src0, zero)
-      ->conditional_mod = BRW_CONDITIONAL_L;
-   v->emit(BRW_OPCODE_AND, v->reg_null_d, dest, one)
-      ->conditional_mod = BRW_CONDITIONAL_Z;
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_Z,
+               bld.AND(bld.null_reg_d(), dest, one));
 
    /* = Before =
     * 0: cmp.l.f0(8)     dest:F  src0:F  0F
diff --git a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
index 4c91af3..3ef0cb3 100644
--- a/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_fs_saturate_propagation.cpp
@@ -26,11 +26,13 @@
 #include "brw_cfg.h"
 #include "program/program.h"
 
+using namespace brw;
+
 class saturate_propagation_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct brw_wm_prog_data *prog_data;
@@ -42,30 +44,31 @@ public:
 class saturate_propagation_fs_visitor : public fs_visitor
 {
 public:
-   saturate_propagation_fs_visitor(struct brw_context *brw,
+   saturate_propagation_fs_visitor(struct brw_compiler *compiler,
                                    struct brw_wm_prog_data *prog_data,
                                    struct gl_shader_program *shader_prog)
-      : fs_visitor(brw, NULL, NULL, prog_data, shader_prog, NULL, 8) {}
+      : fs_visitor(compiler, NULL, NULL, MESA_SHADER_FRAGMENT, NULL,
+                   &prog_data->base, shader_prog,
+                   (struct gl_program *) NULL, 8, -1) {}
 };
 
 
 void saturate_propagation_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    fp = ralloc(NULL, struct brw_fragment_program);
    prog_data = ralloc(NULL, struct brw_wm_prog_data);
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new saturate_propagation_fs_visitor(brw, prog_data, shader_prog);
+   v = new saturate_propagation_fs_visitor(compiler, prog_data, shader_prog);
 
    _mesa_init_fragment_program(ctx, &fp->program, GL_FRAGMENT_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static fs_inst *
@@ -100,13 +103,13 @@ saturate_propagation(fs_visitor *v)
 
 TEST_F(saturate_propagation_test, basic)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -135,15 +138,15 @@ TEST_F(saturate_propagation_test, basic)
 
 TEST_F(saturate_propagation_test, other_non_saturated_use)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_ADD, dst2, dst0, src0);
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.ADD(dst2, dst0, src0);
 
    /* = Before =
     *
@@ -173,14 +176,14 @@ TEST_F(saturate_propagation_test, other_non_saturated_use)
 
 TEST_F(saturate_propagation_test, predicated_instruction)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1)
+   bld.ADD(dst0, src0, src1)
       ->predicate = BRW_PREDICATE_NORMAL;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -208,14 +211,14 @@ TEST_F(saturate_propagation_test, predicated_instruction)
 
 TEST_F(saturate_propagation_test, neg_mov_sat)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
+   bld.ADD(dst0, src0, src1);
    dst0.negate = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -243,14 +246,14 @@ TEST_F(saturate_propagation_test, neg_mov_sat)
 
 TEST_F(saturate_propagation_test, abs_mov_sat)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
+   bld.ADD(dst0, src0, src1);
    dst0.abs = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
 
    /* = Before =
     *
@@ -278,16 +281,15 @@ TEST_F(saturate_propagation_test, abs_mov_sat)
 
 TEST_F(saturate_propagation_test, producer_saturates)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_MOV, dst2, dst0);
+   set_saturate(true, bld.ADD(dst0, src0, src1));
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.MOV(dst2, dst0);
 
    /* = Before =
     *
@@ -318,16 +320,15 @@ TEST_F(saturate_propagation_test, producer_saturates)
 
 TEST_F(saturate_propagation_test, intervening_saturating_copy)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_ADD, dst0, src0, src1);
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
-   v->emit(BRW_OPCODE_MOV, dst2, dst0)
-      ->saturate = true;
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   set_saturate(true, bld.MOV(dst2, dst0));
 
    /* = Before =
     *
@@ -360,16 +361,16 @@ TEST_F(saturate_propagation_test, intervening_saturating_copy)
 
 TEST_F(saturate_propagation_test, intervening_dest_write)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::vec4_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
    fs_reg src2 = v->vgrf(glsl_type::vec2_type);
-   v->emit(BRW_OPCODE_ADD, offset(dst0, 2), src0, src1);
-   v->emit(SHADER_OPCODE_TEX, dst0, src2)
+   bld.ADD(offset(dst0, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dst0, src2)
       ->regs_written = 4;
-   v->emit(BRW_OPCODE_MOV, dst1, offset(dst0, 2))
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, offset(dst0, 2)));
 
    /* = Before =
     *
@@ -400,18 +401,17 @@ TEST_F(saturate_propagation_test, intervening_dest_write)
 
 TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat)
 {
+   const fs_builder &bld = v->bld;
    fs_reg dst0 = v->vgrf(glsl_type::float_type);
    fs_reg dst1 = v->vgrf(glsl_type::float_type);
    fs_reg dst2 = v->vgrf(glsl_type::float_type);
    fs_reg src0 = v->vgrf(glsl_type::float_type);
    fs_reg src1 = v->vgrf(glsl_type::float_type);
-   v->emit(BRW_OPCODE_MUL, dst0, src0, src1);
+   bld.MUL(dst0, src0, src1);
    dst0.negate = true;
-   v->emit(BRW_OPCODE_MOV, dst1, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
    dst0.negate = false;
-   v->emit(BRW_OPCODE_MOV, dst2, dst0)
-      ->saturate = true;
+   set_saturate(true, bld.MOV(dst2, dst0));
 
    /* = Before =
     *
diff --git a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
index 2ef52e9..84e43fa 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_copy_propagation.cpp
@@ -33,7 +33,7 @@ class copy_propagation_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
@@ -44,12 +44,11 @@ public:
 class copy_propagation_vec4_visitor : public vec4_visitor
 {
 public:
-   copy_propagation_vec4_visitor(struct brw_context *brw,
+   copy_propagation_vec4_visitor(struct brw_compiler *compiler,
                                   struct gl_shader_program *shader_prog)
-      : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
+      : vec4_visitor(compiler, NULL, NULL, NULL, NULL, shader_prog,
                      MESA_SHADER_VERTEX, NULL,
-                     false /* no_spills */,
-                     ST_NONE, ST_NONE, ST_NONE)
+                     false /* no_spills */, -1)
    {
    }
 
@@ -93,21 +92,20 @@ protected:
 
 void copy_propagation_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    vp = ralloc(NULL, struct brw_vertex_program);
 
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new copy_propagation_vec4_visitor(brw, shader_prog);
+   v = new copy_propagation_vec4_visitor(compiler, shader_prog);
 
    _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
index c8c6757..de2afd3 100644
--- a/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
+++ b/src/mesa/drivers/dri/i965/test_vec4_register_coalesce.cpp
@@ -35,7 +35,7 @@ class register_coalesce_test : public ::testing::Test {
    virtual void SetUp();
 
 public:
-   struct brw_context *brw;
+   struct brw_compiler *compiler;
    struct brw_device_info *devinfo;
    struct gl_context *ctx;
    struct gl_shader_program *shader_prog;
@@ -47,12 +47,11 @@ public:
 class register_coalesce_vec4_visitor : public vec4_visitor
 {
 public:
-   register_coalesce_vec4_visitor(struct brw_context *brw,
+   register_coalesce_vec4_visitor(struct brw_compiler *compiler,
                                   struct gl_shader_program *shader_prog)
-      : vec4_visitor(brw, NULL, NULL, NULL, NULL, shader_prog,
+      : vec4_visitor(compiler, NULL, NULL, NULL, NULL, shader_prog,
                      MESA_SHADER_VERTEX, NULL,
-                     false /* no_spills */,
-                     ST_NONE, ST_NONE, ST_NONE)
+                     false /* no_spills */, -1)
    {
    }
 
@@ -96,21 +95,20 @@ protected:
 
 void register_coalesce_test::SetUp()
 {
-   brw = (struct brw_context *)calloc(1, sizeof(*brw));
-   devinfo = (struct brw_device_info *)calloc(1, sizeof(*brw));
-   brw->intelScreen = (struct intel_screen *)calloc(1, sizeof(*brw->intelScreen));
-   brw->intelScreen->devinfo = devinfo;
-   ctx = &brw->ctx;
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct brw_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
 
    vp = ralloc(NULL, struct brw_vertex_program);
 
    shader_prog = ralloc(NULL, struct gl_shader_program);
 
-   v = new register_coalesce_vec4_visitor(brw, shader_prog);
+   v = new register_coalesce_vec4_visitor(compiler, shader_prog);
 
    _mesa_init_vertex_program(ctx, &vp->program, GL_VERTEX_SHADER, 0);
 
-   brw->gen = devinfo->gen = 4;
+   devinfo->gen = 4;
 }
 
 static void
diff --git a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
index 6c479f5..c78d4ba 100644
--- a/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
+++ b/src/mesa/drivers/dri/nouveau/nouveau_fbo.c
@@ -242,7 +242,7 @@ static void
 nouveau_framebuffer_renderbuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
 				 GLenum attachment, struct gl_renderbuffer *rb)
 {
-	_mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+	_mesa_FramebufferRenderbuffer_sw(ctx, fb, attachment, rb);
 
 	context_dirty(ctx, FRAMEBUFFER);
 }
diff --git a/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c b/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c
index c0c7b26..1398385 100644
--- a/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c
+++ b/src/mesa/drivers/dri/nouveau/nv10_state_tnl.c
@@ -31,6 +31,8 @@
 #include "nv10_3d.xml.h"
 #include "nv10_driver.h"
 
+#include "util/simple_list.h"
+
 void
 nv10_emit_clip_plane(struct gl_context *ctx, int emit)
 {
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c b/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c
index f0acbed..4139551 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_tnl.c
@@ -32,6 +32,8 @@
 #include "nv10_driver.h"
 #include "nv20_driver.h"
 
+#include "util/simple_list.h"
+
 #define LIGHT_MODEL_AMBIENT_R(side)			\
 	((side) ? NV20_3D_LIGHT_MODEL_BACK_AMBIENT_R :	\
 	 NV20_3D_LIGHT_MODEL_FRONT_AMBIENT_R)
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index b0a6bd5..6fe70b5 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -2215,9 +2215,9 @@ GLboolean r200ValidateState( struct gl_context *ctx )
    GLuint new_state = rmesa->radeon.NewGLState;
 
    if (new_state & _NEW_BUFFERS) {
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
 
       R200_STATECHANGE(rmesa, ctx);
    }
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
index 0ca526d..2a8bd6c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_common.c
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -220,9 +220,9 @@ void radeon_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb)
 	 */
 	if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
 		/* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
-		_mesa_update_framebuffer(ctx);
+		_mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
 		/* this updates the DrawBuffer's Width/Height if it's a FBO */
-		_mesa_update_draw_buffer_bounds(ctx);
+		_mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
 	}
 
 	if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
index 97022f9..ef62d09 100644
--- a/src/mesa/drivers/dri/radeon/radeon_fbo.c
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -723,7 +723,7 @@ radeon_framebuffer_renderbuffer(struct gl_context * ctx,
 		"%s(%p, fb %p, rb %p) \n",
 		__func__, ctx, fb, rb);
 
-   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+   _mesa_FramebufferRenderbuffer_sw(ctx, fb, attachment, rb);
    radeon_draw_buffer(ctx, fb);
 }
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index c45bb51..cba3d9c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -1994,9 +1994,9 @@ GLboolean radeonValidateState( struct gl_context *ctx )
    GLuint new_state = rmesa->radeon.NewGLState;
 
    if (new_state & _NEW_BUFFERS) {
-     _mesa_update_framebuffer(ctx);
+     _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
      /* this updates the DrawBuffer's Width/Height if it's a FBO */
-     _mesa_update_draw_buffer_bounds(ctx);
+     _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
      RADEON_STATECHANGE(rmesa, ctx);
    }
 
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index 2ddb474..2d4bb70 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -62,7 +62,9 @@
 #include "swrast/s_context.h"
 
 #include <sys/types.h>
-#include <sys/sysctl.h>
+#ifdef HAVE_SYS_SYSCTL_H
+# include <sys/sysctl.h>
+#endif
 
 const __DRIextension **__driDriverGetExtensions_swrast(void);
 
@@ -958,6 +960,7 @@ static const __DRIextension *swrast_driver_extensions[] = {
     &driCoreExtension.base,
     &driSWRastExtension.base,
     &driCopySubBufferExtension.base,
+    &dri2ConfigQueryExtension.base,
     &swrast_vtable.base,
     NULL
 };
diff --git a/src/mesa/drivers/haiku/swrast/SConscript b/src/mesa/drivers/haiku/swrast/SConscript
deleted file mode 100644
index 907325e..0000000
--- a/src/mesa/drivers/haiku/swrast/SConscript
+++ /dev/null
@@ -1,33 +0,0 @@
-Import('*')
-
-env = env.Clone()
-
-env.Append(CPPPATH = [
-    '#/src',
-    '#/src/mapi',
-    '#/src/mesa',
-    '#/src/mesa/main',
-    '#/include/HaikuGL',
-    '/boot/system/develop/headers/private',
-    Dir('../../../mapi'), # src/mapi build path for python-generated GL API files/headers
-])
-
-env.Prepend(LIBS = [
-    mesautil,
-    glsl,
-    mesa,
-])
-
-env.Prepend(LIBS = [libgl])
-
-sources = [
-	'SoftwareRast.cpp'
-]
-
-# Disallow undefined symbols
-#env.Append(SHLINKFLAGS = ['-Wl,-z,defs'])
-
-libswrast = env.SharedLibrary(
-    target = 'swrast',
-    source = sources
-)
diff --git a/src/mesa/drivers/haiku/swrast/SoftwareRast.cpp b/src/mesa/drivers/haiku/swrast/SoftwareRast.cpp
deleted file mode 100644
index 813ad1f..0000000
--- a/src/mesa/drivers/haiku/swrast/SoftwareRast.cpp
+++ /dev/null
@@ -1,697 +0,0 @@
-/*
- * Copyright 2006-2012, Haiku, Inc. All rights reserved.
- * Distributed under the terms of the MIT License.
- *
- * Authors:
- *		Jérôme Duval, korli@users.berlios.de
- *		Philippe Houdoin, philippe.houdoin@free.fr
- *		Artur Wyszynski, harakash@gmail.com
- *		Alexander von Gluck, kallisti5@unixzen.com
- */
-
-
-#include <kernel/image.h>
-#include "SoftwareRast.h"
-
-#include <Autolock.h>
-#include <interface/DirectWindowPrivate.h>
-#include <GraphicsDefs.h>
-#include <Screen.h>
-#include <stdio.h>
-#include <string.h>
-
-extern "C" {
-#include "extensions.h"
-#include "drivers/common/driverfuncs.h"
-#include "drivers/common/meta.h"
-#include "main/api_exec.h"
-#include "main/colormac.h"
-#include "main/cpuinfo.h"
-#include "main/buffers.h"
-#include "main/formats.h"
-#include "main/framebuffer.h"
-#include "main/renderbuffer.h"
-#include "main/version.h"
-#include "main/vtxfmt.h"
-#include "swrast/swrast.h"
-#include "swrast/s_renderbuffer.h"
-#include "swrast_setup/swrast_setup.h"
-#include "tnl/tnl.h"
-#include "tnl/t_context.h"
-#include "tnl/t_pipeline.h"
-#include "vbo/vbo.h"
-
-
-#ifdef DEBUG
-#	define TRACE(x...) printf("MesaSoftwareRast: " x)
-#	define CALLED() printf("MesaSoftwareRast: %s\n", __PRETTY_FUNCTION__)
-#else
-#	define TRACE(x...)
-#	define CALLED()
-#endif
-
-#define ERROR(x...) printf("MesaSoftwareRast: " x)
-}
-
-
-extern const char* color_space_name(color_space space);
-
-
-extern "C" _EXPORT BGLRenderer*
-instantiate_gl_renderer(BGLView* view, ulong options,
-	BGLDispatcher* dispatcher)
-{
-	return new MesaSoftwareRast(view, options, dispatcher);
-}
-
-
-MesaSoftwareRast::MesaSoftwareRast(BGLView* view, ulong options,
-	BGLDispatcher* dispatcher)
-	: BGLRenderer(view, options, dispatcher),
-	fBitmap(NULL),
-	fDirectModeEnabled(false),
-	fInfo(NULL),
-	fInfoLocker("info locker"),
-	fVisual(NULL),
-	fFrameBuffer(NULL),
-	fFrontRenderBuffer(NULL),
-	fBackRenderBuffer(NULL),
-	fColorSpace(B_NO_COLOR_SPACE)
-{
-	CALLED();
-
-	fColorSpace = BScreen(GLView()->Window()).ColorSpace();
-
-	// We force single buffering for the time being
-	options &= ~BGL_DOUBLE;
-
-	const GLboolean rgbFlag = ((options & BGL_INDEX) == 0);
-	const GLboolean alphaFlag = ((options & BGL_ALPHA) == BGL_ALPHA);
-	const GLboolean dblFlag = ((options & BGL_DOUBLE) == BGL_DOUBLE);
-	const GLboolean stereoFlag = false;
-	const GLint depth = (options & BGL_DEPTH) ? 16 : 0;
-	const GLint stencil = (options & BGL_STENCIL) ? 8 : 0;
-	const GLint accum = (options & BGL_ACCUM) ? 16 : 0;
-	const GLint red = rgbFlag ? 8 : 0;
-	const GLint green = rgbFlag ? 8 : 0;
-	const GLint blue = rgbFlag ? 8 : 0;
-	const GLint alpha = alphaFlag ? 8 : 0;
-
-	fOptions = options; // | BGL_INDIRECT;
-	struct dd_function_table functions;
-
-	fVisual = _mesa_create_visual(dblFlag, stereoFlag, red, green,
-		blue, alpha, depth, stencil, accum, accum, accum,
-		alpha ? accum : 0, 1);
-
-	// Initialize device driver function table
-	_mesa_init_driver_functions(&functions);
-
-	functions.GetString = _GetString;
-	functions.UpdateState = _UpdateState;
-	functions.MapRenderbuffer = _RenderBufferMap;
-	functions.Flush = _Flush;
-
-	// create core context
-	// We inherit gl_context to this class
-	_mesa_initialize_context(this, API_OPENGL_COMPAT, fVisual, NULL,
-		&functions);
-
-	/* Initialize the software rasterizer and helper modules. */
-	_swrast_CreateContext(this);
-	_vbo_CreateContext(this);
-	_tnl_CreateContext(this);
-	_swsetup_CreateContext(this);
-	_swsetup_Wakeup(this);
-
-	// Use default TCL pipeline
-	TNL_CONTEXT(this)->Driver.RunPipeline = _tnl_run_pipeline;
-
-	_mesa_meta_init(this);
-	_mesa_enable_sw_extensions(this);
-
-	_mesa_compute_version(this);
-
-	_mesa_initialize_dispatch_tables(this);
-	_mesa_initialize_vbo_vtxfmt(this);
-
-	// create core framebuffer
-	fFrameBuffer = _mesa_create_framebuffer(fVisual);
-	if (fFrameBuffer == NULL) {
-		ERROR("%s: Unable to calloc GL FrameBuffer!\n", __func__);
-		_mesa_destroy_visual(fVisual);
-		return;
-	}
-
-	// Setup front render buffer
-	fFrontRenderBuffer = _NewRenderBuffer(true);
-	if (fFrontRenderBuffer == NULL) {
-		ERROR("%s: FrontRenderBuffer is requested but unallocated!\n",
-			__func__);
-		_mesa_destroy_visual(fVisual);
-		free(fFrameBuffer);
-		return;
-	}
-	_mesa_add_renderbuffer(fFrameBuffer, BUFFER_FRONT_LEFT,
-		&fFrontRenderBuffer->Base);
-
-	// Setup back render buffer (if requested)
-	if (fVisual->doubleBufferMode) {
-		fBackRenderBuffer = _NewRenderBuffer(false);
-		if (fBackRenderBuffer == NULL) {
-			ERROR("%s: BackRenderBuffer is requested but unallocated!\n",
-				__func__);
-			_mesa_destroy_visual(fVisual);
-			free(fFrameBuffer);
-			return;
-		}
-		_mesa_add_renderbuffer(fFrameBuffer, BUFFER_BACK_LEFT,
-			&fBackRenderBuffer->Base);
-	}
-
-	_swrast_add_soft_renderbuffers(fFrameBuffer, GL_FALSE,
-		fVisual->haveDepthBuffer, fVisual->haveStencilBuffer,
-		fVisual->haveAccumBuffer, alphaFlag, GL_FALSE);
-
-	BRect bounds = view->Bounds();
-	fWidth = (GLint)bounds.Width();
-	fHeight = (GLint)bounds.Height();
-
-	// some stupid applications (Quake2) don't even think about calling LockGL()
-	// before using glGetString and its glGet*() friends...
-	// so make sure there is at least a valid context.
-
-	if (!_mesa_get_current_context()) {
-		LockGL();
-		// not needed, we don't have a looper yet: UnlockLooper();
-	}
-}
-
-
-MesaSoftwareRast::~MesaSoftwareRast()
-{
-	CALLED();
-	_swsetup_DestroyContext(this);
-	_swrast_DestroyContext(this);
-	_tnl_DestroyContext(this);
-	_vbo_DestroyContext(this);
-	_mesa_destroy_visual(fVisual);
-	_mesa_destroy_framebuffer(fFrameBuffer);
-	_mesa_destroy_context(this);
-
-	free(fInfo);
-	free(fFrameBuffer);
-
-	delete fBitmap;
-}
-
-
-void
-MesaSoftwareRast::LockGL()
-{
-	CALLED();
-	BGLRenderer::LockGL();
-
-	_mesa_make_current(this, fFrameBuffer, fFrameBuffer);
-
-	color_space colorSpace = BScreen(GLView()->Window()).ColorSpace();
-
-	GLuint width = fWidth;
-	GLuint height = fHeight;
-
-	BAutolock lock(fInfoLocker);
-	if (fDirectModeEnabled && fInfo != NULL) {
-		width = fInfo->window_bounds.right
-			- fInfo->window_bounds.left + 1;
-		height = fInfo->window_bounds.bottom
-			- fInfo->window_bounds.top + 1;
-	}
-
-	if (fColorSpace != colorSpace) {
-		fColorSpace = colorSpace;
-		_SetupRenderBuffer(&fFrontRenderBuffer->Base, fColorSpace);
-		if (fVisual->doubleBufferMode)
-			_SetupRenderBuffer(&fBackRenderBuffer->Base, fColorSpace);
-	}
-
-	_CheckResize(width, height);
-}
-
-
-void
-MesaSoftwareRast::UnlockGL()
-{
-	CALLED();
-	_mesa_make_current(this, NULL, NULL);
-	BGLRenderer::UnlockGL();
-}
-
-
-void
-MesaSoftwareRast::SwapBuffers(bool VSync)
-{
-	CALLED();
-
-	if (!fBitmap)
-		return;
-
-	if (fVisual->doubleBufferMode)
-		_mesa_notifySwapBuffers(this);
-
-	if (!fDirectModeEnabled || fInfo == NULL) {
-		if (GLView()->LockLooperWithTimeout(1000) == B_OK) {
-			GLView()->DrawBitmap(fBitmap, B_ORIGIN);
-			GLView()->UnlockLooper();
-		}
-	} else {
-		// TODO: Here the BGLView needs to be drawlocked.
-		_CopyToDirect();
-	}
-
-	if (VSync) {
-		BScreen screen(GLView()->Window());
-		screen.WaitForRetrace();
-	}
-}
-
-
-void
-MesaSoftwareRast::Draw(BRect updateRect)
-{
-	CALLED();
-	if (fBitmap && (!fDirectModeEnabled || (fInfo == NULL)))
-		GLView()->DrawBitmap(fBitmap, updateRect, updateRect);
-}
-
-
-status_t
-MesaSoftwareRast::CopyPixelsOut(BPoint location, BBitmap* bitmap)
-{
-	CALLED();
-	color_space scs = fBitmap->ColorSpace();
-	color_space dcs = bitmap->ColorSpace();
-
-	if (scs != dcs && (scs != B_RGBA32 || dcs != B_RGB32)) {
-		fprintf(stderr, "CopyPixelsOut(): incompatible color space: %s != %s\n",
-			color_space_name(scs),
-			color_space_name(dcs));
-		return B_BAD_TYPE;
-	}
-
-	BRect sr = fBitmap->Bounds();
-	BRect dr = bitmap->Bounds();
-
-	sr = sr & dr.OffsetBySelf(location);
-	dr = sr.OffsetByCopy(-location.x, -location.y);
-
-	uint8* ps = (uint8*)fBitmap->Bits();
-	uint8* pd = (uint8*)bitmap->Bits();
-	uint32* s;
-	uint32* d;
-	uint32 y;
-	for (y = (uint32)sr.top; y <= (uint32)sr.bottom; y++) {
-		s = (uint32*)(ps + y * fBitmap->BytesPerRow());
-		s += (uint32)sr.left;
-
-		d = (uint32*)(pd + (y + (uint32)(dr.top - sr.top))
-			* bitmap->BytesPerRow());
-		d += (uint32)dr.left;
-
-		memcpy(d, s, dr.IntegerWidth() * 4);
-	}
-	return B_OK;
-}
-
-
-status_t
-MesaSoftwareRast::CopyPixelsIn(BBitmap* bitmap, BPoint location)
-{
-	CALLED();
-	color_space scs = bitmap->ColorSpace();
-	color_space dcs = fBitmap->ColorSpace();
-
-	if (scs != dcs && (dcs != B_RGBA32 || scs != B_RGB32)) {
-		fprintf(stderr, "CopyPixelsIn(): incompatible color space: %s != %s\n",
-			color_space_name(scs),
-			color_space_name(dcs));
-		return B_BAD_TYPE;
-	}
-
-	BRect sr = bitmap->Bounds();
-	BRect dr = fBitmap->Bounds();
-
-	sr = sr & dr.OffsetBySelf(location);
-	dr = sr.OffsetByCopy(-location.x, -location.y);
-
-	uint8* ps = (uint8*)bitmap->Bits();
-	uint8* pd = (uint8*)fBitmap->Bits();
-	uint32* s;
-	uint32* d;
-	uint32 y;
-	for (y = (uint32)sr.top; y <= (uint32)sr.bottom; y++) {
-		s = (uint32*)(ps + y * bitmap->BytesPerRow());
-		s += (uint32)sr.left;
-
-		d = (uint32*)(pd + (y + (uint32)(dr.top - sr.top))
-			* fBitmap->BytesPerRow());
-		d += (uint32)dr.left;
-
-		memcpy(d, s, dr.IntegerWidth() * 4);
-	}
-	return B_OK;
-}
-
-
-void
-MesaSoftwareRast::EnableDirectMode(bool enabled)
-{
-	fDirectModeEnabled = enabled;
-}
-
-
-void
-MesaSoftwareRast::DirectConnected(direct_buffer_info* info)
-{
-	// TODO: I'm not sure we need to do this: BGLView already
-	// keeps a local copy of the direct_buffer_info passed by
-	// BDirectWindow::DirectConnected().
-	BAutolock lock(fInfoLocker);
-	if (info) {
-		if (!fInfo) {
-			fInfo = (direct_buffer_info*)malloc(DIRECT_BUFFER_INFO_AREA_SIZE);
-			if (!fInfo)
-				return;
-		}
-		memcpy(fInfo, info, DIRECT_BUFFER_INFO_AREA_SIZE);
-	} else if (fInfo) {
-		free(fInfo);
-		fInfo = NULL;
-	}
-}
-
-
-void
-MesaSoftwareRast::FrameResized(float width, float height)
-{
-	BAutolock lock(fInfoLocker);
-	_CheckResize((GLuint)width, (GLuint)height);
-}
-
-
-void
-MesaSoftwareRast::_CheckResize(GLuint newWidth, GLuint newHeight)
-{
-	CALLED();
-
-	if (fBitmap && newWidth == fWidth
-		&& newHeight == fHeight) {
-		return;
-	}
-
-	_mesa_resize_framebuffer(this, fFrameBuffer, newWidth, newHeight);
-	fHeight = newHeight;
-	fWidth = newWidth;
-
-	_AllocateBitmap();
-}
-
-
-void
-MesaSoftwareRast::_AllocateBitmap()
-{
-	CALLED();
-
-	// allocate new size of back buffer bitmap
-	delete fBitmap;
-	fBitmap = NULL;
-
-	if (fWidth < 1 || fHeight < 1) {
-		TRACE("%s: Cannot allocate bitmap < 1x1!\n", __func__);
-		return;
-	}
-
-	BRect rect(0.0, 0.0, fWidth - 1, fHeight - 1);
-	fBitmap = new BBitmap(rect, fColorSpace);
-
-	#if 0
-	// Used for platform optimized drawing
-	for (uint i = 0; i < fHeight; i++) {
-		fRowAddr[fHeight - i - 1] = (GLvoid *)((GLubyte *)fBitmap->Bits()
-			+ i * fBitmap->BytesPerRow());
-	}
-	#endif
-
-	fFrameBuffer->Width = fWidth;
-	fFrameBuffer->Height = fHeight;
-	TRACE("%s: Bitmap Size: %" B_PRIu32 "\n", __func__, fBitmap->BitsLength());
-
-	fFrontRenderBuffer->Buffer = (GLubyte*)fBitmap->Bits();
-}
-
-
-// #pragma mark - static
-
-
-const GLubyte*
-MesaSoftwareRast::_GetString(gl_context* ctx, GLenum name)
-{
-	switch (name) {
-		case GL_VENDOR:
-			return (const GLubyte*) "Mesa Project";
-		case GL_RENDERER:
-			return (const GLubyte*) "Software Rasterizer";
-		default:
-			// Let core library handle all other cases
-			return NULL;
-	}
-}
-
-
-void
-MesaSoftwareRast::_UpdateState(gl_context* ctx, GLuint new_state)
-{
-	if (!ctx)
-		return;
-
-	CALLED();
-	_swrast_InvalidateState(ctx, new_state);
-	_swsetup_InvalidateState(ctx, new_state);
-	_vbo_InvalidateState(ctx, new_state);
-	_tnl_InvalidateState(ctx, new_state);
-}
-
-
-GLboolean
-MesaSoftwareRast::_RenderBufferStorage(gl_context* ctx,
-	struct gl_renderbuffer* render, GLenum internalFormat,
-	GLuint width, GLuint height)
-{
-	CALLED();
-
-	render->Width = width;
-	render->Height = height;
-
-	struct swrast_renderbuffer *swRenderBuffer = swrast_renderbuffer(render);
-
-	swRenderBuffer->RowStride = width * _mesa_get_format_bytes(render->Format);
-
-	return GL_TRUE;
-}
-
-
-GLboolean
-MesaSoftwareRast::_RenderBufferStorageMalloc(gl_context* ctx,
-	struct gl_renderbuffer* render, GLenum internalFormat,
-	GLuint width, GLuint height)
-{
-	CALLED();
-
-	render->Width = width;
-	render->Height = height;
-
-	struct swrast_renderbuffer *swRenderBuffer = swrast_renderbuffer(render);
-
-	if (swRenderBuffer != NULL) {
-		free(swRenderBuffer->Buffer);
-		swRenderBuffer->RowStride
-			= width * _mesa_get_format_bytes(render->Format);
-
-		uint32 size = swRenderBuffer->RowStride * height;
-		TRACE("%s: Allocate %" B_PRIu32 " bytes for RenderBuffer\n",
-			__func__, size);
-		swRenderBuffer->Buffer = (GLubyte*)malloc(size);
-		if (!swRenderBuffer->Buffer) {
-			ERROR("%s: Memory allocation failure!\n", __func__);
-			return GL_FALSE;
-		}
-	} else {
-		ERROR("%s: Couldn't obtain software renderbuffer!\n",
-			__func__);
-		return GL_FALSE;
-	}
-
-	return GL_TRUE;
-}
-
-
-void
-MesaSoftwareRast::_Flush(gl_context* ctx)
-{
-	CALLED();
-	MesaSoftwareRast* driverContext = static_cast<MesaSoftwareRast*>(ctx);
-
-	//MesaSoftwareRast* driverContext = (MesaSoftwareRast*)ctx->DriverCtx;
-	if ((driverContext->fOptions & BGL_DOUBLE) == 0) {
-		// TODO: SwapBuffers() can call _CopyToDirect(), which should
-		// be always called with with the BGLView drawlocked.
-		// This is not always the case if called from here.
-		driverContext->SwapBuffers();
-	}
-}
-
-
-struct swrast_renderbuffer*
-MesaSoftwareRast::_NewRenderBuffer(bool front)
-{
-	CALLED();
-	struct swrast_renderbuffer *swRenderBuffer
-		= (struct swrast_renderbuffer*)calloc(1, sizeof *swRenderBuffer);
-
-	if (!swRenderBuffer) {
-		ERROR("%s: Failed calloc RenderBuffer\n", __func__);
-		return NULL;
-	}
-
-	_mesa_init_renderbuffer(&swRenderBuffer->Base, 0);
-
-	swRenderBuffer->Base.ClassID = HAIKU_SWRAST_RENDERBUFFER_CLASS;
-	swRenderBuffer->Base.RefCount = 1;
-	swRenderBuffer->Base.Delete = _RenderBufferDelete;
-
-	if (!front)
-		swRenderBuffer->Base.AllocStorage = _RenderBufferStorageMalloc;
-	else
-		swRenderBuffer->Base.AllocStorage = _RenderBufferStorage;
-
-	if (_SetupRenderBuffer(&swRenderBuffer->Base, fColorSpace) != B_OK) {
-		free(swRenderBuffer);
-		return NULL;
-	}
-
-	return swRenderBuffer;
-}
-
-
-status_t
-MesaSoftwareRast::_SetupRenderBuffer(struct gl_renderbuffer* rb,
-	color_space colorSpace)
-{
-	CALLED();
-
-	rb->InternalFormat = GL_RGBA;
-
-	switch (colorSpace) {
-		case B_RGBA32:
-			rb->_BaseFormat = GL_RGBA;
-			rb->Format = MESA_FORMAT_B8G8R8A8_UNORM;
-			break;
-		case B_RGB32:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_B8G8R8X8_UNORM;
-			break;
-		case B_RGB24:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_BGR_UNORM8;
-			break;
-		case B_RGB16:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_B5G6R5_UNORM;
-			break;
-		case B_RGB15:
-			rb->_BaseFormat = GL_RGB;
-			rb->Format = MESA_FORMAT_B5G5R5A1_UNORM;
-			break;
-		default:
-			fprintf(stderr, "Unsupported screen color space %s\n",
-				color_space_name(fColorSpace));
-			debugger("Unsupported OpenGL color space");
-			return B_ERROR;
-	}
-	return B_OK;
-}
-
-
-/*!	Y inverted Map RenderBuffer function
-	We use a BBitmap for storage which has Y inverted.
-	If the Mesa provided Map function ever allows external
-	control of this we can omit this function.
-*/
-void
-MesaSoftwareRast::_RenderBufferMap(gl_context *ctx,
-	struct gl_renderbuffer *rb, GLuint x, GLuint y, GLuint w, GLuint h,
-	GLbitfield mode, GLubyte **mapOut, GLint *rowStrideOut)
-{
-	if (rb->ClassID == HAIKU_SWRAST_RENDERBUFFER_CLASS) {
-		struct swrast_renderbuffer *srb = swrast_renderbuffer(rb);
-		const GLuint bpp = _mesa_get_format_bytes(rb->Format);
-		GLint rowStride = rb->Width * bpp; // in Bytes
-
-		y = rb->Height - y - 1;
-
-		*rowStrideOut = -rowStride;
-		*mapOut = (GLubyte *) srb->Buffer + y * rowStride + x * bpp;
-	} else {
-		_swrast_map_soft_renderbuffer(ctx, rb, x, y, w, h, mode,
-			mapOut, rowStrideOut);
-	}
-}
-
-
-void
-MesaSoftwareRast::_RenderBufferDelete(struct gl_context *ctx,
-	struct gl_renderbuffer* rb)
-{
-	CALLED();
-	if (rb != NULL) {
-		struct swrast_renderbuffer *swRenderBuffer
-			= swrast_renderbuffer(rb);
-		if (swRenderBuffer != NULL)
-			free(swRenderBuffer->Buffer);
-	}
-	free(rb);
-}
-
-
-void
-MesaSoftwareRast::_CopyToDirect()
-{
-	BAutolock lock(fInfoLocker);
-
-	// check the bitmap size still matches the size
-	if (fInfo->window_bounds.bottom - fInfo->window_bounds.top
-		!= fBitmap->Bounds().IntegerHeight()
-		|| fInfo->window_bounds.right - fInfo->window_bounds.left
-			!= fBitmap->Bounds().IntegerWidth())
-		return;
-
-	uint8 bytesPerPixel = fInfo->bits_per_pixel / 8;
-	uint32 bytesPerRow = fBitmap->BytesPerRow();
-	for (uint32 i = 0; i < fInfo->clip_list_count; i++) {
-		clipping_rect *clip = &fInfo->clip_list[i];
-		int32 height = clip->bottom - clip->top + 1;
-		int32 bytesWidth
-			= (clip->right - clip->left + 1) * bytesPerPixel;
-		uint8* p = (uint8*)fInfo->bits + clip->top
-			* fInfo->bytes_per_row + clip->left * bytesPerPixel;
-		uint8* b = (uint8*)fBitmap->Bits()
-			+ (clip->top - fInfo->window_bounds.top) * bytesPerRow
-			+ (clip->left - fInfo->window_bounds.left)
-				* bytesPerPixel;
-
-		for (int y = 0; y < height; y++) {
-			memcpy(p, b, bytesWidth);
-			p += fInfo->bytes_per_row;
-			b += bytesPerRow;
-		}
-	}
-}
diff --git a/src/mesa/drivers/haiku/swrast/SoftwareRast.h b/src/mesa/drivers/haiku/swrast/SoftwareRast.h
deleted file mode 100644
index 8f0f018..0000000
--- a/src/mesa/drivers/haiku/swrast/SoftwareRast.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright 2006-2012, Haiku, Inc. All rights reserved.
- * Distributed under the terms of the MIT License.
- *
- * Authors:
- *		Jérôme Duval, korli@users.berlios.de
- * 		Philippe Houdoin, philippe.houdoin@free.fr
- *		Artur Wyszynski, harakash@gmail.com
- */
-#ifndef MESASOFTWARERENDERER_H
-#define MESASOFTWARERENDERER_H
-
-
-#define HAIKU_SWRAST_RENDERBUFFER_CLASS 0x737752 // swR
-
-
-#include "GLRenderer.h"
-
-extern "C" {
-#include "context.h"
-#include "main/version.h"
-#include "swrast/s_chan.h"
-#include "swrast/s_context.h"
-}
-
-
-class MesaSoftwareRast : public BGLRenderer, public gl_context {
-public:
-							MesaSoftwareRast(BGLView* view,
-								ulong bgl_options,
-								BGLDispatcher* dispatcher);
-	virtual					~MesaSoftwareRast();
-
-	virtual	void			LockGL();
-	virtual	void 			UnlockGL();
-
-	virtual	void 			SwapBuffers(bool VSync = false);
-	virtual	void			Draw(BRect updateRect);
-	virtual	status_t		CopyPixelsOut(BPoint source, BBitmap* dest);
-	virtual	status_t		CopyPixelsIn(BBitmap* source, BPoint dest);
-	virtual void			FrameResized(float width, float height);
-
-	virtual	void			EnableDirectMode(bool enabled);
-	virtual	void			DirectConnected(direct_buffer_info* info);
-
-private:
-	static	const GLubyte*	_GetString(gl_context* ctx, GLenum name);
-			void			_CheckResize(GLuint newWidth, GLuint newHeight);
-	static	void			_UpdateState(gl_context* ctx, GLuint newState);
-	static	void			_Flush(gl_context *ctx);
-
-	struct	swrast_renderbuffer* _NewRenderBuffer(bool front);
-			status_t		_SetupRenderBuffer(struct gl_renderbuffer* rb,
-								color_space colorSpace);
-
-/* Mesa callbacks */
-	static	void			_RenderBufferDelete(struct gl_context *ctx,
-								struct gl_renderbuffer* rb);
-	static	GLboolean		_RenderBufferStorage(gl_context* ctx,
-								struct gl_renderbuffer* render,
-								GLenum internalFormat,
-								GLuint width, GLuint height);
-	static	GLboolean		_RenderBufferStorageMalloc(gl_context* ctx,
-								struct gl_renderbuffer* render,
-								GLenum internalFormat,
-								GLuint width, GLuint height);
-	static	void			_RenderBufferMap(gl_context *ctx,
-								struct gl_renderbuffer *rb,
-								GLuint x, GLuint y, GLuint w, GLuint h,
-								GLbitfield mode, GLubyte **mapOut,
-								GLint *rowStrideOut);
-
-			void			_AllocateBitmap();
-			void			_CopyToDirect();
-
-			BBitmap*		fBitmap;
-			bool			fDirectModeEnabled;
-			direct_buffer_info* fInfo;
-			BLocker			fInfoLocker;
-			ulong			fOptions;
-
-			gl_config*		fVisual;
-
-			struct gl_framebuffer* fFrameBuffer;
-			struct swrast_renderbuffer* fFrontRenderBuffer;
-			struct swrast_renderbuffer* fBackRenderBuffer;
-
-			GLuint			fWidth;
-			GLuint			fHeight;
-			color_space		fColorSpace;
-
-			void*			fRowAddr[SWRAST_MAX_HEIGHT];
-};
-
-#endif	// MESASOFTWARERENDERER_H
diff --git a/src/mesa/drivers/haiku/swrast/SoftwareRast.rdef b/src/mesa/drivers/haiku/swrast/SoftwareRast.rdef
deleted file mode 100644
index cb60332..0000000
--- a/src/mesa/drivers/haiku/swrast/SoftwareRast.rdef
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Copyright 2012, Haiku, Inc. All rights reserved.
- * Distributed under the terms of the MIT License.
- */
-
-resource app_signature "application/x-vnd.Haiku-swrast";
-
-resource app_version {
-    major  = 9,
-    middle = 0,
-    minor  = 0,
-    variety = 0,
-    internal = 0,
-    short_info = "Software Rasterizer",
-    long_info = "Haiku Mesa Software GL Rasterizer"
-};
-
-resource vector_icon {
-	$"6E6369660A0200140294A9FF18020014028DFFFF97058C0500020006023B10B7"
-	$"37F036BA1A993D466848C719BEBE2000919292FFD5D5D5020016023900000000"
-	$"000000003EE0004AE00048E0005EF884C702000203392E8D383001BAD97F3C12"
-	$"8B4786BD48B8AD0D97BBFFFF7B4168DBE9FF4168DB97020002023A0C1238D099"
-	$"BE44203F4BD14B38844678240DF56A7D9FE1EA064CC704016B0500090A044024"
-	$"2438404C5C380A044028243C40505C3C0A042438243B5C3C5C380608BFBE4D59"
-	$"4D59515957575659585560406044603C5E3A5C3CCB4FBFBA5E3ECA9DC11F564B"
-	$"584A544C504C0606AF0F2F3D2F3D393D4034BF593542324130432F42364432C0"
-	$"3FBC5A2F48354A2F480608AE9A22303EB5BD3AB42542B755422E412F3C29322D"
-	$"32223C0204263726372538263F253E263F304430443143303C313D303C02043D"
-	$"423D423C433D4A3C493D4A495049504A4F49474A484947060DAEAAAE014E445A"
-	$"3456365E325E3D5D3F5A3A5542544E4D573A4E364439463342324A2242310A0A"
-	$"0002020102403CA00C88888C8CC1401673C40D6544F2950A01010002403CA000"
-	$"0000000000401673C40D65446CF80A08020304023EC16A0000000000003EC16A"
-	$"45DD1844C6550A030105123EC16A0000000000003EC16A45DD1844C655011784"
-	$"22040A040105023EC16A0000000000003EC16A45DD1844C6550A030108123EC1"
-	$"6A0000000000003EC16A45DD1844C65501178422040A0503080706023EC16A00"
-	$"00000000003EC16A45DD1844C6550A030206071A3EC16A0000000000003EC16A"
-	$"45DD1844C65510FF0215810004178222040A060106023EC16A0000000000003E"
-	$"C16A45DD1844C6550A070107023EC16A0000000000003EC16A45DD1844C655"
-};
diff --git a/src/mesa/drivers/osmesa/Makefile.am b/src/mesa/drivers/osmesa/Makefile.am
index 9a388d6..46332e1 100644
--- a/src/mesa/drivers/osmesa/Makefile.am
+++ b/src/mesa/drivers/osmesa/Makefile.am
@@ -39,7 +39,6 @@ nodist_EXTRA_lib@OSMESA_LIB@_la_SOURCES = dummy.cpp
 lib@OSMESA_LIB@_la_SOURCES = osmesa.c
 
 lib@OSMESA_LIB@_la_LDFLAGS = \
-	-module \
 	-no-undefined \
 	-version-number @OSMESA_VERSION@ \
 	$(GC_SECTIONS) \
diff --git a/src/mesa/drivers/x11/Makefile.am b/src/mesa/drivers/x11/Makefile.am
index c0596f8..ba79f69 100644
--- a/src/mesa/drivers/x11/Makefile.am
+++ b/src/mesa/drivers/x11/Makefile.am
@@ -25,6 +25,11 @@
 
 EXTRA_DIST = SConscript
 
+if HAVE_SHARED_GLAPI
+SHARED_GLAPI_CFLAGS = -DGLX_SHARED_GLAPI
+SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
+endif
+
 AM_CPPFLAGS = \
 	-I$(top_srcdir)/include \
 	-I$(top_srcdir)/src/mapi \
@@ -34,11 +39,10 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/mesa/main \
 	$(X11_INCLUDES) \
+	$(SHARED_GLAPI_CFLAGS) \
 	$(DEFINES)
 
-if HAVE_X11_DRIVER
 lib_LTLIBRARIES = lib@GL_LIB@.la
-endif
 
 lib@GL_LIB@_la_SOURCES = \
 	glxapi.h \
@@ -66,6 +70,7 @@ GL_PATCH = 0
 lib@GL_LIB@_la_LIBADD = \
 	$(top_builddir)/src/mesa/libmesa.la \
 	$(top_builddir)/src/mapi/glapi/libglapi.la \
+	$(SHARED_GLAPI_LIB) \
 	$(GL_LIB_DEPS)
 
 lib@GL_LIB@_la_LDFLAGS = \
diff --git a/src/mesa/main/api_exec.h b/src/mesa/main/api_exec.h
index 12249fe..655cb32 100644
--- a/src/mesa/main/api_exec.h
+++ b/src/mesa/main/api_exec.h
@@ -38,6 +38,9 @@ _mesa_initialize_exec_table(struct gl_context *ctx);
 extern void
 _mesa_initialize_dispatch_tables(struct gl_context *ctx);
 
+extern struct _glapi_table *
+_mesa_new_nop_table(unsigned numEntries);
+
 #ifdef __cplusplus
 } // extern "C"
 #endif
diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index 9932a83..a7fd82c 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -1772,7 +1772,9 @@ _mesa_loopback_init_api_table(const struct gl_context *ctx,
       SET_VertexAttribI4sv(dest, _mesa_VertexAttribI4sv);
       SET_VertexAttribI4ubv(dest, _mesa_VertexAttribI4ubv);
       SET_VertexAttribI4usv(dest, _mesa_VertexAttribI4usv);
+   }
 
+   if (ctx->API == API_OPENGL_CORE) {
       /* GL 4.1 / GL_ARB_vertex_attrib_64bit */
       SET_VertexAttribL1d(dest, _mesa_VertexAttribL1d);
       SET_VertexAttribL2d(dest, _mesa_VertexAttribL2d);
diff --git a/src/mesa/main/attrib.c b/src/mesa/main/attrib.c
index b163c0a..53626e3 100644
--- a/src/mesa/main/attrib.c
+++ b/src/mesa/main/attrib.c
@@ -177,6 +177,10 @@ struct texture_state
 };
 
 
+/** An unused GL_*_BIT value */
+#define DUMMY_BIT 0x10000000
+
+
 /**
  * Allocate new attribute node of given type/kind.  Attach payload data.
  * Insert it into the linked list named by 'head'.
@@ -253,6 +257,15 @@ _mesa_PushAttrib(GLbitfield mask)
    /* groups specified by the mask. */
    head = NULL;
 
+   if (mask == 0) {
+      /* if mask is zero we still need to push something so that we
+       * don't get a GL_STACK_UNDERFLOW error in glPopAttrib().
+       */
+      GLuint dummy = 0;
+      if (!push_attrib(ctx, &head, DUMMY_BIT, sizeof(dummy), &dummy))
+         goto end;
+   }
+
    if (mask & GL_ACCUM_BUFFER_BIT) {
       if (!push_attrib(ctx, &head, GL_ACCUM_BUFFER_BIT,
                        sizeof(struct gl_accum_attrib),
@@ -928,6 +941,10 @@ _mesa_PopAttrib(void)
       }
 
       switch (attr->kind) {
+         case DUMMY_BIT:
+            /* do nothing */
+            break;
+
          case GL_ACCUM_BUFFER_BIT:
             {
                const struct gl_accum_attrib *accum;
@@ -1074,6 +1091,11 @@ _mesa_PopAttrib(void)
                _mesa_ClearDepth(depth->Clear);
                _mesa_set_enable(ctx, GL_DEPTH_TEST, depth->Test);
                _mesa_DepthMask(depth->Mask);
+               if (ctx->Extensions.EXT_depth_bounds_test) {
+                  _mesa_set_enable(ctx, GL_DEPTH_BOUNDS_TEST_EXT,
+                                   depth->BoundsTest);
+                  _mesa_DepthBoundsEXT(depth->BoundsMin, depth->BoundsMax);
+               }
             }
             break;
          case GL_ENABLE_BIT:
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index 774fc88..d869fa2 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -769,7 +769,7 @@ _mesa_ClampColor(GLenum target, GLenum clamp)
       }
       FLUSH_VERTICES(ctx, _NEW_LIGHT);
       ctx->Light.ClampVertexColor = clamp;
-      _mesa_update_clamp_vertex_color(ctx);
+      _mesa_update_clamp_vertex_color(ctx, ctx->DrawBuffer);
       break;
    case GL_CLAMP_FRAGMENT_COLOR_ARB:
       if (ctx->API == API_OPENGL_CORE &&
@@ -778,7 +778,7 @@ _mesa_ClampColor(GLenum target, GLenum clamp)
       }
       FLUSH_VERTICES(ctx, _NEW_FRAG_CLAMP);
       ctx->Color.ClampFragmentColor = clamp;
-      _mesa_update_clamp_fragment_color(ctx);
+      _mesa_update_clamp_fragment_color(ctx, ctx->DrawBuffer);
       break;
    case GL_CLAMP_READ_COLOR_ARB:
       ctx->Color.ClampReadColor = clamp;
@@ -807,50 +807,55 @@ get_clamp_color(const struct gl_framebuffer *fb, GLenum clamp)
 }
 
 GLboolean
-_mesa_get_clamp_fragment_color(const struct gl_context *ctx)
+_mesa_get_clamp_fragment_color(const struct gl_context *ctx,
+                               const struct gl_framebuffer *drawFb)
 {
-   return get_clamp_color(ctx->DrawBuffer,
-                                ctx->Color.ClampFragmentColor);
+   return get_clamp_color(drawFb, ctx->Color.ClampFragmentColor);
 }
 
 GLboolean
-_mesa_get_clamp_vertex_color(const struct gl_context *ctx)
+_mesa_get_clamp_vertex_color(const struct gl_context *ctx,
+                             const struct gl_framebuffer *drawFb)
 {
-   return get_clamp_color(ctx->DrawBuffer, ctx->Light.ClampVertexColor);
+   return get_clamp_color(drawFb, ctx->Light.ClampVertexColor);
 }
 
 GLboolean
-_mesa_get_clamp_read_color(const struct gl_context *ctx)
+_mesa_get_clamp_read_color(const struct gl_context *ctx,
+                           const struct gl_framebuffer *readFb)
 {
-   return get_clamp_color(ctx->ReadBuffer, ctx->Color.ClampReadColor);
+   return get_clamp_color(readFb, ctx->Color.ClampReadColor);
 }
 
 /**
  * Update the ctx->Color._ClampFragmentColor field
  */
 void
-_mesa_update_clamp_fragment_color(struct gl_context *ctx)
+_mesa_update_clamp_fragment_color(struct gl_context *ctx,
+                                  const struct gl_framebuffer *drawFb)
 {
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-
    /* Don't clamp if:
     * - there is no colorbuffer
     * - all colorbuffers are unsigned normalized, so clamping has no effect
     * - there is an integer colorbuffer
     */
-   if (!fb || !fb->_HasSNormOrFloatColorBuffer || fb->_IntegerColor)
+   if (!drawFb || !drawFb->_HasSNormOrFloatColorBuffer ||
+       drawFb->_IntegerColor)
       ctx->Color._ClampFragmentColor = GL_FALSE;
    else
-      ctx->Color._ClampFragmentColor = _mesa_get_clamp_fragment_color(ctx);
+      ctx->Color._ClampFragmentColor =
+         _mesa_get_clamp_fragment_color(ctx, drawFb);
 }
 
 /**
  * Update the ctx->Color._ClampVertexColor field
  */
 void
-_mesa_update_clamp_vertex_color(struct gl_context *ctx)
+_mesa_update_clamp_vertex_color(struct gl_context *ctx,
+                                const struct gl_framebuffer *drawFb)
 {
-   ctx->Light._ClampVertexColor = _mesa_get_clamp_vertex_color(ctx);
+   ctx->Light._ClampVertexColor =
+         _mesa_get_clamp_vertex_color(ctx, drawFb);
 }
 
 /**
diff --git a/src/mesa/main/blend.h b/src/mesa/main/blend.h
index fe31a74..8ab9e02 100644
--- a/src/mesa/main/blend.h
+++ b/src/mesa/main/blend.h
@@ -37,6 +37,7 @@
 #include "formats.h"
 
 struct gl_context;
+struct gl_framebuffer;
 
 
 extern void GLAPIENTRY
@@ -101,19 +102,24 @@ extern void GLAPIENTRY
 _mesa_ClampColor(GLenum target, GLenum clamp);
 
 extern GLboolean
-_mesa_get_clamp_fragment_color(const struct gl_context *ctx);
+_mesa_get_clamp_fragment_color(const struct gl_context *ctx,
+                               const struct gl_framebuffer *drawFb);
 
 extern GLboolean
-_mesa_get_clamp_vertex_color(const struct gl_context *ctx);
+_mesa_get_clamp_vertex_color(const struct gl_context *ctx,
+                             const struct gl_framebuffer *drawFb);
 
 extern GLboolean
-_mesa_get_clamp_read_color(const struct gl_context *ctx);
+_mesa_get_clamp_read_color(const struct gl_context *ctx,
+                           const struct gl_framebuffer *readFb);
 
 extern void
-_mesa_update_clamp_fragment_color(struct gl_context *ctx);
+_mesa_update_clamp_fragment_color(struct gl_context *ctx,
+                                  const struct gl_framebuffer *drawFb);
 
 extern void
-_mesa_update_clamp_vertex_color(struct gl_context *ctx);
+_mesa_update_clamp_vertex_color(struct gl_context *ctx,
+                                const struct gl_framebuffer *drawFb);
 
 extern mesa_format
 _mesa_get_render_format(const struct gl_context *ctx, mesa_format format);
diff --git a/src/mesa/main/blit.c b/src/mesa/main/blit.c
index 0694466..db8fee5 100644
--- a/src/mesa/main/blit.c
+++ b/src/mesa/main/blit.c
@@ -34,6 +34,7 @@
 #include "enums.h"
 #include "blit.h"
 #include "fbobject.h"
+#include "framebuffer.h"
 #include "glformats.h"
 #include "mtypes.h"
 #include "state.h"
@@ -148,38 +149,25 @@ is_valid_blit_filter(const struct gl_context *ctx, GLenum filter)
 }
 
 
-/**
- * Blit rectangular region, optionally from one framebuffer to another.
- *
- * Note, if the src buffer is multisampled and the dest is not, this is
- * when the samples must be resolved to a single color.
- */
-void GLAPIENTRY
-_mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
-                         GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
-                         GLbitfield mask, GLenum filter)
+void
+_mesa_blit_framebuffer(struct gl_context *ctx,
+                       struct gl_framebuffer *readFb,
+                       struct gl_framebuffer *drawFb,
+                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                       GLbitfield mask, GLenum filter, const char *func)
 {
    const GLbitfield legalMaskBits = (GL_COLOR_BUFFER_BIT |
                                      GL_DEPTH_BUFFER_BIT |
                                      GL_STENCIL_BUFFER_BIT);
-   const struct gl_framebuffer *readFb, *drawFb;
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx,
-                  "glBlitFramebuffer(%d, %d, %d, %d,  %d, %d, %d, %d, 0x%x, %s)\n",
-                  srcX0, srcY0, srcX1, srcY1,
-                  dstX0, dstY0, dstX1, dstY1,
-                  mask, _mesa_lookup_enum_by_nr(filter));
-
-   if (ctx->NewState) {
-      _mesa_update_state(ctx);
-   }
+   /* Update completeness status of readFb and drawFb. */
+   _mesa_update_framebuffer(ctx, readFb, drawFb);
 
-   readFb = ctx->ReadBuffer;
-   drawFb = ctx->DrawBuffer;
+   /* Make sure drawFb has an initialized bounding box. */
+   _mesa_update_draw_buffer_bounds(ctx, drawFb);
 
    if (!readFb || !drawFb) {
       /* This will normally never happen but someday we may want to
@@ -192,12 +180,12 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    if (drawFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT ||
        readFb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
       _mesa_error(ctx, GL_INVALID_FRAMEBUFFER_OPERATION_EXT,
-                  "glBlitFramebufferEXT(incomplete draw/read buffers)");
+                  "%s(incomplete draw/read buffers)", func);
       return;
    }
 
    if (!is_valid_blit_filter(ctx, filter)) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glBlitFramebufferEXT(%s)",
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid filter %s)", func,
                   _mesa_lookup_enum_by_nr(filter));
       return;
    }
@@ -205,13 +193,13 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    if ((filter == GL_SCALED_RESOLVE_FASTEST_EXT ||
         filter == GL_SCALED_RESOLVE_NICEST_EXT) &&
         (readFb->Visual.samples == 0 || drawFb->Visual.samples > 0)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glBlitFramebufferEXT(%s)",
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(%s: invalid samples)", func,
                   _mesa_lookup_enum_by_nr(filter));
       return;
    }
 
    if (mask & ~legalMaskBits) {
-      _mesa_error( ctx, GL_INVALID_VALUE, "glBlitFramebufferEXT(mask)");
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(invalid mask bits set)", func);
       return;
    }
 
@@ -219,13 +207,13 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    if ((mask & (GL_DEPTH_BUFFER_BIT | GL_STENCIL_BUFFER_BIT))
         && filter != GL_NEAREST) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-             "glBlitFramebufferEXT(depth/stencil requires GL_NEAREST filter)");
+             "%s(depth/stencil requires GL_NEAREST filter)", func);
       return;
    }
 
    /* get color read/draw renderbuffers */
    if (mask & GL_COLOR_BUFFER_BIT) {
-      const GLuint numColorDrawBuffers = ctx->DrawBuffer->_NumColorDrawBuffers;
+      const GLuint numColorDrawBuffers = drawFb->_NumColorDrawBuffers;
       const struct gl_renderbuffer *colorReadRb = readFb->_ColorReadBuffer;
       const struct gl_renderbuffer *colorDrawRb = NULL;
       GLuint i;
@@ -241,7 +229,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
       }
       else {
          for (i = 0; i < numColorDrawBuffers; i++) {
-            colorDrawRb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+            colorDrawRb = drawFb->_ColorDrawBuffers[i];
             if (!colorDrawRb)
                continue;
 
@@ -257,15 +245,15 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
              */
             if (_mesa_is_gles3(ctx) && (colorDrawRb == colorReadRb)) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glBlitFramebuffer(source and destination color "
-                           "buffer cannot be the same)");
+                           "%s(source and destination color "
+                           "buffer cannot be the same)", func);
                return;
             }
 
             if (!compatible_color_datatypes(colorReadRb->Format,
                                             colorDrawRb->Format)) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glBlitFramebufferEXT(color buffer datatypes mismatch)");
+                           "%s(color buffer datatypes mismatch)", func);
                return;
             }
             /* extra checks for multisample copies... */
@@ -273,7 +261,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                /* color formats must match */
                if (!compatible_resolve_formats(colorReadRb, colorDrawRb)) {
                   _mesa_error(ctx, GL_INVALID_OPERATION,
-                         "glBlitFramebufferEXT(bad src/dst multisample pixel formats)");
+                         "%s(bad src/dst multisample pixel formats)", func);
                   return;
                }
             }
@@ -286,7 +274,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
             GLenum type = _mesa_get_format_datatype(colorReadRb->Format);
             if (type == GL_INT || type == GL_UNSIGNED_INT) {
                _mesa_error(ctx, GL_INVALID_OPERATION,
-                           "glBlitFramebufferEXT(integer color type)");
+                           "%s(integer color type)", func);
                return;
             }
          }
@@ -306,15 +294,15 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
        *     ignored."
        */
       if ((readRb == NULL) || (drawRb == NULL)) {
-	 mask &= ~GL_STENCIL_BUFFER_BIT;
+         mask &= ~GL_STENCIL_BUFFER_BIT;
       }
       else {
          int read_z_bits, draw_z_bits;
 
          if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(source and destination stencil "
-                        "buffer cannot be the same)");
+                        "%s(source and destination stencil "
+                        "buffer cannot be the same)", func);
             return;
          }
 
@@ -324,7 +312,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
              * there is only one: GL_UNSIGNED_INT.
              */
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(stencil attachment format mismatch)");
+                        "%s(stencil attachment format mismatch)", func);
             return;
          }
 
@@ -340,8 +328,8 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
               _mesa_get_format_datatype(readRb->Format) !=
               _mesa_get_format_datatype(drawRb->Format))) {
 
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glBlitFramebuffer"
-                        "(stencil attachment depth format mismatch)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(stencil attachment depth format mismatch)", func);
             return;
          }
       }
@@ -360,15 +348,15 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
        *     ignored."
        */
       if ((readRb == NULL) || (drawRb == NULL)) {
-	 mask &= ~GL_DEPTH_BUFFER_BIT;
+         mask &= ~GL_DEPTH_BUFFER_BIT;
       }
       else {
          int read_s_bit, draw_s_bit;
 
          if (_mesa_is_gles3(ctx) && (drawRb == readRb)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(source and destination depth "
-                        "buffer cannot be the same)");
+                        "%s(source and destination depth "
+                        "buffer cannot be the same)", func);
             return;
          }
 
@@ -377,7 +365,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
              (_mesa_get_format_datatype(readRb->Format) !=
               _mesa_get_format_datatype(drawRb->Format))) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebuffer(depth attachment format mismatch)");
+                        "%s(depth attachment format mismatch)", func);
             return;
          }
 
@@ -389,8 +377,8 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
           * we should ignore the stencil format check.
           */
          if (read_s_bit > 0 && draw_s_bit > 0 && read_s_bit != draw_s_bit) {
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glBlitFramebuffer"
-                        "(depth attachment stencil bits mismatch)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(depth attachment stencil bits mismatch)", func);
             return;
          }
       }
@@ -406,7 +394,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
        */
       if (drawFb->Visual.samples > 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBlitFramebuffer(destination samples must be 0)");
+                     "%s(destination samples must be 0)", func);
          return;
       }
 
@@ -426,7 +414,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
           && (srcX0 != dstX0 || srcY0 != dstY0
               || srcX1 != dstX1 || srcY1 != dstY1)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBlitFramebuffer(bad src/dst multisample region)");
+                     "%s(bad src/dst multisample region)", func);
          return;
       }
    } else {
@@ -434,7 +422,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
           drawFb->Visual.samples > 0 &&
           readFb->Visual.samples != drawFb->Visual.samples) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBlitFramebufferEXT(mismatched samples)");
+                     "%s(mismatched samples)", func);
          return;
       }
 
@@ -445,7 +433,7 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
          if (abs(srcX1 - srcX0) != abs(dstX1 - dstX0) ||
              abs(srcY1 - srcY0) != abs(dstY1 - dstY0)) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glBlitFramebufferEXT(bad src/dst multisample region sizes)");
+                        "%s(bad src/dst multisample region sizes)", func);
             return;
          }
       }
@@ -457,43 +445,44 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
       const struct gl_renderbuffer *colorDrawRb = NULL;
       GLuint i = 0;
 
-      printf("glBlitFramebuffer(%d, %d, %d, %d,  %d, %d, %d, %d,"
-	     " 0x%x, 0x%x)\n",
-	     srcX0, srcY0, srcX1, srcY1,
-	     dstX0, dstY0, dstX1, dstY1,
-	     mask, filter);
+      printf("%s(%d, %d, %d, %d,  %d, %d, %d, %d,"
+             " 0x%x, 0x%x)\n", func,
+             srcX0, srcY0, srcX1, srcY1,
+             dstX0, dstY0, dstX1, dstY1,
+             mask, filter);
+
       if (colorReadRb) {
          const struct gl_renderbuffer_attachment *att;
 
          att = find_attachment(readFb, colorReadRb);
          printf("  Src FBO %u  RB %u (%dx%d)  ",
-		readFb->Name, colorReadRb->Name,
-		colorReadRb->Width, colorReadRb->Height);
+                readFb->Name, colorReadRb->Name,
+                colorReadRb->Width, colorReadRb->Height);
          if (att && att->Texture) {
             printf("Tex %u  tgt 0x%x  level %u  face %u",
-		   att->Texture->Name,
-		   att->Texture->Target,
-		   att->TextureLevel,
-		   att->CubeMapFace);
+                   att->Texture->Name,
+                   att->Texture->Target,
+                   att->TextureLevel,
+                   att->CubeMapFace);
          }
          printf("\n");
 
          /* Print all active color render buffers */
-         for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
-            colorDrawRb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+         for (i = 0; i < drawFb->_NumColorDrawBuffers; i++) {
+            colorDrawRb = drawFb->_ColorDrawBuffers[i];
             if (!colorDrawRb)
                continue;
 
             att = find_attachment(drawFb, colorDrawRb);
             printf("  Dst FBO %u  RB %u (%dx%d)  ",
-		   drawFb->Name, colorDrawRb->Name,
-		   colorDrawRb->Width, colorDrawRb->Height);
+                   drawFb->Name, colorDrawRb->Name,
+                   colorDrawRb->Width, colorDrawRb->Height);
             if (att && att->Texture) {
                printf("Tex %u  tgt 0x%x  level %u  face %u",
-		      att->Texture->Name,
-		      att->Texture->Target,
-		      att->TextureLevel,
-		      att->CubeMapFace);
+                      att->Texture->Name,
+                      att->Texture->Target,
+                      att->TextureLevel,
+                      att->CubeMapFace);
             }
             printf("\n");
          }
@@ -507,8 +496,87 @@ _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
    }
 
    assert(ctx->Driver.BlitFramebuffer);
-   ctx->Driver.BlitFramebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
+   ctx->Driver.BlitFramebuffer(ctx, readFb, drawFb,
                                srcX0, srcY0, srcX1, srcY1,
                                dstX0, dstY0, dstX1, dstY1,
                                mask, filter);
 }
+
+
+/**
+ * Blit rectangular region, optionally from one framebuffer to another.
+ *
+ * Note, if the src buffer is multisampled and the dest is not, this is
+ * when the samples must be resolved to a single color.
+ */
+void GLAPIENTRY
+_mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                      GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                      GLbitfield mask, GLenum filter)
+{
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx,
+                  "glBlitFramebuffer(%d, %d, %d, %d, "
+                  " %d, %d, %d, %d, 0x%x, %s)\n",
+                  srcX0, srcY0, srcX1, srcY1,
+                  dstX0, dstY0, dstX1, dstY1,
+                  mask, _mesa_lookup_enum_by_nr(filter));
+
+   _mesa_blit_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer,
+                          srcX0, srcY0, srcX1, srcY1,
+                          dstX0, dstY0, dstX1, dstY1,
+                          mask, filter, "glBlitFramebuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
+                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                           GLbitfield mask, GLenum filter)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *readFb, *drawFb;
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx,
+                  "glBlitNamedFramebuffer(%u %u %d, %d, %d, %d, "
+                  " %d, %d, %d, %d, 0x%x, %s)\n",
+                  readFramebuffer, drawFramebuffer,
+                  srcX0, srcY0, srcX1, srcY1,
+                  dstX0, dstY0, dstX1, dstY1,
+                  mask, _mesa_lookup_enum_by_nr(filter));
+
+   /*
+    * According to PDF page 533 of the OpenGL 4.5 core spec (30.10.2014,
+    * Section 18.3 Copying Pixels):
+    *   "... if readFramebuffer or drawFramebuffer is zero (for
+    *   BlitNamedFramebuffer), then the default read or draw framebuffer is
+    *   used as the corresponding source or destination framebuffer,
+    *   respectively."
+    */
+   if (readFramebuffer) {
+      readFb = _mesa_lookup_framebuffer_err(ctx, readFramebuffer,
+                                            "glBlitNamedFramebuffer");
+      if (!readFb)
+         return;
+   }
+   else
+      readFb = ctx->WinSysReadBuffer;
+
+   if (drawFramebuffer) {
+      drawFb = _mesa_lookup_framebuffer_err(ctx, drawFramebuffer,
+                                            "glBlitNamedFramebuffer");
+      if (!drawFb)
+         return;
+   }
+   else
+      drawFb = ctx->WinSysDrawBuffer;
+
+   _mesa_blit_framebuffer(ctx, readFb, drawFb,
+                          srcX0, srcY0, srcX1, srcY1,
+                          dstX0, dstY0, dstX1, dstY1,
+                          mask, filter, "glBlitNamedFramebuffer");
+}
diff --git a/src/mesa/main/blit.h b/src/mesa/main/blit.h
index 01a958a..54b946e 100644
--- a/src/mesa/main/blit.h
+++ b/src/mesa/main/blit.h
@@ -28,11 +28,24 @@
 
 #include "glheader.h"
 
+extern void
+_mesa_blit_framebuffer(struct gl_context *ctx,
+                       struct gl_framebuffer *readFb,
+                       struct gl_framebuffer *drawFb,
+                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                       GLbitfield mask, GLenum filter, const char *func);
 
 extern void GLAPIENTRY
 _mesa_BlitFramebuffer(GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
                          GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
                          GLbitfield mask, GLenum filter);
 
+extern void GLAPIENTRY
+_mesa_BlitNamedFramebuffer(GLuint readFramebuffer, GLuint drawFramebuffer,
+                           GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                           GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                           GLbitfield mask, GLenum filter);
+
 
 #endif /* BLIT_H */
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 37a9790..0536266 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -242,16 +242,16 @@ read_buffer_enum_to_index(GLenum buffer)
  *
  * See the GL_EXT_framebuffer_object spec for more info.
  */
-void GLAPIENTRY
-_mesa_DrawBuffer(GLenum buffer)
+void
+_mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller)
 {
    GLbitfield destMask;
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API) {
-      _mesa_debug(ctx, "glDrawBuffer %s\n", _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
    }
 
    if (buffer == GL_NONE) {
@@ -259,33 +259,60 @@ _mesa_DrawBuffer(GLenum buffer)
    }
    else {
       const GLbitfield supportedMask
-         = supported_buffer_bitmask(ctx, ctx->DrawBuffer);
+         = supported_buffer_bitmask(ctx, fb);
       destMask = draw_buffer_enum_to_bitmask(ctx, buffer);
       if (destMask == BAD_MASK) {
          /* totally bogus buffer */
-         _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glDrawBuffer(buffer=0x%x)", buffer);
+         _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)", caller,
+                     _mesa_lookup_enum_by_nr(buffer));
          return;
       }
       destMask &= supportedMask;
       if (destMask == 0x0) {
          /* none of the named color buffers exist! */
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glDrawBuffer(buffer=0x%x)", buffer);
+         _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid buffer %s)",
+                     caller, _mesa_lookup_enum_by_nr(buffer));
          return;
       }
    }
 
    /* if we get here, there's no error so set new state */
-   _mesa_drawbuffers(ctx, 1, &buffer, &destMask);
+   _mesa_drawbuffers(ctx, fb, 1, &buffer, &destMask);
+
+   /* Call device driver function only if fb is the bound draw buffer */
+   if (fb == ctx->DrawBuffer) {
+      if (ctx->Driver.DrawBuffers)
+         ctx->Driver.DrawBuffers(ctx, 1, &buffer);
+      else if (ctx->Driver.DrawBuffer)
+         ctx->Driver.DrawBuffer(ctx, buffer);
+   }
+}
 
-   /*
-    * Call device driver function.
-    */
-   if (ctx->Driver.DrawBuffers)
-      ctx->Driver.DrawBuffers(ctx, 1, &buffer);
-   else if (ctx->Driver.DrawBuffer)
-      ctx->Driver.DrawBuffer(ctx, buffer);
+
+void GLAPIENTRY
+_mesa_DrawBuffer(GLenum buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_draw_buffer(ctx, ctx->DrawBuffer, buffer, "glDrawBuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffer(GLuint framebuffer, GLenum buf)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferDrawBuffer");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   _mesa_draw_buffer(ctx, fb, buf, "glNamedFramebufferDrawBuffer");
 }
 
 
@@ -298,13 +325,13 @@ _mesa_DrawBuffer(GLenum buffer)
  *                 names cannot specify more than one buffer.  For example,
  *                 GL_FRONT_AND_BACK is illegal.
  */
-void GLAPIENTRY
-_mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
+void
+_mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                   GLsizei n, const GLenum *buffers, const char *caller)
 {
    GLuint output;
    GLbitfield usedBufferMask, supportedMask;
    GLbitfield destMask[MAX_DRAW_BUFFERS];
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
@@ -315,12 +342,18 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
     * "An INVALID_VALUE error is generated if n is greater than
     *  MAX_DRAW_BUFFERS."
     */
-   if (n < 0 || n > (GLsizei) ctx->Const.MaxDrawBuffers) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glDrawBuffersARB(n)");
+   if (n < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", caller);
+      return;
+   }
+
+   if (n > (GLsizei) ctx->Const.MaxDrawBuffers) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(n > maximum number of draw buffers)", caller);
       return;
    }
 
-   supportedMask = supported_buffer_bitmask(ctx, ctx->DrawBuffer);
+   supportedMask = supported_buffer_bitmask(ctx, fb);
    usedBufferMask = 0x0;
 
    /* From the ES 3.0 specification, page 180:
@@ -328,9 +361,9 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
     *  and the constant must be BACK or NONE."
     * (same restriction applies with GL_EXT_draw_buffers specification)
     */
-   if (ctx->API == API_OPENGLES2 && _mesa_is_winsys_fbo(ctx->DrawBuffer) &&
+   if (ctx->API == API_OPENGLES2 && _mesa_is_winsys_fbo(fb) &&
        (n != 1 || (buffers[0] != GL_NONE && buffers[0] != GL_BACK))) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawBuffers(buffer)");
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(invalid buffers)", caller);
       return;
    }
 
@@ -362,9 +395,11 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *     or equal to the value of MAX_COLOR_ATTACHMENTS, then the error
           *     INVALID_OPERATION results."
           */
-         if (_mesa_is_user_fbo(ctx->DrawBuffer) && buffers[output] >=
+         if (_mesa_is_user_fbo(fb) && buffers[output] >=
              GL_COLOR_ATTACHMENT0 + ctx->Const.MaxDrawBuffers) {
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawBuffersARB(buffer)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(buffers[%d] >= maximum number of draw buffers)",
+                        caller, output);
             return;
          }
 
@@ -375,9 +410,10 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *  4.5 or 4.6.  Otherwise, an INVALID_ENUM error is generated.
           */
          if (destMask[output] == BAD_MASK) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glDrawBuffersARB(buffer)");
+            _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
-         }         
+         }
 
          /* From the OpenGL 4.0 specification, page 256:
           * "For both the default framebuffer and framebuffer objects, the
@@ -390,7 +426,8 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *  but the Khronos conformance tests expect INVALID_ENUM.
           */
          if (_mesa_bitcount(destMask[output]) > 1) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "glDrawBuffersARB(buffer)");
+            _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -407,7 +444,8 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
          destMask[output] &= supportedMask;
          if (destMask[output] == 0) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glDrawBuffersARB(unsupported buffer)");
+                        "%s(unsupported buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -416,10 +454,12 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           *  in bufs must be COLOR_ATTACHMENTi or NONE. [...] INVALID_OPERATION."
           * (same restriction applies with GL_EXT_draw_buffers specification)
           */
-         if (ctx->API == API_OPENGLES2 && _mesa_is_user_fbo(ctx->DrawBuffer) &&
+         if (ctx->API == API_OPENGLES2 && _mesa_is_user_fbo(fb) &&
              buffers[output] != GL_NONE &&
              buffers[output] != GL_COLOR_ATTACHMENT0 + output) {
-            _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawBuffers(buffer)");
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "%s(unsupported buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -430,7 +470,8 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
           */
          if (destMask[output] & usedBufferMask) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glDrawBuffersARB(duplicated buffer)");
+                        "%s(duplicated buffer %s)",
+                        caller, _mesa_lookup_enum_by_nr(buffers[output]));
             return;
          }
 
@@ -440,17 +481,48 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
    }
 
    /* OK, if we get here, there were no errors so set the new state */
-   _mesa_drawbuffers(ctx, n, buffers, destMask);
+   _mesa_drawbuffers(ctx, fb, n, buffers, destMask);
 
    /*
-    * Call device driver function.  Note that n can be equal to 0,
+    * Call device driver function if fb is the bound draw buffer.
+    * Note that n can be equal to 0,
     * in which case we don't want to reference buffers[0], which
     * may not be valid.
     */
-   if (ctx->Driver.DrawBuffers)
-      ctx->Driver.DrawBuffers(ctx, n, buffers);
-   else if (ctx->Driver.DrawBuffer)
-      ctx->Driver.DrawBuffer(ctx, n > 0 ? buffers[0] : GL_NONE);
+   if (fb == ctx->DrawBuffer) {
+      if (ctx->Driver.DrawBuffers)
+         ctx->Driver.DrawBuffers(ctx, n, buffers);
+      else if (ctx->Driver.DrawBuffer)
+         ctx->Driver.DrawBuffer(ctx, n > 0 ? buffers[0] : GL_NONE);
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_draw_buffers(ctx, ctx->DrawBuffer, n, buffers, "glDrawBuffers");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffers(GLuint framebuffer, GLsizei n,
+                                  const GLenum *bufs)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferDrawBuffers");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   _mesa_draw_buffers(ctx, fb, n, bufs, "glNamedFramebufferDrawBuffers");
 }
 
 
@@ -459,13 +531,11 @@ _mesa_DrawBuffers(GLsizei n, const GLenum *buffers)
  * actual change.
  */
 static void
-updated_drawbuffers(struct gl_context *ctx)
+updated_drawbuffers(struct gl_context *ctx, struct gl_framebuffer *fb)
 {
    FLUSH_VERTICES(ctx, _NEW_BUFFERS);
 
    if (ctx->API == API_OPENGL_COMPAT && !ctx->Extensions.ARB_ES2_compatibility) {
-      struct gl_framebuffer *fb = ctx->DrawBuffer;
-
       /* Flag the FBO as requiring validation. */
       if (_mesa_is_user_fbo(fb)) {
 	 fb->_Status = 0;
@@ -482,6 +552,7 @@ updated_drawbuffers(struct gl_context *ctx)
  * so nothing should go wrong at this point.
  *
  * \param ctx  current context
+ * \param fb   the desired draw buffer
  * \param n    number of color outputs to set
  * \param buffers  array[n] of colorbuffer names, like GL_LEFT.
  * \param destMask  array[n] of BUFFER_BIT_* bitmasks which correspond to the
@@ -489,10 +560,9 @@ updated_drawbuffers(struct gl_context *ctx)
  *                  BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT).
  */
 void
-_mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
-                  const GLbitfield *destMask)
+_mesa_drawbuffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLuint n, const GLenum *buffers, const GLbitfield *destMask)
 {
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
    GLbitfield mask[MAX_DRAW_BUFFERS];
    GLuint buf;
 
@@ -518,7 +588,7 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
       while (destMask0) {
          GLint bufIndex = ffs(destMask0) - 1;
          if (fb->_ColorDrawBufferIndexes[count] != bufIndex) {
-            updated_drawbuffers(ctx);
+            updated_drawbuffers(ctx, fb);
             fb->_ColorDrawBufferIndexes[count] = bufIndex;
          }
          count++;
@@ -535,14 +605,14 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
             /* only one bit should be set in the destMask[buf] field */
             assert(_mesa_bitcount(destMask[buf]) == 1);
             if (fb->_ColorDrawBufferIndexes[buf] != bufIndex) {
-	       updated_drawbuffers(ctx);
+	       updated_drawbuffers(ctx, fb);
                fb->_ColorDrawBufferIndexes[buf] = bufIndex;
             }
             count = buf + 1;
          }
          else {
             if (fb->_ColorDrawBufferIndexes[buf] != -1) {
-	       updated_drawbuffers(ctx);
+	       updated_drawbuffers(ctx, fb);
                fb->_ColorDrawBufferIndexes[buf] = -1;
             }
          }
@@ -554,7 +624,7 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
    /* set remaining outputs to -1 (GL_NONE) */
    for (buf = fb->_NumColorDrawBuffers; buf < ctx->Const.MaxDrawBuffers; buf++) {
       if (fb->_ColorDrawBufferIndexes[buf] != -1) {
-         updated_drawbuffers(ctx);
+         updated_drawbuffers(ctx, fb);
          fb->_ColorDrawBufferIndexes[buf] = -1;
       }
    }
@@ -566,7 +636,7 @@ _mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
       /* also set context drawbuffer state */
       for (buf = 0; buf < ctx->Const.MaxDrawBuffers; buf++) {
          if (ctx->Color.DrawBuffer[buf] != fb->ColorDrawBuffer[buf]) {
-	    updated_drawbuffers(ctx);
+	    updated_drawbuffers(ctx, fb);
             ctx->Color.DrawBuffer[buf] = fb->ColorDrawBuffer[buf];
          }
       }
@@ -585,7 +655,7 @@ _mesa_update_draw_buffers(struct gl_context *ctx)
    /* should be a window system FBO */
    assert(_mesa_is_winsys_fbo(ctx->DrawBuffer));
 
-   _mesa_drawbuffers(ctx, ctx->Const.MaxDrawBuffers,
+   _mesa_drawbuffers(ctx, ctx->DrawBuffer, ctx->Const.MaxDrawBuffers,
                      ctx->Color.DrawBuffer, NULL);
 }
 
@@ -598,11 +668,10 @@ _mesa_update_draw_buffers(struct gl_context *ctx)
  * \param bufferIndex  the numerical index corresponding to 'buffer'
  */
 void
-_mesa_readbuffer(struct gl_context *ctx, GLenum buffer, GLint bufferIndex)
+_mesa_readbuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                 GLenum buffer, GLint bufferIndex)
 {
-   struct gl_framebuffer *fb = ctx->ReadBuffer;
-
-   if (_mesa_is_winsys_fbo(fb)) {
+   if ((fb == ctx->ReadBuffer) && _mesa_is_winsys_fbo(fb)) {
       /* Only update the per-context READ_BUFFER state if we're bound to
        * a window-system framebuffer.
        */
@@ -621,23 +690,17 @@ _mesa_readbuffer(struct gl_context *ctx, GLenum buffer, GLint bufferIndex)
  * Called by glReadBuffer to set the source renderbuffer for reading pixels.
  * \param mode color buffer such as GL_FRONT, GL_BACK, etc.
  */
-void GLAPIENTRY
-_mesa_ReadBuffer(GLenum buffer)
+void
+_mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller)
 {
-   struct gl_framebuffer *fb;
    GLbitfield supportedMask;
    GLint srcBuffer;
-   GET_CURRENT_CONTEXT(ctx);
 
    FLUSH_VERTICES(ctx, 0);
 
    if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glReadBuffer %s\n", _mesa_lookup_enum_by_nr(buffer));
-
-   fb = ctx->ReadBuffer;
-
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glReadBuffer %s\n", _mesa_lookup_enum_by_nr(buffer));
+      _mesa_debug(ctx, "%s %s\n", caller, _mesa_lookup_enum_by_nr(buffer));
 
    if (buffer == GL_NONE) {
       /* This is legal--it means that no buffer should be bound for reading. */
@@ -648,24 +711,53 @@ _mesa_ReadBuffer(GLenum buffer)
       srcBuffer = read_buffer_enum_to_index(buffer);
       if (srcBuffer == -1) {
          _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glReadBuffer(buffer=0x%x)", buffer);
+                     "%s(invalid buffer %s)", caller,
+                     _mesa_lookup_enum_by_nr(buffer));
          return;
       }
       supportedMask = supported_buffer_bitmask(ctx, fb);
       if (((1 << srcBuffer) & supportedMask) == 0) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glReadBuffer(buffer=0x%x)", buffer);
+                     "%s(invalid buffer %s)", caller,
+                     _mesa_lookup_enum_by_nr(buffer));
          return;
       }
    }
 
    /* OK, all error checking has been completed now */
 
-   _mesa_readbuffer(ctx, buffer, srcBuffer);
+   _mesa_readbuffer(ctx, fb, buffer, srcBuffer);
 
-   /*
-    * Call device driver function.
-    */
-   if (ctx->Driver.ReadBuffer)
-      (*ctx->Driver.ReadBuffer)(ctx, buffer);
+   /* Call the device driver function only if fb is the bound read buffer */
+   if (fb == ctx->ReadBuffer) {
+      if (ctx->Driver.ReadBuffer)
+         (*ctx->Driver.ReadBuffer)(ctx, buffer);
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_ReadBuffer(GLenum buffer)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_read_buffer(ctx, ctx->ReadBuffer, buffer, "glReadBuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferReadBuffer(GLuint framebuffer, GLenum src)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glNamedFramebufferReadBuffer");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysReadBuffer;
+
+   _mesa_read_buffer(ctx, fb, src, "glNamedFramebufferReadBuffer");
 }
diff --git a/src/mesa/main/buffers.h b/src/mesa/main/buffers.h
index ebcfa1c..5aa79fd 100644
--- a/src/mesa/main/buffers.h
+++ b/src/mesa/main/buffers.h
@@ -36,26 +36,51 @@
 #include "glheader.h"
 
 struct gl_context;
+struct gl_framebuffer;
+
+extern void
+_mesa_draw_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller);
 
 extern void GLAPIENTRY
 _mesa_DrawBuffer( GLenum mode );
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffer(GLuint framebuffer, GLenum buf);
+
+extern void
+_mesa_draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                   GLsizei n, const GLenum *buffers, const char *caller);
+
+extern void GLAPIENTRY
 _mesa_DrawBuffers(GLsizei n, const GLenum *buffers);
 
+extern void GLAPIENTRY
+_mesa_NamedFramebufferDrawBuffers(GLuint framebuffer, GLsizei n,
+                                  const GLenum *bufs);
+
 extern void
-_mesa_drawbuffers(struct gl_context *ctx, GLuint n, const GLenum *buffers,
+_mesa_drawbuffers(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLuint n, const GLenum *buffers,
                   const GLbitfield *destMask);
 
 extern void
-_mesa_readbuffer(struct gl_context *ctx, GLenum buffer, GLint bufferIndex);
+_mesa_readbuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                 GLenum buffer, GLint bufferIndex);
 
 extern void
 _mesa_update_draw_buffers(struct gl_context *ctx);
 
 
+extern void
+_mesa_read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
+                  GLenum buffer, const char *caller);
+
 extern void GLAPIENTRY
 _mesa_ReadBuffer( GLenum mode );
 
+extern void GLAPIENTRY
+_mesa_NamedFramebufferReadBuffer(GLuint framebuffer, GLenum src);
+
 
 #endif
diff --git a/src/mesa/main/clear.c b/src/mesa/main/clear.c
index 8d707bc..426caea 100644
--- a/src/mesa/main/clear.c
+++ b/src/mesa/main/clear.c
@@ -34,6 +34,8 @@
 #include "clear.h"
 #include "context.h"
 #include "enums.h"
+#include "fbobject.h"
+#include "get.h"
 #include "macros.h"
 #include "mtypes.h"
 #include "state.h"
@@ -400,6 +402,24 @@ _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value)
 
 
 /**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferiv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLint *value)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferiv(buffer, drawbuffer, value);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
+
+
+/**
  * New in GL 3.0
  * Clear unsigned integer color buffer (not depth, not stencil).
  */
@@ -472,6 +492,24 @@ _mesa_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value)
 
 
 /**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferuiv(GLuint framebuffer, GLenum buffer,
+                               GLint drawbuffer, const GLuint *value)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferuiv(buffer, drawbuffer, value);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
+
+
+/**
  * New in GL 3.0
  * Clear fixed-pt or float color buffer or depth buffer (not stencil).
  */
@@ -565,6 +603,24 @@ _mesa_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value)
 
 
 /**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferfv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLfloat *value)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferfv(buffer, drawbuffer, value);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
+
+
+/**
  * New in GL 3.0
  * Clear depth/stencil buffer only.
  */
@@ -626,3 +682,21 @@ _mesa_ClearBufferfi(GLenum buffer, GLint drawbuffer,
       ctx->Stencil.Clear = clearStencilSave;
    }
 }
+
+
+/**
+ * The ClearBuffer framework is so complicated and so riddled with the
+ * assumption that the framebuffer is bound that, for now, we will just fake
+ * direct state access clearing for the user.
+ */
+void GLAPIENTRY
+_mesa_ClearNamedFramebufferfi(GLuint framebuffer, GLenum buffer,
+                              GLfloat depth, GLint stencil)
+{
+   GLint oldfb;
+
+   _mesa_GetIntegerv(GL_DRAW_FRAMEBUFFER_BINDING, &oldfb);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer);
+   _mesa_ClearBufferfi(buffer, 0, depth, stencil);
+   _mesa_BindFramebuffer(GL_DRAW_FRAMEBUFFER, (GLuint) oldfb);
+}
diff --git a/src/mesa/main/clear.h b/src/mesa/main/clear.h
index 96ce47b..c298506 100644
--- a/src/mesa/main/clear.h
+++ b/src/mesa/main/clear.h
@@ -52,13 +52,29 @@ extern void GLAPIENTRY
 _mesa_ClearBufferiv(GLenum buffer, GLint drawbuffer, const GLint *value);
 
 extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferiv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLint *value);
+
+extern void GLAPIENTRY
 _mesa_ClearBufferuiv(GLenum buffer, GLint drawbuffer, const GLuint *value);
 
 extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferuiv(GLuint framebuffer, GLenum buffer,
+                               GLint drawbuffer, const GLuint *value);
+
+extern void GLAPIENTRY
 _mesa_ClearBufferfv(GLenum buffer, GLint drawbuffer, const GLfloat *value);
 
 extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferfv(GLuint framebuffer, GLenum buffer,
+                              GLint drawbuffer, const GLfloat *value);
+
+extern void GLAPIENTRY
 _mesa_ClearBufferfi(GLenum buffer, GLint drawbuffer,
                     GLfloat depth, GLint stencil);
 
+extern void GLAPIENTRY
+_mesa_ClearNamedFramebufferfi(GLuint framebuffer, GLenum buffer,
+                              GLfloat depth, GLint stencil);
+
 #endif
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 5a66a4e..9c3baf4 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -213,19 +213,10 @@
 /** For GL_ARB_fragment_program */
 /*@{*/
 #define MAX_FRAGMENT_PROGRAM_ADDRESS_REGS 0
+#define MAX_FRAGMENT_PROGRAM_PARAMS       64
+#define MAX_FRAGMENT_PROGRAM_INPUTS       12
 /*@}*/
 
-/** For GL_NV_fragment_program */
-/*@{*/
-#define MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS 1024 /* 72 for GL_ARB_f_p */
-#define MAX_NV_FRAGMENT_PROGRAM_TEMPS         96
-#define MAX_NV_FRAGMENT_PROGRAM_PARAMS        64
-#define MAX_NV_FRAGMENT_PROGRAM_INPUTS        12
-#define MAX_NV_FRAGMENT_PROGRAM_OUTPUTS        3
-#define MAX_NV_FRAGMENT_PROGRAM_WRITE_ONLYS    2
-/*@}*/
-
-
 /** For GL_ARB_vertex_shader */
 /*@{*/
 #define MAX_VERTEX_GENERIC_ATTRIBS 16
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 0a192de..79fa018 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -489,8 +489,8 @@ init_program_limits(struct gl_constants *consts, gl_shader_stage stage,
       prog->MaxOutputComponents = 16 * 4; /* old limit not to break tnl and swrast */
       break;
    case MESA_SHADER_FRAGMENT:
-      prog->MaxParameters = MAX_NV_FRAGMENT_PROGRAM_PARAMS;
-      prog->MaxAttribs = MAX_NV_FRAGMENT_PROGRAM_INPUTS;
+      prog->MaxParameters = MAX_FRAGMENT_PROGRAM_PARAMS;
+      prog->MaxAttribs = MAX_FRAGMENT_PROGRAM_INPUTS;
       prog->MaxAddressRegs = MAX_FRAGMENT_PROGRAM_ADDRESS_REGS;
       prog->MaxUniformComponents = 4 * MAX_UNIFORMS;
       prog->MaxInputComponents = 16 * 4; /* old limit not to break tnl and swrast */
@@ -883,6 +883,19 @@ update_default_objects(struct gl_context *ctx)
 }
 
 
+/* XXX this is temporary and should be removed at some point in the
+ * future when there's a reasonable expectation that the libGL library
+ * contains the _glapi_new_nop_table() and _glapi_set_nop_handler()
+ * functions which were added in Mesa 10.6.
+ */
+#if !defined(_WIN32)
+/* Avoid libGL / driver ABI break */
+#define USE_GLAPI_NOP_FEATURES 0
+#else
+#define USE_GLAPI_NOP_FEATURES 1
+#endif
+
+
 /**
  * This function is called by the glapi no-op functions.  For each OpenGL
  * function/entrypoint there's a simple no-op function.  These "no-op"
@@ -898,6 +911,7 @@ update_default_objects(struct gl_context *ctx)
  *
  * \param name  the name of the OpenGL function
  */
+#if USE_GLAPI_NOP_FEATURES
 static void
 nop_handler(const char *name)
 {
@@ -914,6 +928,7 @@ nop_handler(const char *name)
    }
 #endif
 }
+#endif
 
 
 /**
@@ -923,12 +938,52 @@ nop_handler(const char *name)
 static void GLAPIENTRY
 nop_glFlush(void)
 {
-   /* don't record an error like we do in _mesa_generic_nop() */
+   /* don't record an error like we do in nop_handler() */
+}
+#endif
+
+
+#if !USE_GLAPI_NOP_FEATURES
+static int
+generic_nop(void)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "unsupported function called "
+               "(unsupported extension or deprecated function?)");
+   return 0;
 }
 #endif
 
 
 /**
+ * Create a new API dispatch table in which all entries point to the
+ * generic_nop() function.  This will not work on Windows because of
+ * the __stdcall convention which requires the callee to clean up the
+ * call stack.  That's impossible with one generic no-op function.
+ */
+struct _glapi_table *
+_mesa_new_nop_table(unsigned numEntries)
+{
+   struct _glapi_table *table;
+
+#if !USE_GLAPI_NOP_FEATURES
+   table = malloc(numEntries * sizeof(_glapi_proc));
+   if (table) {
+      _glapi_proc *entry = (_glapi_proc *) table;
+      unsigned i;
+      for (i = 0; i < numEntries; i++) {
+         entry[i] = (_glapi_proc) generic_nop;
+      }
+   }
+#else
+   table = _glapi_new_nop_table(numEntries);
+#endif
+   return table;
+}
+
+
+/**
  * Allocate and initialize a new dispatch table.  The table will be
  * populated with pointers to "no-op" functions.  In turn, the no-op
  * functions will call nop_handler() above.
@@ -941,8 +996,9 @@ alloc_dispatch_table(void)
     * Mesa we do this to accommodate different versions of libGL and various
     * DRI drivers.
     */
-   GLint numEntries = MAX2(_glapi_get_dispatch_table_size(), _gloffset_COUNT);
-   struct _glapi_table *table = _glapi_new_nop_table(numEntries);
+   int numEntries = MAX2(_glapi_get_dispatch_table_size(), _gloffset_COUNT);
+
+   struct _glapi_table *table = _mesa_new_nop_table(numEntries);
 
 #if defined(_WIN32)
    if (table) {
@@ -966,7 +1022,9 @@ alloc_dispatch_table(void)
    }
 #endif
 
+#if USE_GLAPI_NOP_FEATURES
    _glapi_set_nop_handler(nop_handler);
+#endif
 
    return table;
 }
@@ -1111,9 +1169,7 @@ _mesa_initialize_context(struct gl_context *ctx,
       ctx->HasConfig = GL_FALSE;
    }
 
-   if (_mesa_is_desktop_gl(ctx)) {
-      _mesa_override_gl_version(ctx);
-   }
+   _mesa_override_gl_version(ctx);
 
    /* misc one-time initializations */
    one_time_init(ctx);
@@ -1275,7 +1331,6 @@ _mesa_free_context_data( struct gl_context *ctx )
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._Current, NULL);
    _mesa_reference_vertprog(ctx, &ctx->VertexProgram._TnlProgram, NULL);
 
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current, NULL);
    _mesa_reference_geomprog(ctx, &ctx->GeometryProgram._Current, NULL);
 
    _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, NULL);
@@ -1565,7 +1620,8 @@ handle_first_current(struct gl_context *ctx)
          else
             buffer = GL_FRONT;
 
-         _mesa_drawbuffers(ctx, 1, &buffer, NULL /* destMask */);
+         _mesa_drawbuffers(ctx, ctx->DrawBuffer, 1, &buffer,
+                           NULL /* destMask */);
       }
 
       if (ctx->ReadBuffer != _mesa_get_incomplete_framebuffer()) {
@@ -1578,7 +1634,7 @@ handle_first_current(struct gl_context *ctx)
             bufferIndex = BUFFER_FRONT_LEFT;
          }
 
-         _mesa_readbuffer(ctx, buffer, bufferIndex);
+         _mesa_readbuffer(ctx, ctx->ReadBuffer, buffer, bufferIndex);
       }
    }
 
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index fd22f28..e8732c6 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -40,14 +40,25 @@ enum mesa_block_class {
    BLOCK_CLASS_64_BITS
 };
 
+/**
+ * Prepare the source or destination resource, including:
+ * - Error checking
+ * - Creating texture wrappers for renderbuffers
+ * \param name  the texture or renderbuffer name
+ * \param target  GL_TEXTURE target or GL_RENDERBUFFER.  For the later, will
+ *                be changed to a compatible GL_TEXTURE target.
+ * \param level  mipmap level
+ * \param tex_obj  returns a pointer to a texture object
+ * \param tex_image  returns a pointer to a texture image
+ * \param tmp_tex  returns temporary texture object name
+ * \return true if success, false if error
+ */
 static bool
 prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
                struct gl_texture_object **tex_obj,
                struct gl_texture_image **tex_image, GLuint *tmp_tex,
                const char *dbg_prefix)
 {
-   struct gl_renderbuffer *rb;
-
    if (name == 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glCopyImageSubData(%sName = %d)", dbg_prefix, name);
@@ -87,7 +98,7 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
    }
 
    if (*target == GL_RENDERBUFFER) {
-      rb = _mesa_lookup_renderbuffer(ctx, name);
+      struct gl_renderbuffer *rb = _mesa_lookup_renderbuffer(ctx, name);
       if (!rb) {
          _mesa_error(ctx, GL_INVALID_VALUE,
                      "glCopyImageSubData(%sName = %u)", dbg_prefix, name);
@@ -169,8 +180,15 @@ prepare_target(struct gl_context *ctx, GLuint name, GLenum *target, int level,
    return true;
 }
 
+
+/**
+ * Check that the x,y,z,width,height,region is within the texture image
+ * dimensions.
+ * \return true if bounds OK, false if regions is out of bounds
+ */
 static bool
-check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
+check_region_bounds(struct gl_context *ctx,
+                    const struct gl_texture_image *tex_image,
                     int x, int y, int z, int width, int height, int depth,
                     const char *dbg_prefix)
 {
@@ -188,6 +206,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
       return false;
    }
 
+   /* Check X direction */
    if (x + width > tex_image->Width) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glCopyImageSubData(%sX or %sWidth exceeds image bounds)",
@@ -195,6 +214,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
       return false;
    }
 
+   /* Check Y direction */
    switch (tex_image->TexObject->Target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_1D_ARRAY:
@@ -215,6 +235,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
       break;
    }
 
+   /* Check Z direction */
    switch (tex_image->TexObject->Target) {
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
@@ -260,7 +281,7 @@ check_region_bounds(struct gl_context *ctx, struct gl_texture_image *tex_image,
 }
 
 static bool
-compressed_format_compatible(struct gl_context *ctx,
+compressed_format_compatible(const struct gl_context *ctx,
                              GLenum compressedFormat, GLenum otherFormat)
 {
    enum mesa_block_class compressedClass, otherClass;
@@ -348,8 +369,8 @@ compressed_format_compatible(struct gl_context *ctx,
 }
 
 static bool
-copy_format_compatible(struct gl_context *ctx,
-                                GLenum srcFormat, GLenum dstFormat)
+copy_format_compatible(const struct gl_context *ctx,
+                       GLenum srcFormat, GLenum dstFormat)
 {
    /*
     * From ARB_copy_image spec:
@@ -389,7 +410,7 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
    struct gl_texture_object *srcTexObj, *dstTexObj;
    struct gl_texture_image *srcTexImage, *dstTexImage;
    GLuint src_bw, src_bh, dst_bw, dst_bh;
-   int i, srcNewZ, dstNewZ;
+   int i;
 
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glCopyImageSubData(%u, %s, %d, %d, %d, %d, "
@@ -447,6 +468,8 @@ _mesa_CopyImageSubData(GLuint srcName, GLenum srcTarget, GLint srcLevel,
    }
 
    for (i = 0; i < srcDepth; ++i) {
+      int srcNewZ, dstNewZ;
+
       if (srcTexObj->Target == GL_TEXTURE_CUBE_MAP) {
          srcTexImage = srcTexObj->Image[i + srcZ][srcLevel];
          srcNewZ = 0;
diff --git a/src/mesa/main/depth.c b/src/mesa/main/depth.c
index 29851ec..bb4591c 100644
--- a/src/mesa/main/depth.c
+++ b/src/mesa/main/depth.c
@@ -65,6 +65,9 @@ _mesa_DepthFunc( GLenum func )
    if (MESA_VERBOSE & VERBOSE_API)
       _mesa_debug(ctx, "glDepthFunc %s\n", _mesa_lookup_enum_by_nr(func));
 
+   if (ctx->Depth.Func == func)
+      return;
+
    switch (func) {
    case GL_LESS:    /* (default) pass if incoming z < stored z */
    case GL_GEQUAL:
@@ -80,9 +83,6 @@ _mesa_DepthFunc( GLenum func )
       return;
    }
 
-   if (ctx->Depth.Func == func)
-      return;
-
    FLUSH_VERTICES(ctx, _NEW_DEPTH);
    ctx->Depth.Func = func;
 
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index 431c4b4..aafe486 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -7592,28 +7592,6 @@ save_FramebufferTexture(GLenum target, GLenum attachment,
    }
 }
 
-static void GLAPIENTRY
-save_FramebufferTextureFace(GLenum target, GLenum attachment,
-                            GLuint texture, GLint level, GLenum face)
-{
-   Node *n;
-   GET_CURRENT_CONTEXT(ctx);
-   ASSERT_OUTSIDE_SAVE_BEGIN_END_AND_FLUSH(ctx);
-   n = alloc_instruction(ctx, OPCODE_FRAMEBUFFER_TEXTURE_FACE, 5);
-   if (n) {
-      n[1].e = target;
-      n[2].e = attachment;
-      n[3].ui = texture;
-      n[4].i = level;
-      n[5].e = face;
-   }
-   if (ctx->ExecuteFlag) {
-      CALL_FramebufferTextureFaceARB(ctx->Exec, (target, attachment, texture,
-                                                 level, face));
-   }
-}
-
-
 
 static void GLAPIENTRY
 save_WaitSync(GLsync sync, GLbitfield flags, GLuint64 timeout)
@@ -8873,11 +8851,6 @@ execute_list(struct gl_context *ctx, GLuint list)
             CALL_FramebufferTexture(ctx->Exec, (n[1].e, n[2].e,
                                                    n[3].ui, n[4].i));
             break;
-         case OPCODE_FRAMEBUFFER_TEXTURE_FACE:
-            CALL_FramebufferTextureFaceARB(ctx->Exec, (n[1].e, n[2].e,
-                                                       n[3].ui, n[4].i, n[5].e));
-            break;
-
          /* GL_ARB_sync */
          case OPCODE_WAIT_SYNC:
             {
@@ -9644,10 +9617,9 @@ _mesa_initialize_save_table(const struct gl_context *ctx)
    SET_BlendEquationiARB(table, save_BlendEquationi);
    SET_BlendEquationSeparateiARB(table, save_BlendEquationSeparatei);
 
-   /* GL_ARB_geometry_shader4 */
+   /* OpenGL 3.2 */
    SET_ProgramParameteri(table, save_ProgramParameteri);
    SET_FramebufferTexture(table, save_FramebufferTexture);
-   SET_FramebufferTextureFaceARB(table, save_FramebufferTextureFace);
 
    /* GL_NV_conditional_render */
    SET_BeginConditionalRender(table, save_BeginConditionalRender);
diff --git a/src/mesa/main/errors.c b/src/mesa/main/errors.c
index 2aa1deb..b340666 100644
--- a/src/mesa/main/errors.c
+++ b/src/mesa/main/errors.c
@@ -39,6 +39,7 @@
 #include "mtypes.h"
 #include "version.h"
 #include "util/hash_table.h"
+#include "util/simple_list.h"
 
 static mtx_t DynamicIDMutex = _MTX_INITIALIZER_NP;
 static GLuint NextDynamicID = 1;
@@ -1412,6 +1413,26 @@ should_output(struct gl_context *ctx, GLenum error, const char *fmtString)
 
 
 void
+_mesa_gl_vdebug(struct gl_context *ctx,
+                GLuint *id,
+                enum mesa_debug_source source,
+                enum mesa_debug_type type,
+                enum mesa_debug_severity severity,
+                const char *fmtString,
+                va_list args)
+{
+   char s[MAX_DEBUG_MESSAGE_LENGTH];
+   int len;
+
+   debug_get_id(id);
+
+   len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args);
+
+   log_msg(ctx, source, type, *id, severity, len, s);
+}
+
+
+void
 _mesa_gl_debug(struct gl_context *ctx,
                GLuint *id,
                enum mesa_debug_source source,
@@ -1419,17 +1440,10 @@ _mesa_gl_debug(struct gl_context *ctx,
                enum mesa_debug_severity severity,
                const char *fmtString, ...)
 {
-   char s[MAX_DEBUG_MESSAGE_LENGTH];
-   int len;
    va_list args;
-
-   debug_get_id(id);
-
    va_start(args, fmtString);
-   len = _mesa_vsnprintf(s, MAX_DEBUG_MESSAGE_LENGTH, fmtString, args);
+   _mesa_gl_vdebug(ctx, id, source, type, severity, fmtString, args);
    va_end(args);
-
-   log_msg(ctx, source, type, *id, severity, len, s);
 }
 
 
diff --git a/src/mesa/main/errors.h b/src/mesa/main/errors.h
index e6dc9b5..24f234f 100644
--- a/src/mesa/main/errors.h
+++ b/src/mesa/main/errors.h
@@ -76,6 +76,15 @@ extern FILE *
 _mesa_get_log_file(void);
 
 extern void
+_mesa_gl_vdebug(struct gl_context *ctx,
+                GLuint *id,
+                enum mesa_debug_source source,
+                enum mesa_debug_type type,
+                enum mesa_debug_severity severity,
+                const char *fmtString,
+                va_list args);
+
+extern void
 _mesa_gl_debug(struct gl_context *ctx,
                GLuint *id,
                enum mesa_debug_source source,
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index f7ce064..4176a69 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -104,7 +104,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_depth_clamp",                         o(ARB_depth_clamp),                         GL,             2003 },
    { "GL_ARB_depth_texture",                       o(ARB_depth_texture),                       GLL,            2001 },
    { "GL_ARB_derivative_control",                  o(ARB_derivative_control),                  GL,             2014 },
-   { "GL_ARB_direct_state_access",                 o(dummy_false),                             GL,             2014 },
+   { "GL_ARB_direct_state_access",                 o(dummy_true),                              GLC,            2014 },
    { "GL_ARB_draw_buffers",                        o(dummy_true),                              GL,             2002 },
    { "GL_ARB_draw_buffers_blend",                  o(ARB_draw_buffers_blend),                  GL,             2009 },
    { "GL_ARB_draw_elements_base_vertex",           o(ARB_draw_elements_base_vertex),           GL,             2009 },
@@ -117,6 +117,7 @@ static const struct extension extension_table[] = {
    { "GL_ARB_fragment_program",                    o(ARB_fragment_program),                    GLL,            2002 },
    { "GL_ARB_fragment_program_shadow",             o(ARB_fragment_program_shadow),             GLL,            2003 },
    { "GL_ARB_fragment_shader",                     o(ARB_fragment_shader),                     GL,             2002 },
+   { "GL_ARB_framebuffer_no_attachments",          o(ARB_framebuffer_no_attachments),          GL,             2012 },
    { "GL_ARB_framebuffer_object",                  o(ARB_framebuffer_object),                  GL,             2005 },
    { "GL_ARB_framebuffer_sRGB",                    o(EXT_framebuffer_sRGB),                    GL,             1998 },
    { "GL_ARB_get_program_binary",                  o(dummy_true),                              GL,             2010 },
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index 27cf97f..f8dcf12 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -121,6 +121,27 @@ _mesa_lookup_renderbuffer(struct gl_context *ctx, GLuint id)
 
 
 /**
+ * A convenience function for direct state access that throws
+ * GL_INVALID_OPERATION if the renderbuffer doesn't exist.
+ */
+struct gl_renderbuffer *
+_mesa_lookup_renderbuffer_err(struct gl_context *ctx, GLuint id,
+                              const char *func)
+{
+   struct gl_renderbuffer *rb;
+
+   rb = _mesa_lookup_renderbuffer(ctx, id);
+   if (!rb || rb == &DummyRenderbuffer) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(non-existent renderbuffer %u)", func, id);
+      return NULL;
+   }
+
+   return rb;
+}
+
+
+/**
  * Helper routine for getting a gl_framebuffer.
  */
 struct gl_framebuffer *
@@ -138,6 +159,27 @@ _mesa_lookup_framebuffer(struct gl_context *ctx, GLuint id)
 
 
 /**
+ * A convenience function for direct state access that throws
+ * GL_INVALID_OPERATION if the framebuffer doesn't exist.
+ */
+struct gl_framebuffer *
+_mesa_lookup_framebuffer_err(struct gl_context *ctx, GLuint id,
+                             const char *func)
+{
+   struct gl_framebuffer *fb;
+
+   fb = _mesa_lookup_framebuffer(ctx, id);
+   if (!fb || fb == &DummyFramebuffer) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(non-existent framebuffer %u)", func, id);
+      return NULL;
+   }
+
+   return fb;
+}
+
+
+/**
  * Mark the given framebuffer as invalid.  This will force the
  * test for framebuffer completeness to be done before the framebuffer
  * is used.
@@ -423,7 +465,7 @@ set_texture_attachment(struct gl_context *ctx,
                        struct gl_framebuffer *fb,
                        struct gl_renderbuffer_attachment *att,
                        struct gl_texture_object *texObj,
-                       GLenum texTarget, GLuint level, GLuint zoffset,
+                       GLenum texTarget, GLuint level, GLuint layer,
                        GLboolean layered)
 {
    struct gl_renderbuffer *rb = att->Renderbuffer;
@@ -447,7 +489,7 @@ set_texture_attachment(struct gl_context *ctx,
    /* always update these fields */
    att->TextureLevel = level;
    att->CubeMapFace = _mesa_tex_target_to_face(texTarget);
-   att->Zoffset = zoffset;
+   att->Zoffset = layer;
    att->Layered = layered;
    att->Complete = GL_FALSE;
 
@@ -479,9 +521,10 @@ set_renderbuffer_attachment(struct gl_context *ctx,
  * Attach a renderbuffer object to a framebuffer object.
  */
 void
-_mesa_framebuffer_renderbuffer(struct gl_context *ctx,
-                               struct gl_framebuffer *fb,
-                               GLenum attachment, struct gl_renderbuffer *rb)
+_mesa_FramebufferRenderbuffer_sw(struct gl_context *ctx,
+                                 struct gl_framebuffer *fb,
+                                 GLenum attachment,
+                                 struct gl_renderbuffer *rb)
 {
    struct gl_renderbuffer_attachment *att;
 
@@ -914,6 +957,7 @@ _mesa_test_framebuffer_completeness(struct gl_context *ctx,
    fb->Height = 0;
    fb->_AllColorBuffersFixedPoint = GL_TRUE;
    fb->_HasSNormOrFloatColorBuffer = GL_FALSE;
+   fb->_HasAttachments = true;
 
    /* Start at -2 to more easily loop over all attachment points.
     *  -2: depth buffer
@@ -1112,14 +1156,48 @@ _mesa_test_framebuffer_completeness(struct gl_context *ctx,
       } else if (att_layer_count > max_layer_count) {
          max_layer_count = att_layer_count;
       }
+
+      /*
+       * The extension GL_ARB_framebuffer_no_attachments places additional
+       * requirement on each attachment. Those additional requirements are
+       * tighter that those of previous versions of GL. In interest of better
+       * compatibility, we will not enforce these restrictions. For the record
+       * those additional restrictions are quoted below:
+       *
+       * "The width and height of image are greater than zero and less than or
+       *  equal to the values of the implementation-dependent limits
+       *  MAX_FRAMEBUFFER_WIDTH and MAX_FRAMEBUFFER_HEIGHT, respectively."
+       *
+       * "If <image> is a three-dimensional texture or a one- or two-dimensional
+       *  array texture and the attachment is layered, the depth or layer count
+       *  of the texture is less than or equal to the implementation-dependent
+       *  limit MAX_FRAMEBUFFER_LAYERS."
+       *
+       * "If image has multiple samples, its sample count is less than or equal
+       *  to the value of the implementation-dependent limit
+       *  MAX_FRAMEBUFFER_SAMPLES."
+       *
+       * The same requirements are also in place for GL 4.5,
+       * Section 9.4.1 "Framebuffer Attachment Completeness", pg 310-311
+       */
    }
 
    fb->MaxNumLayers = max_layer_count;
 
    if (numImages == 0) {
-      fb->_Status = GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT;
-      fbo_incomplete(ctx, "no attachments", -1);
-      return;
+      fb->_HasAttachments = false;
+
+      if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+         fb->_Status = GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT;
+         fbo_incomplete(ctx, "no attachments", -1);
+         return;
+      }
+
+      if (fb->DefaultGeometry.Width == 0 || fb->DefaultGeometry.Height == 0) {
+         fb->_Status = GL_FRAMEBUFFER_INCOMPLETE_MISSING_ATTACHMENT_EXT;
+         fbo_incomplete(ctx, "no attachments and default width or height is 0", -1);
+         return;
+      }
    }
 
    if (_mesa_is_desktop_gl(ctx) && !ctx->Extensions.ARB_ES2_compatibility) {
@@ -1184,8 +1262,10 @@ _mesa_test_framebuffer_completeness(struct gl_context *ctx,
        * renderbuffers/textures are different sizes, the framebuffer
        * width/height will be set to the smallest width/height.
        */
-      fb->Width = minWidth;
-      fb->Height = minHeight;
+      if (numImages != 0) {
+         fb->Width = minWidth;
+         fb->Height = minHeight;
+      }
 
       /* finally, update the visual info for the framebuffer */
       _mesa_update_framebuffer_visual(ctx, fb);
@@ -1291,6 +1371,131 @@ _mesa_BindRenderbufferEXT(GLenum target, GLuint renderbuffer)
    bind_renderbuffer(target, renderbuffer, true);
 }
 
+static void
+framebuffer_parameteri(struct gl_context *ctx, struct gl_framebuffer *fb,
+                       GLenum pname, GLint param, const char *func)
+{
+   switch (pname) {
+   case GL_FRAMEBUFFER_DEFAULT_WIDTH:
+      if (param < 0 || param > ctx->Const.MaxFramebufferWidth)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+         fb->DefaultGeometry.Width = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_HEIGHT:
+      if (param < 0 || param > ctx->Const.MaxFramebufferHeight)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+         fb->DefaultGeometry.Height = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_LAYERS:
+      if (param < 0 || param > ctx->Const.MaxFramebufferLayers)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+         fb->DefaultGeometry.Layers = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_SAMPLES:
+      if (param < 0 || param > ctx->Const.MaxFramebufferSamples)
+        _mesa_error(ctx, GL_INVALID_VALUE, "%s", func);
+      else
+        fb->DefaultGeometry.NumSamples = param;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS:
+      fb->DefaultGeometry.FixedSampleLocations = param;
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(pname=0x%x)", func, pname);
+   }
+}
+
+void GLAPIENTRY
+_mesa_FramebufferParameteri(GLenum target, GLenum pname, GLint param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glFramebufferParameteriv not supported "
+                  "(ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferParameteri(target=0x%x)", target);
+      return;
+   }
+
+   /* check framebuffer binding */
+   if (_mesa_is_winsys_fbo(fb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glFramebufferParameteri");
+      return;
+   }
+
+   framebuffer_parameteri(ctx, fb, pname, param, "glFramebufferParameteri");
+}
+
+static void
+get_framebuffer_parameteriv(struct gl_context *ctx, struct gl_framebuffer *fb,
+                            GLenum pname, GLint *params, const char *func)
+{
+   switch (pname) {
+   case GL_FRAMEBUFFER_DEFAULT_WIDTH:
+      *params = fb->DefaultGeometry.Width;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_HEIGHT:
+      *params = fb->DefaultGeometry.Height;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_LAYERS:
+      *params = fb->DefaultGeometry.Layers;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_SAMPLES:
+      *params = fb->DefaultGeometry.NumSamples;
+      break;
+   case GL_FRAMEBUFFER_DEFAULT_FIXED_SAMPLE_LOCATIONS:
+      *params = fb->DefaultGeometry.FixedSampleLocations;
+      break;
+   default:
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "%s(pname=0x%x)", func, pname);
+   }
+}
+
+void GLAPIENTRY
+_mesa_GetFramebufferParameteriv(GLenum target, GLenum pname, GLint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetFramebufferParameteriv not supported "
+                  "(ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glGetFramebufferParameteriv(target=0x%x)", target);
+      return;
+   }
+
+   /* check framebuffer binding */
+   if (_mesa_is_winsys_fbo(fb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glGetFramebufferParameteriv");
+      return;
+   }
+
+   get_framebuffer_parameteriv(ctx, fb, pname, params,
+                               "glGetFramebufferParameteriv");
+}
+
 
 /**
  * Remove the specified renderbuffer or texture from any attachment point in
@@ -2396,15 +2601,23 @@ _mesa_DeleteFramebuffers(GLsizei n, const GLuint *framebuffers)
 }
 
 
-void GLAPIENTRY
-_mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers)
+/**
+ * This is the implementation for glGenFramebuffers and glCreateFramebuffers.
+ * It is not exposed to the rest of Mesa to encourage the use of
+ * nameless buffers in driver internals.
+ */
+static void
+create_framebuffers(GLsizei n, GLuint *framebuffers, bool dsa)
 {
    GET_CURRENT_CONTEXT(ctx);
    GLuint first;
    GLint i;
+   struct gl_framebuffer *fb;
+
+   const char *func = dsa ? "glCreateFramebuffers" : "glGenFramebuffers";
 
    if (n < 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glGenFramebuffersEXT(n)");
+      _mesa_error(ctx, GL_INVALID_VALUE, "%s(n < 0)", func);
       return;
    }
 
@@ -2416,31 +2629,43 @@ _mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers)
    for (i = 0; i < n; i++) {
       GLuint name = first + i;
       framebuffers[i] = name;
-      /* insert dummy placeholder into hash table */
+
+      if (dsa) {
+         fb = ctx->Driver.NewFramebuffer(ctx, framebuffers[i]);
+         if (!fb) {
+            _mesa_error(ctx, GL_OUT_OF_MEMORY, "%s", func);
+            return;
+         }
+      }
+      else
+         fb = &DummyFramebuffer;
+
       mtx_lock(&ctx->Shared->Mutex);
-      _mesa_HashInsert(ctx->Shared->FrameBuffers, name, &DummyFramebuffer);
+      _mesa_HashInsert(ctx->Shared->FrameBuffers, name, fb);
       mtx_unlock(&ctx->Shared->Mutex);
    }
 }
 
 
-GLenum GLAPIENTRY
-_mesa_CheckFramebufferStatus(GLenum target)
+void GLAPIENTRY
+_mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers)
 {
-   struct gl_framebuffer *buffer;
-   GET_CURRENT_CONTEXT(ctx);
+   create_framebuffers(n, framebuffers, false);
+}
 
-   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
-   if (MESA_VERBOSE & VERBOSE_API)
-      _mesa_debug(ctx, "glCheckFramebufferStatus(%s)\n",
-                  _mesa_lookup_enum_by_nr(target));
+void GLAPIENTRY
+_mesa_CreateFramebuffers(GLsizei n, GLuint *framebuffers)
+{
+   create_framebuffers(n, framebuffers, true);
+}
 
-   buffer = get_framebuffer_target(ctx, target);
-   if (!buffer) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glCheckFramebufferStatus(target)");
-      return 0;
-   }
+
+GLenum
+_mesa_check_framebuffer_status(struct gl_context *ctx,
+                               struct gl_framebuffer *buffer)
+{
+   ASSERT_OUTSIDE_BEGIN_END_WITH_RETVAL(ctx, 0);
 
    if (_mesa_is_winsys_fbo(buffer)) {
       /* EGL_KHR_surfaceless_context allows the winsys FBO to be incomplete. */
@@ -2461,6 +2686,67 @@ _mesa_CheckFramebufferStatus(GLenum target)
 }
 
 
+GLenum GLAPIENTRY
+_mesa_CheckFramebufferStatus(GLenum target)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   if (MESA_VERBOSE & VERBOSE_API)
+      _mesa_debug(ctx, "glCheckFramebufferStatus(%s)\n",
+                  _mesa_lookup_enum_by_nr(target));
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glCheckFramebufferStatus(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return 0;
+   }
+
+   return _mesa_check_framebuffer_status(ctx, fb);
+}
+
+
+GLenum GLAPIENTRY
+_mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   /* Validate the target (for conformance's sake) and grab a reference to the
+    * default framebuffer in case framebuffer = 0.
+    * Section 9.4 Framebuffer Completeness of the OpenGL 4.5 core spec
+    * (30.10.2014, PDF page 336) says:
+    *    "If framebuffer is zero, then the status of the default read or
+    *    draw framebuffer (as determined by target) is returned."
+    */
+   switch (target) {
+      case GL_DRAW_FRAMEBUFFER:
+      case GL_FRAMEBUFFER:
+         fb = ctx->WinSysDrawBuffer;
+         break;
+      case GL_READ_FRAMEBUFFER:
+         fb = ctx->WinSysReadBuffer;
+         break;
+      default:
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glCheckNamedFramebufferStatus(invalid target %s)",
+                     _mesa_lookup_enum_by_nr(target));
+         return 0;
+   }
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glCheckNamedFramebufferStatus");
+      if (!fb)
+         return 0;
+   }
+
+   return _mesa_check_framebuffer_status(ctx, fb);
+}
+
+
 /**
  * Replicate the src attachment point. Used by framebuffer_texture() when
  * the same texture is attached at GL_DEPTH_ATTACHMENT and
@@ -2487,144 +2773,308 @@ reuse_framebuffer_texture_attachment(struct gl_framebuffer *fb,
 
 
 /**
- * Common code called by glFramebufferTexture1D/2D/3D() and
- * glFramebufferTextureLayer().
+ * Common code called by gl*FramebufferTexture*() to retrieve the correct
+ * texture object pointer.
  *
- * \param textarget is the textarget that was passed to the
- * glFramebufferTexture...() function, or 0 if the corresponding function
- * doesn't have a textarget parameter.
+ * \param texObj where the pointer to the texture object is returned.  Note
+ * that a successful call may return texObj = NULL.
  *
- * \param layered is true if this function was called from
- * glFramebufferTexture(), false otherwise.
+ * \return true if no errors, false if errors
  */
-static void
-framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
-                    GLenum attachment, GLenum textarget, GLuint texture,
-                    GLint level, GLuint zoffset, GLboolean layered)
+static bool
+get_texture_for_framebuffer(struct gl_context *ctx, GLuint texture,
+                            bool layered, const char *caller,
+                            struct gl_texture_object **texObj)
 {
-   struct gl_renderbuffer_attachment *att;
-   struct gl_texture_object *texObj = NULL;
-   struct gl_framebuffer *fb;
-   GLenum maxLevelsTarget;
+   *texObj = NULL; /* This will get returned if texture = 0. */
 
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferTexture%s(target=0x%x)", caller, target);
-      return;
+   if (!texture)
+      return true;
+
+   *texObj = _mesa_lookup_texture(ctx, texture);
+   if (*texObj == NULL || (*texObj)->Target == 0) {
+      /* Can't render to a non-existent texture object.
+       *
+       * The OpenGL 4.5 core spec (02.02.2015) in Section 9.2 Binding and
+       * Managing Framebuffer Objects specifies a different error
+       * depending upon the calling function (PDF pages 325-328).
+       * *FramebufferTexture (where layered = GL_TRUE) throws invalid
+       * value, while the other commands throw invalid operation (where
+       * layered = GL_FALSE).
+       */
+      const GLenum error = layered ? GL_INVALID_VALUE :
+                           GL_INVALID_OPERATION;
+      _mesa_error(ctx, error,
+                  "%s(non-existent texture %u)", caller, texture);
+      return false;
    }
 
-   /* check framebuffer binding */
-   if (_mesa_is_winsys_fbo(fb)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glFramebufferTexture%s", caller);
-      return;
+   return true;
+}
+
+
+/**
+ * Common code called by gl*FramebufferTexture() to verify the texture target
+ * and decide whether or not the attachment should truly be considered
+ * layered.
+ *
+ * \param layered true if attachment should be considered layered, false if
+ * not
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_layered_texture_target(struct gl_context *ctx, GLenum target,
+                             const char *caller, GLboolean *layered)
+{
+   *layered = GL_TRUE;
+
+   switch (target) {
+   case GL_TEXTURE_3D:
+   case GL_TEXTURE_1D_ARRAY_EXT:
+   case GL_TEXTURE_2D_ARRAY_EXT:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      return true;
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_RECTANGLE:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+      /* These texture types are valid to pass to
+       * glFramebufferTexture(), but since they aren't layered, it
+       * is equivalent to calling glFramebufferTexture{1D,2D}().
+       */
+      *layered = GL_FALSE;
+      return true;
    }
 
-   /* The textarget, level, and zoffset parameters are only validated if
-    * texture is non-zero.
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "%s(invalid texture target %s)", caller,
+               _mesa_lookup_enum_by_nr(target));
+   return false;
+}
+
+
+/**
+ * Common code called by gl*FramebufferTextureLayer() to verify the texture
+ * target.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_texture_target(struct gl_context *ctx, GLenum target,
+                     const char *caller)
+{
+   /* We're being called by glFramebufferTextureLayer().
+    * The only legal texture types for that function are 3D,
+    * cube-map, and 1D/2D/cube-map array textures.
+    *
+    * We don't need to check for GL_ARB_texture_cube_map_array because the
+    * application wouldn't have been able to create a texture with a
+    * GL_TEXTURE_CUBE_MAP_ARRAY target if the extension were not enabled.
     */
-   if (texture) {
-      GLboolean err = GL_TRUE;
-
-      texObj = _mesa_lookup_texture(ctx, texture);
-      if (texObj != NULL) {
-         if (textarget == 0) {
-            if (layered) {
-               /* We're being called by glFramebufferTexture() and textarget
-                * is not used.
-                */
-               switch (texObj->Target) {
-               case GL_TEXTURE_3D:
-               case GL_TEXTURE_1D_ARRAY_EXT:
-               case GL_TEXTURE_2D_ARRAY_EXT:
-               case GL_TEXTURE_CUBE_MAP:
-               case GL_TEXTURE_CUBE_MAP_ARRAY:
-               case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-                  err = false;
-                  break;
-               case GL_TEXTURE_1D:
-               case GL_TEXTURE_2D:
-               case GL_TEXTURE_RECTANGLE:
-               case GL_TEXTURE_2D_MULTISAMPLE:
-                  /* These texture types are valid to pass to
-                   * glFramebufferTexture(), but since they aren't layered, it
-                   * is equivalent to calling glFramebufferTexture{1D,2D}().
-                   */
-                  err = false;
-                  layered = false;
-                  textarget = texObj->Target;
-                  break;
-               default:
-                  err = true;
-                  break;
-               }
-            } else {
-               /* We're being called by glFramebufferTextureLayer() and
-                * textarget is not used.  The only legal texture types for
-                * that function are 3D and 1D/2D arrays textures.
-                */
-               err = (texObj->Target != GL_TEXTURE_3D) &&
-                  (texObj->Target != GL_TEXTURE_1D_ARRAY_EXT) &&
-                  (texObj->Target != GL_TEXTURE_2D_ARRAY_EXT) &&
-                  (texObj->Target != GL_TEXTURE_CUBE_MAP_ARRAY) &&
-                  (texObj->Target != GL_TEXTURE_2D_MULTISAMPLE_ARRAY);
-            }
-         }
-         else {
-            /* Make sure textarget is consistent with the texture's type */
-            err = (texObj->Target == GL_TEXTURE_CUBE_MAP)
-                ? !_mesa_is_cube_face(textarget)
-                : (texObj->Target != textarget);
-         }
+   switch (target) {
+   case GL_TEXTURE_3D:
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      return true;
+   case GL_TEXTURE_CUBE_MAP:
+      /* We don't need to check the extension (GL_ARB_direct_state_access) or
+       * GL version (4.5) for GL_TEXTURE_CUBE_MAP because DSA is always
+       * enabled in core profile.  This can be called from
+       * _mesa_FramebufferTextureLayer in compatibility profile (OpenGL 3.0),
+       * so we do have to check the profile.
+       */
+      return ctx->API == API_OPENGL_CORE;
+   }
+
+   _mesa_error(ctx, GL_INVALID_OPERATION,
+               "%s(invalid texture target %s)", caller,
+               _mesa_lookup_enum_by_nr(target));
+   return false;
+}
+
+
+/**
+ * Common code called by glFramebufferTexture*D() to verify the texture
+ * target.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_textarget(struct gl_context *ctx, int dims, GLenum target,
+                GLenum textarget, const char *caller)
+{
+   bool err = false;
+
+   switch (dims) {
+   case 1:
+      switch (textarget) {
+      case GL_TEXTURE_1D:
+         break;
+      case GL_TEXTURE_1D_ARRAY:
+         err = !ctx->Extensions.EXT_texture_array;
+         break;
+      default:
+         err = true;
       }
-      else {
-         /* can't render to a non-existant texture */
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture%s(non existant texture)",
-                     caller);
-         return;
+      break;
+   case 2:
+      switch (textarget) {
+      case GL_TEXTURE_2D:
+         break;
+      case GL_TEXTURE_RECTANGLE:
+         err = _mesa_is_gles(ctx)
+            || !ctx->Extensions.NV_texture_rectangle;
+         break;
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+      case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+         err = !ctx->Extensions.ARB_texture_cube_map;
+         break;
+      case GL_TEXTURE_2D_ARRAY:
+         err = (_mesa_is_gles(ctx) && ctx->Version < 30)
+               || !ctx->Extensions.EXT_texture_array;
+         break;
+      case GL_TEXTURE_2D_MULTISAMPLE:
+      case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+         err = _mesa_is_gles(ctx)
+               || !ctx->Extensions.ARB_texture_multisample;
+         break;
+      default:
+         err = true;
       }
+      break;
+   case 3:
+      if (textarget != GL_TEXTURE_3D)
+         err = true;
+      break;
+   default:
+      err = true;
+   }
 
-      if (err) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture%s(texture target mismatch)",
-                     caller);
-         return;
-      }
+   if (err) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(invalid textarget %s)",
+                  caller, _mesa_lookup_enum_by_nr(textarget));
+      return false;
+   }
 
-      if (texObj->Target == GL_TEXTURE_3D) {
-         const GLuint maxSize = 1 << (ctx->Const.Max3DTextureLevels - 1);
-         if (zoffset >= maxSize) {
-            _mesa_error(ctx, GL_INVALID_VALUE,
-                        "glFramebufferTexture%s(zoffset)", caller);
-            return;
-         }
+   /* Make sure textarget is consistent with the texture's type */
+   err = (target == GL_TEXTURE_CUBE_MAP) ?
+          !_mesa_is_cube_face(textarget): (target != textarget);
+
+   if (err) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(mismatched texture target)", caller);
+      return false;
+   }
+
+   return true;
+}
+
+
+/**
+ * Common code called by gl*FramebufferTextureLayer() and
+ * glFramebufferTexture3D() to validate the layer.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_layer(struct gl_context *ctx, GLenum target, GLint layer,
+            const char *caller)
+{
+   /* Page 306 (page 328 of the PDF) of the OpenGL 4.5 (Core Profile)
+    * spec says:
+    *
+    *    "An INVALID_VALUE error is generated if texture is non-zero
+    *     and layer is negative."
+    */
+   if (layer < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(layer %u < 0)", caller, layer);
+      return false;
+   }
+
+   if (target == GL_TEXTURE_3D) {
+      const GLuint maxSize = 1 << (ctx->Const.Max3DTextureLevels - 1);
+      if (layer >= maxSize) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(invalid layer %u)", caller, layer);
+         return false;
       }
-      else if ((texObj->Target == GL_TEXTURE_1D_ARRAY_EXT) ||
-               (texObj->Target == GL_TEXTURE_2D_ARRAY_EXT) ||
-               (texObj->Target == GL_TEXTURE_CUBE_MAP_ARRAY) ||
-               (texObj->Target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY)) {
-         if (zoffset >= ctx->Const.MaxArrayTextureLayers) {
-            _mesa_error(ctx, GL_INVALID_VALUE,
-                        "glFramebufferTexture%s(layer)", caller);
-            return;
-         }
+   }
+   else if ((target == GL_TEXTURE_1D_ARRAY) ||
+            (target == GL_TEXTURE_2D_ARRAY) ||
+            (target == GL_TEXTURE_CUBE_MAP_ARRAY) ||
+            (target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY)) {
+      if (layer >= ctx->Const.MaxArrayTextureLayers) {
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "%s(layer %u >= GL_MAX_ARRAY_TEXTURE_LAYERS)",
+                     caller, layer);
+         return false;
       }
-
-      maxLevelsTarget = textarget ? textarget : texObj->Target;
-      if ((level < 0) ||
-          (level >= _mesa_max_texture_levels(ctx, maxLevelsTarget))) {
+   }
+   else if (target == GL_TEXTURE_CUBE_MAP) {
+      if (layer >= 6) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glFramebufferTexture%s(level)", caller);
-         return;
+                     "%s(layer %u >= 6)", caller, layer);
+         return false;
       }
    }
 
+   return true;
+}
+
+
+/**
+ * Common code called by all gl*FramebufferTexture*() entry points to verify
+ * the level.
+ *
+ * \return true if no errors, false if errors
+ */
+static bool
+check_level(struct gl_context *ctx, GLenum target, GLint level,
+            const char *caller)
+{
+   if ((level < 0) ||
+       (level >= _mesa_max_texture_levels(ctx, target))) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(invalid level %d)", caller, level);
+      return false;
+   }
+
+   return true;
+}
+
+
+void
+_mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
+                          GLenum attachment,
+                          struct gl_texture_object *texObj, GLenum textarget,
+                          GLint level, GLuint layer, GLboolean layered,
+                          const char *caller)
+{
+   struct gl_renderbuffer_attachment *att;
+
+   /* The window-system framebuffer object is immutable */
+   if (_mesa_is_winsys_fbo(fb)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "%s(window-system framebuffer)",
+                  caller);
+      return;
+   }
+
+   /* Not a hash lookup, so we can afford to get the attachment here. */
    att = get_attachment(ctx, fb, attachment);
    if (att == NULL) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferTexture%s(attachment)", caller);
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
+                  _mesa_lookup_enum_by_nr(attachment));
       return;
    }
 
@@ -2637,7 +3087,7 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
           level == fb->Attachment[BUFFER_STENCIL].TextureLevel &&
           _mesa_tex_target_to_face(textarget) ==
           fb->Attachment[BUFFER_STENCIL].CubeMapFace &&
-          zoffset == fb->Attachment[BUFFER_STENCIL].Zoffset) {
+          layer == fb->Attachment[BUFFER_STENCIL].Zoffset) {
          /* The texture object is already attached to the stencil attachment
           * point. Don't create a new renderbuffer; just reuse the stencil
           * attachment's. This is required to prevent a GL error in
@@ -2650,13 +3100,14 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
                  level == fb->Attachment[BUFFER_DEPTH].TextureLevel &&
                  _mesa_tex_target_to_face(textarget) ==
                  fb->Attachment[BUFFER_DEPTH].CubeMapFace &&
-                 zoffset == fb->Attachment[BUFFER_DEPTH].Zoffset) {
+                 layer == fb->Attachment[BUFFER_DEPTH].Zoffset) {
          /* As above, but with depth and stencil transposed. */
          reuse_framebuffer_texture_attachment(fb, BUFFER_STENCIL,
                                               BUFFER_DEPTH);
       } else {
          set_texture_attachment(ctx, fb, att, texObj, textarget,
-                                      level, zoffset, layered);
+                                level, layer, layered);
+
          if (attachment == GL_DEPTH_STENCIL_ATTACHMENT) {
             /* Above we created a new renderbuffer and attached it to the
              * depth attachment point. Now attach it to the stencil attachment
@@ -2692,116 +3143,157 @@ framebuffer_texture(struct gl_context *ctx, const char *caller, GLenum target,
 }
 
 
-void GLAPIENTRY
-_mesa_FramebufferTexture1D(GLenum target, GLenum attachment,
-                           GLenum textarget, GLuint texture, GLint level)
+static void
+framebuffer_texture_with_dims(int dims, GLenum target,
+                              GLenum attachment, GLenum textarget,
+                              GLuint texture, GLint level, GLint layer,
+                              const char *caller)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+
+   /* Get the framebuffer object */
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid target %s)", caller,
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
 
-   if (texture != 0) {
-      GLboolean error;
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, false, caller, &texObj))
+      return;
 
-      switch (textarget) {
-      case GL_TEXTURE_1D:
-         error = GL_FALSE;
-         break;
-      case GL_TEXTURE_1D_ARRAY:
-         error = !ctx->Extensions.EXT_texture_array;
-         break;
-      default:
-         error = GL_TRUE;
-      }
+   if (texObj) {
+      if (!check_textarget(ctx, dims, texObj->Target, textarget, caller))
+         return;
 
-      if (error) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture1D(textarget=%s)",
-                     _mesa_lookup_enum_by_nr(textarget));
+      if ((dims == 3) && !check_layer(ctx, texObj->Target, layer, caller))
          return;
-      }
    }
 
-   framebuffer_texture(ctx, "1D", target, attachment, textarget, texture,
-                       level, 0, GL_FALSE);
+   if (!check_level(ctx, textarget, level, caller))
+      return;
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
+                             layer, GL_FALSE, caller);
 }
 
 
 void GLAPIENTRY
-_mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
+_mesa_FramebufferTexture1D(GLenum target, GLenum attachment,
                            GLenum textarget, GLuint texture, GLint level)
 {
-   GET_CURRENT_CONTEXT(ctx);
-
-   if (texture != 0) {
-      GLboolean error;
-
-      switch (textarget) {
-      case GL_TEXTURE_2D:
-         error = GL_FALSE;
-         break;
-      case GL_TEXTURE_RECTANGLE:
-         error = _mesa_is_gles(ctx)
-            || !ctx->Extensions.NV_texture_rectangle;
-         break;
-      case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-      case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-      case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-      case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-      case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-         error = !ctx->Extensions.ARB_texture_cube_map;
-         break;
-      case GL_TEXTURE_2D_ARRAY:
-         error = (_mesa_is_gles(ctx) && ctx->Version < 30)
-            || !ctx->Extensions.EXT_texture_array;
-         break;
-      case GL_TEXTURE_2D_MULTISAMPLE:
-      case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-         error = _mesa_is_gles(ctx)
-            || !ctx->Extensions.ARB_texture_multisample;
-         break;
-      default:
-         error = GL_TRUE;
-      }
+   framebuffer_texture_with_dims(1, target, attachment, textarget, texture,
+                                 level, 0, "glFramebufferTexture1D");
+}
 
-      if (error) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferTexture2D(textarget=%s)",
-                     _mesa_lookup_enum_by_nr(textarget));
-         return;
-      }
-   }
 
-   framebuffer_texture(ctx, "2D", target, attachment, textarget, texture,
-                       level, 0, GL_FALSE);
+void GLAPIENTRY
+_mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
+                           GLenum textarget, GLuint texture, GLint level)
+{
+   framebuffer_texture_with_dims(2, target, attachment, textarget, texture,
+                                 level, 0, "glFramebufferTexture2D");
 }
 
 
 void GLAPIENTRY
 _mesa_FramebufferTexture3D(GLenum target, GLenum attachment,
                            GLenum textarget, GLuint texture,
-                           GLint level, GLint zoffset)
+                           GLint level, GLint layer)
+{
+   framebuffer_texture_with_dims(3, target, attachment, textarget, texture,
+                                 level, layer, "glFramebufferTexture3D");
+}
+
+
+void GLAPIENTRY
+_mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
+                              GLuint texture, GLint level, GLint layer)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLenum textarget = 0;
 
-   if ((texture != 0) && (textarget != GL_TEXTURE_3D)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glFramebufferTexture3D(textarget)");
+   const char *func = "glFramebufferTextureLayer";
+
+   /* Get the framebuffer object */
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferTextureLayer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
       return;
    }
 
-   framebuffer_texture(ctx, "3D", target, attachment, textarget, texture,
-                       level, zoffset, GL_FALSE);
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, false, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_texture_target(ctx, texObj->Target, func))
+         return;
+
+      if (!check_layer(ctx, texObj->Target, layer, func))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
+
+      if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+         assert(layer >= 0 && layer < 6);
+         textarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + layer;
+         layer = 0;
+      }
+   }
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
+                             layer, GL_FALSE, func);
 }
 
 
 void GLAPIENTRY
-_mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
-                              GLuint texture, GLint level, GLint layer)
+_mesa_NamedFramebufferTextureLayer(GLuint framebuffer, GLenum attachment,
+                                   GLuint texture, GLint level, GLint layer)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLenum textarget = 0;
+
+   const char *func = "glNamedFramebufferTextureLayer";
+
+   /* Get the framebuffer object */
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer, func);
+   if (!fb)
+      return;
+
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, false, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_texture_target(ctx, texObj->Target, func))
+         return;
 
-   framebuffer_texture(ctx, "Layer", target, attachment, 0, texture,
-                       level, layer, GL_FALSE);
+      if (!check_layer(ctx, texObj->Target, layer, func))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
+
+      if (texObj->Target == GL_TEXTURE_CUBE_MAP) {
+         assert(layer >= 0 && layer < 6);
+         textarget = GL_TEXTURE_CUBE_MAP_POSITIVE_X + layer;
+         layer = 0;
+      }
+   }
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, textarget, level,
+                             layer, GL_FALSE, func);
 }
 
 
@@ -2810,82 +3302,115 @@ _mesa_FramebufferTexture(GLenum target, GLenum attachment,
                          GLuint texture, GLint level)
 {
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLboolean layered;
 
-   if (_mesa_has_geometry_shaders(ctx)) {
-      framebuffer_texture(ctx, "", target, attachment, 0, texture,
-                          level, 0, GL_TRUE);
-   } else {
+   const char *func = "FramebufferTexture";
+
+   if (!_mesa_has_geometry_shaders(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "unsupported function (glFramebufferTexture) called");
+      return;
    }
+
+   /* Get the framebuffer object */
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferTexture(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, true, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_layered_texture_target(ctx, texObj->Target, func, &layered))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
+   }
+
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, 0, level,
+                             0, layered, func);
 }
 
 
 void GLAPIENTRY
-_mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
-                              GLenum renderbufferTarget,
-                              GLuint renderbuffer)
+_mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
+                              GLuint texture, GLint level)
 {
-   struct gl_renderbuffer_attachment *att;
-   struct gl_framebuffer *fb;
-   struct gl_renderbuffer *rb;
    GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+   struct gl_texture_object *texObj;
+   GLboolean layered;
 
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferRenderbuffer(target)");
+   const char *func = "glNamedFramebufferTexture";
+
+   if (!_mesa_has_geometry_shaders(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "unsupported function (glNamedFramebufferTexture) called");
       return;
    }
 
-   if (renderbufferTarget != GL_RENDERBUFFER_EXT) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferRenderbuffer(renderbufferTarget)");
+   /* Get the framebuffer object */
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer, func);
+   if (!fb)
       return;
+
+   /* Get the texture object */
+   if (!get_texture_for_framebuffer(ctx, texture, true, func, &texObj))
+      return;
+
+   if (texObj) {
+      if (!check_layered_texture_target(ctx, texObj->Target, func,
+                                        &layered))
+         return;
+
+      if (!check_level(ctx, texObj->Target, level, func))
+         return;
    }
 
+   _mesa_framebuffer_texture(ctx, fb, attachment, texObj, 0, level,
+                             0, layered, func);
+}
+
+
+void
+_mesa_framebuffer_renderbuffer(struct gl_context *ctx,
+                               struct gl_framebuffer *fb,
+                               GLenum attachment,
+                               struct gl_renderbuffer *rb,
+                               const char *func)
+{
+   struct gl_renderbuffer_attachment *att;
+
    if (_mesa_is_winsys_fbo(fb)) {
       /* Can't attach new renderbuffers to a window system framebuffer */
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glFramebufferRenderbuffer");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "%s(window-system framebuffer)", func);
       return;
    }
 
    att = get_attachment(ctx, fb, attachment);
    if (att == NULL) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glFramebufferRenderbuffer(invalid attachment %s)",
+                  "%s(invalid attachment %s)", func,
                   _mesa_lookup_enum_by_nr(attachment));
       return;
    }
 
-   if (renderbuffer) {
-      rb = _mesa_lookup_renderbuffer(ctx, renderbuffer);
-      if (!rb) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferRenderbuffer(non-existant"
-                     " renderbuffer %u)", renderbuffer);
-         return;
-      }
-      else if (rb == &DummyRenderbuffer) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferRenderbuffer(renderbuffer %u)",
-                     renderbuffer);
-         return;
-      }
-   }
-   else {
-      /* remove renderbuffer attachment */
-      rb = NULL;
-   }
-
    if (attachment == GL_DEPTH_STENCIL_ATTACHMENT &&
        rb && rb->Format != MESA_FORMAT_NONE) {
       /* make sure the renderbuffer is a depth/stencil format */
       const GLenum baseFormat = _mesa_get_format_base_format(rb->Format);
       if (baseFormat != GL_DEPTH_STENCIL) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glFramebufferRenderbuffer(renderbuffer"
-                     " is not DEPTH_STENCIL format)");
+                     "%s(renderbuffer is not DEPTH_STENCIL format)", func);
          return;
       }
    }
@@ -2903,24 +3428,94 @@ _mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
 
 
 void GLAPIENTRY
-_mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
-                                          GLenum pname, GLint *params)
+_mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
+                              GLenum renderbuffertarget,
+                              GLuint renderbuffer)
 {
-   const struct gl_renderbuffer_attachment *att;
-   struct gl_framebuffer *buffer;
-   GLenum err;
+   struct gl_framebuffer *fb;
+   struct gl_renderbuffer *rb;
    GET_CURRENT_CONTEXT(ctx);
 
-   /* The error differs in GL and GLES. */
-   err = _mesa_is_desktop_gl(ctx) ? GL_INVALID_OPERATION : GL_INVALID_ENUM;
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glFramebufferRenderbuffer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
 
-   buffer = get_framebuffer_target(ctx, target);
-   if (!buffer) {
+   if (renderbuffertarget != GL_RENDERBUFFER) {
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetFramebufferAttachmentParameteriv(target)");
+                  "glFramebufferRenderbuffer(renderbuffertarget is not "
+                  "GL_RENDERBUFFER)");
       return;
    }
 
+   if (renderbuffer) {
+      rb = _mesa_lookup_renderbuffer_err(ctx, renderbuffer,
+                                         "glFramebufferRenderbuffer");
+      if (!rb)
+         return;
+   }
+   else {
+      /* remove renderbuffer attachment */
+      rb = NULL;
+   }
+
+   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb,
+                                  "glFramebufferRenderbuffer");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferRenderbuffer(GLuint framebuffer, GLenum attachment,
+                                   GLenum renderbuffertarget,
+                                   GLuint renderbuffer)
+{
+   struct gl_framebuffer *fb;
+   struct gl_renderbuffer *rb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                     "glNamedFramebufferRenderbuffer");
+   if (!fb)
+      return;
+
+   if (renderbuffertarget != GL_RENDERBUFFER) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glNamedFramebufferRenderbuffer(renderbuffertarget is not "
+                  "GL_RENDERBUFFER)");
+      return;
+   }
+
+   if (renderbuffer) {
+      rb = _mesa_lookup_renderbuffer_err(ctx, renderbuffer,
+                                         "glNamedFramebufferRenderbuffer");
+      if (!rb)
+         return;
+   }
+   else {
+      /* remove renderbuffer attachment */
+      rb = NULL;
+   }
+
+   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb,
+                                  "glNamedFramebufferRenderbuffer");
+}
+
+
+void
+_mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
+                                           struct gl_framebuffer *buffer,
+                                           GLenum attachment, GLenum pname,
+                                           GLint *params, const char *caller)
+{
+   const struct gl_renderbuffer_attachment *att;
+   GLenum err;
+
+   /* The error differs in GL and GLES. */
+   err = _mesa_is_desktop_gl(ctx) ? GL_INVALID_OPERATION : GL_INVALID_ENUM;
+
    if (_mesa_is_winsys_fbo(buffer)) {
       /* Page 126 (page 136 of the PDF) of the OpenGL ES 2.0.25 spec
        * says:
@@ -2936,14 +3531,15 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
            !ctx->Extensions.ARB_framebuffer_object)
           && !_mesa_is_gles3(ctx)) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetFramebufferAttachmentParameteriv(bound FBO = 0)");
+                     "%s(window-system framebuffer)", caller);
          return;
       }
 
       if (_mesa_is_gles3(ctx) && attachment != GL_BACK &&
           attachment != GL_DEPTH && attachment != GL_STENCIL) {
          _mesa_error(ctx, GL_INVALID_ENUM,
-                     "glGetFramebufferAttachmentParameteriv(attachment)");
+                     "%s(invalid attachment %s)", caller,
+                     _mesa_lookup_enum_by_nr(attachment));
          return;
       }
       /* the default / window-system FBO */
@@ -2955,8 +3551,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
    }
 
    if (att == NULL) {
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetFramebufferAttachmentParameteriv(attachment)");
+      _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", caller,
+                  _mesa_lookup_enum_by_nr(attachment));
       return;
    }
 
@@ -2970,9 +3566,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
           *    attachment, since it does not have a single format."
           */
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetFramebufferAttachmentParameteriv("
-                     "GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE"
-                     " is invalid for depth+stencil attachment)");
+                     "%s(GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE"
+                     " is invalid for depth+stencil attachment)", caller);
          return;
       }
       /* the depth and stencil attachments must point to the same buffer */
@@ -2980,8 +3575,7 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
       stencilAtt = get_attachment(ctx, buffer, GL_STENCIL_ATTACHMENT);
       if (depthAtt->Renderbuffer != stencilAtt->Renderbuffer) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glGetFramebufferAttachmentParameteriv(DEPTH/STENCIL"
-                     " attachments differ)");
+                     "%s(DEPTH/STENCIL attachments differ)", caller);
          return;
       }
    }
@@ -3014,8 +3608,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          *params = att->TextureLevel;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3031,8 +3625,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          }
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          goto invalid_pname_enum;
@@ -3042,8 +3636,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
       if (ctx->API == API_OPENGLES) {
          goto invalid_pname_enum;
       } else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       } else if (att->Type == GL_TEXTURE) {
          if (att->Texture && (att->Texture->Target == GL_TEXTURE_3D ||
              att->Texture->Target == GL_TEXTURE_2D_ARRAY)) {
@@ -3064,8 +3658,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          goto invalid_pname_enum;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          if (ctx->Extensions.EXT_framebuffer_sRGB) {
@@ -3087,8 +3681,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          goto invalid_pname_enum;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else {
          mesa_format format = att->Renderbuffer->Format;
@@ -3103,9 +3697,9 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          if (_mesa_is_gles3(ctx) &&
              attachment == GL_DEPTH_STENCIL_ATTACHMENT) {
             _mesa_error(ctx, GL_INVALID_OPERATION,
-                        "glGetFramebufferAttachmentParameteriv(cannot query "
+                        "%s(cannot query "
                         "GL_FRAMEBUFFER_ATTACHMENT_COMPONENT_TYPE of "
-                        "GL_DEPTH_STENCIL_ATTACHMENT");
+                        "GL_DEPTH_STENCIL_ATTACHMENT)", caller);
             return;
          }
 
@@ -3139,8 +3733,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
          goto invalid_pname_enum;
       }
       else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       }
       else if (att->Texture) {
          const struct gl_texture_image *texImage =
@@ -3159,8 +3753,7 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
                                       att->Renderbuffer->Format);
       }
       else {
-         _mesa_problem(ctx, "glGetFramebufferAttachmentParameterivEXT:"
-                       " invalid FBO attachment structure");
+         _mesa_problem(ctx, "%s: invalid FBO attachment structure", caller);
       }
       return;
    case GL_FRAMEBUFFER_ATTACHMENT_LAYERED:
@@ -3169,8 +3762,8 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
       } else if (att->Type == GL_TEXTURE) {
          *params = att->Layered;
       } else if (att->Type == GL_NONE) {
-         _mesa_error(ctx, err,
-                     "glGetFramebufferAttachmentParameteriv(pname)");
+         _mesa_error(ctx, err, "%s(invalid pname %s)", caller,
+                     _mesa_lookup_enum_by_nr(pname));
       } else {
          goto invalid_pname_enum;
       }
@@ -3182,30 +3775,144 @@ _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
    return;
 
 invalid_pname_enum:
-   _mesa_error(ctx, GL_INVALID_ENUM,
-               "glGetFramebufferAttachmentParameteriv(pname)");
+   _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid pname %s)", caller,
+               _mesa_lookup_enum_by_nr(pname));
    return;
 }
 
 
+void GLAPIENTRY
+_mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
+                                          GLenum pname, GLint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *buffer;
+
+   buffer = get_framebuffer_target(ctx, target);
+   if (!buffer) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glGetFramebufferAttachmentParameteriv(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
+   _mesa_get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
+                                              params,
+                                    "glGetFramebufferAttachmentParameteriv");
+}
+
+
+void GLAPIENTRY
+_mesa_GetNamedFramebufferAttachmentParameteriv(GLuint framebuffer,
+                                               GLenum attachment,
+                                               GLenum pname, GLint *params)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *buffer;
+
+   if (framebuffer) {
+      buffer = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                              "glGetNamedFramebufferAttachmentParameteriv");
+      if (!buffer)
+         return;
+   }
+   else {
+      /*
+       * Section 9.2 Binding and Managing Framebuffer Objects of the OpenGL
+       * 4.5 core spec (30.10.2014, PDF page 314):
+       *    "If framebuffer is zero, then the default draw framebuffer is
+       *    queried."
+       */
+      buffer = ctx->WinSysDrawBuffer;
+   }
+
+   _mesa_get_framebuffer_attachment_parameter(ctx, buffer, attachment, pname,
+                                              params,
+                              "glGetNamedFramebufferAttachmentParameteriv");
+}
+
+
+void GLAPIENTRY
+_mesa_NamedFramebufferParameteri(GLuint framebuffer, GLenum pname,
+                                 GLint param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb = NULL;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glNamedFramebufferParameteri("
+                  "ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                     "glNamedFramebufferParameteri");
+
+   if (fb) {
+      framebuffer_parameteri(ctx, fb, pname, param,
+                             "glNamedFramebufferParameteriv");
+   }
+}
+
+
+void GLAPIENTRY
+_mesa_GetNamedFramebufferParameteriv(GLuint framebuffer, GLenum pname,
+                                     GLint *param)
+{
+   GET_CURRENT_CONTEXT(ctx);
+   struct gl_framebuffer *fb;
+
+   if (!ctx->Extensions.ARB_framebuffer_no_attachments) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glNamedFramebufferParameteriv("
+                  "ARB_framebuffer_no_attachments not implemented)");
+      return;
+   }
+
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glGetNamedFramebufferParameteriv");
+   } else {
+      fb = ctx->WinSysDrawBuffer;
+   }
+
+   if (fb) {
+      get_framebuffer_parameteriv(ctx, fb, pname, param,
+                                  "glGetNamedFramebufferParameteriv");
+   }
+}
+
+
 static void
-invalidate_framebuffer_storage(GLenum target, GLsizei numAttachments,
+invalidate_framebuffer_storage(struct gl_context *ctx,
+                               struct gl_framebuffer *fb,
+                               GLsizei numAttachments,
                                const GLenum *attachments, GLint x, GLint y,
                                GLsizei width, GLsizei height, const char *name)
 {
    int i;
-   struct gl_framebuffer *fb;
-   GET_CURRENT_CONTEXT(ctx);
 
-   fb = get_framebuffer_target(ctx, target);
-   if (!fb) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "%s(target)", name);
+   /* Section 17.4 Whole Framebuffer Operations of the OpenGL 4.5 Core
+    * Spec (2.2.2015, PDF page 522) says:
+    *    "An INVALID_VALUE error is generated if numAttachments, width, or
+    *    height is negative."
+    */
+   if (numAttachments < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(numAttachments < 0)", name);
       return;
    }
 
-   if (numAttachments < 0) {
+   if (width < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "%s(numAttachments < 0)", name);
+                  "%s(width < 0)", name);
+      return;
+   }
+
+   if (height < 0) {
+      _mesa_error(ctx, GL_INVALID_VALUE,
+                  "%s(height < 0)", name);
       return;
    }
 
@@ -3301,7 +4008,8 @@ invalidate_framebuffer_storage(GLenum target, GLsizei numAttachments,
    return;
 
 invalid_enum:
-   _mesa_error(ctx, GL_INVALID_ENUM, "%s(attachment)", name);
+   _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid attachment %s)", name,
+               _mesa_lookup_enum_by_nr(attachments[i]));
    return;
 }
 
@@ -3311,16 +4019,67 @@ _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
                                const GLenum *attachments, GLint x, GLint y,
                                GLsizei width, GLsizei height)
 {
-   invalidate_framebuffer_storage(target, numAttachments, attachments,
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glInvalidateSubFramebuffer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
                                   x, y, width, height,
                                   "glInvalidateSubFramebuffer");
 }
 
 
 void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferSubData(GLuint framebuffer,
+                                        GLsizei numAttachments,
+                                        const GLenum *attachments,
+                                        GLint x, GLint y,
+                                        GLsizei width, GLsizei height)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   /* The OpenGL 4.5 core spec (02.02.2015) says (in Section 17.4 Whole
+    * Framebuffer Operations, PDF page 522): "If framebuffer is zero, the
+    * default draw framebuffer is affected."
+    */
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glInvalidateNamedFramebufferSubData");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
+                                  x, y, width, height,
+                                  "glInvalidateNamedFramebufferSubData");
+}
+
+
+void GLAPIENTRY
 _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments)
 {
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   fb = get_framebuffer_target(ctx, target);
+   if (!fb) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glInvalidateFramebuffer(invalid target %s)",
+                  _mesa_lookup_enum_by_nr(target));
+      return;
+   }
+
    /* The GL_ARB_invalidate_subdata spec says:
     *
     *     "The command
@@ -3333,7 +4092,7 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
     *     <width>, <height> equal to 0, 0, <MAX_VIEWPORT_DIMS[0]>,
     *     <MAX_VIEWPORT_DIMS[1]> respectively."
     */
-   invalidate_framebuffer_storage(target, numAttachments, attachments,
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
                                   0, 0,
                                   MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT,
                                   "glInvalidateFramebuffer");
@@ -3341,6 +4100,46 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
 
 
 void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferData(GLuint framebuffer,
+                                     GLsizei numAttachments,
+                                     const GLenum *attachments)
+{
+   struct gl_framebuffer *fb;
+   GET_CURRENT_CONTEXT(ctx);
+
+   /* The OpenGL 4.5 core spec (02.02.2015) says (in Section 17.4 Whole
+    * Framebuffer Operations, PDF page 522): "If framebuffer is zero, the
+    * default draw framebuffer is affected."
+    */
+   if (framebuffer) {
+      fb = _mesa_lookup_framebuffer_err(ctx, framebuffer,
+                                        "glInvalidateNamedFramebufferData");
+      if (!fb)
+         return;
+   }
+   else
+      fb = ctx->WinSysDrawBuffer;
+
+   /* The GL_ARB_invalidate_subdata spec says:
+    *
+    *     "The command
+    *
+    *        void InvalidateFramebuffer(enum target,
+    *                                   sizei numAttachments,
+    *                                   const enum *attachments);
+    *
+    *     is equivalent to the command InvalidateSubFramebuffer with <x>, <y>,
+    *     <width>, <height> equal to 0, 0, <MAX_VIEWPORT_DIMS[0]>,
+    *     <MAX_VIEWPORT_DIMS[1]> respectively."
+    */
+   invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
+                                  0, 0,
+                                  MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT,
+                                  "glInvalidateNamedFramebufferData");
+}
+
+
+void GLAPIENTRY
 _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments)
 {
diff --git a/src/mesa/main/fbobject.h b/src/mesa/main/fbobject.h
index 61aa1f5..8dad0ff 100644
--- a/src/mesa/main/fbobject.h
+++ b/src/mesa/main/fbobject.h
@@ -64,9 +64,17 @@ _mesa_get_incomplete_framebuffer(void);
 extern struct gl_renderbuffer *
 _mesa_lookup_renderbuffer(struct gl_context *ctx, GLuint id);
 
+extern struct gl_renderbuffer *
+_mesa_lookup_renderbuffer_err(struct gl_context *ctx, GLuint id,
+                              const char *func);
+
 extern struct gl_framebuffer *
 _mesa_lookup_framebuffer(struct gl_context *ctx, GLuint id);
 
+extern struct gl_framebuffer *
+_mesa_lookup_framebuffer_err(struct gl_context *ctx, GLuint id,
+                             const char *func);
+
 
 void
 _mesa_update_texture_renderbuffer(struct gl_context *ctx,
@@ -74,9 +82,17 @@ _mesa_update_texture_renderbuffer(struct gl_context *ctx,
                                   struct gl_renderbuffer_attachment *att);
 
 extern void
+_mesa_FramebufferRenderbuffer_sw(struct gl_context *ctx,
+                                 struct gl_framebuffer *fb,
+                                 GLenum attachment,
+                                 struct gl_renderbuffer *rb);
+
+extern void
 _mesa_framebuffer_renderbuffer(struct gl_context *ctx,
                                struct gl_framebuffer *fb,
-                               GLenum attachment, struct gl_renderbuffer *rb);
+                               GLenum attachment,
+                               struct gl_renderbuffer *rb,
+                               const char *func);
 
 extern void
 _mesa_validate_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb);
@@ -99,6 +115,24 @@ _mesa_detach_renderbuffer(struct gl_context *ctx,
                           struct gl_framebuffer *fb,
                           const void *att);
 
+extern void
+_mesa_framebuffer_texture(struct gl_context *ctx, struct gl_framebuffer *fb,
+                          GLenum attachment,
+                          struct gl_texture_object *texObj, GLenum textarget,
+                          GLint level, GLuint layer, GLboolean layered,
+                          const char *caller);
+
+extern GLenum
+_mesa_check_framebuffer_status(struct gl_context *ctx,
+                               struct gl_framebuffer *fb);
+
+extern void
+_mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
+                                           struct gl_framebuffer *buffer,
+                                           GLenum attachment, GLenum pname,
+                                           GLint *params, const char *caller);
+
+
 extern GLboolean GLAPIENTRY
 _mesa_IsRenderbuffer(GLuint renderbuffer);
 
@@ -165,9 +199,15 @@ _mesa_DeleteFramebuffers(GLsizei n, const GLuint *framebuffers);
 extern void GLAPIENTRY
 _mesa_GenFramebuffers(GLsizei n, GLuint *framebuffers);
 
+extern void GLAPIENTRY
+_mesa_CreateFramebuffers(GLsizei n, GLuint *framebuffers);
+
 extern GLenum GLAPIENTRY
 _mesa_CheckFramebufferStatus(GLenum target);
 
+extern GLenum GLAPIENTRY
+_mesa_CheckNamedFramebufferStatus(GLuint framebuffer, GLenum target);
+
 extern void GLAPIENTRY
 _mesa_FramebufferTexture1D(GLenum target, GLenum attachment,
                               GLenum textarget, GLuint texture, GLint level);
@@ -179,24 +219,49 @@ _mesa_FramebufferTexture2D(GLenum target, GLenum attachment,
 extern void GLAPIENTRY
 _mesa_FramebufferTexture3D(GLenum target, GLenum attachment,
                               GLenum textarget, GLuint texture,
-                              GLint level, GLint zoffset);
+                              GLint level, GLint layer);
 
 extern void GLAPIENTRY
 _mesa_FramebufferTextureLayer(GLenum target, GLenum attachment,
                                  GLuint texture, GLint level, GLint layer);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferTextureLayer(GLuint framebuffer, GLenum attachment,
+                                   GLuint texture, GLint level, GLint layer);
+
+extern void GLAPIENTRY
 _mesa_FramebufferTexture(GLenum target, GLenum attachment,
                          GLuint texture, GLint level);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferTexture(GLuint framebuffer, GLenum attachment,
+                              GLuint texture, GLint level);
+
+extern void GLAPIENTRY
 _mesa_FramebufferRenderbuffer(GLenum target, GLenum attachment,
                                  GLenum renderbuffertarget,
                                  GLuint renderbuffer);
 
 extern void GLAPIENTRY
+_mesa_NamedFramebufferRenderbuffer(GLuint framebuffer, GLenum attachment,
+                                   GLenum renderbuffertarget,
+                                   GLuint renderbuffer);
+
+extern void GLAPIENTRY
 _mesa_GetFramebufferAttachmentParameteriv(GLenum target, GLenum attachment,
                                              GLenum pname, GLint *params);
+extern void GLAPIENTRY
+_mesa_GetNamedFramebufferAttachmentParameteriv(GLuint framebuffer,
+                                               GLenum attachment,
+                                               GLenum pname, GLint *params);
+
+extern void GLAPIENTRY
+_mesa_NamedFramebufferParameteri(GLuint framebuffer, GLenum pname,
+                                 GLint param);
+
+extern void GLAPIENTRY
+_mesa_GetNamedFramebufferParameteriv(GLuint framebuffer, GLenum pname,
+                                     GLint *param);
 
 extern void GLAPIENTRY
 _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
@@ -204,11 +269,29 @@ _mesa_InvalidateSubFramebuffer(GLenum target, GLsizei numAttachments,
                                GLsizei width, GLsizei height);
 
 extern void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferSubData(GLuint framebuffer,
+                                        GLsizei numAttachments,
+                                        const GLenum *attachments,
+                                        GLint x, GLint y,
+                                        GLsizei width, GLsizei height);
+
+extern void GLAPIENTRY
 _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments);
 
 extern void GLAPIENTRY
+_mesa_InvalidateNamedFramebufferData(GLuint framebuffer,
+                                     GLsizei numAttachments,
+                                     const GLenum *attachments);
+
+extern void GLAPIENTRY
 _mesa_DiscardFramebufferEXT(GLenum target, GLsizei numAttachments,
                             const GLenum *attachments);
 
+extern void GLAPIENTRY
+_mesa_FramebufferParameteri(GLenum target, GLenum pname, GLint param);
+
+extern void GLAPIENTRY
+_mesa_GetFramebufferParameteriv(GLenum target, GLenum pname, GLint *params);
+
 #endif /* FBOBJECT_H */
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 8af44e9..baeb1bf 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -397,6 +397,11 @@ format_array_format_table_init(void)
    format_array_format_table = _mesa_hash_table_create(NULL, NULL,
                                                        array_formats_equal);
 
+   if (!format_array_format_table) {
+      _mesa_error_no_memory(__func__);
+      return;
+   }
+
    for (f = 1; f < MESA_FORMAT_COUNT; ++f) {
       info = _mesa_get_format_info(f);
       if (!info->ArrayFormat)
@@ -432,6 +437,12 @@ _mesa_format_from_array_format(uint32_t array_format)
 
    call_once(&format_array_format_table_exists, format_array_format_table_init);
 
+   if (!format_array_format_table) {
+      static const once_flag once_flag_init = ONCE_FLAG_INIT;
+      format_array_format_table_exists = once_flag_init;
+      return MESA_FORMAT_NONE;
+   }
+
    entry = _mesa_hash_table_search_pre_hashed(format_array_format_table,
                                               array_format,
                                               (void *)(intptr_t)array_format);
diff --git a/src/mesa/main/framebuffer.c b/src/mesa/main/framebuffer.c
index 4f7736a..77c04b8 100644
--- a/src/mesa/main/framebuffer.c
+++ b/src/mesa/main/framebuffer.c
@@ -157,6 +157,7 @@ _mesa_initialize_window_framebuffer(struct gl_framebuffer *fb,
    fb->_Status = GL_FRAMEBUFFER_COMPLETE_EXT;
    fb->_AllColorBuffersFixedPoint = !visual->floatMode;
    fb->_HasSNormOrFloatColorBuffer = visual->floatMode;
+   fb->_HasAttachments = true;
 
    compute_depth_max(fb);
 }
@@ -312,7 +313,7 @@ _mesa_resize_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb,
 
    if (ctx) {
       /* update scissor / window bounds */
-      _mesa_update_draw_buffer_bounds(ctx);
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
       /* Signal new buffer state so that swrast will update its clipping
        * info (the CLIP_BIT flag).
        */
@@ -356,30 +357,20 @@ update_framebuffer_size(struct gl_context *ctx, struct gl_framebuffer *fb)
 }
 
 
+
 /**
- * Calculate the inclusive bounding box for the scissor of a specific viewport
+ * Given a bounding box, intersect the bounding box with the scissor of
+ * a specified vieport.
  *
  * \param ctx     GL context.
- * \param buffer  Framebuffer to be checked against
  * \param idx     Index of the desired viewport
  * \param bbox    Bounding box for the scissored viewport.  Stored as xmin,
  *                xmax, ymin, ymax.
- *
- * \warning This function assumes that the framebuffer dimensions are up to
- * date (e.g., update_framebuffer_size has been recently called on \c buffer).
- *
- * \sa _mesa_clip_to_region
  */
 void
-_mesa_scissor_bounding_box(const struct gl_context *ctx,
-                           const struct gl_framebuffer *buffer,
-                           unsigned idx, int *bbox)
+_mesa_intersect_scissor_bounding_box(const struct gl_context *ctx,
+                                     unsigned idx, int *bbox)
 {
-   bbox[0] = 0;
-   bbox[2] = 0;
-   bbox[1] = buffer->Width;
-   bbox[3] = buffer->Height;
-
    if (ctx->Scissor.EnableFlags & (1u << idx)) {
       if (ctx->Scissor.ScissorArray[idx].X > bbox[0]) {
          bbox[0] = ctx->Scissor.ScissorArray[idx].X;
@@ -401,6 +392,33 @@ _mesa_scissor_bounding_box(const struct gl_context *ctx,
          bbox[2] = bbox[3];
       }
    }
+}
+
+/**
+ * Calculate the inclusive bounding box for the scissor of a specific viewport
+ *
+ * \param ctx     GL context.
+ * \param buffer  Framebuffer to be checked against
+ * \param idx     Index of the desired viewport
+ * \param bbox    Bounding box for the scissored viewport.  Stored as xmin,
+ *                xmax, ymin, ymax.
+ *
+ * \warning This function assumes that the framebuffer dimensions are up to
+ * date (e.g., update_framebuffer_size has been recently called on \c buffer).
+ *
+ * \sa _mesa_clip_to_region
+ */
+void
+_mesa_scissor_bounding_box(const struct gl_context *ctx,
+                           const struct gl_framebuffer *buffer,
+                           unsigned idx, int *bbox)
+{
+   bbox[0] = 0;
+   bbox[2] = 0;
+   bbox[1] = buffer->Width;
+   bbox[3] = buffer->Height;
+
+   _mesa_intersect_scissor_bounding_box(ctx, idx, bbox);
 
    assert(bbox[0] <= bbox[1]);
    assert(bbox[2] <= bbox[3]);
@@ -413,9 +431,9 @@ _mesa_scissor_bounding_box(const struct gl_context *ctx,
  * \param ctx  the GL context.
  */
 void
-_mesa_update_draw_buffer_bounds(struct gl_context *ctx)
+_mesa_update_draw_buffer_bounds(struct gl_context *ctx,
+                                struct gl_framebuffer *buffer)
 {
-   struct gl_framebuffer *buffer = ctx->DrawBuffer;
    int bbox[4];
 
    if (!buffer)
@@ -652,7 +670,7 @@ update_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
        * context state (GL_READ_BUFFER too).
        */
       if (fb->ColorDrawBuffer[0] != ctx->Color.DrawBuffer[0]) {
-         _mesa_drawbuffers(ctx, ctx->Const.MaxDrawBuffers,
+         _mesa_drawbuffers(ctx, fb, ctx->Const.MaxDrawBuffers,
                            ctx->Color.DrawBuffer, NULL);
       }
    }
@@ -678,24 +696,21 @@ update_framebuffer(struct gl_context *ctx, struct gl_framebuffer *fb)
 
 
 /**
- * Update state related to the current draw/read framebuffers.
+ * Update state related to the draw/read framebuffers.
  */
 void
-_mesa_update_framebuffer(struct gl_context *ctx)
+_mesa_update_framebuffer(struct gl_context *ctx,
+                         struct gl_framebuffer *readFb,
+                         struct gl_framebuffer *drawFb)
 {
-   struct gl_framebuffer *drawFb;
-   struct gl_framebuffer *readFb;
-
    assert(ctx);
-   drawFb = ctx->DrawBuffer;
-   readFb = ctx->ReadBuffer;
 
    update_framebuffer(ctx, drawFb);
    if (readFb != drawFb)
       update_framebuffer(ctx, readFb);
 
-   _mesa_update_clamp_vertex_color(ctx);
-   _mesa_update_clamp_fragment_color(ctx);
+   _mesa_update_clamp_vertex_color(ctx, drawFb);
+   _mesa_update_clamp_fragment_color(ctx, drawFb);
 }
 
 
diff --git a/src/mesa/main/framebuffer.h b/src/mesa/main/framebuffer.h
index a427421..08e4322 100644
--- a/src/mesa/main/framebuffer.h
+++ b/src/mesa/main/framebuffer.h
@@ -75,16 +75,50 @@ extern void
 _mesa_scissor_bounding_box(const struct gl_context *ctx,
                            const struct gl_framebuffer *buffer,
                            unsigned idx, int *bbox);
+extern void
+_mesa_intersect_scissor_bounding_box(const struct gl_context *ctx,
+                                     unsigned idx, int *bbox);
+
+static inline GLuint
+_mesa_geometric_width(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Width : buffer->DefaultGeometry.Width;
+}
+
+static inline GLuint
+_mesa_geometric_height(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Height : buffer->DefaultGeometry.Height;
+}
+
+static inline GLuint
+_mesa_geometric_samples(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->Visual.samples : buffer->DefaultGeometry.NumSamples;
+}
+
+static inline GLuint
+_mesa_geometric_layers(const struct gl_framebuffer *buffer)
+{
+   return buffer->_HasAttachments ?
+      buffer->MaxNumLayers : buffer->DefaultGeometry.Layers;
+}
 
 extern void 
-_mesa_update_draw_buffer_bounds(struct gl_context *ctx);
+_mesa_update_draw_buffer_bounds(struct gl_context *ctx,
+                                struct gl_framebuffer *drawFb);
 
 extern void
 _mesa_update_framebuffer_visual(struct gl_context *ctx,
 				struct gl_framebuffer *fb);
 
 extern void
-_mesa_update_framebuffer(struct gl_context *ctx);
+_mesa_update_framebuffer(struct gl_context *ctx,
+                         struct gl_framebuffer *readFb,
+                         struct gl_framebuffer *drawFb);
 
 extern GLboolean
 _mesa_source_buffer_exists(struct gl_context *ctx, GLenum format);
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index a881bc5..3d6d639 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -138,6 +138,7 @@ enum value_extra {
    EXTRA_API_GL_CORE,
    EXTRA_API_ES2,
    EXTRA_API_ES3,
+   EXTRA_API_ES31,
    EXTRA_NEW_BUFFERS, 
    EXTRA_NEW_FRAG_CLAMP,
    EXTRA_VALID_DRAW_BUFFER,
@@ -348,6 +349,12 @@ static const int extra_ARB_shader_image_load_store_and_geometry_shader[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_draw_indirect_es31[] = {
+   EXT(ARB_draw_indirect),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
@@ -393,6 +400,7 @@ EXTRA_EXT(INTEL_performance_query);
 EXTRA_EXT(ARB_explicit_uniform_location);
 EXTRA_EXT(ARB_clip_control);
 EXTRA_EXT(EXT_polygon_offset_clamp);
+EXTRA_EXT(ARB_framebuffer_no_attachments);
 
 static const int
 extra_ARB_color_buffer_float_or_glcore[] = {
@@ -909,13 +917,13 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       break;
 
    case GL_FOG_COLOR:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          COPY_4FV(v->value_float_4, ctx->Fog.Color);
       else
          COPY_4FV(v->value_float_4, ctx->Fog.ColorUnclamped);
       break;
    case GL_COLOR_CLEAR_VALUE:
-      if (_mesa_get_clamp_fragment_color(ctx)) {
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer)) {
          v->value_float_4[0] = CLAMP(ctx->Color.ClearColor.f[0], 0.0F, 1.0F);
          v->value_float_4[1] = CLAMP(ctx->Color.ClearColor.f[1], 0.0F, 1.0F);
          v->value_float_4[2] = CLAMP(ctx->Color.ClearColor.f[2], 0.0F, 1.0F);
@@ -924,13 +932,13 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
          COPY_4FV(v->value_float_4, ctx->Color.ClearColor.f);
       break;
    case GL_BLEND_COLOR_EXT:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          COPY_4FV(v->value_float_4, ctx->Color.BlendColor);
       else
          COPY_4FV(v->value_float_4, ctx->Color.BlendColorUnclamped);
       break;
    case GL_ALPHA_TEST_REF:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          v->value_float = ctx->Color.AlphaRef;
       else
          v->value_float = ctx->Color.AlphaRefUnclamped;
@@ -1078,6 +1086,11 @@ check_extra(struct gl_context *ctx, const char *func, const struct value_desc *d
          if (_mesa_is_gles3(ctx))
             api_found = GL_TRUE;
 	 break;
+      case EXTRA_API_ES31:
+         api_check = GL_TRUE;
+         if (_mesa_is_gles31(ctx))
+            api_found = GL_TRUE;
+	 break;
       case EXTRA_API_GL:
          api_check = GL_TRUE;
          if (_mesa_is_desktop_gl(ctx))
@@ -1911,6 +1924,7 @@ find_value_indexed(const char *func, GLenum pname, GLuint index, union value *v)
       if (index >= ctx->Const.Program[MESA_SHADER_VERTEX].MaxAttribs)
           goto invalid_value;
       v->value_int = ctx->Array.VAO->VertexBinding[VERT_ATTRIB_GENERIC(index)].Stride;
+      return TYPE_INT;
 
    /* ARB_shader_image_load_store */
    case GL_IMAGE_BINDING_NAME: {
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 41cb2c1..74ff3ba 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -409,6 +409,12 @@ descriptor=[
   [ "SAMPLER_BINDING", "LOC_CUSTOM, TYPE_INT, GL_SAMPLER_BINDING, NO_EXTRA" ],
 ]},
 
+# Enums in OpenGL Core profile and ES 3.1
+{ "apis": ["GL_CORE", "GLES3"], "params": [
+# GL_ARB_draw_indirect / GLES 3.1
+  [ "DRAW_INDIRECT_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_draw_indirect_es31" ],
+]},
+
 # Remaining enums are only in OpenGL
 { "apis": ["GL", "GL_CORE"], "params": [
   [ "ACCUM_RED_BITS", "BUFFER_INT(Visual.accumRedBits), NO_EXTRA" ],
@@ -793,19 +799,20 @@ descriptor=[
   [ "MAX_COMPUTE_UNIFORM_COMPONENTS", "CONST(MAX_COMPUTE_UNIFORM_COMPONENTS), extra_ARB_compute_shader" ],
   [ "MAX_COMPUTE_IMAGE_UNIFORMS", "CONST(MAX_COMPUTE_IMAGE_UNIFORMS), extra_ARB_compute_shader" ],
 
-# GL_ARB_gpu_shader5
-  [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
-  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
+# GL_ARB_framebuffer_no_attachments
+  ["MAX_FRAMEBUFFER_WIDTH", "CONTEXT_INT(Const.MaxFramebufferWidth), extra_ARB_framebuffer_no_attachments"],
+  ["MAX_FRAMEBUFFER_HEIGHT", "CONTEXT_INT(Const.MaxFramebufferHeight), extra_ARB_framebuffer_no_attachments"],
+  ["MAX_FRAMEBUFFER_LAYERS", "CONTEXT_INT(Const.MaxFramebufferLayers), extra_ARB_framebuffer_no_attachments"],
+  ["MAX_FRAMEBUFFER_SAMPLES", "CONTEXT_INT(Const.MaxFramebufferSamples), extra_ARB_framebuffer_no_attachments"],
+
+# GL_EXT_polygon_offset_clamp
+  [ "POLYGON_OFFSET_CLAMP_EXT", "CONTEXT_FLOAT(Polygon.OffsetClamp), extra_EXT_polygon_offset_clamp" ],
 ]},
 
 # Enums restricted to OpenGL Core profile
 { "apis": ["GL_CORE"], "params": [
 # GL_ARB_texture_buffer_range
   [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
-# GL_ARB_draw_indirect
-  [ "DRAW_INDIRECT_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_draw_indirect" ],
 
 # GL_ARB_viewport_array
   [ "MAX_VIEWPORTS", "CONTEXT_INT(Const.MaxViewports), extra_ARB_viewport_array" ],
@@ -814,8 +821,11 @@ descriptor=[
   [ "LAYER_PROVOKING_VERTEX", "CONTEXT_ENUM(Light.ProvokingVertex), extra_ARB_viewport_array" ],
   [ "VIEWPORT_INDEX_PROVOKING_VERTEX", "CONTEXT_ENUM(Light.ProvokingVertex), extra_ARB_viewport_array" ],
 
-# GL_EXT_polygon_offset_clamp
-  [ "POLYGON_OFFSET_CLAMP_EXT", "CONTEXT_FLOAT(Polygon.OffsetClamp), extra_EXT_polygon_offset_clamp" ],
+# GL_ARB_gpu_shader5
+  [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
+  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
+  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
+  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
 ]}
 
 ]
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 1b2c7f0..72d99ca 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -72,10 +72,18 @@ shading_language_version(struct gl_context *ctx)
       break;
 
    case API_OPENGLES2:
-      return (ctx->Version < 30)
-         ? (const GLubyte *) "OpenGL ES GLSL ES 1.0.16"
-         : (const GLubyte *) "OpenGL ES GLSL ES 3.00";
-
+      switch (ctx->Version) {
+      case 20:
+         return (const GLubyte *) "OpenGL ES GLSL ES 1.0.16";
+      case 30:
+         return (const GLubyte *) "OpenGL ES GLSL ES 3.00";
+      case 31:
+         return (const GLubyte *) "OpenGL ES GLSL ES 3.10";
+      default:
+         _mesa_problem(ctx,
+                       "Invalid OpenGL ES version in shading_language_version()");
+         return (const GLubyte *) 0;
+      }
    case API_OPENGLES:
       /* fall-through */
 
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 8ced579..ac69fab 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -1200,7 +1200,7 @@ _mesa_is_depth_or_stencil_format(GLenum format)
  * \return GL_TRUE if compressed, GL_FALSE if uncompressed
  */
 GLboolean
-_mesa_is_compressed_format(struct gl_context *ctx, GLenum format)
+_mesa_is_compressed_format(const struct gl_context *ctx, GLenum format)
 {
    switch (format) {
    case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
@@ -1678,6 +1678,10 @@ _mesa_error_check_format_and_type(const struct gl_context *ctx,
       case GL_LUMINANCE:
       case GL_ALPHA:
          return GL_NO_ERROR;
+      case GL_RG:
+      case GL_RED:
+	 if (_mesa_is_gles3(ctx) || ctx->Extensions.ARB_texture_rg)
+            return GL_NO_ERROR;
       default:
          return GL_INVALID_OPERATION;
       }
@@ -2292,8 +2296,18 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          break;
 
       case GL_HALF_FLOAT:
-         if (internalFormat != GL_RG16F)
-            return GL_INVALID_OPERATION;
+      case GL_HALF_FLOAT_OES:
+         switch (internalFormat) {
+            case GL_RG16F:
+               break;
+            case GL_RG:
+               if (ctx->Extensions.ARB_texture_rg &&
+                   ctx->Extensions.OES_texture_half_float)
+                  break;
+            /* fallthrough */
+            default:
+               return GL_INVALID_OPERATION;
+         }
          break;
 
       case GL_FLOAT:
@@ -2301,6 +2315,11 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          case GL_RG16F:
          case GL_RG32F:
             break;
+         case GL_RG:
+            if (ctx->Extensions.ARB_texture_rg &&
+                ctx->Extensions.OES_texture_float)
+               break;
+            /* fallthrough */
          default:
             return GL_INVALID_OPERATION;
          }
@@ -2361,8 +2380,19 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          break;
 
       case GL_HALF_FLOAT:
-         if (internalFormat != GL_R16F)
+      case GL_HALF_FLOAT_OES:
+         switch (internalFormat) {
+         case GL_R16F:
+            break;
+         case GL_RG:
+         case GL_RED:
+            if (ctx->Extensions.ARB_texture_rg &&
+                ctx->Extensions.OES_texture_half_float)
+               break;
+            /* fallthrough */
+         default:
             return GL_INVALID_OPERATION;
+         }
          break;
 
       case GL_FLOAT:
@@ -2370,6 +2400,11 @@ _mesa_es3_error_check_format_and_type(const struct gl_context *ctx,
          case GL_R16F:
          case GL_R32F:
             break;
+         case GL_RED:
+            if (ctx->Extensions.ARB_texture_rg &&
+                ctx->Extensions.OES_texture_float)
+               break;
+            /* fallthrough */
          default:
             return GL_INVALID_OPERATION;
          }
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index e1ecd64..8881cb7 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -96,7 +96,7 @@ extern GLboolean
 _mesa_is_depth_or_stencil_format(GLenum format);
 
 extern GLboolean
-_mesa_is_compressed_format(struct gl_context *ctx, GLenum format);
+_mesa_is_compressed_format(const struct gl_context *ctx, GLenum format);
 
 extern GLenum
 _mesa_base_format_to_integer_format(GLenum format);
diff --git a/src/mesa/main/glheader.h b/src/mesa/main/glheader.h
index 7f7f9a3..a2d98d4 100644
--- a/src/mesa/main/glheader.h
+++ b/src/mesa/main/glheader.h
@@ -135,12 +135,6 @@ typedef void *GLeglImageOES;
 #define GL_SHADER_PROGRAM_MESA 0x9999
 
 
-/**
- * Internal token for geometry programs.
- * Use the value for GL_GEOMETRY_PROGRAM_NV for now.
- */
-#define MESA_GEOMETRY_PROGRAM 0x8c26
-
 /* Several fields of struct gl_config can take these as values.  Since
  * GLX header files may not be available everywhere they need to be used,
  * redefine them here.
diff --git a/src/mesa/main/hash.c b/src/mesa/main/hash.c
index d04cccd..315b5d6 100644
--- a/src/mesa/main/hash.c
+++ b/src/mesa/main/hash.c
@@ -389,34 +389,6 @@ _mesa_HashDeleteAll(struct _mesa_HashTable *table,
 
 
 /**
- * Clone all entries in a hash table, into a new table.
- *
- * \param table  the hash table to clone
- */
-struct _mesa_HashTable *
-_mesa_HashClone(const struct _mesa_HashTable *table)
-{
-   /* cast-away const */
-   struct _mesa_HashTable *table2 = (struct _mesa_HashTable *) table;
-   struct hash_entry *entry;
-   struct _mesa_HashTable *clonetable;
-
-   assert(table);
-   mtx_lock(&table2->Mutex);
-
-   clonetable = _mesa_NewHashTable();
-   assert(clonetable);
-   hash_table_foreach(table->ht, entry) {
-      _mesa_HashInsert(clonetable, (GLint)(uintptr_t)entry->key, entry->data);
-   }
-
-   mtx_unlock(&table2->Mutex);
-
-   return clonetable;
-}
-
-
-/**
  * Walk over all entries in a hash table, calling callback function for each.
  * Note: we use a separate mutex in this function to avoid a recursive
  * locking deadlock (in case the callback calls _mesa_HashRemove()) and to
diff --git a/src/mesa/main/hash.h b/src/mesa/main/hash.h
index e3e8f49..da3b997 100644
--- a/src/mesa/main/hash.h
+++ b/src/mesa/main/hash.h
@@ -59,9 +59,6 @@ _mesa_HashDeleteAll(struct _mesa_HashTable *table,
                     void (*callback)(GLuint key, void *data, void *userData),
                     void *userData);
 
-extern struct _mesa_HashTable *
-_mesa_HashClone(const struct _mesa_HashTable *table);
-
 extern void
 _mesa_HashWalk(const struct _mesa_HashTable *table,
                void (*callback)(GLuint key, void *data, void *userData),
diff --git a/src/mesa/main/imports.h b/src/mesa/main/imports.h
index c4d917e..9ffe3de 100644
--- a/src/mesa/main/imports.h
+++ b/src/mesa/main/imports.h
@@ -230,38 +230,6 @@ static inline int IFLOOR(float f)
 }
 
 
-/** Return (as an integer) ceiling of float */
-static inline int ICEIL(float f)
-{
-#if defined(USE_X86_ASM) && defined(__GNUC__) && defined(__i386__)
-   /*
-    * IEEE ceil for computers that round to nearest or even.
-    * 'f' must be between -4194304 and 4194303.
-    * This ceil operation is done by "(iround(f + .5) + iround(f - .5) + 1) >> 1",
-    * but uses some IEEE specific tricks for better speed.
-    * Contributed by Josh Vanderhoof
-    */
-   int ai, bi;
-   double af, bf;
-   af = (3 << 22) + 0.5 + (double)f;
-   bf = (3 << 22) + 0.5 - (double)f;
-   /* GCC generates an extra fstp/fld without this. */
-   __asm__ ("fstps %0" : "=m" (ai) : "t" (af) : "st");
-   __asm__ ("fstps %0" : "=m" (bi) : "t" (bf) : "st");
-   return (ai - bi + 1) >> 1;
-#else
-   int ai, bi;
-   double af, bf;
-   fi_type u;
-   af = (3 << 22) + 0.5 + (double)f;
-   bf = (3 << 22) + 0.5 - (double)f;
-   u.f = (float) af; ai = u.i;
-   u.f = (float) bf; bi = u.i;
-   return (ai - bi + 1) >> 1;
-#endif
-}
-
-
 /**
  * Is x a power of two?
  */
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index bd84113..481fd5e 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -43,7 +43,6 @@
 #include "glapi/glapi.h"
 #include "math/m_matrix.h"	/* GLmatrix */
 #include "glsl/shader_enums.h"
-#include "util/simple_list.h"	/* struct simple_node */
 #include "main/formats.h"       /* MESA_FORMAT_COUNT */
 
 
@@ -398,7 +397,6 @@ struct gl_config
 {
    GLboolean rgbMode;
    GLboolean floatMode;
-   GLboolean colorIndexMode;  /* XXX is this used anywhere? */
    GLuint doubleBufferMode;
    GLuint stereoMode;
 
@@ -2099,8 +2097,6 @@ struct gl_program
    GLbitfield64 DoubleInputsRead;     /**< Bitmask of which input regs are read  and are doubles */
    GLbitfield64 OutputsWritten; /**< Bitmask of which output regs are written */
    GLbitfield SystemValuesRead;   /**< Bitmask of SYSTEM_VALUE_x inputs used */
-   GLbitfield InputFlags[MAX_PROGRAM_INPUTS];   /**< PROG_PARAM_BIT_x flags */
-   GLbitfield OutputFlags[MAX_PROGRAM_OUTPUTS]; /**< PROG_PARAM_BIT_x flags */
    GLbitfield TexturesUsed[MAX_COMBINED_TEXTURE_IMAGE_UNITS];  /**< TEXTURE_x_BIT bitmask */
    GLbitfield SamplersUsed;   /**< Bitfield of which samplers are used */
    GLbitfield ShadowSamplers; /**< Texture units used for shadow sampling. */
@@ -2275,16 +2271,10 @@ struct gl_vertex_program_state
  */
 struct gl_geometry_program_state
 {
-   GLboolean Enabled;               /**< GL_ARB_GEOMETRY_SHADER4 */
-   GLboolean _Enabled;              /**< Enabled and valid program? */
-   struct gl_geometry_program *Current;  /**< user-bound geometry program */
-
    /** Currently enabled and valid program (including internal programs
     * and compiled shader programs).
     */
    struct gl_geometry_program *_Current;
-
-   GLfloat Parameters[MAX_PROGRAM_ENV_PARAMS][4]; /**< Env params */
 };
 
 /**
@@ -2320,8 +2310,6 @@ struct gl_fragment_program_state
  */
 struct gl_compute_program_state
 {
-   struct gl_compute_program *Current;  /**< user-bound compute program */
-
    /** Currently enabled and valid program (including internal programs
     * and compiled shader programs).
     */
@@ -2733,7 +2721,7 @@ struct gl_shader_program
    } Comp;
 
    /* post-link info: */
-   unsigned NumUserUniformStorage;
+   unsigned NumUniformStorage;
    unsigned NumHiddenUniforms;
    struct gl_uniform_storage *UniformStorage;
 
@@ -2832,6 +2820,8 @@ struct gl_pipeline_object
 
    mtx_t Mutex;
 
+   GLchar *Label;   /**< GL_KHR_debug */
+
    /**
     * Programs used for rendering
     *
@@ -3009,7 +2999,6 @@ struct gl_shared_state
    struct _mesa_HashTable *Programs; /**< All vertex/fragment programs */
    struct gl_vertex_program *DefaultVertexProgram;
    struct gl_fragment_program *DefaultFragmentProgram;
-   struct gl_geometry_program *DefaultGeometryProgram;
    /*@}*/
 
    /* GL_ATI_fragment_shader */
@@ -3151,12 +3140,29 @@ struct gl_framebuffer
     */
    struct gl_config Visual;
 
-   GLuint Width, Height;	/**< size of frame buffer in pixels */
+   /**
+    * Size of frame buffer in pixels. If there are no attachments, then both
+    * of these are 0.
+    */
+   GLuint Width, Height;
 
-   /** \name  Drawing bounds (Intersection of buffer size and scissor box) */
+   /**
+    * In the case that the framebuffer has no attachment (i.e.
+    * GL_ARB_framebuffer_no_attachments) then the geometry of
+    * the framebuffer is specified by the default values.
+    */
+   struct {
+     GLuint Width, Height, Layers, NumSamples;
+     GLboolean FixedSampleLocations;
+   } DefaultGeometry;
+
+   /** \name  Drawing bounds (Intersection of buffer size and scissor box)
+    * The drawing region is given by [_Xmin, _Xmax) x [_Ymin, _Ymax),
+    * (inclusive for _Xmin and _Ymin while exclusive for _Xmax and _Ymax)
+    */
    /*@{*/
-   GLint _Xmin, _Xmax;  /**< inclusive */
-   GLint _Ymin, _Ymax;  /**< exclusive */
+   GLint _Xmin, _Xmax;
+   GLint _Ymin, _Ymax;
    /*@}*/
 
    /** \name  Derived Z buffer stuff */
@@ -3169,6 +3175,22 @@ struct gl_framebuffer
    /** One of the GL_FRAMEBUFFER_(IN)COMPLETE_* tokens */
    GLenum _Status;
 
+   /** Whether one of Attachment has Type != GL_NONE
+    * NOTE: the values for Width and Height are set to 0 in case of having
+    * no attachments, a backend driver supporting the extension
+    * GL_ARB_framebuffer_no_attachments must check for the flag _HasAttachments
+    * and if GL_FALSE, must then use the values in DefaultGeometry to initialize
+    * its viewport, scissor and so on (in particular _Xmin, _Xmax, _Ymin and
+    * _Ymax do NOT take into account _HasAttachments being false). To get the
+    * geometry of the framebuffer, the  helper functions
+    *   _mesa_geometric_width(),
+    *   _mesa_geometric_height(),
+    *   _mesa_geometric_samples() and
+    *   _mesa_geometric_layers()
+    * are available that check _HasAttachments.
+    */
+   bool _HasAttachments;
+
    /** Integer color values */
    GLboolean _IntegerColor;
 
@@ -3179,7 +3201,9 @@ struct gl_framebuffer
    /**
     * The maximum number of layers in the framebuffer, or 0 if the framebuffer
     * is not layered.  For cube maps and cube map arrays, each cube face
-    * counts as a layer.
+    * counts as a layer. As the case for Width, Height a backend driver
+    * supporting GL_ARB_framebuffer_no_attachments must use DefaultGeometry
+    * in the case that _HasAttachments is false
     */
    GLuint MaxNumLayers;
 
@@ -3358,6 +3382,14 @@ struct gl_constants
    GLuint MaxRenderbufferSize;   /**< GL_EXT_framebuffer_object */
    GLuint MaxSamples;            /**< GL_ARB_framebuffer_object */
 
+   /**
+    * GL_ARB_framebuffer_no_attachments
+    */
+   GLuint MaxFramebufferWidth;
+   GLuint MaxFramebufferHeight;
+   GLuint MaxFramebufferLayers;
+   GLuint MaxFramebufferSamples;
+
    /** Number of varying vectors between any two shader stages. */
    GLuint MaxVarying;
 
@@ -3635,6 +3667,7 @@ struct gl_extensions
    GLboolean ARB_fragment_program;
    GLboolean ARB_fragment_program_shadow;
    GLboolean ARB_fragment_shader;
+   GLboolean ARB_framebuffer_no_attachments;
    GLboolean ARB_framebuffer_object;
    GLboolean ARB_explicit_attrib_location;
    GLboolean ARB_explicit_uniform_location;
@@ -4422,7 +4455,12 @@ enum _debug
    DEBUG_INCOMPLETE_FBO         = (1 << 3)
 };
 
-
+static inline bool
+_mesa_active_fragment_shader_has_atomic_ops(const struct gl_context *ctx)
+{
+   return ctx->Shader._CurrentFragmentProgram != NULL &&
+      ctx->Shader._CurrentFragmentProgram->NumAtomicBuffers > 0;
+}
 
 #ifdef __cplusplus
 }
diff --git a/src/mesa/main/objectlabel.c b/src/mesa/main/objectlabel.c
index aecb5b1..5626054 100644
--- a/src/mesa/main/objectlabel.c
+++ b/src/mesa/main/objectlabel.c
@@ -30,6 +30,7 @@
 #include "enums.h"
 #include "fbobject.h"
 #include "objectlabel.h"
+#include "pipelineobj.h"
 #include "queryobj.h"
 #include "samplerobj.h"
 #include "shaderobj.h"
@@ -214,8 +215,13 @@ get_label_pointer(struct gl_context *ctx, GLenum identifier, GLuint name,
       }
       break;
    case GL_PROGRAM_PIPELINE:
-      /* requires GL 4.2 */
-      goto invalid_enum;
+      {
+         struct gl_pipeline_object *pipe =
+            _mesa_lookup_pipeline_object(ctx, name);
+         if (pipe)
+            labelPtr = &pipe->Label;
+      }
+      break;
    default:
       goto invalid_enum;
    }
diff --git a/src/mesa/main/pipelineobj.c b/src/mesa/main/pipelineobj.c
index 0fefa7d..279ae20 100644
--- a/src/mesa/main/pipelineobj.c
+++ b/src/mesa/main/pipelineobj.c
@@ -65,6 +65,7 @@ _mesa_delete_pipeline_object(struct gl_context *ctx,
 
    _mesa_reference_shader_program(ctx, &obj->ActiveProgram, NULL);
    mtx_destroy(&obj->Mutex);
+   free(obj->Label);
    ralloc_free(obj);
 }
 
@@ -136,8 +137,8 @@ _mesa_free_pipeline_data(struct gl_context *ctx)
  * a non-existent ID.  The spec defines ID 0 as being technically
  * non-existent.
  */
-static inline struct gl_pipeline_object *
-lookup_pipeline_object(struct gl_context *ctx, GLuint id)
+struct gl_pipeline_object *
+_mesa_lookup_pipeline_object(struct gl_context *ctx, GLuint id)
 {
    if (id == 0)
       return NULL;
@@ -225,7 +226,7 @@ _mesa_UseProgramStages(GLuint pipeline, GLbitfield stages, GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
    struct gl_shader_program *shProg = NULL;
    GLbitfield any_valid_stages;
 
@@ -337,7 +338,7 @@ _mesa_ActiveShaderProgram(GLuint pipeline, GLuint program)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct gl_shader_program *shProg = NULL;
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (program != 0) {
       shProg = _mesa_lookup_shader_program_err(ctx, program,
@@ -399,7 +400,7 @@ _mesa_BindProgramPipeline(GLuint pipeline)
     */
    if (pipeline) {
       /* non-default pipeline object */
-      newObj = lookup_pipeline_object(ctx, pipeline);
+      newObj = _mesa_lookup_pipeline_object(ctx, pipeline);
       if (!newObj) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glBindProgramPipeline(non-gen name)");
@@ -468,7 +469,7 @@ _mesa_DeleteProgramPipelines(GLsizei n, const GLuint *pipelines)
 
    for (i = 0; i < n; i++) {
       struct gl_pipeline_object *obj =
-         lookup_pipeline_object(ctx, pipelines[i]);
+         _mesa_lookup_pipeline_object(ctx, pipelines[i]);
 
       if (obj) {
          assert(obj->Name == pipelines[i]);
@@ -568,7 +569,7 @@ _mesa_IsProgramPipeline(GLuint pipeline)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *obj = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *obj = _mesa_lookup_pipeline_object(ctx, pipeline);
    if (obj == NULL)
       return GL_FALSE;
 
@@ -582,7 +583,7 @@ void GLAPIENTRY
 _mesa_GetProgramPipelineiv(GLuint pipeline, GLenum pname, GLint *params)
 {
    GET_CURRENT_CONTEXT(ctx);
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    /* Are geometry shaders available in this context?
     */
@@ -673,6 +674,38 @@ program_stages_all_active(struct gl_pipeline_object *pipe,
    return status;
 }
 
+static bool
+program_stages_interleaved_illegally(const struct gl_pipeline_object *pipe)
+{
+   struct gl_shader_program *prev = NULL;
+   unsigned i, j;
+
+   /* Look for programs bound to stages: A -> B -> A, with any intervening
+    * sequence of unrelated programs or empty stages.
+    */
+   for (i = 0; i < MESA_SHADER_STAGES; i++) {
+      struct gl_shader_program *cur = pipe->CurrentProgram[i];
+
+      /* Empty stages anywhere in the pipe are OK */
+      if (!cur || cur == prev)
+         continue;
+
+      if (prev) {
+         /* We've seen an A -> B transition; look at the rest of the pipe
+          * to see if we ever see A again.
+          */
+         for (j = i + 1; j < MESA_SHADER_STAGES; j++) {
+            if (pipe->CurrentProgram[j] == prev)
+               return true;
+         }
+      }
+
+      prev = cur;
+   }
+
+   return false;
+}
+
 extern GLboolean
 _mesa_validate_program_pipeline(struct gl_context* ctx,
                                 struct gl_pipeline_object *pipe,
@@ -721,24 +754,13 @@ _mesa_validate_program_pipeline(struct gl_context* ctx,
     *         - One program object is active for at least two shader stages
     *           and a second program is active for a shader stage between two
     *           stages for which the first program was active."
-    *
-    * Without Tesselation, the only case where this can occur is the geometry
-    * shader between the fragment shader and vertex shader.
     */
-   if (pipe->CurrentProgram[MESA_SHADER_GEOMETRY]
-       && pipe->CurrentProgram[MESA_SHADER_FRAGMENT]
-       && pipe->CurrentProgram[MESA_SHADER_VERTEX]) {
-      if (pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name == pipe->CurrentProgram[MESA_SHADER_FRAGMENT]->Name &&
-          pipe->CurrentProgram[MESA_SHADER_GEOMETRY]->Name != pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name) {
-         pipe->InfoLog =
-            ralloc_asprintf(pipe,
-                            "Program %d is active for geometry stage between "
-                            "two stages for which another program %d is "
-                            "active",
-                            pipe->CurrentProgram[MESA_SHADER_GEOMETRY]->Name,
-                            pipe->CurrentProgram[MESA_SHADER_VERTEX]->Name);
-         goto err;
-      }
+   if (program_stages_interleaved_illegally(pipe)) {
+      pipe->InfoLog =
+         ralloc_strdup(pipe,
+                       "Program is active for multiple shader stages with an "
+                       "intervening stage provided by another program");
+      goto err;
    }
 
    /* Section 2.11.11 (Shader Execution), subheading "Validation," of the
@@ -820,7 +842,7 @@ _mesa_ValidateProgramPipeline(GLuint pipeline)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -838,7 +860,7 @@ _mesa_GetProgramPipelineInfoLog(GLuint pipeline, GLsizei bufSize,
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   struct gl_pipeline_object *pipe = lookup_pipeline_object(ctx, pipeline);
+   struct gl_pipeline_object *pipe = _mesa_lookup_pipeline_object(ctx, pipeline);
 
    if (!pipe) {
       _mesa_error(ctx, GL_INVALID_VALUE,
diff --git a/src/mesa/main/pipelineobj.h b/src/mesa/main/pipelineobj.h
index b57bcb9..6dee775 100644
--- a/src/mesa/main/pipelineobj.h
+++ b/src/mesa/main/pipelineobj.h
@@ -45,6 +45,9 @@ _mesa_init_pipeline(struct gl_context *ctx);
 extern void
 _mesa_free_pipeline_data(struct gl_context *ctx);
 
+extern struct gl_pipeline_object *
+_mesa_lookup_pipeline_object(struct gl_context *ctx, GLuint id);
+
 extern void
 _mesa_reference_pipeline_object_(struct gl_context *ctx,
                                  struct gl_pipeline_object **ptr,
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index b15a132..d857b84 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -220,12 +220,12 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
    case GL_UNIFORM:
-   case GL_UNIFORM_BLOCK:
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      /* Validate name syntax for arrays. */
+      /* Validate name syntax for array variables */
       if (!valid_program_resource_index_name(name))
          return GL_INVALID_INDEX;
-
+      /* fall-through */
+   case GL_UNIFORM_BLOCK:
       res = _mesa_program_resource_find_name(shProg, programInterface, name);
       if (!res)
          return GL_INVALID_INDEX;
diff --git a/src/mesa/main/readpix.c b/src/mesa/main/readpix.c
index ed0104c..a3357cd 100644
--- a/src/mesa/main/readpix.c
+++ b/src/mesa/main/readpix.c
@@ -46,15 +46,18 @@
 /**
  * Return true if the conversion L=R+G+B is needed.
  */
-static GLboolean
-need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
+GLboolean
+_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format)
 {
    GLenum baseTexFormat = _mesa_get_format_base_format(texFormat);
 
    return (baseTexFormat == GL_RG ||
            baseTexFormat == GL_RGB ||
            baseTexFormat == GL_RGBA) &&
-          (format == GL_LUMINANCE || format == GL_LUMINANCE_ALPHA);
+          (format == GL_LUMINANCE ||
+           format == GL_LUMINANCE_ALPHA ||
+           format == GL_LUMINANCE_INTEGER_EXT ||
+           format == GL_LUMINANCE_ALPHA_INTEGER_EXT);
 }
 
 
@@ -83,7 +86,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
    if (uses_blit) {
       /* For blit-based ReadPixels packing, the clamping is done automatically
        * unless the type is float. */
-      if (_mesa_get_clamp_read_color(ctx) &&
+      if (_mesa_get_clamp_read_color(ctx, ctx->ReadBuffer) &&
           (type == GL_FLOAT || type == GL_HALF_FLOAT)) {
          transferOps |= IMAGE_CLAMP_BIT;
       }
@@ -91,7 +94,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
    else {
       /* For CPU-based ReadPixels packing, the clamping must always be done
        * for non-float types, */
-      if (_mesa_get_clamp_read_color(ctx) ||
+      if (_mesa_get_clamp_read_color(ctx, ctx->ReadBuffer) ||
           (type != GL_FLOAT && type != GL_HALF_FLOAT)) {
          transferOps |= IMAGE_CLAMP_BIT;
       }
@@ -102,7 +105,7 @@ get_readpixels_transfer_ops(const struct gl_context *ctx, mesa_format texFormat,
     * have any effect anyway.
     */
    if (_mesa_get_format_datatype(texFormat) == GL_UNSIGNED_NORMALIZED &&
-       !need_rgb_to_luminance_conversion(texFormat, format)) {
+       !_mesa_need_rgb_to_luminance_conversion(texFormat, format)) {
       transferOps &= ~IMAGE_CLAMP_BIT;
    }
 
@@ -146,7 +149,7 @@ _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
 
    default:
       /* Color formats. */
-      if (need_rgb_to_luminance_conversion(rb->Format, format)) {
+      if (_mesa_need_rgb_to_luminance_conversion(rb->Format, format)) {
          return GL_TRUE;
       }
 
@@ -418,7 +421,7 @@ read_rgba_pixels( struct gl_context *ctx,
                   const struct gl_pixelstore_attrib *packing )
 {
    GLbitfield transferOps;
-   bool dst_is_integer, dst_is_luminance, needs_rebase;
+   bool dst_is_integer, convert_rgb_to_lum, needs_rebase;
    int dst_stride, src_stride, rb_stride;
    uint32_t dst_format, src_format;
    GLubyte *dst, *map;
@@ -439,10 +442,8 @@ read_rgba_pixels( struct gl_context *ctx,
    dst_is_integer = _mesa_is_enum_format_integer(format);
    dst_stride = _mesa_image_row_stride(packing, width, format, type);
    dst_format = _mesa_format_from_format_and_type(format, type);
-   dst_is_luminance = format == GL_LUMINANCE ||
-                      format == GL_LUMINANCE_ALPHA ||
-                      format == GL_LUMINANCE_INTEGER_EXT ||
-                      format == GL_LUMINANCE_ALPHA_INTEGER_EXT;
+   convert_rgb_to_lum =
+      _mesa_need_rgb_to_luminance_conversion(rb->Format, format);
    dst = (GLubyte *) _mesa_image_address2d(packing, pixels, width, height,
                                            format, type, 0, 0);
 
@@ -490,7 +491,7 @@ read_rgba_pixels( struct gl_context *ctx,
     */
    assert(!transferOps || (transferOps && !dst_is_integer));
 
-   needs_rgba = transferOps || dst_is_luminance;
+   needs_rgba = transferOps || convert_rgb_to_lum;
    rgba = NULL;
    if (needs_rgba) {
       uint32_t rgba_format;
@@ -563,7 +564,7 @@ read_rgba_pixels( struct gl_context *ctx,
     * If the dst format is Luminance, we need to do the conversion by computing
     * L=R+G+B values.
     */
-   if (!dst_is_luminance) {
+   if (!convert_rgb_to_lum) {
       _mesa_format_convert(dst, dst_format, dst_stride,
                            src, src_format, src_stride,
                            width, height,
diff --git a/src/mesa/main/readpix.h b/src/mesa/main/readpix.h
index 4bb35e1..1636dd9 100644
--- a/src/mesa/main/readpix.h
+++ b/src/mesa/main/readpix.h
@@ -37,6 +37,9 @@ extern GLboolean
 _mesa_readpixels_needs_slow_path(const struct gl_context *ctx, GLenum format,
                                  GLenum type, GLboolean uses_blit);
 
+extern GLboolean
+_mesa_need_rgb_to_luminance_conversion(mesa_format texFormat, GLenum format);
+
 extern void
 _mesa_readpixels(struct gl_context *ctx,
                  GLint x, GLint y, GLsizei width, GLsizei height,
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 6e46553..a6246a3 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -28,6 +28,7 @@
  * \author Ian Romanick <ian.d.romanick@intel.com>
  */
 
+#include "main/context.h"
 #include "main/core.h"
 #include "glsl_symbol_table.h"
 #include "ir.h"
@@ -478,12 +479,20 @@ _mesa_GetFragDataLocation(GLuint program, const GLchar *name)
 const char*
 _mesa_program_resource_name(struct gl_program_resource *res)
 {
+   const ir_variable *var;
    switch (res->Type) {
    case GL_UNIFORM_BLOCK:
       return RESOURCE_UBO(res)->Name;
    case GL_TRANSFORM_FEEDBACK_VARYING:
       return RESOURCE_XFB(res)->Name;
    case GL_PROGRAM_INPUT:
+      var = RESOURCE_VAR(res);
+      /* Special case gl_VertexIDMESA -> gl_VertexID. */
+      if (var->data.mode == ir_var_system_value &&
+          var->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
+         return "gl_VertexID";
+      }
+   /* fallthrough */
    case GL_PROGRAM_OUTPUT:
       return RESOURCE_VAR(res)->name;
    case GL_UNIFORM:
@@ -538,6 +547,17 @@ struct gl_program_resource *
 _mesa_program_resource_find_name(struct gl_shader_program *shProg,
                                  GLenum programInterface, const char *name)
 {
+   GET_CURRENT_CONTEXT(ctx);
+   const char *full_name = name;
+
+   /* When context has 'VertexID_is_zero_based' set, gl_VertexID has been
+    * lowered to gl_VertexIDMESA.
+    */
+   if (name && ctx->Const.VertexID_is_zero_based) {
+      if (strcmp(name, "gl_VertexID") == 0)
+         full_name = "gl_VertexIDMESA";
+   }
+
    struct gl_program_resource *res = shProg->ProgramResourceList;
    for (unsigned i = 0; i < shProg->NumProgramResourceList; i++, res++) {
       if (res->Type != programInterface)
@@ -562,7 +582,7 @@ _mesa_program_resource_find_name(struct gl_shader_program *shProg,
          break;
       case GL_PROGRAM_INPUT:
       case GL_PROGRAM_OUTPUT:
-         if (array_index_of_resource(res, name) >= 0)
+         if (array_index_of_resource(res, full_name) >= 0)
             return res;
          break;
       default:
@@ -727,6 +747,10 @@ program_resource_location(struct gl_shader_program *shProg,
          return -1;
    }
 
+   /* Built-in locations should report GL_INVALID_INDEX. */
+   if (is_gl_identifier(name))
+      return GL_INVALID_INDEX;
+
    /* VERT_ATTRIB_GENERIC0 and FRAG_RESULT_DATA0 are decremented as these
     * offsets are used internally to differentiate between built-in attributes
     * and user-defined attributes.
@@ -986,8 +1010,9 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
    case GL_ACTIVE_VARIABLES:
       return get_buffer_property(shProg, res, prop, val, caller);
    case GL_REFERENCED_BY_COMPUTE_SHADER:
-      if (!ctx->Extensions.ARB_compute_shader)
+      if (!_mesa_has_compute_shaders(ctx))
          goto invalid_enum;
+      /* fallthrough */
    case GL_REFERENCED_BY_VERTEX_SHADER:
    case GL_REFERENCED_BY_GEOMETRY_SHADER:
    case GL_REFERENCED_BY_FRAGMENT_SHADER:
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index a04b287..a4296ad 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -532,7 +532,7 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
    /* True if geometry shaders (of the form that was adopted into GLSL 1.50
     * and GL 3.2) are available in this context
     */
-   const bool has_core_gs = _mesa_is_desktop_gl(ctx) && ctx->Version >= 32;
+   const bool has_core_gs = _mesa_has_geometry_shaders(ctx);
 
    /* Are uniform buffer objects available in this context?
     */
@@ -569,13 +569,13 @@ get_programiv(struct gl_context *ctx, GLuint program, GLenum pname,
       *params = _mesa_longest_attribute_name_length(shProg);
       return;
    case GL_ACTIVE_UNIFORMS:
-      *params = shProg->NumUserUniformStorage - shProg->NumHiddenUniforms;
+      *params = shProg->NumUniformStorage - shProg->NumHiddenUniforms;
       return;
    case GL_ACTIVE_UNIFORM_MAX_LENGTH: {
       unsigned i;
       GLint max_len = 0;
       const unsigned num_uniforms =
-         shProg->NumUserUniformStorage - shProg->NumHiddenUniforms;
+         shProg->NumUniformStorage - shProg->NumHiddenUniforms;
 
       for (i = 0; i < num_uniforms; i++) {
 	 /* Add one for the terminating NUL character for a non-array, and
diff --git a/src/mesa/main/shaderobj.c b/src/mesa/main/shaderobj.c
index e428960..110a18e 100644
--- a/src/mesa/main/shaderobj.c
+++ b/src/mesa/main/shaderobj.c
@@ -282,10 +282,10 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
    unsigned i;
 
    if (shProg->UniformStorage) {
-      for (i = 0; i < shProg->NumUserUniformStorage; ++i)
+      for (i = 0; i < shProg->NumUniformStorage; ++i)
          _mesa_uniform_detach_all_driver_storage(&shProg->UniformStorage[i]);
       ralloc_free(shProg->UniformStorage);
-      shProg->NumUserUniformStorage = 0;
+      shProg->NumUniformStorage = 0;
       shProg->UniformStorage = NULL;
    }
 
diff --git a/src/mesa/main/shared.c b/src/mesa/main/shared.c
index 0b76cc0..d5ac9f1 100644
--- a/src/mesa/main/shared.c
+++ b/src/mesa/main/shared.c
@@ -313,7 +313,6 @@ free_shared_state(struct gl_context *ctx, struct gl_shared_state *shared)
    _mesa_DeleteHashTable(shared->Programs);
 
    _mesa_reference_vertprog(ctx, &shared->DefaultVertexProgram, NULL);
-   _mesa_reference_geomprog(ctx, &shared->DefaultGeometryProgram, NULL);
    _mesa_reference_fragprog(ctx, &shared->DefaultFragmentProgram, NULL);
 
    _mesa_HashDeleteAll(shared->ATIShaders, delete_fragshader_cb, ctx);
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 99db37b..bede7fe 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -225,7 +225,7 @@ update_program(struct gl_context *ctx)
    if (ctx->GeometryProgram._Current != prevGP) {
       new_state |= _NEW_PROGRAM;
       if (ctx->Driver.BindProgram) {
-         ctx->Driver.BindProgram(ctx, MESA_GEOMETRY_PROGRAM,
+         ctx->Driver.BindProgram(ctx, GL_GEOMETRY_PROGRAM_NV,
                             (struct gl_program *) ctx->GeometryProgram._Current);
       }
    }
@@ -266,15 +266,9 @@ update_program_constants(struct gl_context *ctx)
       }
    }
 
-   if (ctx->GeometryProgram._Current) {
-      const struct gl_program_parameter_list *params =
-         ctx->GeometryProgram._Current->Base.Parameters;
-      /*FIXME: StateFlags is always 0 because we have unnamed constant
-       *       not state changes */
-      if (params /*&& params->StateFlags & ctx->NewState*/) {
-         new_state |= _NEW_PROGRAM_CONSTANTS;
-      }
-   }
+   /* Don't handle geometry shaders here. They don't use any state
+    * constants.
+    */
 
    if (ctx->VertexProgram._Current) {
       const struct gl_program_parameter_list *params =
@@ -389,10 +383,10 @@ _mesa_update_state_locked( struct gl_context *ctx )
       update_frontbit( ctx );
 
    if (new_state & _NEW_BUFFERS)
-      _mesa_update_framebuffer(ctx);
+      _mesa_update_framebuffer(ctx, ctx->ReadBuffer, ctx->DrawBuffer);
 
    if (new_state & (_NEW_SCISSOR | _NEW_BUFFERS | _NEW_VIEWPORT))
-      _mesa_update_draw_buffer_bounds( ctx );
+      _mesa_update_draw_buffer_bounds(ctx, ctx->DrawBuffer);
 
    if (new_state & _NEW_LIGHT)
       _mesa_update_lighting( ctx );
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index ccd0124..800720b 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -68,10 +68,13 @@ struct function {
    int offset;
 };
 
+extern const struct function common_desktop_functions_possible[];
+extern const struct function gl_compatibility_functions_possible[];
 extern const struct function gl_core_functions_possible[];
 extern const struct function gles11_functions_possible[];
 extern const struct function gles2_functions_possible[];
 extern const struct function gles3_functions_possible[];
+extern const struct function gles31_functions_possible[];
 
 class DispatchSanity_test : public ::testing::Test {
 public:
@@ -96,7 +99,7 @@ DispatchSanity_test::SetUp()
    _mesa_init_driver_functions(&driver_functions);
 
    const unsigned size = _glapi_get_dispatch_table_size();
-   nop_table = (_glapi_proc *) _glapi_new_nop_table(size);
+   nop_table = (_glapi_proc *) _mesa_new_nop_table(size);
 }
 
 void
@@ -175,10 +178,19 @@ validate_nops(struct gl_context *ctx, const _glapi_proc *nop_table)
 TEST_F(DispatchSanity_test, GL31_CORE)
 {
    SetUpCtx(API_OPENGL_CORE, 31);
+   validate_functions(&ctx, common_desktop_functions_possible, nop_table);
    validate_functions(&ctx, gl_core_functions_possible, nop_table);
    validate_nops(&ctx, nop_table);
 }
 
+TEST_F(DispatchSanity_test, GL30)
+{
+   SetUpCtx(API_OPENGL_COMPAT, 30);
+   validate_functions(&ctx, common_desktop_functions_possible, nop_table);
+   validate_functions(&ctx, gl_compatibility_functions_possible, nop_table);
+   validate_nops(&ctx, nop_table);
+}
+
 TEST_F(DispatchSanity_test, GLES11)
 {
    SetUpCtx(API_OPENGLES, 11);
@@ -201,7 +213,16 @@ TEST_F(DispatchSanity_test, GLES3)
    validate_nops(&ctx, nop_table);
 }
 
-const struct function gl_core_functions_possible[] = {
+TEST_F(DispatchSanity_test, GLES31)
+{
+   SetUpCtx(API_OPENGLES2, 31);
+   validate_functions(&ctx, gles2_functions_possible, nop_table);
+   validate_functions(&ctx, gles3_functions_possible, nop_table);
+   validate_functions(&ctx, gles31_functions_possible, nop_table);
+   validate_nops(&ctx, nop_table);
+}
+
+const struct function common_desktop_functions_possible[] = {
    { "glCullFace", 10, -1 },
    { "glFrontFace", 10, -1 },
    { "glHint", 10, -1 },
@@ -213,8 +234,8 @@ const struct function gl_core_functions_possible[] = {
    { "glTexParameterfv", 10, -1 },
    { "glTexParameteri", 10, -1 },
    { "glTexParameteriv", 10, -1 },
-   { "glTexImage1D", 10, -1 },
-   { "glTexImage2D", 10, -1 },
+   { "glTexImage1D", 10, _gloffset_TexImage1D },
+   { "glTexImage2D", 10, _gloffset_TexImage2D },
    { "glDrawBuffer", 10, -1 },
    { "glClear", 10, -1 },
    { "glClearColor", 10, -1 },
@@ -482,7 +503,6 @@ const struct function gl_core_functions_possible[] = {
    /* GL 3.1 */
    { "glDrawArraysInstanced", 31, -1 },
    { "glDrawElementsInstanced", 31, -1 },
-   { "glTexBuffer", 31, -1 },
    { "glPrimitiveRestartIndex", 31, -1 },
 
    /* GL_ARB_shader_objects */
@@ -535,12 +555,8 @@ const struct function gl_core_functions_possible[] = {
    { "glGetInteger64i_v", 32, -1 },
    { "glGetBufferParameteri64v", 32, -1 },
    { "glFramebufferTexture", 32, -1 },
-
-   /* GL_ARB_geometry_shader4 */
-   { "glProgramParameteriARB", 32, -1 },
-   { "glFramebufferTextureARB", 32, -1 },
-   { "glFramebufferTextureLayerARB", 32, -1 },
-   { "glFramebufferTextureFaceARB", 32, -1 },
+   { "glProgramParameteri", 32, -1 },
+   { "glFramebufferTextureLayer", 32, -1 },
 
    /* GL 3.3 */
    { "glVertexAttribDivisor", 33, -1 },
@@ -673,34 +689,6 @@ const struct function gl_core_functions_possible[] = {
    { "glVertexAttribP4uiv", 43, -1 },
    { "glDrawArraysIndirect", 43, -1 },
    { "glDrawElementsIndirect", 43, -1 },
-   { "glUniform1d", 40, -1 },
-   { "glUniform2d", 40, -1 },
-   { "glUniform3d", 40, -1 },
-   { "glUniform4d", 40, -1 },
-   { "glUniform1dv", 40, -1 },
-   { "glUniform2dv", 40, -1 },
-   { "glUniform3dv", 40, -1 },
-   { "glUniform4dv", 40, -1 },
-   { "glUniformMatrix2dv", 40, -1 },
-   { "glUniformMatrix3dv", 40, -1 },
-   { "glUniformMatrix4dv", 40, -1 },
-   { "glUniformMatrix2x3dv", 40, -1 },
-   { "glUniformMatrix2x4dv", 40, -1 },
-   { "glUniformMatrix3x2dv", 40, -1 },
-   { "glUniformMatrix3x4dv", 40, -1 },
-   { "glUniformMatrix4x2dv", 40, -1 },
-   { "glUniformMatrix4x3dv", 40, -1 },
-   { "glGetUniformdv", 43, -1 },
-// { "glGetSubroutineUniformLocation", 43, -1 },        // XXX: Add to xml
-// { "glGetSubroutineIndex", 43, -1 },                  // XXX: Add to xml
-// { "glGetActiveSubroutineUniformiv", 43, -1 },        // XXX: Add to xml
-// { "glGetActiveSubroutineUniformName", 43, -1 },      // XXX: Add to xml
-// { "glGetActiveSubroutineName", 43, -1 },             // XXX: Add to xml
-// { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
-// { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
-// { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
-// { "glPatchParameteri", 43, -1 },                     // XXX: Add to xml
-// { "glPatchParameterfv", 43, -1 },                    // XXX: Add to xml
    { "glBindTransformFeedback", 43, -1 },
    { "glDeleteTransformFeedbacks", 43, -1 },
    { "glGenTransformFeedbacks", 43, -1 },
@@ -728,12 +716,12 @@ const struct function gl_core_functions_possible[] = {
    { "glGenProgramPipelines", 43, -1 },
    { "glIsProgramPipeline", 43, -1 },
    { "glGetProgramPipelineiv", 43, -1 },
+   { "glProgramUniform1d", 43, -1 },
+   { "glProgramUniform1dv", 43, -1 },
    { "glProgramUniform1i", 43, -1 },
    { "glProgramUniform1iv", 43, -1 },
    { "glProgramUniform1f", 43, -1 },
    { "glProgramUniform1fv", 43, -1 },
-   { "glProgramUniform1d", 40, -1 },
-   { "glProgramUniform1dv", 40, -1 },
    { "glProgramUniform1ui", 43, -1 },
    { "glProgramUniform1uiv", 43, -1 },
    { "glProgramUniform2i", 43, -1 },
@@ -754,50 +742,32 @@ const struct function gl_core_functions_possible[] = {
    { "glProgramUniform3uiv", 43, -1 },
    { "glProgramUniform4i", 43, -1 },
    { "glProgramUniform4iv", 43, -1 },
+   { "glProgramUniform4d", 43, -1 },
+   { "glProgramUniform4dv", 43, -1 },
    { "glProgramUniform4f", 43, -1 },
    { "glProgramUniform4fv", 43, -1 },
-   { "glProgramUniform4d", 40, -1 },
-   { "glProgramUniform4dv", 40, -1 },
    { "glProgramUniform4ui", 43, -1 },
    { "glProgramUniform4uiv", 43, -1 },
+   { "glProgramUniformMatrix2dv", 43, -1 },
    { "glProgramUniformMatrix2fv", 43, -1 },
+   { "glProgramUniformMatrix3dv", 43, -1 },
    { "glProgramUniformMatrix3fv", 43, -1 },
+   { "glProgramUniformMatrix4dv", 43, -1 },
    { "glProgramUniformMatrix4fv", 43, -1 },
-   { "glProgramUniformMatrix2dv", 40, -1 },
-   { "glProgramUniformMatrix3dv", 40, -1 },
-   { "glProgramUniformMatrix4dv", 40, -1 },
+   { "glProgramUniformMatrix2x3dv", 43, -1 },
    { "glProgramUniformMatrix2x3fv", 43, -1 },
+   { "glProgramUniformMatrix3x2dv", 43, -1 },
    { "glProgramUniformMatrix3x2fv", 43, -1 },
+   { "glProgramUniformMatrix2x4dv", 43, -1 },
    { "glProgramUniformMatrix2x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x2dv", 43, -1 },
    { "glProgramUniformMatrix4x2fv", 43, -1 },
+   { "glProgramUniformMatrix3x4dv", 43, -1 },
    { "glProgramUniformMatrix3x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x3dv", 43, -1 },
    { "glProgramUniformMatrix4x3fv", 43, -1 },
-   { "glProgramUniformMatrix2x3dv", 40, -1 },
-   { "glProgramUniformMatrix3x2dv", 40, -1 },
-   { "glProgramUniformMatrix2x4dv", 40, -1 },
-   { "glProgramUniformMatrix4x2dv", 40, -1 },
-   { "glProgramUniformMatrix3x4dv", 40, -1 },
-   { "glProgramUniformMatrix4x3dv", 40, -1 },
    { "glValidateProgramPipeline", 43, -1 },
    { "glGetProgramPipelineInfoLog", 43, -1 },
-   { "glVertexAttribL1d", 41, -1 },
-   { "glVertexAttribL2d", 41, -1 },
-   { "glVertexAttribL3d", 41, -1 },
-   { "glVertexAttribL4d", 41, -1 },
-   { "glVertexAttribL1dv", 41, -1 },
-   { "glVertexAttribL2dv", 41, -1 },
-   { "glVertexAttribL3dv", 41, -1 },
-   { "glVertexAttribL4dv", 41, -1 },
-   { "glVertexAttribLPointer", 41, -1 },
-   { "glGetVertexAttribLdv", 41, -1 },
-   { "glViewportArrayv", 43, -1 },
-   { "glViewportIndexedf", 43, -1 },
-   { "glViewportIndexedfv", 43, -1 },
-   { "glScissorArrayv", 43, -1 },
-   { "glScissorIndexed", 43, -1 },
-   { "glScissorIndexedv", 43, -1 },
-   { "glDepthRangeArrayv", 43, -1 },
-   { "glDepthRangeIndexed", 43, -1 },
    { "glGetFloati_v", 43, -1 },
    { "glGetDoublei_v", 43, -1 },
 // { "glCreateSyncFromCLeventARB", 43, -1 },            // XXX: Add to xml
@@ -840,8 +810,6 @@ const struct function gl_core_functions_possible[] = {
    { "glClearBufferSubData", 43, -1 },
 // { "glClearNamedBufferDataEXT", 43, -1 },             // XXX: Add to xml
 // { "glClearNamedBufferSubDataEXT", 43, -1 },          // XXX: Add to xml
-   { "glDispatchCompute", 43, -1 },
-   { "glDispatchComputeIndirect", 43, -1 },
    { "glCopyImageSubData", 43, -1 },
    { "glTextureView", 43, -1 },
    { "glBindVertexBuffer", 43, -1 },
@@ -853,11 +821,10 @@ const struct function gl_core_functions_possible[] = {
 // { "glVertexArrayBindVertexBufferEXT", 43, -1 },      // XXX: Add to xml
 // { "glVertexArrayVertexAttribFormatEXT", 43, -1 },    // XXX: Add to xml
 // { "glVertexArrayVertexAttribIFormatEXT", 43, -1 },   // XXX: Add to xml
-// { "glVertexArrayVertexAttribLFormatEXT", 43, -1 },   // XXX: Add to xml
 // { "glVertexArrayVertexAttribBindingEXT", 43, -1 },   // XXX: Add to xml
 // { "glVertexArrayVertexBindingDivisorEXT", 43, -1 },  // XXX: Add to xml
-// { "glFramebufferParameteri", 43, -1 },               // XXX: Add to xml
-// { "glGetFramebufferParameteriv", 43, -1 },           // XXX: Add to xml
+   { "glFramebufferParameteri", 43, -1 },
+   { "glGetFramebufferParameteriv", 43, -1 },
 // { "glNamedFramebufferParameteriEXT", 43, -1 },       // XXX: Add to xml
 // { "glGetNamedFramebufferParameterivEXT", 43, -1 },   // XXX: Add to xml
 // { "glGetInternalformati64v", 43, -1 },               // XXX: Add to xml
@@ -876,7 +843,6 @@ const struct function gl_core_functions_possible[] = {
    { "glGetProgramResourceLocation", 43, -1 },
    { "glGetProgramResourceLocationIndex", 43, -1 },
 // { "glShaderStorageBlockBinding", 43, -1 },           // XXX: Add to xml
-   { "glTexBufferRange", 43, -1 },
 // { "glTextureBufferRangeEXT", 43, -1 },               // XXX: Add to xml
    { "glTexStorage2DMultisample", 43, -1 },
    { "glTexStorage3DMultisample", 43, -1 },
@@ -958,6 +924,814 @@ const struct function gl_core_functions_possible[] = {
    /* GL_ARB_clip_control */
    { "glClipControl", 45, -1 },
 
+   /* GL_ARB_compute_shader */
+   { "glDispatchCompute", 43, -1 },
+   { "glDispatchComputeIndirect", 43, -1 },
+
+   /* GL_EXT_polygon_offset_clamp */
+   { "glPolygonOffsetClampEXT", 11, -1 },
+   { NULL, 0, -1 }
+};
+
+const struct function gl_compatibility_functions_possible[] = {
+   { "glBindVertexArrayAPPLE", 10, -1 },
+   { "glGenVertexArraysAPPLE", 10, -1 },
+   { "glBindRenderbufferEXT", 10, -1 },
+   { "glBindFramebufferEXT", 10, -1 },
+   { "glNewList", 10, _gloffset_NewList },
+   { "glEndList", 10, _gloffset_EndList },
+   { "glCallList", 10, _gloffset_CallList },
+   { "glCallLists", 10, _gloffset_CallLists },
+   { "glDeleteLists", 10, _gloffset_DeleteLists },
+   { "glGenLists", 10, _gloffset_GenLists },
+   { "glListBase", 10, _gloffset_ListBase },
+   { "glBegin", 10, _gloffset_Begin },
+   { "glBitmap", 10, _gloffset_Bitmap },
+   { "glColor3b", 10, _gloffset_Color3b },
+   { "glColor3bv", 10, _gloffset_Color3bv },
+   { "glColor3d", 10, _gloffset_Color3d },
+   { "glColor3dv", 10, _gloffset_Color3dv },
+   { "glColor3f", 10, _gloffset_Color3f },
+   { "glColor3fv", 10, _gloffset_Color3fv },
+   { "glColor3i", 10, _gloffset_Color3i },
+   { "glColor3iv", 10, _gloffset_Color3iv },
+   { "glColor3s", 10, _gloffset_Color3s },
+   { "glColor3sv", 10, _gloffset_Color3sv },
+   { "glColor3ub", 10, _gloffset_Color3ub },
+   { "glColor3ubv", 10, _gloffset_Color3ubv },
+   { "glColor3ui", 10, _gloffset_Color3ui },
+   { "glColor3uiv", 10, _gloffset_Color3uiv },
+   { "glColor3us", 10, _gloffset_Color3us },
+   { "glColor3usv", 10, _gloffset_Color3usv },
+   { "glColor4b", 10, _gloffset_Color4b },
+   { "glColor4bv", 10, _gloffset_Color4bv },
+   { "glColor4d", 10, _gloffset_Color4d },
+   { "glColor4dv", 10, _gloffset_Color4dv },
+   { "glColor4f", 10, _gloffset_Color4f },
+   { "glColor4fv", 10, _gloffset_Color4fv },
+   { "glColor4i", 10, _gloffset_Color4i },
+   { "glColor4iv", 10, _gloffset_Color4iv },
+   { "glColor4s", 10, _gloffset_Color4s },
+   { "glColor4sv", 10, _gloffset_Color4sv },
+   { "glColor4ub", 10, _gloffset_Color4ub },
+   { "glColor4ubv", 10, _gloffset_Color4ubv },
+   { "glColor4ui", 10, _gloffset_Color4ui },
+   { "glColor4uiv", 10, _gloffset_Color4uiv },
+   { "glColor4us", 10, _gloffset_Color4us },
+   { "glColor4usv", 10, _gloffset_Color4usv },
+   { "glEdgeFlag", 10, _gloffset_EdgeFlag },
+   { "glEdgeFlagv", 10, _gloffset_EdgeFlagv },
+   { "glEnd", 10, _gloffset_End },
+   { "glIndexd", 10, _gloffset_Indexd },
+   { "glIndexdv", 10, _gloffset_Indexdv },
+   { "glIndexf", 10, _gloffset_Indexf },
+   { "glIndexfv", 10, _gloffset_Indexfv },
+   { "glIndexi", 10, _gloffset_Indexi },
+   { "glIndexiv", 10, _gloffset_Indexiv },
+   { "glIndexs", 10, _gloffset_Indexs },
+   { "glIndexsv", 10, _gloffset_Indexsv },
+   { "glNormal3b", 10, _gloffset_Normal3b },
+   { "glNormal3bv", 10, _gloffset_Normal3bv },
+   { "glNormal3d", 10, _gloffset_Normal3d },
+   { "glNormal3dv", 10, _gloffset_Normal3dv },
+   { "glNormal3f", 10, _gloffset_Normal3f },
+   { "glNormal3fv", 10, _gloffset_Normal3fv },
+   { "glNormal3i", 10, _gloffset_Normal3i },
+   { "glNormal3iv", 10, _gloffset_Normal3iv },
+   { "glNormal3s", 10, _gloffset_Normal3s },
+   { "glNormal3sv", 10, _gloffset_Normal3sv },
+   { "glRasterPos2d", 10, _gloffset_RasterPos2d },
+   { "glRasterPos2dv", 10, _gloffset_RasterPos2dv },
+   { "glRasterPos2f", 10, _gloffset_RasterPos2f },
+   { "glRasterPos2fv", 10, _gloffset_RasterPos2fv },
+   { "glRasterPos2i", 10, _gloffset_RasterPos2i },
+   { "glRasterPos2iv", 10, _gloffset_RasterPos2iv },
+   { "glRasterPos2s", 10, _gloffset_RasterPos2s },
+   { "glRasterPos2sv", 10, _gloffset_RasterPos2sv },
+   { "glRasterPos3d", 10, _gloffset_RasterPos3d },
+   { "glRasterPos3dv", 10, _gloffset_RasterPos3dv },
+   { "glRasterPos3f", 10, _gloffset_RasterPos3f },
+   { "glRasterPos3fv", 10, _gloffset_RasterPos3fv },
+   { "glRasterPos3i", 10, _gloffset_RasterPos3i },
+   { "glRasterPos3iv", 10, _gloffset_RasterPos3iv },
+   { "glRasterPos3s", 10, _gloffset_RasterPos3s },
+   { "glRasterPos3sv", 10, _gloffset_RasterPos3sv },
+   { "glRasterPos4d", 10, _gloffset_RasterPos4d },
+   { "glRasterPos4dv", 10, _gloffset_RasterPos4dv },
+   { "glRasterPos4f", 10, _gloffset_RasterPos4f },
+   { "glRasterPos4fv", 10, _gloffset_RasterPos4fv },
+   { "glRasterPos4i", 10, _gloffset_RasterPos4i },
+   { "glRasterPos4iv", 10, _gloffset_RasterPos4iv },
+   { "glRasterPos4s", 10, _gloffset_RasterPos4s },
+   { "glRasterPos4sv", 10, _gloffset_RasterPos4sv },
+   { "glRectd", 10, _gloffset_Rectd },
+   { "glRectdv", 10, _gloffset_Rectdv },
+   { "glRectf", 10, _gloffset_Rectf },
+   { "glRectfv", 10, _gloffset_Rectfv },
+   { "glRecti", 10, _gloffset_Recti },
+   { "glRectiv", 10, _gloffset_Rectiv },
+   { "glRects", 10, _gloffset_Rects },
+   { "glRectsv", 10, _gloffset_Rectsv },
+   { "glTexCoord1d", 10, _gloffset_TexCoord1d },
+   { "glTexCoord1dv", 10, _gloffset_TexCoord1dv },
+   { "glTexCoord1f", 10, _gloffset_TexCoord1f },
+   { "glTexCoord1fv", 10, _gloffset_TexCoord1fv },
+   { "glTexCoord1i", 10, _gloffset_TexCoord1i },
+   { "glTexCoord1iv", 10, _gloffset_TexCoord1iv },
+   { "glTexCoord1s", 10, _gloffset_TexCoord1s },
+   { "glTexCoord1sv", 10, _gloffset_TexCoord1sv },
+   { "glTexCoord2d", 10, _gloffset_TexCoord2d },
+   { "glTexCoord2dv", 10, _gloffset_TexCoord2dv },
+   { "glTexCoord2f", 10, _gloffset_TexCoord2f },
+   { "glTexCoord2fv", 10, _gloffset_TexCoord2fv },
+   { "glTexCoord2i", 10, _gloffset_TexCoord2i },
+   { "glTexCoord2iv", 10, _gloffset_TexCoord2iv },
+   { "glTexCoord2s", 10, _gloffset_TexCoord2s },
+   { "glTexCoord2sv", 10, _gloffset_TexCoord2sv },
+   { "glTexCoord3d", 10, _gloffset_TexCoord3d },
+   { "glTexCoord3dv", 10, _gloffset_TexCoord3dv },
+   { "glTexCoord3f", 10, _gloffset_TexCoord3f },
+   { "glTexCoord3fv", 10, _gloffset_TexCoord3fv },
+   { "glTexCoord3i", 10, _gloffset_TexCoord3i },
+   { "glTexCoord3iv", 10, _gloffset_TexCoord3iv },
+   { "glTexCoord3s", 10, _gloffset_TexCoord3s },
+   { "glTexCoord3sv", 10, _gloffset_TexCoord3sv },
+   { "glTexCoord4d", 10, _gloffset_TexCoord4d },
+   { "glTexCoord4dv", 10, _gloffset_TexCoord4dv },
+   { "glTexCoord4f", 10, _gloffset_TexCoord4f },
+   { "glTexCoord4fv", 10, _gloffset_TexCoord4fv },
+   { "glTexCoord4i", 10, _gloffset_TexCoord4i },
+   { "glTexCoord4iv", 10, _gloffset_TexCoord4iv },
+   { "glTexCoord4s", 10, _gloffset_TexCoord4s },
+   { "glTexCoord4sv", 10, _gloffset_TexCoord4sv },
+   { "glVertex2d", 10, _gloffset_Vertex2d },
+   { "glVertex2dv", 10, _gloffset_Vertex2dv },
+   { "glVertex2f", 10, _gloffset_Vertex2f },
+   { "glVertex2fv", 10, _gloffset_Vertex2fv },
+   { "glVertex2i", 10, _gloffset_Vertex2i },
+   { "glVertex2iv", 10, _gloffset_Vertex2iv },
+   { "glVertex2s", 10, _gloffset_Vertex2s },
+   { "glVertex2sv", 10, _gloffset_Vertex2sv },
+   { "glVertex3d", 10, _gloffset_Vertex3d },
+   { "glVertex3dv", 10, _gloffset_Vertex3dv },
+   { "glVertex3f", 10, _gloffset_Vertex3f },
+   { "glVertex3fv", 10, _gloffset_Vertex3fv },
+   { "glVertex3i", 10, _gloffset_Vertex3i },
+   { "glVertex3iv", 10, _gloffset_Vertex3iv },
+   { "glVertex3s", 10, _gloffset_Vertex3s },
+   { "glVertex3sv", 10, _gloffset_Vertex3sv },
+   { "glVertex4d", 10, _gloffset_Vertex4d },
+   { "glVertex4dv", 10, _gloffset_Vertex4dv },
+   { "glVertex4f", 10, _gloffset_Vertex4f },
+   { "glVertex4fv", 10, _gloffset_Vertex4fv },
+   { "glVertex4i", 10, _gloffset_Vertex4i },
+   { "glVertex4iv", 10, _gloffset_Vertex4iv },
+   { "glVertex4s", 10, _gloffset_Vertex4s },
+   { "glVertex4sv", 10, _gloffset_Vertex4sv },
+   { "glClipPlane", 10, _gloffset_ClipPlane },
+   { "glColorMaterial", 10, _gloffset_ColorMaterial },
+   { "glFogf", 10, _gloffset_Fogf },
+   { "glFogfv", 10, _gloffset_Fogfv },
+   { "glFogi", 10, _gloffset_Fogi },
+   { "glFogiv", 10, _gloffset_Fogiv },
+   { "glLightf", 10, _gloffset_Lightf },
+   { "glLightfv", 10, _gloffset_Lightfv },
+   { "glLighti", 10, _gloffset_Lighti },
+   { "glLightiv", 10, _gloffset_Lightiv },
+   { "glLightModelf", 10, _gloffset_LightModelf },
+   { "glLightModelfv", 10, _gloffset_LightModelfv },
+   { "glLightModeli", 10, _gloffset_LightModeli },
+   { "glLightModeliv", 10, _gloffset_LightModeliv },
+   { "glLineStipple", 10, _gloffset_LineStipple },
+   { "glMaterialf", 10, _gloffset_Materialf },
+   { "glMaterialfv", 10, _gloffset_Materialfv },
+   { "glMateriali", 10, _gloffset_Materiali },
+   { "glMaterialiv", 10, _gloffset_Materialiv },
+   { "glPolygonStipple", 10, _gloffset_PolygonStipple },
+   { "glShadeModel", 10, _gloffset_ShadeModel },
+   { "glTexEnvf", 10, _gloffset_TexEnvf },
+   { "glTexEnvfv", 10, _gloffset_TexEnvfv },
+   { "glTexEnvi", 10, _gloffset_TexEnvi },
+   { "glTexEnviv", 10, _gloffset_TexEnviv },
+   { "glTexGend", 10, _gloffset_TexGend },
+   { "glTexGendv", 10, _gloffset_TexGendv },
+   { "glTexGenf", 10, _gloffset_TexGenf },
+   { "glTexGenfv", 10, _gloffset_TexGenfv },
+   { "glTexGeni", 10, _gloffset_TexGeni },
+   { "glTexGeniv", 10, _gloffset_TexGeniv },
+   { "glFeedbackBuffer", 10, _gloffset_FeedbackBuffer },
+   { "glSelectBuffer", 10, _gloffset_SelectBuffer },
+   { "glRenderMode", 10, _gloffset_RenderMode },
+   { "glInitNames", 10, _gloffset_InitNames },
+   { "glLoadName", 10, _gloffset_LoadName },
+   { "glPassThrough", 10, _gloffset_PassThrough },
+   { "glPopName", 10, _gloffset_PopName },
+   { "glPushName", 10, _gloffset_PushName },
+   { "glClearAccum", 10, _gloffset_ClearAccum },
+   { "glClearIndex", 10, _gloffset_ClearIndex },
+   { "glIndexMask", 10, _gloffset_IndexMask },
+   { "glAccum", 10, _gloffset_Accum },
+   { "glPopAttrib", 10, _gloffset_PopAttrib },
+   { "glPushAttrib", 10, _gloffset_PushAttrib },
+   { "glMap1d", 10, _gloffset_Map1d },
+   { "glMap1f", 10, _gloffset_Map1f },
+   { "glMap2d", 10, _gloffset_Map2d },
+   { "glMap2f", 10, _gloffset_Map2f },
+   { "glMapGrid1d", 10, _gloffset_MapGrid1d },
+   { "glMapGrid1f", 10, _gloffset_MapGrid1f },
+   { "glMapGrid2d", 10, _gloffset_MapGrid2d },
+   { "glMapGrid2f", 10, _gloffset_MapGrid2f },
+   { "glEvalCoord1d", 10, _gloffset_EvalCoord1d },
+   { "glEvalCoord1dv", 10, _gloffset_EvalCoord1dv },
+   { "glEvalCoord1f", 10, _gloffset_EvalCoord1f },
+   { "glEvalCoord1fv", 10, _gloffset_EvalCoord1fv },
+   { "glEvalCoord2d", 10, _gloffset_EvalCoord2d },
+   { "glEvalCoord2dv", 10, _gloffset_EvalCoord2dv },
+   { "glEvalCoord2f", 10, _gloffset_EvalCoord2f },
+   { "glEvalCoord2fv", 10, _gloffset_EvalCoord2fv },
+   { "glEvalMesh1", 10, _gloffset_EvalMesh1 },
+   { "glEvalPoint1", 10, _gloffset_EvalPoint1 },
+   { "glEvalMesh2", 10, _gloffset_EvalMesh2 },
+   { "glEvalPoint2", 10, _gloffset_EvalPoint2 },
+   { "glAlphaFunc", 10, _gloffset_AlphaFunc },
+   { "glPixelZoom", 10, _gloffset_PixelZoom },
+   { "glPixelTransferf", 10, _gloffset_PixelTransferf },
+   { "glPixelTransferi", 10, _gloffset_PixelTransferi },
+   { "glPixelMapfv", 10, _gloffset_PixelMapfv },
+   { "glPixelMapuiv", 10, _gloffset_PixelMapuiv },
+   { "glPixelMapusv", 10, _gloffset_PixelMapusv },
+   { "glCopyPixels", 10, _gloffset_CopyPixels },
+   { "glDrawPixels", 10, _gloffset_DrawPixels },
+   { "glGetClipPlane", 10, _gloffset_GetClipPlane },
+   { "glGetLightfv", 10, _gloffset_GetLightfv },
+   { "glGetLightiv", 10, _gloffset_GetLightiv },
+   { "glGetMapdv", 10, _gloffset_GetMapdv },
+   { "glGetMapfv", 10, _gloffset_GetMapfv },
+   { "glGetMapiv", 10, _gloffset_GetMapiv },
+   { "glGetMaterialfv", 10, _gloffset_GetMaterialfv },
+   { "glGetMaterialiv", 10, _gloffset_GetMaterialiv },
+   { "glGetPixelMapfv", 10, _gloffset_GetPixelMapfv },
+   { "glGetPixelMapuiv", 10, _gloffset_GetPixelMapuiv },
+   { "glGetPixelMapusv", 10, _gloffset_GetPixelMapusv },
+   { "glGetPolygonStipple", 10, _gloffset_GetPolygonStipple },
+   { "glGetTexEnvfv", 10, _gloffset_GetTexEnvfv },
+   { "glGetTexEnviv", 10, _gloffset_GetTexEnviv },
+   { "glGetTexGendv", 10, _gloffset_GetTexGendv },
+   { "glGetTexGenfv", 10, _gloffset_GetTexGenfv },
+   { "glGetTexGeniv", 10, _gloffset_GetTexGeniv },
+   { "glIsList", 10, _gloffset_IsList },
+   { "glFrustum", 10, _gloffset_Frustum },
+   { "glLoadIdentity", 10, _gloffset_LoadIdentity },
+   { "glLoadMatrixf", 10, _gloffset_LoadMatrixf },
+   { "glLoadMatrixd", 10, _gloffset_LoadMatrixd },
+   { "glMatrixMode", 10, _gloffset_MatrixMode },
+   { "glMultMatrixf", 10, _gloffset_MultMatrixf },
+   { "glMultMatrixd", 10, _gloffset_MultMatrixd },
+   { "glOrtho", 10, _gloffset_Ortho },
+   { "glPopMatrix", 10, _gloffset_PopMatrix },
+   { "glPushMatrix", 10, _gloffset_PushMatrix },
+   { "glRotated", 10, _gloffset_Rotated },
+   { "glRotatef", 10, _gloffset_Rotatef },
+   { "glScaled", 10, _gloffset_Scaled },
+   { "glScalef", 10, _gloffset_Scalef },
+   { "glTranslated", 10, _gloffset_Translated },
+   { "glTranslatef", 10, _gloffset_Translatef },
+   { "glArrayElement", 10, _gloffset_ArrayElement },
+   { "glColorPointer", 10, _gloffset_ColorPointer },
+   { "glDisableClientState", 10, _gloffset_DisableClientState },
+   { "glEdgeFlagPointer", 10, _gloffset_EdgeFlagPointer },
+   { "glEnableClientState", 10, _gloffset_EnableClientState },
+   { "glIndexPointer", 10, _gloffset_IndexPointer },
+   { "glInterleavedArrays", 10, _gloffset_InterleavedArrays },
+   { "glNormalPointer", 10, _gloffset_NormalPointer },
+   { "glTexCoordPointer", 10, _gloffset_TexCoordPointer },
+   { "glVertexPointer", 10, _gloffset_VertexPointer },
+   { "glAreTexturesResident", 10, _gloffset_AreTexturesResident },
+   { "glPrioritizeTextures", 10, _gloffset_PrioritizeTextures },
+   { "glIndexub", 10, _gloffset_Indexub },
+   { "glIndexubv", 10, _gloffset_Indexubv },
+   { "glPopClientAttrib", 10, _gloffset_PopClientAttrib },
+   { "glPushClientAttrib", 10, _gloffset_PushClientAttrib },
+   { "glColorTable", 10, _gloffset_ColorTable },
+   { "glColorTableParameterfv", 10, _gloffset_ColorTableParameterfv },
+   { "glColorTableParameteriv", 10, _gloffset_ColorTableParameteriv },
+   { "glCopyColorTable", 10, _gloffset_CopyColorTable },
+   { "glGetColorTable", 10, _gloffset_GetColorTable },
+   { "glGetColorTableParameterfv", 10, _gloffset_GetColorTableParameterfv },
+   { "glGetColorTableParameteriv", 10, _gloffset_GetColorTableParameteriv },
+   { "glColorSubTable", 10, _gloffset_ColorSubTable },
+   { "glCopyColorSubTable", 10, _gloffset_CopyColorSubTable },
+   { "glConvolutionFilter1D", 10, _gloffset_ConvolutionFilter1D },
+   { "glConvolutionFilter2D", 10, _gloffset_ConvolutionFilter2D },
+   { "glConvolutionParameterf", 10, _gloffset_ConvolutionParameterf },
+   { "glConvolutionParameterfv", 10, _gloffset_ConvolutionParameterfv },
+   { "glConvolutionParameteri", 10, _gloffset_ConvolutionParameteri },
+   { "glConvolutionParameteriv", 10, _gloffset_ConvolutionParameteriv },
+   { "glCopyConvolutionFilter1D", 10, _gloffset_CopyConvolutionFilter1D },
+   { "glCopyConvolutionFilter2D", 10, _gloffset_CopyConvolutionFilter2D },
+   { "glGetConvolutionFilter", 10, _gloffset_GetConvolutionFilter },
+   { "glGetConvolutionParameterfv", 10, _gloffset_GetConvolutionParameterfv },
+   { "glGetConvolutionParameteriv", 10, _gloffset_GetConvolutionParameteriv },
+   { "glGetSeparableFilter", 10, _gloffset_GetSeparableFilter },
+   { "glSeparableFilter2D", 10, _gloffset_SeparableFilter2D },
+   { "glGetHistogram", 10, _gloffset_GetHistogram },
+   { "glGetHistogramParameterfv", 10, _gloffset_GetHistogramParameterfv },
+   { "glGetHistogramParameteriv", 10, _gloffset_GetHistogramParameteriv },
+   { "glGetMinmax", 10, _gloffset_GetMinmax },
+   { "glGetMinmaxParameterfv", 10, _gloffset_GetMinmaxParameterfv },
+   { "glGetMinmaxParameteriv", 10, _gloffset_GetMinmaxParameteriv },
+   { "glHistogram", 10, _gloffset_Histogram },
+   { "glMinmax", 10, _gloffset_Minmax },
+   { "glResetHistogram", 10, _gloffset_ResetHistogram },
+   { "glResetMinmax", 10, _gloffset_ResetMinmax },
+   { "glClientActiveTexture", 10, _gloffset_ClientActiveTexture },
+   { "glMultiTexCoord1d", 10, _gloffset_MultiTexCoord1d },
+   { "glMultiTexCoord1dv", 10, _gloffset_MultiTexCoord1dv },
+   { "glMultiTexCoord1f", 10, _gloffset_MultiTexCoord1fARB },
+   { "glMultiTexCoord1fv", 10, _gloffset_MultiTexCoord1fvARB },
+   { "glMultiTexCoord1i", 10, _gloffset_MultiTexCoord1i },
+   { "glMultiTexCoord1iv", 10, _gloffset_MultiTexCoord1iv },
+   { "glMultiTexCoord1s", 10, _gloffset_MultiTexCoord1s },
+   { "glMultiTexCoord1sv", 10, _gloffset_MultiTexCoord1sv },
+   { "glMultiTexCoord2d", 10, _gloffset_MultiTexCoord2d },
+   { "glMultiTexCoord2dv", 10, _gloffset_MultiTexCoord2dv },
+   { "glMultiTexCoord2f", 10, _gloffset_MultiTexCoord2fARB },
+   { "glMultiTexCoord2fv", 10, _gloffset_MultiTexCoord2fvARB },
+   { "glMultiTexCoord2i", 10, _gloffset_MultiTexCoord2i },
+   { "glMultiTexCoord2iv", 10, _gloffset_MultiTexCoord2iv },
+   { "glMultiTexCoord2s", 10, _gloffset_MultiTexCoord2s },
+   { "glMultiTexCoord2sv", 10, _gloffset_MultiTexCoord2sv },
+   { "glMultiTexCoord3d", 10, _gloffset_MultiTexCoord3d },
+   { "glMultiTexCoord3dv", 10, _gloffset_MultiTexCoord3dv },
+   { "glMultiTexCoord3f", 10, _gloffset_MultiTexCoord3fARB },
+   { "glMultiTexCoord3fv", 10, _gloffset_MultiTexCoord3fvARB },
+   { "glMultiTexCoord3i", 10, _gloffset_MultiTexCoord3i },
+   { "glMultiTexCoord3iv", 10, _gloffset_MultiTexCoord3iv },
+   { "glMultiTexCoord3s", 10, _gloffset_MultiTexCoord3s },
+   { "glMultiTexCoord3sv", 10, _gloffset_MultiTexCoord3sv },
+   { "glMultiTexCoord4d", 10, _gloffset_MultiTexCoord4d },
+   { "glMultiTexCoord4dv", 10, _gloffset_MultiTexCoord4dv },
+   { "glMultiTexCoord4f", 10, _gloffset_MultiTexCoord4fARB },
+   { "glMultiTexCoord4fv", 10, _gloffset_MultiTexCoord4fvARB },
+   { "glMultiTexCoord4i", 10, _gloffset_MultiTexCoord4i },
+   { "glMultiTexCoord4iv", 10, _gloffset_MultiTexCoord4iv },
+   { "glMultiTexCoord4s", 10, _gloffset_MultiTexCoord4s },
+   { "glMultiTexCoord4sv", 10, _gloffset_MultiTexCoord4sv },
+   { "glLoadTransposeMatrixf", 10, -1 },
+   { "glLoadTransposeMatrixd", 10, -1 },
+   { "glMultTransposeMatrixf", 10, -1 },
+   { "glMultTransposeMatrixd", 10, -1 },
+   { "glFogCoordf", 10, -1 },
+   { "glFogCoordfv", 10, -1 },
+   { "glFogCoordd", 10, -1 },
+   { "glFogCoorddv", 10, -1 },
+   { "glFogCoordPointer", 10, -1 },
+   { "glSecondaryColor3b", 10, -1 },
+   { "glSecondaryColor3bv", 10, -1 },
+   { "glSecondaryColor3d", 10, -1 },
+   { "glSecondaryColor3dv", 10, -1 },
+   { "glSecondaryColor3f", 10, -1 },
+   { "glSecondaryColor3fv", 10, -1 },
+   { "glSecondaryColor3i", 10, -1 },
+   { "glSecondaryColor3iv", 10, -1 },
+   { "glSecondaryColor3s", 10, -1 },
+   { "glSecondaryColor3sv", 10, -1 },
+   { "glSecondaryColor3ub", 10, -1 },
+   { "glSecondaryColor3ubv", 10, -1 },
+   { "glSecondaryColor3ui", 10, -1 },
+   { "glSecondaryColor3uiv", 10, -1 },
+   { "glSecondaryColor3us", 10, -1 },
+   { "glSecondaryColor3usv", 10, -1 },
+   { "glSecondaryColorPointer", 10, -1 },
+   { "glWindowPos2d", 10, -1 },
+   { "glWindowPos2dv", 10, -1 },
+   { "glWindowPos2f", 10, -1 },
+   { "glWindowPos2fv", 10, -1 },
+   { "glWindowPos2i", 10, -1 },
+   { "glWindowPos2iv", 10, -1 },
+   { "glWindowPos2s", 10, -1 },
+   { "glWindowPos2sv", 10, -1 },
+   { "glWindowPos3d", 10, -1 },
+   { "glWindowPos3dv", 10, -1 },
+   { "glWindowPos3f", 10, -1 },
+   { "glWindowPos3fv", 10, -1 },
+   { "glWindowPos3i", 10, -1 },
+   { "glWindowPos3iv", 10, -1 },
+   { "glWindowPos3s", 10, -1 },
+   { "glWindowPos3sv", 10, -1 },
+   { "glProgramStringARB", 10, -1 },
+   { "glProgramEnvParameter4dARB", 10, -1 },
+   { "glProgramEnvParameter4dvARB", 10, -1 },
+   { "glProgramEnvParameter4fARB", 10, -1 },
+   { "glProgramEnvParameter4fvARB", 10, -1 },
+   { "glProgramLocalParameter4dARB", 10, -1 },
+   { "glProgramLocalParameter4dvARB", 10, -1 },
+   { "glProgramLocalParameter4fARB", 10, -1 },
+   { "glProgramLocalParameter4fvARB", 10, -1 },
+   { "glGetProgramEnvParameterdvARB", 10, -1 },
+   { "glGetProgramEnvParameterfvARB", 10, -1 },
+   { "glGetProgramLocalParameterdvARB", 10, -1 },
+   { "glGetProgramLocalParameterfvARB", 10, -1 },
+   { "glGetProgramivARB", 10, -1 },
+   { "glGetProgramStringARB", 10, -1 },
+   { "glPolygonOffsetEXT", 10, -1 },
+   { "glColorPointerEXT", 10, -1 },
+   { "glEdgeFlagPointerEXT", 10, -1 },
+   { "glIndexPointerEXT", 10, -1 },
+   { "glNormalPointerEXT", 10, -1 },
+   { "glTexCoordPointerEXT", 10, -1 },
+   { "glVertexPointerEXT", 10, -1 },
+   { "glLockArraysEXT", 10, -1 },
+   { "glUnlockArraysEXT", 10, -1 },
+   { "glWindowPos4dMESA", 10, -1 },
+   { "glWindowPos4dvMESA", 10, -1 },
+   { "glWindowPos4fMESA", 10, -1 },
+   { "glWindowPos4fvMESA", 10, -1 },
+   { "glWindowPos4iMESA", 10, -1 },
+   { "glWindowPos4ivMESA", 10, -1 },
+   { "glWindowPos4sMESA", 10, -1 },
+   { "glWindowPos4svMESA", 10, -1 },
+   { "glBindProgramNV", 10, -1 },
+   { "glDeleteProgramsNV", 10, -1 },
+   { "glGenProgramsNV", 10, -1 },
+   { "glIsProgramNV", 10, -1 },
+   { "glVertexAttrib1sNV", 10, -1 },
+   { "glVertexAttrib1svNV", 10, -1 },
+   { "glVertexAttrib2sNV", 10, -1 },
+   { "glVertexAttrib2svNV", 10, -1 },
+   { "glVertexAttrib3sNV", 10, -1 },
+   { "glVertexAttrib3svNV", 10, -1 },
+   { "glVertexAttrib4sNV", 10, -1 },
+   { "glVertexAttrib4svNV", 10, -1 },
+   { "glVertexAttrib1fNV", 10, -1 },
+   { "glVertexAttrib1fvNV", 10, -1 },
+   { "glVertexAttrib2fNV", 10, -1 },
+   { "glVertexAttrib2fvNV", 10, -1 },
+   { "glVertexAttrib3fNV", 10, -1 },
+   { "glVertexAttrib3fvNV", 10, -1 },
+   { "glVertexAttrib4fNV", 10, -1 },
+   { "glVertexAttrib4fvNV", 10, -1 },
+   { "glVertexAttrib1dNV", 10, -1 },
+   { "glVertexAttrib1dvNV", 10, -1 },
+   { "glVertexAttrib2dNV", 10, -1 },
+   { "glVertexAttrib2dvNV", 10, -1 },
+   { "glVertexAttrib3dNV", 10, -1 },
+   { "glVertexAttrib3dvNV", 10, -1 },
+   { "glVertexAttrib4dNV", 10, -1 },
+   { "glVertexAttrib4dvNV", 10, -1 },
+   { "glVertexAttrib4ubNV", 10, -1 },
+   { "glVertexAttrib4ubvNV", 10, -1 },
+   { "glVertexAttribs1svNV", 10, -1 },
+   { "glVertexAttribs2svNV", 10, -1 },
+   { "glVertexAttribs3svNV", 10, -1 },
+   { "glVertexAttribs4svNV", 10, -1 },
+   { "glVertexAttribs1fvNV", 10, -1 },
+   { "glVertexAttribs2fvNV", 10, -1 },
+   { "glVertexAttribs3fvNV", 10, -1 },
+   { "glVertexAttribs4fvNV", 10, -1 },
+   { "glVertexAttribs1dvNV", 10, -1 },
+   { "glVertexAttribs2dvNV", 10, -1 },
+   { "glVertexAttribs3dvNV", 10, -1 },
+   { "glVertexAttribs4dvNV", 10, -1 },
+   { "glVertexAttribs4ubvNV", 10, -1 },
+   { "glGenFragmentShadersATI", 10, -1 },
+   { "glBindFragmentShaderATI", 10, -1 },
+   { "glDeleteFragmentShaderATI", 10, -1 },
+   { "glBeginFragmentShaderATI", 10, -1 },
+   { "glEndFragmentShaderATI", 10, -1 },
+   { "glPassTexCoordATI", 10, -1 },
+   { "glSampleMapATI", 10, -1 },
+   { "glColorFragmentOp1ATI", 10, -1 },
+   { "glColorFragmentOp2ATI", 10, -1 },
+   { "glColorFragmentOp3ATI", 10, -1 },
+   { "glAlphaFragmentOp1ATI", 10, -1 },
+   { "glAlphaFragmentOp2ATI", 10, -1 },
+   { "glAlphaFragmentOp3ATI", 10, -1 },
+   { "glSetFragmentShaderConstantATI", 10, -1 },
+   { "glActiveStencilFaceEXT", 10, -1 },
+   { "glStencilFuncSeparateATI", 10, -1 },
+   { "glProgramEnvParameters4fvEXT", 10, -1 },
+   { "glProgramLocalParameters4fvEXT", 10, -1 },
+   { "glPrimitiveRestartNV", 10, -1 },
+
+   { NULL, 0, -1 }
+};
+
+const struct function gl_core_functions_possible[] = {
+   /* GL 3.1 */
+   { "glTexBuffer", 31, -1 },
+
+   /* GL 3.2 */
+   { "glFramebufferTexture", 32, -1 },
+
+   /* GL 4.3 */
+   { "glIsRenderbuffer", 43, -1 },
+   { "glBindRenderbuffer", 43, -1 },
+   { "glDeleteRenderbuffers", 43, -1 },
+   { "glGenRenderbuffers", 43, -1 },
+   { "glRenderbufferStorage", 43, -1 },
+   { "glGetRenderbufferParameteriv", 43, -1 },
+   { "glIsFramebuffer", 43, -1 },
+   { "glBindFramebuffer", 43, -1 },
+   { "glDeleteFramebuffers", 43, -1 },
+   { "glGenFramebuffers", 43, -1 },
+   { "glCheckFramebufferStatus", 43, -1 },
+   { "glFramebufferTexture1D", 43, -1 },
+   { "glFramebufferTexture2D", 43, -1 },
+   { "glFramebufferTexture3D", 43, -1 },
+   { "glFramebufferRenderbuffer", 43, -1 },
+   { "glGetFramebufferAttachmentParameteriv", 43, -1 },
+   { "glGenerateMipmap", 43, -1 },
+   { "glBlitFramebuffer", 43, -1 },
+   { "glRenderbufferStorageMultisample", 43, -1 },
+   { "glFramebufferTextureLayer", 43, -1 },
+   { "glMapBufferRange", 43, -1 },
+   { "glFlushMappedBufferRange", 43, -1 },
+   { "glBindVertexArray", 43, -1 },
+   { "glDeleteVertexArrays", 43, -1 },
+   { "glGenVertexArrays", 43, -1 },
+   { "glIsVertexArray", 43, -1 },
+   { "glGetUniformIndices", 43, -1 },
+   { "glGetActiveUniformsiv", 43, -1 },
+   { "glGetActiveUniformName", 43, -1 },
+   { "glGetUniformBlockIndex", 43, -1 },
+   { "glGetActiveUniformBlockiv", 43, -1 },
+   { "glGetActiveUniformBlockName", 43, -1 },
+   { "glUniformBlockBinding", 43, -1 },
+   { "glCopyBufferSubData", 43, -1 },
+   { "glDrawElementsBaseVertex", 43, -1 },
+   { "glDrawRangeElementsBaseVertex", 43, -1 },
+   { "glDrawElementsInstancedBaseVertex", 43, -1 },
+   { "glMultiDrawElementsBaseVertex", 43, -1 },
+   { "glProvokingVertex", 43, -1 },
+   { "glFenceSync", 43, -1 },
+   { "glIsSync", 43, -1 },
+   { "glDeleteSync", 43, -1 },
+   { "glClientWaitSync", 43, -1 },
+   { "glWaitSync", 43, -1 },
+   { "glGetInteger64v", 43, -1 },
+   { "glGetSynciv", 43, -1 },
+   { "glTexImage2DMultisample", 43, -1 },
+   { "glTexImage3DMultisample", 43, -1 },
+   { "glGetMultisamplefv", 43, -1 },
+   { "glSampleMaski", 43, -1 },
+   { "glBlendEquationiARB", 43, -1 },
+   { "glBlendEquationSeparateiARB", 43, -1 },
+   { "glBlendFunciARB", 43, -1 },
+   { "glBlendFuncSeparateiARB", 43, -1 },
+   { "glMinSampleShadingARB", 43, -1 },                 // XXX: Add to xml
+// { "glNamedStringARB", 43, -1 },                      // XXX: Add to xml
+// { "glDeleteNamedStringARB", 43, -1 },                // XXX: Add to xml
+// { "glCompileShaderIncludeARB", 43, -1 },             // XXX: Add to xml
+// { "glIsNamedStringARB", 43, -1 },                    // XXX: Add to xml
+// { "glGetNamedStringARB", 43, -1 },                   // XXX: Add to xml
+// { "glGetNamedStringivARB", 43, -1 },                 // XXX: Add to xml
+   { "glBindFragDataLocationIndexed", 43, -1 },
+   { "glGetFragDataIndex", 43, -1 },
+   { "glGenSamplers", 43, -1 },
+   { "glDeleteSamplers", 43, -1 },
+   { "glIsSampler", 43, -1 },
+   { "glBindSampler", 43, -1 },
+   { "glSamplerParameteri", 43, -1 },
+   { "glSamplerParameteriv", 43, -1 },
+   { "glSamplerParameterf", 43, -1 },
+   { "glSamplerParameterfv", 43, -1 },
+   { "glSamplerParameterIiv", 43, -1 },
+   { "glSamplerParameterIuiv", 43, -1 },
+   { "glGetSamplerParameteriv", 43, -1 },
+   { "glGetSamplerParameterIiv", 43, -1 },
+   { "glGetSamplerParameterfv", 43, -1 },
+   { "glGetSamplerParameterIuiv", 43, -1 },
+   { "glQueryCounter", 43, -1 },
+   { "glGetQueryObjecti64v", 43, -1 },
+   { "glGetQueryObjectui64v", 43, -1 },
+   { "glVertexP2ui", 43, -1 },
+   { "glVertexP2uiv", 43, -1 },
+   { "glVertexP3ui", 43, -1 },
+   { "glVertexP3uiv", 43, -1 },
+   { "glVertexP4ui", 43, -1 },
+   { "glVertexP4uiv", 43, -1 },
+   { "glTexCoordP1ui", 43, -1 },
+   { "glTexCoordP1uiv", 43, -1 },
+   { "glTexCoordP2ui", 43, -1 },
+   { "glTexCoordP2uiv", 43, -1 },
+   { "glTexCoordP3ui", 43, -1 },
+   { "glTexCoordP3uiv", 43, -1 },
+   { "glTexCoordP4ui", 43, -1 },
+   { "glTexCoordP4uiv", 43, -1 },
+   { "glMultiTexCoordP1ui", 43, -1 },
+   { "glMultiTexCoordP1uiv", 43, -1 },
+   { "glMultiTexCoordP2ui", 43, -1 },
+   { "glMultiTexCoordP2uiv", 43, -1 },
+   { "glMultiTexCoordP3ui", 43, -1 },
+   { "glMultiTexCoordP3uiv", 43, -1 },
+   { "glMultiTexCoordP4ui", 43, -1 },
+   { "glMultiTexCoordP4uiv", 43, -1 },
+   { "glNormalP3ui", 43, -1 },
+   { "glNormalP3uiv", 43, -1 },
+   { "glColorP3ui", 43, -1 },
+   { "glColorP3uiv", 43, -1 },
+   { "glColorP4ui", 43, -1 },
+   { "glColorP4uiv", 43, -1 },
+   { "glVertexAttribP1ui", 43, -1 },
+   { "glVertexAttribP1uiv", 43, -1 },
+   { "glVertexAttribP2ui", 43, -1 },
+   { "glVertexAttribP2uiv", 43, -1 },
+   { "glVertexAttribP3ui", 43, -1 },
+   { "glVertexAttribP3uiv", 43, -1 },
+   { "glVertexAttribP4ui", 43, -1 },
+   { "glVertexAttribP4uiv", 43, -1 },
+   { "glDrawArraysIndirect", 43, -1 },
+   { "glDrawElementsIndirect", 43, -1 },
+
+   { "glUniform1d", 40, -1 },
+   { "glUniform2d", 40, -1 },
+   { "glUniform3d", 40, -1 },
+   { "glUniform4d", 40, -1 },
+   { "glUniform1dv", 40, -1 },
+   { "glUniform2dv", 40, -1 },
+   { "glUniform3dv", 40, -1 },
+   { "glUniform4dv", 40, -1 },
+   { "glUniformMatrix2dv", 40, -1 },
+   { "glUniformMatrix3dv", 40, -1 },
+   { "glUniformMatrix4dv", 40, -1 },
+   { "glUniformMatrix2x3dv", 40, -1 },
+   { "glUniformMatrix2x4dv", 40, -1 },
+   { "glUniformMatrix3x2dv", 40, -1 },
+   { "glUniformMatrix3x4dv", 40, -1 },
+   { "glUniformMatrix4x2dv", 40, -1 },
+   { "glUniformMatrix4x3dv", 40, -1 },
+   { "glGetUniformdv", 43, -1 },
+// { "glGetSubroutineUniformLocation", 43, -1 },        // XXX: Add to xml
+// { "glGetSubroutineIndex", 43, -1 },                  // XXX: Add to xml
+// { "glGetActiveSubroutineUniformiv", 43, -1 },        // XXX: Add to xml
+// { "glGetActiveSubroutineUniformName", 43, -1 },      // XXX: Add to xml
+// { "glGetActiveSubroutineName", 43, -1 },             // XXX: Add to xml
+// { "glUniformSubroutinesuiv", 43, -1 },               // XXX: Add to xml
+// { "glGetUniformSubroutineuiv", 43, -1 },             // XXX: Add to xml
+// { "glGetProgramStageiv", 43, -1 },                   // XXX: Add to xml
+// { "glPatchParameteri", 43, -1 },                     // XXX: Add to xml
+// { "glPatchParameterfv", 43, -1 },                    // XXX: Add to xml
+
+   { "glBindTransformFeedback", 43, -1 },
+   { "glDeleteTransformFeedbacks", 43, -1 },
+   { "glGenTransformFeedbacks", 43, -1 },
+   { "glIsTransformFeedback", 43, -1 },
+   { "glPauseTransformFeedback", 43, -1 },
+   { "glResumeTransformFeedback", 43, -1 },
+   { "glDrawTransformFeedback", 43, -1 },
+   { "glDrawTransformFeedbackStream", 43, -1 },
+   { "glBeginQueryIndexed", 43, -1 },
+   { "glEndQueryIndexed", 43, -1 },
+   { "glGetQueryIndexediv", 43, -1 },
+   { "glReleaseShaderCompiler", 43, -1 },
+   { "glShaderBinary", 43, -1 },
+   { "glGetShaderPrecisionFormat", 43, -1 },
+   { "glDepthRangef", 43, -1 },
+   { "glClearDepthf", 43, -1 },
+   { "glGetProgramBinary", 43, -1 },
+   { "glProgramBinary", 43, -1 },
+   { "glProgramParameteri", 43, -1 },
+   { "glUseProgramStages", 43, -1 },
+   { "glActiveShaderProgram", 43, -1 },
+   { "glCreateShaderProgramv", 43, -1 },
+   { "glBindProgramPipeline", 43, -1 },
+   { "glDeleteProgramPipelines", 43, -1 },
+   { "glGenProgramPipelines", 43, -1 },
+   { "glIsProgramPipeline", 43, -1 },
+   { "glGetProgramPipelineiv", 43, -1 },
+   { "glProgramUniform1i", 43, -1 },
+   { "glProgramUniform1iv", 43, -1 },
+   { "glProgramUniform1f", 43, -1 },
+   { "glProgramUniform1fv", 43, -1 },
+   { "glProgramUniform1d", 40, -1 },
+   { "glProgramUniform1dv", 40, -1 },
+   { "glProgramUniform1ui", 43, -1 },
+   { "glProgramUniform1uiv", 43, -1 },
+   { "glProgramUniform2i", 43, -1 },
+   { "glProgramUniform2iv", 43, -1 },
+   { "glProgramUniform2f", 43, -1 },
+   { "glProgramUniform2fv", 43, -1 },
+   { "glProgramUniform2d", 40, -1 },
+   { "glProgramUniform2dv", 40, -1 },
+   { "glProgramUniform2ui", 43, -1 },
+   { "glProgramUniform2uiv", 43, -1 },
+   { "glProgramUniform3i", 43, -1 },
+   { "glProgramUniform3iv", 43, -1 },
+   { "glProgramUniform3f", 43, -1 },
+   { "glProgramUniform3fv", 43, -1 },
+   { "glProgramUniform3d", 40, -1 },
+   { "glProgramUniform3dv", 40, -1 },
+   { "glProgramUniform3ui", 43, -1 },
+   { "glProgramUniform3uiv", 43, -1 },
+   { "glProgramUniform4i", 43, -1 },
+   { "glProgramUniform4iv", 43, -1 },
+   { "glProgramUniform4f", 43, -1 },
+   { "glProgramUniform4fv", 43, -1 },
+   { "glProgramUniform4d", 40, -1 },
+   { "glProgramUniform4dv", 40, -1 },
+   { "glProgramUniform4ui", 43, -1 },
+   { "glProgramUniform4uiv", 43, -1 },
+   { "glProgramUniformMatrix2fv", 43, -1 },
+   { "glProgramUniformMatrix3fv", 43, -1 },
+   { "glProgramUniformMatrix4fv", 43, -1 },
+   { "glProgramUniformMatrix2dv", 40, -1 },
+   { "glProgramUniformMatrix3dv", 40, -1 },
+   { "glProgramUniformMatrix4dv", 40, -1 },
+   { "glProgramUniformMatrix2x3fv", 43, -1 },
+   { "glProgramUniformMatrix3x2fv", 43, -1 },
+   { "glProgramUniformMatrix2x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x2fv", 43, -1 },
+   { "glProgramUniformMatrix3x4fv", 43, -1 },
+   { "glProgramUniformMatrix4x3fv", 43, -1 },
+   { "glProgramUniformMatrix2x3dv", 40, -1 },
+   { "glProgramUniformMatrix3x2dv", 40, -1 },
+   { "glProgramUniformMatrix2x4dv", 40, -1 },
+   { "glProgramUniformMatrix4x2dv", 40, -1 },
+   { "glProgramUniformMatrix3x4dv", 40, -1 },
+   { "glProgramUniformMatrix4x3dv", 40, -1 },
+   { "glValidateProgramPipeline", 43, -1 },
+   { "glGetProgramPipelineInfoLog", 43, -1 },
+
+   { "glVertexAttribL1d", 41, -1 },
+   { "glVertexAttribL2d", 41, -1 },
+   { "glVertexAttribL3d", 41, -1 },
+   { "glVertexAttribL4d", 41, -1 },
+   { "glVertexAttribL1dv", 41, -1 },
+   { "glVertexAttribL2dv", 41, -1 },
+   { "glVertexAttribL3dv", 41, -1 },
+   { "glVertexAttribL4dv", 41, -1 },
+   { "glVertexAttribLPointer", 41, -1 },
+   { "glGetVertexAttribLdv", 41, -1 },
+   { "glViewportArrayv", 43, -1 },
+   { "glViewportIndexedf", 43, -1 },
+   { "glViewportIndexedfv", 43, -1 },
+   { "glScissorArrayv", 43, -1 },
+   { "glScissorIndexed", 43, -1 },
+   { "glScissorIndexedv", 43, -1 },
+   { "glDepthRangeArrayv", 43, -1 },
+   { "glDepthRangeIndexed", 43, -1 },
+
+// { "glCreateSyncFromCLeventARB", 43, -1 },            // XXX: Add to xml
+
+   { "glDrawArraysInstancedBaseInstance", 43, -1 },
+   { "glDrawElementsInstancedBaseInstance", 43, -1 },
+   { "glDrawElementsInstancedBaseVertexBaseInstance", 43, -1 },
+   { "glDrawTransformFeedbackInstanced", 43, -1 },
+   { "glDrawTransformFeedbackStreamInstanced", 43, -1 },
+   { "glGetActiveAtomicCounterBufferiv", 43, -1 },
+   { "glBindImageTexture", 43, -1 },
+   { "glMemoryBarrier", 43, -1 },
+   { "glTexStorage1D", 43, -1 },
+   { "glTexStorage2D", 43, -1 },
+   { "glTexStorage3D", 43, -1 },
+   { "glTextureStorage1DEXT", 43, -1 },
+   { "glTextureStorage2DEXT", 43, -1 },
+   { "glTextureStorage3DEXT", 43, -1 },
+   { "glClearBufferData", 43, -1 },
+   { "glClearBufferSubData", 43, -1 },
+// { "glClearNamedBufferDataEXT", 43, -1 },             // XXX: Add to xml
+// { "glClearNamedBufferSubDataEXT", 43, -1 },          // XXX: Add to xml
+   { "glCopyImageSubData", 43, -1 },
+   { "glTextureView", 43, -1 },
+   { "glBindVertexBuffer", 43, -1 },
+   { "glVertexAttribFormat", 43, -1 },
+   { "glVertexAttribIFormat", 43, -1 },
+   { "glVertexAttribBinding", 43, -1 },
+   { "glVertexBindingDivisor", 43, -1 },
+// { "glVertexArrayBindVertexBufferEXT", 43, -1 },      // XXX: Add to xml
+// { "glVertexArrayVertexAttribFormatEXT", 43, -1 },    // XXX: Add to xml
+// { "glVertexArrayVertexAttribIFormatEXT", 43, -1 },   // XXX: Add to xml
+// { "glVertexArrayVertexAttribLFormatEXT", 43, -1 },   // XXX: Add to xml
+// { "glVertexArrayVertexAttribBindingEXT", 43, -1 },   // XXX: Add to xml
+// { "glVertexArrayVertexBindingDivisorEXT", 43, -1 },  // XXX: Add to xml
+// { "glFramebufferParameteri", 43, -1 },               // XXX: Add to xml
+// { "glGetFramebufferParameteriv", 43, -1 },           // XXX: Add to xml
+// { "glNamedFramebufferParameteriEXT", 43, -1 },       // XXX: Add to xml
+// { "glGetNamedFramebufferParameterivEXT", 43, -1 },   // XXX: Add to xml
+// { "glGetInternalformati64v", 43, -1 },               // XXX: Add to xml
+   { "glInvalidateTexSubImage", 43, -1 },
+   { "glInvalidateTexImage", 43, -1 },
+   { "glInvalidateBufferSubData", 43, -1 },
+   { "glInvalidateBufferData", 43, -1 },
+   { "glInvalidateFramebuffer", 43, -1 },
+   { "glInvalidateSubFramebuffer", 43, -1 },
+   { "glMultiDrawArraysIndirect", 43, -1 },
+   { "glMultiDrawElementsIndirect", 43, -1 },
+   { "glGetProgramInterfaceiv", 43, -1 },
+   { "glGetProgramResourceIndex", 43, -1 },
+   { "glGetProgramResourceName", 43, -1 },
+   { "glGetProgramResourceiv", 43, -1 },
+   { "glGetProgramResourceLocation", 43, -1 },
+   { "glGetProgramResourceLocationIndex", 43, -1 },
+// { "glShaderStorageBlockBinding", 43, -1 },           // XXX: Add to xml
+   { "glTexBufferRange", 43, -1 },
+// { "glTextureBufferRangeEXT", 43, -1 },               // XXX: Add to xml
+   { "glTexStorage2DMultisample", 43, -1 },
+   { "glTexStorage3DMultisample", 43, -1 },
+// { "glTextureStorage2DMultisampleEXT", 43, -1 },      // XXX: Add to xml
+// { "glTextureStorage3DMultisampleEXT", 43, -1 },      // XXX: Add to xml
+
    /* GL_ARB_direct_state_access */
    { "glCreateTransformFeedbacks", 45, -1 },
    { "glTransformFeedbackBufferBase", 45, -1 },
@@ -980,6 +1754,24 @@ const struct function gl_core_functions_possible[] = {
    { "glGetNamedBufferParameteri64v", 45, -1 },
    { "glGetNamedBufferPointerv", 45, -1 },
    { "glGetNamedBufferSubData", 45, -1 },
+   { "glCreateFramebuffers", 45, -1 },
+   { "glNamedFramebufferRenderbuffer", 45, -1 },
+   { "glNamedFramebufferParameteri", 45, -1 },
+   { "glNamedFramebufferTexture", 45, -1 },
+   { "glNamedFramebufferTextureLayer", 45, -1 },
+   { "glNamedFramebufferDrawBuffer", 45, -1 },
+   { "glNamedFramebufferDrawBuffers", 45, -1 },
+   { "glNamedFramebufferReadBuffer", 45, -1 },
+   { "glInvalidateNamedFramebufferSubData", 45, -1 },
+   { "glInvalidateNamedFramebufferData", 45, -1 },
+   { "glClearNamedFramebufferiv", 45, -1 },
+   { "glClearNamedFramebufferuiv", 45, -1 },
+   { "glClearNamedFramebufferfv", 45, -1 },
+   { "glClearNamedFramebufferfi", 45, -1 },
+   { "glBlitNamedFramebuffer", 45, -1 },
+   { "glCheckNamedFramebufferStatus", 45, -1 },
+   { "glGetNamedFramebufferParameteriv", 45, -1 },
+   { "glGetNamedFramebufferAttachmentParameteriv", 45, -1 },
    { "glCreateRenderbuffers", 45, -1 },
    { "glNamedRenderbufferStorage", 45, -1 },
    { "glNamedRenderbufferStorageMultisample", 45, -1 },
@@ -1039,9 +1831,6 @@ const struct function gl_core_functions_possible[] = {
    { "glGetQueryBufferObjecti64v", 45, -1 },
    { "glGetQueryBufferObjectui64v", 45, -1 },
 
-   /* GL_EXT_polygon_offset_clamp */
-   { "glPolygonOffsetClampEXT", 11, -1 },
-
    { NULL, 0, -1 }
 };
 
@@ -1596,3 +2385,88 @@ const struct function gles3_functions_possible[] = {
 
    { NULL, 0, -1 }
 };
+
+const struct function gles31_functions_possible[] = {
+   { "glDispatchCompute", 31, -1 },
+   { "glDispatchComputeIndirect", 31, -1 },
+   { "glDrawArraysIndirect", 31, -1 },
+   { "glDrawElementsIndirect", 31, -1 },
+
+   // FINISHME: These two functions have not been implemented yet.  They come
+   // FINISHME: from the ARB_framebuffer_no_attachments extension.
+   // { "glFramebufferParameteri", 31, -1 },
+   // { "glGetFramebufferParameteriv", 31, -1 },
+
+   { "glGetProgramInterfaceiv", 31, -1 },
+   { "glGetProgramResourceIndex", 31, -1 },
+   { "glGetProgramResourceName", 31, -1 },
+   { "glGetProgramResourceiv", 31, -1 },
+   { "glGetProgramResourceLocation", 31, -1 },
+
+   // We check for the aliased EXT versions in GLES 2
+   // { "glUseProgramStages", 31, -1 },
+   // { "glActiveShaderProgram", 31, -1 },
+   // { "glCreateShaderProgramv", 31, -1 },
+   // { "glBindProgramPipeline", 31, -1 },
+   // { "glDeleteProgramPipelines", 31, -1 },
+   // { "glGenProgramPipelines", 31, -1 },
+   // { "glIsProgramPipeline", 31, -1 },
+   // { "glGetProgramPipelineiv", 31, -1 },
+   // { "glProgramUniform1i", 31, -1 },
+   // { "glProgramUniform2i", 31, -1 },
+   // { "glProgramUniform3i", 31, -1 },
+   // { "glProgramUniform4i", 31, -1 },
+   // { "glProgramUniform1f", 31, -1 },
+   // { "glProgramUniform2f", 31, -1 },
+   // { "glProgramUniform3f", 31, -1 },
+   // { "glProgramUniform4f", 31, -1 },
+   // { "glProgramUniform1iv", 31, -1 },
+   // { "glProgramUniform2iv", 31, -1 },
+   // { "glProgramUniform3iv", 31, -1 },
+   // { "glProgramUniform4iv", 31, -1 },
+   // { "glProgramUniform1fv", 31, -1 },
+   // { "glProgramUniform2fv", 31, -1 },
+   // { "glProgramUniform3fv", 31, -1 },
+   // { "glProgramUniform4fv", 31, -1 },
+   // { "glProgramUniformMatrix2fv", 31, -1 },
+   // { "glProgramUniformMatrix3fv", 31, -1 },
+   // { "glProgramUniformMatrix4fv", 31, -1 },
+   // { "glProgramUniformMatrix2x3fv", 31, -1 },
+   // { "glProgramUniformMatrix3x2fv", 31, -1 },
+   // { "glProgramUniformMatrix2x4fv", 31, -1 },
+   // { "glProgramUniformMatrix4x2fv", 31, -1 },
+   // { "glProgramUniformMatrix3x4fv", 31, -1 },
+   // { "glProgramUniformMatrix4x3fv", 31, -1 },
+   // { "glValidateProgramPipeline", 31, -1 },
+   // { "glGetProgramPipelineInfoLog", 31, -1 },
+
+   // We check for the aliased EXT versions in GLES 3
+   // { "glProgramUniform1ui", 31, -1 },
+   // { "glProgramUniform2ui", 31, -1 },
+   // { "glProgramUniform3ui", 31, -1 },
+   // { "glProgramUniform4ui", 31, -1 },
+   // { "glProgramUniform1uiv", 31, -1 },
+   // { "glProgramUniform2uiv", 31, -1 },
+   // { "glProgramUniform3uiv", 31, -1 },
+   // { "glProgramUniform4uiv", 31, -1 },
+
+   { "glBindImageTexture", 31, -1 },
+   { "glGetBooleani_v", 31, -1 },
+   { "glMemoryBarrier", 31, -1 },
+
+   // FINISHME: This function has not been implemented yet.
+   // { "glMemoryBarrierByRegion", 31, -1 },
+
+   { "glTexStorage2DMultisample", 31, -1 },
+   { "glGetMultisamplefv", 31, -1 },
+   { "glSampleMaski", 31, -1 },
+   { "glGetTexLevelParameteriv", 31, -1 },
+   { "glGetTexLevelParameterfv", 31, -1 },
+   { "glBindVertexBuffer", 31, -1 },
+   { "glVertexAttribFormat", 31, -1 },
+   { "glVertexAttribIFormat", 31, -1 },
+   { "glVertexAttribBinding", 31, -1 },
+   { "glVertexBindingDivisor", 31, -1 },
+
+   { NULL, 0, -1 },
+ };
diff --git a/src/mesa/main/texenv.c b/src/mesa/main/texenv.c
index ec521e6..3edafc0 100644
--- a/src/mesa/main/texenv.c
+++ b/src/mesa/main/texenv.c
@@ -646,7 +646,7 @@ _mesa_GetTexEnvfv( GLenum target, GLenum pname, GLfloat *params )
       if (pname == GL_TEXTURE_ENV_COLOR) {
          if(ctx->NewState & (_NEW_BUFFERS | _NEW_FRAG_CLAMP))
             _mesa_update_state(ctx);
-         if (_mesa_get_clamp_fragment_color(ctx))
+         if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
             COPY_4FV( params, texUnit->EnvColor );
          else
             COPY_4FV( params, texUnit->EnvColorUnclamped );
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 7bc1da7..3d85615 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -222,7 +222,7 @@ _mesa_base_tex_format( struct gl_context *ctx, GLint internalFormat )
       }
    }
 
-   if (ctx->Extensions.ARB_stencil_texturing) {
+   if (ctx->Extensions.ARB_texture_stencil8) {
       switch (internalFormat) {
       case GL_STENCIL_INDEX:
       case GL_STENCIL_INDEX1:
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index b5d42d3..d74134f 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1709,7 +1709,7 @@ get_tex_parameterfv(struct gl_context *ctx,
 
          if (ctx->NewState & (_NEW_BUFFERS | _NEW_FRAG_CLAMP))
             _mesa_update_state_locked(ctx);
-         if (_mesa_get_clamp_fragment_color(ctx)) {
+         if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer)) {
             params[0] = CLAMP(obj->Sampler.BorderColor.f[0], 0.0F, 1.0F);
             params[1] = CLAMP(obj->Sampler.BorderColor.f[1], 0.0F, 1.0F);
             params[2] = CLAMP(obj->Sampler.BorderColor.f[2], 0.0F, 1.0F);
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index cd87a27..6b0aed4 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -167,7 +167,7 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[
  * \return VIEW_CLASS if internalformat found in table, false otherwise.
  */
 static GLenum
-lookup_view_class(struct gl_context *ctx, GLenum internalformat)
+lookup_view_class(const struct gl_context *ctx, GLenum internalformat)
 {
    GLuint i;
 
@@ -176,9 +176,11 @@ lookup_view_class(struct gl_context *ctx, GLenum internalformat)
          return compatible_internal_formats[i].view_class;
    }
 
-   if (ctx->Extensions.EXT_texture_compression_s3tc && ctx->Extensions.EXT_texture_sRGB) {
+   if (ctx->Extensions.EXT_texture_compression_s3tc &&
+       ctx->Extensions.EXT_texture_sRGB) {
       for (i = 0; i < ARRAY_SIZE(s3tc_compatible_internal_formats); i++) {
-         if (s3tc_compatible_internal_formats[i].internal_format == internalformat)
+         if (s3tc_compatible_internal_formats[i].internal_format
+             == internalformat)
             return s3tc_compatible_internal_formats[i].view_class;
       }
    }
@@ -226,7 +228,8 @@ initialize_texture_fields(struct gl_context *ctx,
                                     0, internalFormat, texFormat);
       }
 
-      _mesa_next_mipmap_level_size(target, 0, levelWidth, levelHeight, levelDepth,
+      _mesa_next_mipmap_level_size(target, 0,
+                                   levelWidth, levelHeight, levelDepth,
                                    &levelWidth, &levelHeight, &levelDepth);
    }
 
@@ -320,8 +323,8 @@ target_valid(struct gl_context *ctx, GLenum origTarget, GLenum newTarget)
  * If an error is found, record it with _mesa_error()
  * \return false if any error, true otherwise.
  */
-GLboolean
-_mesa_texture_view_compatible_format(struct gl_context *ctx,
+bool
+_mesa_texture_view_compatible_format(const struct gl_context *ctx,
                                      GLenum origInternalFormat,
                                      GLenum newInternalFormat)
 {
@@ -334,15 +337,16 @@ _mesa_texture_view_compatible_format(struct gl_context *ctx,
     * or an INVALID_OPERATION error is generated.
     */
    if (origInternalFormat == newInternalFormat)
-      return GL_TRUE;
+      return true;
 
    origViewClass = lookup_view_class(ctx, origInternalFormat);
    newViewClass = lookup_view_class(ctx, newInternalFormat);
    if ((origViewClass == newViewClass) && origViewClass != false)
-      return GL_TRUE;
+      return true;
 
-   return GL_FALSE;
+   return false;
 }
+
 /**
  * Helper function for TexStorage and teximagemultisample to set immutable
  * texture state needed by ARB_texture_view.
@@ -357,17 +361,19 @@ _mesa_set_texture_view_state(struct gl_context *ctx,
    /* Get a reference to what will become this View's base level */
    texImage = _mesa_select_tex_image(texObj, target, 0);
 
-   /* When an immutable texture is created via glTexStorage or glTexImageMultisample,
+   /* When an immutable texture is created via glTexStorage or
+    * glTexImageMultisample,
     * TEXTURE_IMMUTABLE_FORMAT becomes TRUE.
     * TEXTURE_IMMUTABLE_LEVELS and TEXTURE_VIEW_NUM_LEVELS become levels.
     * If the texture target is TEXTURE_1D_ARRAY then
     * TEXTURE_VIEW_NUM_LAYERS becomes height.
     * If the texture target is TEXTURE_2D_ARRAY, TEXTURE_CUBE_MAP_ARRAY,
-    * or TEXTURE_2D_MULTISAMPLE_ARRAY then TEXTURE_VIEW_NUM_LAYERS becomes depth.
+    * or TEXTURE_2D_MULTISAMPLE_ARRAY then TEXTURE_VIEW_NUM_LAYERS becomes
+    * depth.
     * If the texture target is TEXTURE_CUBE_MAP, then
     * TEXTURE_VIEW_NUM_LAYERS becomes 6.
     * For any other texture target, TEXTURE_VIEW_NUM_LAYERS becomes 1.
-    * 
+    *
     * ARB_texture_multisample: Multisample textures do
     * not have multiple image levels.
     */
@@ -401,7 +407,6 @@ _mesa_set_texture_view_state(struct gl_context *ctx,
    case GL_TEXTURE_CUBE_MAP:
       texObj->NumLayers = 6;
       break;
-
    }
 }
 
@@ -435,16 +440,20 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                   minlevel, numlevels, minlayer, numlayers);
 
    if (origtexture == 0) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)", origtexture);
+      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)",
+                  origtexture);
       return;
    }
 
    /* Need original texture information to validate arguments */
    origTexObj = _mesa_lookup_texture(ctx, origtexture);
 
-   /* If <origtexture> is not the name of a texture, INVALID_VALUE is generated. */
+   /* If <origtexture> is not the name of a texture, INVALID_VALUE
+    * is generated.
+    */
    if (!origTexObj) {
-      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)", origtexture);
+      _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(origtexture = %u)",
+                  origtexture);
       return;
    }
 
@@ -452,7 +461,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     * INVALID_OPERATION is generated.
     */
    if (!origTexObj->Immutable) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(origtexture not immutable)");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(origtexture not immutable)");
       return;
    }
 
@@ -467,7 +477,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     */
    texObj = _mesa_lookup_texture(ctx, texture);
    if (texObj == NULL) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(texture = %u non-gen name)", texture);
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(texture = %u non-gen name)", texture);
       return;
    }
 
@@ -475,7 +486,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     * the error INVALID_OPERATION is generated.
     */
    if (texObj->Target) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(texture = %u already bound)", texture);
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(texture = %u already bound)", texture);
       return;
    }
 
@@ -484,33 +496,35 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
       return; /* error was recorded */
    }
 
-   /* minlevel and minlayer are relative to the view of origtexture
+   /* minlevel and minlayer are relative to the view of origtexture.
     * If minlevel or minlayer is greater than level or layer, respectively,
-    * of origtexture return INVALID_VALUE.
+    * return INVALID_VALUE.
     */
    newViewMinLevel = origTexObj->MinLevel + minlevel;
    newViewMinLayer = origTexObj->MinLayer + minlayer;
    if (newViewMinLevel >= (origTexObj->MinLevel + origTexObj->NumLevels)) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glTextureView(new minlevel (%d) > orig minlevel (%d) + orig numlevels (%d))",
+                  "glTextureView(new minlevel (%d) > orig minlevel (%d)"
+                  " + orig numlevels (%d))",
                   newViewMinLevel, origTexObj->MinLevel, origTexObj->NumLevels);
       return;
    }
 
    if (newViewMinLayer >= (origTexObj->MinLayer + origTexObj->NumLayers)) {
       _mesa_error(ctx, GL_INVALID_VALUE,
-                  "glTextureView(new minlayer (%d) > orig minlayer (%d) + orig numlayers (%d))",
+                  "glTextureView(new minlayer (%d) > orig minlayer (%d)"
+                  " + orig numlayers (%d))",
                   newViewMinLayer, origTexObj->MinLayer, origTexObj->NumLayers);
       return;
    }
 
    if (!_mesa_texture_view_compatible_format(ctx,
-                                             origTexObj->Image[0][0]->InternalFormat,
-                                             internalformat)) {
+                                   origTexObj->Image[0][0]->InternalFormat,
+                                   internalformat)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
-                  "glTextureView(internalformat %s not compatible with origtexture %s)",
-                  _mesa_lookup_enum_by_nr(internalformat),
-                  _mesa_lookup_enum_by_nr(origTexObj->Image[0][0]->InternalFormat));
+          "glTextureView(internalformat %s not compatible with origtexture %s)",
+          _mesa_lookup_enum_by_nr(internalformat),
+          _mesa_lookup_enum_by_nr(origTexObj->Image[0][0]->InternalFormat));
       return;
    }
 
@@ -569,14 +583,16 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
    dimensionsOK = _mesa_legal_texture_dimensions(ctx, target, 0,
                                                  width, height, depth, 0);
    if (!dimensionsOK) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(invalid width or height or depth)");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(invalid width or height or depth)");
       return;
    }
 
    sizeOK = ctx->Driver.TestProxyTexImage(ctx, target, 0, texFormat,
                                           width, height, depth, 0);
    if (!sizeOK) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(invalid texture size)");
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(invalid texture size)");
       return;
    }
 
@@ -591,17 +607,19 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
    case GL_TEXTURE_RECTANGLE:
    case GL_TEXTURE_2D_MULTISAMPLE:
       if (numlayers != 1) {
-         _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(numlayers %d != 1)", numlayers);
+         _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(numlayers %d != 1)",
+                     numlayers);
          return;
       }
       break;
 
    case GL_TEXTURE_CUBE_MAP:
-      /* If the new texture's target is TEXTURE_CUBE_MAP, the clamped <numlayers>
-       * must be equal to 6.
+      /* If the new texture's target is TEXTURE_CUBE_MAP, the clamped
+       * <numlayers> must be equal to 6.
        */
       if (newViewNumLayers != 6) {
-         _mesa_error(ctx, GL_INVALID_VALUE, "glTextureView(clamped numlayers %d != 6)",
+         _mesa_error(ctx, GL_INVALID_VALUE,
+                     "glTextureView(clamped numlayers %d != 6)",
                      newViewNumLayers);
          return;
       }
@@ -615,7 +633,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
        */
       if ((newViewNumLayers % 6) != 0) {
          _mesa_error(ctx, GL_INVALID_VALUE,
-                     "glTextureView(clamped numlayers %d is not a multiple of 6)",
+                     "glTextureView(clamped numlayers %d is not"
+                     " a multiple of 6)",
                      newViewNumLayers);
          return;
       }
@@ -628,7 +647,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
     */
    if ((target == GL_TEXTURE_CUBE_MAP || target == GL_TEXTURE_CUBE_MAP_ARRAY) &&
        (origTexImage->Width != origTexImage->Height)) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glTextureView(origtexture width (%d) != height (%d))",
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTextureView(origtexture width (%d) != height (%d))",
                   origTexImage->Width, origTexImage->Height);
       return;
    }
@@ -662,7 +682,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
    texObj->ImmutableLevels = origTexObj->ImmutableLevels;
    texObj->Target = target;
 
-   if (ctx->Driver.TextureView != NULL && !ctx->Driver.TextureView(ctx, texObj, origTexObj)) {
+   if (ctx->Driver.TextureView != NULL &&
+       !ctx->Driver.TextureView(ctx, texObj, origTexObj)) {
       return; /* driver recorded error */
    }
 }
diff --git a/src/mesa/main/textureview.h b/src/mesa/main/textureview.h
index 549a13c..59e24b6 100644
--- a/src/mesa/main/textureview.h
+++ b/src/mesa/main/textureview.h
@@ -29,8 +29,8 @@
 #ifndef TEXTUREVIEW_H
 #define TEXTUREVIEW_H
 
-GLboolean
-_mesa_texture_view_compatible_format(struct gl_context *ctx,
+bool
+_mesa_texture_view_compatible_format(const struct gl_context *ctx,
                                      GLenum origInternalFormat,
                                      GLenum newInternalFormat);
 
@@ -41,7 +41,8 @@ _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                   GLuint minlayer, GLuint numlayers);
 
 extern void
-_mesa_set_texture_view_state(struct gl_context *ctx, struct gl_texture_object *texObj,
-                       GLenum target, GLuint levels);
+_mesa_set_texture_view_state(struct gl_context *ctx,
+                             struct gl_texture_object *texObj,
+                             GLenum target, GLuint levels);
 
 #endif /* TEXTUREVIEW_H */
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index 728bd1b..cab5083 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -237,6 +237,13 @@ validate_uniform_parameters(struct gl_context *ctx,
 
    struct gl_uniform_storage *const uni = shProg->UniformRemapTable[location];
 
+   /* Even though no location is assigned to a built-in uniform and this
+    * function should already have returned NULL, this test makes it explicit
+    * that we are not allowing to update the value of a built-in.
+    */
+   if (uni->builtin)
+      return NULL;
+
    if (uni->array_elements == 0) {
       if (count > 1) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
@@ -1028,6 +1035,10 @@ _mesa_get_uniform_location(struct gl_shader_program *shProg,
    if (!found)
       return GL_INVALID_INDEX;
 
+   /* If the uniform is built-in, fail. */
+   if (shProg->UniformStorage[location].builtin)
+      return GL_INVALID_INDEX;
+
    /* If the uniform is an array, fail if the index is out of bounds.
     * (A negative index is caught above.)  This also fails if the uniform
     * is not an array, but the user is trying to index it, because
@@ -1047,7 +1058,7 @@ _mesa_sampler_uniforms_are_valid(const struct gl_shader_program *shProg,
 				 char *errMsg, size_t errMsgLength)
 {
    /* Shader does not have samplers. */
-   if (shProg->NumUserUniformStorage == 0)
+   if (shProg->NumUniformStorage == 0)
       return true;
 
    if (!shProg->SamplersValidated) {
@@ -1087,7 +1098,7 @@ _mesa_sampler_uniforms_pipeline_are_valid(struct gl_pipeline_object *pipeline)
       if (!shProg[idx])
          continue;
 
-      for (unsigned i = 0; i < shProg[idx]->NumUserUniformStorage; i++) {
+      for (unsigned i = 0; i < shProg[idx]->NumUniformStorage; i++) {
          const struct gl_uniform_storage *const storage =
             &shProg[idx]->UniformStorage[i];
          const glsl_type *const t = (storage->type->is_array())
diff --git a/src/mesa/main/uniforms.h b/src/mesa/main/uniforms.h
index 55fa235..bd7b05e 100644
--- a/src/mesa/main/uniforms.h
+++ b/src/mesa/main/uniforms.h
@@ -343,10 +343,6 @@ void GLAPIENTRY
 _mesa_ProgramUniformMatrix4x3dv(GLuint program, GLint location, GLsizei count,
                                 GLboolean transpose, const GLdouble *value);
 
-long
-_mesa_parse_program_resource_name(const GLchar *name,
-                                  const GLchar **out_base_name_end);
-
 unsigned
 _mesa_get_uniform_location(struct gl_shader_program *shProg,
 			   const GLchar *name, unsigned *offset);
diff --git a/src/mesa/main/varray.c b/src/mesa/main/varray.c
index 7389037..ebdd9ea 100644
--- a/src/mesa/main/varray.c
+++ b/src/mesa/main/varray.c
@@ -2309,10 +2309,10 @@ print_array(const char *name, GLint index, const struct gl_client_array *array)
       fprintf(stderr, "  %s[%d]: ", name, index);
    else
       fprintf(stderr, "  %s: ", name);
-   fprintf(stderr, "Ptr=%p, Type=0x%x, Size=%d, ElemSize=%u, Stride=%d, Buffer=%u(Size %lu)\n",
-	  array->Ptr, array->Type, array->Size,
-	  array->_ElementSize, array->StrideB,
-	  array->BufferObj->Name, (unsigned long) array->BufferObj->Size);
+   fprintf(stderr, "Ptr=%p, Type=%s, Size=%d, ElemSize=%u, Stride=%d, Buffer=%u(Size %lu)\n",
+           array->Ptr, _mesa_lookup_enum_by_nr(array->Type), array->Size,
+           array->_ElementSize, array->StrideB, array->BufferObj->Name,
+           (unsigned long) array->BufferObj->Size);
 }
 
 
diff --git a/src/mesa/main/version.c b/src/mesa/main/version.c
index 699a0de..8bc00ac 100644
--- a/src/mesa/main/version.c
+++ b/src/mesa/main/version.c
@@ -51,31 +51,51 @@ check_for_ending(const char *string, const char *ending)
  * fwd_context is only valid if version > 0
  */
 static void
-get_gl_override(int *version, bool *fwd_context, bool *compat_context)
+get_gl_override(gl_api api, int *version, bool *fwd_context,
+                bool *compat_context)
 {
-   const char *env_var = "MESA_GL_VERSION_OVERRIDE";
+   const char *env_var = (api == API_OPENGL_CORE || api == API_OPENGL_COMPAT)
+      ? "MESA_GL_VERSION_OVERRIDE" : "MESA_GLES_VERSION_OVERRIDE";
    const char *version_str;
    int major, minor, n;
-   static int override_version = -1;
-   static bool fc_suffix = false;
-   static bool compat_suffix = false;
+   static struct override_info {
+      int version;
+      bool fc_suffix;
+      bool compat_suffix;
+   } override[] = {
+      { -1, false, false},
+      { -1, false, false},
+      { -1, false, false},
+      { -1, false, false},
+   };
 
-   if (override_version < 0) {
-      override_version = 0;
+   STATIC_ASSERT(ARRAY_SIZE(override) == API_OPENGL_LAST + 1);
+
+   if (api == API_OPENGLES)
+      goto exit;
+
+   if (override[api].version < 0) {
+      override[api].version = 0;
 
       version_str = getenv(env_var);
       if (version_str) {
-         fc_suffix = check_for_ending(version_str, "FC");
-         compat_suffix = check_for_ending(version_str, "COMPAT");
+         override[api].fc_suffix = check_for_ending(version_str, "FC");
+         override[api].compat_suffix = check_for_ending(version_str, "COMPAT");
 
          n = sscanf(version_str, "%u.%u", &major, &minor);
          if (n != 2) {
             fprintf(stderr, "error: invalid value for %s: %s\n",
                     env_var, version_str);
-            override_version = 0;
+            override[api].version = 0;
          } else {
-            override_version = major * 10 + minor;
-            if (override_version < 30 && fc_suffix) {
+            override[api].version = major * 10 + minor;
+
+            /* There is no such thing as compatibility or forward-compatible for
+             * OpenGL ES 2.0 or 3.x APIs.
+             */
+            if ((override[api].version < 30 && override[api].fc_suffix) ||
+                (api == API_OPENGLES2 && (override[api].fc_suffix ||
+                                          override[api].compat_suffix))) {
                fprintf(stderr, "error: invalid value for %s: %s\n",
                        env_var, version_str);
             }
@@ -83,9 +103,10 @@ get_gl_override(int *version, bool *fwd_context, bool *compat_context)
       }
    }
 
-   *version = override_version;
-   *fwd_context = fc_suffix;
-   *compat_context = compat_suffix;
+exit:
+   *version = override[api].version;
+   *fwd_context = override[api].fc_suffix;
+   *compat_context = override[api].compat_suffix;
 }
 
 /**
@@ -130,18 +151,26 @@ _mesa_override_gl_version_contextless(struct gl_constants *consts,
    int version;
    bool fwd_context, compat_context;
 
-   get_gl_override(&version, &fwd_context, &compat_context);
+   get_gl_override(*apiOut, &version, &fwd_context, &compat_context);
 
    if (version > 0) {
       *versionOut = version;
-      if (version >= 30 && fwd_context) {
-         *apiOut = API_OPENGL_CORE;
-         consts->ContextFlags |= GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT;
-      } else if (version >= 31 && !compat_context) {
-         *apiOut = API_OPENGL_CORE;
-      } else {
-         *apiOut = API_OPENGL_COMPAT;
+
+      /* If the API is a desktop API, adjust the context flags.  We may also
+       * need to modify the API depending on the version.  For example, Mesa
+       * does not support a GL 3.3 compatibility profile.
+       */
+      if (*apiOut == API_OPENGL_CORE || *apiOut == API_OPENGL_COMPAT) {
+         if (version >= 30 && fwd_context) {
+            *apiOut = API_OPENGL_CORE;
+            consts->ContextFlags |= GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT;
+         } else if (version >= 31 && !compat_context) {
+            *apiOut = API_OPENGL_CORE;
+         } else {
+            *apiOut = API_OPENGL_COMPAT;
+         }
       }
+
       return true;
    }
    return false;
@@ -157,22 +186,6 @@ _mesa_override_gl_version(struct gl_context *ctx)
 }
 
 /**
- * Returns the gl override value
- *
- * version > 0 indicates there is an override requested
- */
-int
-_mesa_get_gl_version_override(void)
-{
-   int version;
-   bool fwd_context, compat_context;
-
-   get_gl_override(&version, &fwd_context, &compat_context);
-
-   return version;
-}
-
-/**
  * Override the context's GLSL version if the environment variable
  * MESA_GLSL_VERSION_OVERRIDE is set. Valid values for
  * MESA_GLSL_VERSION_OVERRIDE are integers, such as "130".
@@ -433,7 +446,23 @@ compute_version_es2(const struct gl_extensions *extensions)
                          extensions->EXT_texture_snorm &&
                          extensions->NV_primitive_restart &&
                          extensions->OES_depth_texture_cube_map);
-   if (ver_3_0) {
+   const bool ver_3_1 = (ver_3_0 &&
+                         extensions->ARB_arrays_of_arrays &&
+                         extensions->ARB_compute_shader &&
+                         extensions->ARB_draw_indirect &&
+                         false /*extensions->ARB_framebuffer_no_attachments*/ &&
+                         extensions->ARB_shader_atomic_counters &&
+                         extensions->ARB_shader_image_load_store &&
+                         false /*extensions->ARB_shader_image_size*/ &&
+                         false /*extensions->ARB_shader_storage_buffer_object*/ &&
+                         extensions->ARB_shading_language_packing &&
+                         extensions->ARB_stencil_texturing &&
+                         extensions->ARB_gpu_shader5 &&
+                         extensions->EXT_shader_integer_mix);
+
+   if (ver_3_1) {
+      return 31;
+   } else if (ver_3_0) {
       return 30;
    } else if (ver_2_0) {
       return 20;
diff --git a/src/mesa/main/version.h b/src/mesa/main/version.h
index 450a0e3..ee7cb75 100644
--- a/src/mesa/main/version.h
+++ b/src/mesa/main/version.h
@@ -47,7 +47,4 @@ _mesa_override_gl_version(struct gl_context *ctx);
 extern void
 _mesa_override_glsl_version(struct gl_constants *consts);
 
-extern int
-_mesa_get_gl_version_override(void);
-
 #endif /* VERSION_H */
diff --git a/src/mesa/main/vtxfmt.c b/src/mesa/main/vtxfmt.c
index d7ef7e2..81bf4c5 100644
--- a/src/mesa/main/vtxfmt.c
+++ b/src/mesa/main/vtxfmt.c
@@ -207,7 +207,7 @@ install_vtxfmt(struct gl_context *ctx, struct _glapi_table *tab,
       SET_VertexAttribP4uiv(tab, vfmt->VertexAttribP4uiv);
    }
 
-   if (_mesa_is_desktop_gl(ctx)) {
+   if (ctx->API == API_OPENGL_CORE) {
       SET_VertexAttribL1d(tab, vfmt->VertexAttribL1d);
       SET_VertexAttribL2d(tab, vfmt->VertexAttribL2d);
       SET_VertexAttribL3d(tab, vfmt->VertexAttribL3d);
diff --git a/src/glsl/tests/common.c b/src/mesa/program/dummy_errors.c
index d69f54d..d69f54d 100644
--- a/src/glsl/tests/common.c
+++ b/src/mesa/program/dummy_errors.c
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index fceed71..3bffe90 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -262,6 +262,7 @@ public:
    virtual void visit(ir_if *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void visit(ir_barrier *);
    /*@}*/
 
    src_reg result;
@@ -405,7 +406,7 @@ ir_to_mesa_visitor::emit_dp(ir_instruction *ir,
 			    dst_reg dst, src_reg src0, src_reg src1,
 			    unsigned elements)
 {
-   static const gl_inst_opcode dot_opcodes[] = {
+   static const enum prog_opcode dot_opcodes[] = {
       OPCODE_DP2, OPCODE_DP3, OPCODE_DP4
    };
 
@@ -2118,6 +2119,12 @@ ir_to_mesa_visitor::visit(ir_end_primitive *)
    assert(!"Geometry shaders not supported.");
 }
 
+void
+ir_to_mesa_visitor::visit(ir_barrier *)
+{
+   unreachable("GLSL barrier() not supported.");
+}
+
 ir_to_mesa_visitor::ir_to_mesa_visitor()
 {
    result.file = PROGRAM_UNDEFINED;
@@ -2407,9 +2414,14 @@ _mesa_associate_uniform_storage(struct gl_context *ctx,
       if (!found)
 	 continue;
 
+      struct gl_uniform_storage *storage =
+         &shader_program->UniformStorage[location];
+
+      /* Do not associate any uniform storage to built-in uniforms */
+      if (storage->builtin)
+         continue;
+
       if (location != last_location) {
-	 struct gl_uniform_storage *storage =
-	    &shader_program->UniformStorage[location];
 	 enum gl_uniform_driver_format format = uniform_native;
 
 	 unsigned columns = 0;
@@ -2722,7 +2734,7 @@ get_mesa_program(struct gl_context *ctx,
       mesa_inst->Opcode = inst->op;
       mesa_inst->CondUpdate = inst->cond_update;
       if (inst->saturate)
-	 mesa_inst->SaturateMode = SATURATE_ZERO_ONE;
+	 mesa_inst->Saturate = GL_TRUE;
       mesa_inst->DstReg.File = inst->dst.file;
       mesa_inst->DstReg.Index = inst->dst.index;
       mesa_inst->DstReg.CondMask = inst->dst.cond_mask;
diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c
index 16e8e34..46260b5 100644
--- a/src/mesa/program/prog_execute.c
+++ b/src/mesa/program/prog_execute.c
@@ -397,7 +397,7 @@ store_vector4(const struct prog_instruction *inst,
               struct gl_program_machine *machine, const GLfloat value[4])
 {
    const struct prog_dst_register *dstReg = &(inst->DstReg);
-   const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
+   const GLboolean clamp = inst->Saturate;
    GLuint writeMask = dstReg->WriteMask;
    GLfloat clampedValue[4];
    GLfloat *dst = get_dst_register_pointer(dstReg, machine);
diff --git a/src/mesa/program/prog_instruction.c b/src/mesa/program/prog_instruction.c
index f9ebe4e..21ef353 100644
--- a/src/mesa/program/prog_instruction.c
+++ b/src/mesa/program/prog_instruction.c
@@ -55,7 +55,7 @@ _mesa_init_instructions(struct prog_instruction *inst, GLuint count)
       inst[i].DstReg.CondMask = COND_TR;
       inst[i].DstReg.CondSwizzle = SWIZZLE_NOOP;
 
-      inst[i].SaturateMode = SATURATE_OFF;
+      inst[i].Saturate = GL_FALSE;
       inst[i].Precision = FLOAT32;
    }
 }
@@ -114,7 +114,7 @@ _mesa_free_instructions(struct prog_instruction *inst, GLuint count)
  */
 struct instruction_info
 {
-   gl_inst_opcode Opcode;
+   enum prog_opcode Opcode;
    const char *Name;
    GLuint NumSrcRegs;
    GLuint NumDstRegs;
@@ -198,7 +198,7 @@ static const struct instruction_info InstInfo[MAX_OPCODE] = {
  * Return the number of src registers for the given instruction/opcode.
  */
 GLuint
-_mesa_num_inst_src_regs(gl_inst_opcode opcode)
+_mesa_num_inst_src_regs(enum prog_opcode opcode)
 {
    assert(opcode < MAX_OPCODE);
    assert(opcode == InstInfo[opcode].Opcode);
@@ -211,7 +211,7 @@ _mesa_num_inst_src_regs(gl_inst_opcode opcode)
  * Return the number of dst registers for the given instruction/opcode.
  */
 GLuint
-_mesa_num_inst_dst_regs(gl_inst_opcode opcode)
+_mesa_num_inst_dst_regs(enum prog_opcode opcode)
 {
    assert(opcode < MAX_OPCODE);
    assert(opcode == InstInfo[opcode].Opcode);
@@ -221,7 +221,7 @@ _mesa_num_inst_dst_regs(gl_inst_opcode opcode)
 
 
 GLboolean
-_mesa_is_tex_instruction(gl_inst_opcode opcode)
+_mesa_is_tex_instruction(enum prog_opcode opcode)
 {
    return (opcode == OPCODE_TEX ||
            opcode == OPCODE_TXB ||
@@ -285,7 +285,7 @@ _mesa_check_soa_dependencies(const struct prog_instruction *inst)
  * Return string name for given program opcode.
  */
 const char *
-_mesa_opcode_string(gl_inst_opcode opcode)
+_mesa_opcode_string(enum prog_opcode opcode)
 {
    if (opcode < MAX_OPCODE)
       return InstInfo[opcode].Name;
diff --git a/src/mesa/program/prog_instruction.h b/src/mesa/program/prog_instruction.h
index 96da198..d56f96c 100644
--- a/src/mesa/program/prog_instruction.h
+++ b/src/mesa/program/prog_instruction.h
@@ -118,15 +118,6 @@
 
 
 /**
- * Saturation modes when storing values.
- */
-/*@{*/
-#define SATURATE_OFF            0
-#define SATURATE_ZERO_ONE       1
-/*@}*/
-
-
-/**
  * Per-component negation masks
  */
 /*@{*/
@@ -143,7 +134,7 @@
 /**
  * Program instruction opcodes for vertex, fragment and geometry programs.
  */
-typedef enum prog_opcode {
+enum prog_opcode {
                      /* ARB_vp   ARB_fp   NV_vp   NV_fp     GLSL */
                      /*------------------------------------------*/
    OPCODE_NOP = 0,   /*                                      X   */
@@ -213,7 +204,7 @@ typedef enum prog_opcode {
    OPCODE_TRUNC,     /*                                      X   */
    OPCODE_XPD,       /*   X        X                             */
    MAX_OPCODE
-} gl_inst_opcode;
+};
 
 
 /**
@@ -300,7 +291,7 @@ struct prog_dst_register
  */
 struct prog_instruction
 {
-   gl_inst_opcode Opcode;
+   enum prog_opcode Opcode;
    struct prog_src_register SrcReg[3];
    struct prog_dst_register DstReg;
 
@@ -327,15 +318,12 @@ struct prog_instruction
    GLuint CondDst:1;
 
    /**
-    * Saturate each value of the vectored result to the range [0,1] or the
-    * range [-1,1].  \c SSAT mode (i.e., saturation to the range [-1,1]) is
-    * only available in NV_fragment_program2 mode.
-    * Value is one of the SATURATE_* tokens.
+    * Saturate each value of the vectored result to the range [0,1].
     *
     * \since
     * NV_fragment_program_option, NV_vertex_program3.
     */
-   GLuint SaturateMode:2;
+   GLuint Saturate:1;
 
    /**
     * Per-instruction selectable precision: FLOAT32, FLOAT16, FIXED12.
@@ -368,9 +356,6 @@ struct prog_instruction
     */
    GLint BranchTarget;
 
-   /** for driver use (try to remove someday) */
-   GLint Aux;
-
    /** for debugging purposes */
    const char *Comment;
 };
@@ -394,19 +379,19 @@ extern void
 _mesa_free_instructions(struct prog_instruction *inst, GLuint count);
 
 extern GLuint
-_mesa_num_inst_src_regs(gl_inst_opcode opcode);
+_mesa_num_inst_src_regs(enum prog_opcode opcode);
 
 extern GLuint
-_mesa_num_inst_dst_regs(gl_inst_opcode opcode);
+_mesa_num_inst_dst_regs(enum prog_opcode opcode);
 
 extern GLboolean
-_mesa_is_tex_instruction(gl_inst_opcode opcode);
+_mesa_is_tex_instruction(enum prog_opcode opcode);
 
 extern GLboolean
 _mesa_check_soa_dependencies(const struct prog_instruction *inst);
 
 extern const char *
-_mesa_opcode_string(gl_inst_opcode opcode);
+_mesa_opcode_string(enum prog_opcode opcode);
 
 
 #ifdef __cplusplus
diff --git a/src/mesa/program/prog_optimize.c b/src/mesa/program/prog_optimize.c
index 6d4485a..f9e9035 100644
--- a/src/mesa/program/prog_optimize.c
+++ b/src/mesa/program/prog_optimize.c
@@ -478,7 +478,7 @@ can_upward_mov_be_modifed(const struct prog_instruction *mov)
    return
       can_downward_mov_be_modifed(mov) &&
       mov->DstReg.File == PROGRAM_TEMPORARY &&
-      mov->SaturateMode == SATURATE_OFF;
+      !mov->Saturate;
 }
 
 
@@ -653,7 +653,7 @@ _mesa_merge_mov_into_inst(struct prog_instruction *inst,
    if (mask != (inst->DstReg.WriteMask & mask))
       return GL_FALSE;
 
-   inst->SaturateMode |= mov->SaturateMode;
+   inst->Saturate |= mov->Saturate;
 
    /* Depending on the instruction, we may need to recompute the swizzles.
     * Also, some other instructions (like TEX) are not linear. We will only
diff --git a/src/mesa/program/prog_print.c b/src/mesa/program/prog_print.c
index d588d07..e4faa63 100644
--- a/src/mesa/program/prog_print.c
+++ b/src/mesa/program/prog_print.c
@@ -600,7 +600,7 @@ _mesa_fprint_alu_instruction(FILE *f,
       fprintf(f, ".C");
 
    /* frag prog only */
-   if (inst->SaturateMode == SATURATE_ZERO_ONE)
+   if (inst->Saturate)
       fprintf(f, "_SAT");
 
    fprintf(f, " ");
@@ -658,7 +658,7 @@ _mesa_fprint_instruction_opt(FILE *f,
    switch (inst->Opcode) {
    case OPCODE_SWZ:
       fprintf(f, "SWZ");
-      if (inst->SaturateMode == SATURATE_ZERO_ONE)
+      if (inst->Saturate)
          fprintf(f, "_SAT");
       fprintf(f, " ");
       fprint_dst_reg(f, &inst->DstReg, mode, prog);
@@ -675,7 +675,7 @@ _mesa_fprint_instruction_opt(FILE *f,
    case OPCODE_TXB:
    case OPCODE_TXD:
       fprintf(f, "%s", _mesa_opcode_string(inst->Opcode));
-      if (inst->SaturateMode == SATURATE_ZERO_ONE)
+      if (inst->Saturate)
          fprintf(f, "_SAT");
       fprintf(f, " ");
       fprint_dst_reg(f, &inst->DstReg, mode, prog);
@@ -864,7 +864,7 @@ _mesa_fprint_program_opt(FILE *f,
       else
          fprintf(f, "# Fragment Program/Shader %u\n", prog->Id);
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       fprintf(f, "# Geometry Shader\n");
    }
 
diff --git a/src/mesa/program/prog_statevars.c b/src/mesa/program/prog_statevars.c
index 0c0c87f..bdb335e 100644
--- a/src/mesa/program/prog_statevars.c
+++ b/src/mesa/program/prog_statevars.c
@@ -244,14 +244,14 @@ _mesa_fetch_state(struct gl_context *ctx, const gl_state_index state[],
       {
          /* state[1] is the texture unit */
          const GLuint unit = (GLuint) state[1];
-         if (_mesa_get_clamp_fragment_color(ctx))
+         if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
             COPY_4V(value, ctx->Texture.Unit[unit].EnvColor);
          else
             COPY_4V(value, ctx->Texture.Unit[unit].EnvColorUnclamped);
       }
       return;
    case STATE_FOG_COLOR:
-      if (_mesa_get_clamp_fragment_color(ctx))
+      if (_mesa_get_clamp_fragment_color(ctx, ctx->DrawBuffer))
          COPY_4V(value, ctx->Fog.Color);
       else
          COPY_4V(value, ctx->Fog.ColorUnclamped);
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index 6c5fa51..d54f934 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -47,6 +47,7 @@ struct ptn_compile {
    nir_builder build;
    bool error;
 
+   nir_variable *parameters;
    nir_variable *input_vars[VARYING_SLOT_MAX];
    nir_variable *output_vars[VARYING_SLOT_MAX];
    nir_register **output_regs;
@@ -112,21 +113,6 @@ ptn_get_dest(struct ptn_compile *c, const struct prog_dst_register *prog_dst)
    return dest;
 }
 
-/**
- * Multiply the contents of the ADDR register by 4 to convert from the number
- * of vec4s to the number of floating point components.
- */
-static nir_ssa_def *
-ptn_addr_reg_value(struct ptn_compile *c)
-{
-   nir_builder *b = &c->build;
-   nir_alu_src src;
-   memset(&src, 0, sizeof(src));
-   src.src = nir_src_for_reg(c->addr_reg);
-
-   return nir_imul(b, nir_fmov_alu(b, src, 1), nir_imm_int(b, 4));
-}
-
 static nir_ssa_def *
 ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
 {
@@ -180,27 +166,38 @@ ptn_get_src(struct ptn_compile *c, const struct prog_src_register *prog_src)
          }
          /* FALLTHROUGH */
       case PROGRAM_STATE_VAR: {
-         nir_intrinsic_op load_op =
-            prog_src->RelAddr ? nir_intrinsic_load_uniform_indirect :
-                                nir_intrinsic_load_uniform;
-         nir_intrinsic_instr *load = nir_intrinsic_instr_create(b->shader, load_op);
+         nir_intrinsic_instr *load =
+            nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_var);
          nir_ssa_dest_init(&load->instr, &load->dest, 4, NULL);
          load->num_components = 4;
 
-         /* Multiply src->Index by 4 to scale from # of vec4s to components. */
-         load->const_index[0] = 4 * prog_src->Index;
-         load->const_index[1] = 1;
+         load->variables[0] = nir_deref_var_create(load, c->parameters);
+         nir_deref_array *deref_arr =
+            nir_deref_array_create(load->variables[0]);
+         deref_arr->deref.type = glsl_vec4_type();
+         load->variables[0]->deref.child = &deref_arr->deref;
 
          if (prog_src->RelAddr) {
-            nir_ssa_def *reladdr = ptn_addr_reg_value(c);
+            deref_arr->deref_array_type = nir_deref_array_type_indirect;
+
+            nir_alu_src addr_src = { NIR_SRC_INIT };
+            addr_src.src = nir_src_for_reg(c->addr_reg);
+            nir_ssa_def *reladdr = nir_imov_alu(b, addr_src, 1);
+
             if (prog_src->Index < 0) {
                /* This is a negative offset which should be added to the address
                 * register's value.
                 */
-               reladdr = nir_iadd(b, reladdr, nir_imm_int(b, load->const_index[0]));
-               load->const_index[0] = 0;
+               reladdr = nir_iadd(b, reladdr, nir_imm_int(b, prog_src->Index));
+
+               deref_arr->base_offset = 0;
+            } else {
+               deref_arr->base_offset = prog_src->Index;
             }
-            load->src[0] = nir_src_for_ssa(reladdr);
+            deref_arr->indirect = nir_src_for_ssa(reladdr);
+         } else {
+            deref_arr->deref_array_type = nir_deref_array_type_direct;
+            deref_arr->base_offset = prog_src->Index;
          }
 
          nir_instr_insert_after_cf_list(b->cf_node_list, &load->instr);
@@ -700,7 +697,7 @@ static const nir_op op_trans[MAX_OPCODE] = {
    [OPCODE_ADD] = nir_op_fadd,
    [OPCODE_ARL] = 0,
    [OPCODE_CMP] = 0,
-   [OPCODE_COS] = nir_op_fcos,
+   [OPCODE_COS] = 0,
    [OPCODE_DDX] = nir_op_fddx,
    [OPCODE_DDY] = nir_op_fddy,
    [OPCODE_DP2] = 0,
@@ -709,11 +706,11 @@ static const nir_op op_trans[MAX_OPCODE] = {
    [OPCODE_DPH] = 0,
    [OPCODE_DST] = 0,
    [OPCODE_END] = 0,
-   [OPCODE_EX2] = nir_op_fexp2,
+   [OPCODE_EX2] = 0,
    [OPCODE_EXP] = 0,
    [OPCODE_FLR] = nir_op_ffloor,
    [OPCODE_FRC] = nir_op_ffract,
-   [OPCODE_LG2] = nir_op_flog2,
+   [OPCODE_LG2] = 0,
    [OPCODE_LIT] = 0,
    [OPCODE_LOG] = 0,
    [OPCODE_LRP] = 0,
@@ -722,15 +719,15 @@ static const nir_op op_trans[MAX_OPCODE] = {
    [OPCODE_MIN] = nir_op_fmin,
    [OPCODE_MOV] = nir_op_fmov,
    [OPCODE_MUL] = nir_op_fmul,
-   [OPCODE_POW] = nir_op_fpow,
-   [OPCODE_RCP] = nir_op_frcp,
+   [OPCODE_POW] = 0,
+   [OPCODE_RCP] = 0,
 
-   [OPCODE_RSQ] = nir_op_frsq,
+   [OPCODE_RSQ] = 0,
    [OPCODE_SCS] = 0,
    [OPCODE_SEQ] = 0,
    [OPCODE_SGE] = 0,
    [OPCODE_SGT] = 0,
-   [OPCODE_SIN] = nir_op_fsin,
+   [OPCODE_SIN] = 0,
    [OPCODE_SLE] = 0,
    [OPCODE_SLT] = 0,
    [OPCODE_SNE] = 0,
@@ -767,7 +764,8 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
 
    switch (op) {
    case OPCODE_RSQ:
-      ptn_move_dest(b, dest, nir_frsq(b, ptn_channel(b, src[0], X)));
+      ptn_move_dest(b, dest,
+                    nir_frsq(b, nir_fabs(b, ptn_channel(b, src[0], X))));
       break;
 
    case OPCODE_RCP:
@@ -894,7 +892,7 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
       break;
 
    default:
-      if (op_trans[op] != 0 || op == OPCODE_MOV) {
+      if (op_trans[op] != 0) {
          ptn_alu(b, op_trans[op], dest, src);
       } else {
          fprintf(stderr, "unknown opcode: %s\n", _mesa_opcode_string(op));
@@ -903,8 +901,8 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
       break;
    }
 
-   if (prog_inst->SaturateMode) {
-      assert(prog_inst->SaturateMode == SATURATE_ZERO_ONE);
+   if (prog_inst->Saturate) {
+      assert(prog_inst->Saturate);
       assert(!dest.dest.is_ssa);
       ptn_move_dest(b, dest, nir_fsat(b, ptn_src_for_dest(c, &dest)));
    }
@@ -926,10 +924,23 @@ ptn_add_output_stores(struct ptn_compile *c)
    foreach_list_typed(nir_variable, var, node, &b->shader->outputs) {
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
-      store->num_components = 4;
+      store->num_components = glsl_get_vector_elements(var->type);
       store->variables[0] =
          nir_deref_var_create(store, c->output_vars[var->data.location]);
-      store->src[0].reg.reg = c->output_regs[var->data.location];
+
+      if (c->prog->Target == GL_FRAGMENT_PROGRAM_ARB &&
+          var->data.location == FRAG_RESULT_DEPTH) {
+         /* result.depth has this strange convention of being the .z component of
+          * a vec4 with undefined .xyw components.  We resolve it to a scalar, to
+          * match GLSL's gl_FragDepth and the expectations of most backends.
+          */
+         nir_alu_src alu_src = { NIR_SRC_INIT };
+         alu_src.src = nir_src_for_reg(c->output_regs[FRAG_RESULT_DEPTH]);
+         alu_src.swizzle[0] = SWIZZLE_Z;
+         store->src[0] = nir_src_for_ssa(nir_fmov_alu(b, alu_src, 1));
+      } else {
+         store->src[0].reg.reg = c->output_regs[var->data.location];
+      }
       nir_instr_insert_after_cf_list(c->build.cf_node_list, &store->instr);
    }
 }
@@ -1022,7 +1033,10 @@ setup_registers_and_variables(struct ptn_compile *c)
       reg->num_components = 4;
 
       nir_variable *var = rzalloc(shader, nir_variable);
-      var->type = glsl_vec4_type();
+      if (c->prog->Target == GL_FRAGMENT_PROGRAM_ARB && i == FRAG_RESULT_DEPTH)
+         var->type = glsl_float_type();
+      else
+         var->type = glsl_vec4_type();
       var->data.mode = nir_var_shader_out;
       var->name = ralloc_asprintf(var, "out_%d", i);
 
@@ -1057,13 +1071,11 @@ setup_registers_and_variables(struct ptn_compile *c)
    }
    reg->num_components = 1;
    c->addr_reg = reg;
-
-   /* Set the number of uniforms */
-   shader->num_uniforms = 4 * c->prog->Parameters->NumParameters;
 }
 
 struct nir_shader *
-prog_to_nir(const struct gl_program *prog, const nir_shader_compiler_options *options)
+prog_to_nir(const struct gl_program *prog,
+            const nir_shader_compiler_options *options)
 {
    struct ptn_compile *c;
    struct nir_shader *s;
@@ -1076,6 +1088,14 @@ prog_to_nir(const struct gl_program *prog, const nir_shader_compiler_options *op
       goto fail;
    c->prog = prog;
 
+   c->parameters = rzalloc(s, nir_variable);
+   c->parameters->type = glsl_array_type(glsl_vec4_type(),
+                                            prog->Parameters->NumParameters);
+   c->parameters->name = "parameters";
+   c->parameters->data.read_only = true;
+   c->parameters->data.mode = nir_var_uniform;
+   exec_list_push_tail(&s->uniforms, &c->parameters->node);
+
    nir_function *func = nir_function_create(s, "main");
    nir_function_overload *overload = nir_function_overload_create(func);
    nir_function_impl *impl = nir_function_impl_create(overload);
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index fb61f4d..c13e61b 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -97,13 +97,6 @@ _mesa_init_program(struct gl_context *ctx)
    assert(ctx->FragmentProgram.Current);
    ctx->FragmentProgram.Cache = _mesa_new_program_cache();
 
-   ctx->GeometryProgram.Enabled = GL_FALSE;
-   /* right now by default we don't have a geometry program */
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current,
-                            NULL);
-
-   _mesa_reference_compprog(ctx, &ctx->ComputeProgram.Current, NULL);
-
    /* XXX probably move this stuff */
    ctx->ATIFragmentShader.Enabled = GL_FALSE;
    ctx->ATIFragmentShader.Current = ctx->Shared->DefaultFragmentShader;
@@ -122,8 +115,6 @@ _mesa_free_program_data(struct gl_context *ctx)
    _mesa_delete_program_cache(ctx, ctx->VertexProgram.Cache);
    _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, NULL);
    _mesa_delete_shader_cache(ctx, ctx->FragmentProgram.Cache);
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current, NULL);
-   _mesa_reference_compprog(ctx, &ctx->ComputeProgram.Current, NULL);
 
    /* XXX probably move this stuff */
    if (ctx->ATIFragmentShader.Current) {
@@ -153,9 +144,6 @@ _mesa_update_default_objects_program(struct gl_context *ctx)
                             ctx->Shared->DefaultFragmentProgram);
    assert(ctx->FragmentProgram.Current);
 
-   _mesa_reference_geomprog(ctx, &ctx->GeometryProgram.Current,
-                      ctx->Shared->DefaultGeometryProgram);
-
    /* XXX probably move this stuff */
    if (ctx->ATIFragmentShader.Current) {
       ctx->ATIFragmentShader.Current->RefCount--;
@@ -340,7 +328,7 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id)
                                          CALLOC_STRUCT(gl_fragment_program),
                                          target, id );
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       prog = _mesa_init_geometry_program(ctx,
                                          CALLOC_STRUCT(gl_geometry_program),
                                          target, id);
@@ -426,8 +414,8 @@ _mesa_reference_program_(struct gl_context *ctx,
       else if ((*ptr)->Target == GL_FRAGMENT_PROGRAM_ARB)
          assert(prog->Target == GL_FRAGMENT_PROGRAM_ARB ||
                 prog->Target == GL_FRAGMENT_PROGRAM_NV);
-      else if ((*ptr)->Target == MESA_GEOMETRY_PROGRAM)
-         assert(prog->Target == MESA_GEOMETRY_PROGRAM);
+      else if ((*ptr)->Target == GL_GEOMETRY_PROGRAM_NV)
+         assert(prog->Target == GL_GEOMETRY_PROGRAM_NV);
    }
 #endif
 
@@ -439,7 +427,7 @@ _mesa_reference_program_(struct gl_context *ctx,
       printf("Program %p ID=%u Target=%s  Refcount-- to %d\n",
              *ptr, (*ptr)->Id,
              ((*ptr)->Target == GL_VERTEX_PROGRAM_ARB ? "VP" :
-              ((*ptr)->Target == MESA_GEOMETRY_PROGRAM ? "GP" : "FP")),
+              ((*ptr)->Target == GL_GEOMETRY_PROGRAM_NV ? "GP" : "FP")),
              (*ptr)->RefCount - 1);
 #endif
       assert((*ptr)->RefCount > 0);
@@ -464,7 +452,7 @@ _mesa_reference_program_(struct gl_context *ctx,
       printf("Program %p ID=%u Target=%s  Refcount++ to %d\n",
              prog, prog->Id,
              (prog->Target == GL_VERTEX_PROGRAM_ARB ? "VP" :
-              (prog->Target == MESA_GEOMETRY_PROGRAM ? "GP" : "FP")),
+              (prog->Target == GL_GEOMETRY_PROGRAM_NV ? "GP" : "FP")),
              prog->RefCount);
 #endif
       /*mtx_unlock(&prog->Mutex);*/
@@ -554,7 +542,7 @@ _mesa_clone_program(struct gl_context *ctx, const struct gl_program *prog)
          fpc->PixelCenterInteger = fp->PixelCenterInteger;
       }
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       {
          const struct gl_geometry_program *gp = gl_geometry_program_const(prog);
          struct gl_geometry_program *gpc = gl_geometry_program(clone);
diff --git a/src/mesa/program/program_parse.y b/src/mesa/program/program_parse.y
index 716b83d..635f5d0 100644
--- a/src/mesa/program/program_parse.y
+++ b/src/mesa/program/program_parse.y
@@ -84,7 +84,7 @@ static void asm_instruction_set_operands(struct asm_instruction *inst,
     const struct prog_dst_register *dst, const struct asm_src_register *src0,
     const struct asm_src_register *src1, const struct asm_src_register *src2);
 
-static struct asm_instruction *asm_instruction_ctor(gl_inst_opcode op,
+static struct asm_instruction *asm_instruction_ctor(enum prog_opcode op,
     const struct prog_dst_register *dst, const struct asm_src_register *src0,
     const struct asm_src_register *src1, const struct asm_src_register *src2);
 
@@ -139,7 +139,7 @@ static struct asm_instruction *asm_instruction_copy_ctor(
    gl_state_index state[STATE_LENGTH];
    int negate;
    struct asm_vector vector;
-   gl_inst_opcode opcode;
+   enum prog_opcode opcode;
 
    struct {
       unsigned swz;
@@ -2275,7 +2275,7 @@ asm_instruction_set_operands(struct asm_instruction *inst,
 
 
 struct asm_instruction *
-asm_instruction_ctor(gl_inst_opcode op,
+asm_instruction_ctor(enum prog_opcode op,
 		     const struct prog_dst_register *dst,
 		     const struct asm_src_register *src0,
 		     const struct asm_src_register *src1,
@@ -2308,7 +2308,7 @@ asm_instruction_copy_ctor(const struct prog_instruction *base,
       inst->Base.Opcode = base->Opcode;
       inst->Base.CondUpdate = base->CondUpdate;
       inst->Base.CondDst = base->CondDst;
-      inst->Base.SaturateMode = base->SaturateMode;
+      inst->Base.Saturate = base->Saturate;
       inst->Base.Precision = base->Precision;
 
       asm_instruction_set_operands(inst, dst, src0, src1, src2);
diff --git a/src/mesa/program/program_parse_extra.c b/src/mesa/program/program_parse_extra.c
index a9e3640..32b54af 100644
--- a/src/mesa/program/program_parse_extra.c
+++ b/src/mesa/program/program_parse_extra.c
@@ -40,7 +40,7 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state,
 {
    inst->CondUpdate = 0;
    inst->CondDst = 0;
-   inst->SaturateMode = SATURATE_OFF;
+   inst->Saturate = GL_FALSE;
    inst->Precision = FLOAT32;
 
 
@@ -82,7 +82,7 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state,
     */
    if (state->mode == ARB_fragment) {
       if (strcmp(suffix, "_SAT") == 0) {
-	 inst->SaturateMode = SATURATE_ZERO_ONE;
+	 inst->Saturate = GL_TRUE;
 	 suffix += 4;
       }
    }
diff --git a/src/mesa/program/programopt.c b/src/mesa/program/programopt.c
index e82c68a..af78150 100644
--- a/src/mesa/program/programopt.c
+++ b/src/mesa/program/programopt.c
@@ -305,7 +305,7 @@ _mesa_append_fog_code(struct gl_context *ctx,
          /* change the instruction to write to colorTemp w/ clamping */
          inst->DstReg.File = PROGRAM_TEMPORARY;
          inst->DstReg.Index = colorTemp;
-         inst->SaturateMode = saturate;
+         inst->Saturate = saturate;
          /* don't break (may be several writes to result.color) */
       }
       inst++;
@@ -331,7 +331,7 @@ _mesa_append_fog_code(struct gl_context *ctx,
       inst->SrcReg[2].File = PROGRAM_STATE_VAR;
       inst->SrcReg[2].Index = fogPRefOpt;
       inst->SrcReg[2].Swizzle = SWIZZLE_YYYY;
-      inst->SaturateMode = SATURATE_ZERO_ONE;
+      inst->Saturate = GL_TRUE;
       inst++;
    }
    else {
@@ -374,7 +374,7 @@ _mesa_append_fog_code(struct gl_context *ctx,
       inst->SrcReg[0].Index = fogFactorTemp;
       inst->SrcReg[0].Negate = NEGATE_XYZW;
       inst->SrcReg[0].Swizzle = SWIZZLE_XXXX;
-      inst->SaturateMode = SATURATE_ZERO_ONE;
+      inst->Saturate = GL_TRUE;
       inst++;
    }
    /* LRP result.color.xyz, fogFactorTemp.xxxx, colorTemp, fogColorRef; */
diff --git a/src/mesa/state_tracker/st_atom_framebuffer.c b/src/mesa/state_tracker/st_atom_framebuffer.c
index b195c55..ae883a2 100644
--- a/src/mesa/state_tracker/st_atom_framebuffer.c
+++ b/src/mesa/state_tracker/st_atom_framebuffer.c
@@ -134,7 +134,10 @@ update_framebuffer_state( struct st_context *st )
    else {
       strb = st_renderbuffer(fb->Attachment[BUFFER_STENCIL].Renderbuffer);
       if (strb) {
-         assert(strb->surface);
+         if (strb->is_rtt) {
+            /* rendering to a GL texture, may have to update surface */
+            st_update_renderbuffer_surface(st, strb);
+         }
          pipe_surface_reference(&framebuffer->zsbuf, strb->surface);
          update_framebuffer_size(framebuffer, strb->surface);
       }
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 629f54f..ad8d262 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -189,7 +189,7 @@ update_gp( struct st_context *st )
    }
 
    stgp = st_geometry_program(st->ctx->GeometryProgram._Current);
-   assert(stgp->Base.Base.Target == MESA_GEOMETRY_PROGRAM);
+   assert(stgp->Base.Base.Target == GL_GEOMETRY_PROGRAM_NV);
 
    memset(&key, 0, sizeof(key));
    key.st = st;
diff --git a/src/mesa/state_tracker/st_cb_bitmap.c b/src/mesa/state_tracker/st_cb_bitmap.c
index 2107ab1..c881e19 100644
--- a/src/mesa/state_tracker/st_cb_bitmap.c
+++ b/src/mesa/state_tracker/st_cb_bitmap.c
@@ -452,6 +452,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_geometry_shader(cso);
    cso_save_vertex_elements(cso);
    cso_save_aux_vertex_buffer_slot(cso);
@@ -466,7 +468,9 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* vertex shader state: position + texcoord pass-through */
    cso_set_vertex_shader_handle(cso, st->bitmap.vs);
 
-   /* geometry shader state: disabled */
+   /* disable other shaders */
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
 
    /* user samplers, plus our bitmap sampler */
@@ -536,6 +540,8 @@ draw_bitmap_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_viewport(cso);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_vertex_elements(cso);
    cso_restore_aux_vertex_buffer_slot(cso);
diff --git a/src/mesa/state_tracker/st_cb_blit.c b/src/mesa/state_tracker/st_cb_blit.c
index bbaedd1..6d93718 100644
--- a/src/mesa/state_tracker/st_cb_blit.c
+++ b/src/mesa/state_tracker/st_cb_blit.c
@@ -36,6 +36,7 @@
 
 #include "st_context.h"
 #include "st_texture.h"
+#include "st_cb_bitmap.h"
 #include "st_cb_blit.h"
 #include "st_cb_fbo.h"
 #include "st_atom.h"
@@ -93,6 +94,9 @@ st_BlitFramebuffer(struct gl_context *ctx,
 
    st_validate_state(st);
 
+   /* Make sure bitmap rendering has landed in the framebuffers */
+   st_flush_bitmap_cache(st);
+
    clip.srcX0 = srcX0;
    clip.srcY0 = srcY0;
    clip.srcX1 = srcX1;
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index f10e906..137fac8 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -265,6 +265,8 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
    cso_save_fragment_shader(st->cso_context);
    cso_save_stream_outputs(st->cso_context);
    cso_save_vertex_shader(st->cso_context);
+   cso_save_tessctrl_shader(st->cso_context);
+   cso_save_tesseval_shader(st->cso_context);
    cso_save_geometry_shader(st->cso_context);
    cso_save_vertex_elements(st->cso_context);
    cso_save_aux_vertex_buffer_slot(st->cso_context);
@@ -347,6 +349,8 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
    }
 
    set_fragment_shader(st);
+   cso_set_tessctrl_shader_handle(st->cso_context, NULL);
+   cso_set_tesseval_shader_handle(st->cso_context, NULL);
 
    if (num_layers > 1)
       set_vertex_shader_layered(st);
@@ -371,6 +375,8 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
    cso_restore_viewport(st->cso_context);
    cso_restore_fragment_shader(st->cso_context);
    cso_restore_vertex_shader(st->cso_context);
+   cso_restore_tessctrl_shader(st->cso_context);
+   cso_restore_tesseval_shader(st->cso_context);
    cso_restore_geometry_shader(st->cso_context);
    cso_restore_vertex_elements(st->cso_context);
    cso_restore_aux_vertex_buffer_slot(st->cso_context);
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 3edf31b..a6a98c8 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -693,6 +693,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_save_fragment_shader(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_geometry_shader(cso);
    cso_save_vertex_elements(cso);
    cso_save_aux_vertex_buffer_slot(cso);
@@ -746,7 +748,9 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    /* vertex shader state: position + texcoord pass-through */
    cso_set_vertex_shader_handle(cso, driver_vp);
 
-   /* geometry shader state: disabled */
+   /* disable other shaders */
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
 
    /* texture sampling state: */
@@ -816,6 +820,8 @@ draw_textured_quad(struct gl_context *ctx, GLint x, GLint y, GLfloat z,
    cso_restore_sampler_views(cso, PIPE_SHADER_FRAGMENT);
    cso_restore_fragment_shader(cso);
    cso_restore_vertex_shader(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_vertex_elements(cso);
    cso_restore_aux_vertex_buffer_slot(cso);
diff --git a/src/mesa/state_tracker/st_cb_drawtex.c b/src/mesa/state_tracker/st_cb_drawtex.c
index 1420b96..2af4f6d 100644
--- a/src/mesa/state_tracker/st_cb_drawtex.c
+++ b/src/mesa/state_tracker/st_cb_drawtex.c
@@ -229,6 +229,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
    cso_save_viewport(cso);
    cso_save_stream_outputs(cso);
    cso_save_vertex_shader(cso);
+   cso_save_tessctrl_shader(cso);
+   cso_save_tesseval_shader(cso);
    cso_save_geometry_shader(cso);
    cso_save_vertex_elements(cso);
    cso_save_aux_vertex_buffer_slot(cso);
@@ -238,6 +240,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
                                semantic_names, semantic_indexes);
       cso_set_vertex_shader_handle(cso, vs);
    }
+   cso_set_tessctrl_shader_handle(cso, NULL);
+   cso_set_tesseval_shader_handle(cso, NULL);
    cso_set_geometry_shader_handle(cso, NULL);
 
    for (i = 0; i < numAttribs; i++) {
@@ -279,6 +283,8 @@ st_DrawTex(struct gl_context *ctx, GLfloat x, GLfloat y, GLfloat z,
    /* restore state */
    cso_restore_viewport(cso);
    cso_restore_vertex_shader(cso);
+   cso_restore_tessctrl_shader(cso);
+   cso_restore_tesseval_shader(cso);
    cso_restore_geometry_shader(cso);
    cso_restore_vertex_elements(cso);
    cso_restore_aux_vertex_buffer_slot(cso);
diff --git a/src/mesa/state_tracker/st_cb_fbo.c b/src/mesa/state_tracker/st_cb_fbo.c
index 296ea1e..0399eef 100644
--- a/src/mesa/state_tracker/st_cb_fbo.c
+++ b/src/mesa/state_tracker/st_cb_fbo.c
@@ -842,7 +842,7 @@ void st_init_fbo_functions(struct dd_function_table *functions)
    functions->NewFramebuffer = st_new_framebuffer;
    functions->NewRenderbuffer = st_new_renderbuffer;
    functions->BindFramebuffer = st_bind_framebuffer;
-   functions->FramebufferRenderbuffer = _mesa_framebuffer_renderbuffer;
+   functions->FramebufferRenderbuffer = _mesa_FramebufferRenderbuffer_sw;
    functions->RenderTexture = st_render_texture;
    functions->FinishRenderTexture = st_finish_render_texture;
    functions->ValidateFramebuffer = st_validate_framebuffer;
diff --git a/src/mesa/state_tracker/st_cb_flush.c b/src/mesa/state_tracker/st_cb_flush.c
index ca51eee..82affd2 100644
--- a/src/mesa/state_tracker/st_cb_flush.c
+++ b/src/mesa/state_tracker/st_cb_flush.c
@@ -141,11 +141,44 @@ static void st_glFinish(struct gl_context *ctx)
 }
 
 
-void st_init_flush_functions(struct dd_function_table *functions)
+/**
+ * Query information about GPU resets observed by this context
+ *
+ * Called via \c dd_function_table::GetGraphicsResetStatus.
+ */
+static GLenum
+st_get_graphics_reset_status(struct gl_context *ctx)
+{
+   struct st_context *st = st_context(ctx);
+   enum pipe_reset_status status;
+
+   status = st->pipe->get_device_reset_status(st->pipe);
+
+   switch (status) {
+   case PIPE_NO_RESET:
+      return GL_NO_ERROR;
+   case PIPE_GUILTY_CONTEXT_RESET:
+      return GL_GUILTY_CONTEXT_RESET_ARB;
+   case PIPE_INNOCENT_CONTEXT_RESET:
+      return GL_INNOCENT_CONTEXT_RESET_ARB;
+   case PIPE_UNKNOWN_CONTEXT_RESET:
+      return GL_UNKNOWN_CONTEXT_RESET_ARB;
+   default:
+      assert(0);
+      return GL_NO_ERROR;
+   }
+}
+
+
+void st_init_flush_functions(struct pipe_screen *screen,
+                             struct dd_function_table *functions)
 {
    functions->Flush = st_glFlush;
    functions->Finish = st_glFinish;
 
+   if (screen->get_param(screen, PIPE_CAP_DEVICE_RESET_STATUS_QUERY))
+      functions->GetGraphicsResetStatus = st_get_graphics_reset_status;
+
    /* Windows opengl32.dll calls glFinish prior to every swapbuffers.
     * This is unnecessary and degrades performance.  Luckily we have some
     * scope to work around this, as the externally-visible behaviour of
diff --git a/src/mesa/state_tracker/st_cb_flush.h b/src/mesa/state_tracker/st_cb_flush.h
index 84ffc63..f92dcd5 100644
--- a/src/mesa/state_tracker/st_cb_flush.h
+++ b/src/mesa/state_tracker/st_cb_flush.h
@@ -37,7 +37,8 @@ struct pipe_fence_handle;
 struct st_context;
 
 extern void
-st_init_flush_functions(struct dd_function_table *functions);
+st_init_flush_functions(struct pipe_screen *screen,
+                        struct dd_function_table *functions);
 
 extern void
 st_flush(struct st_context *st,
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index c382d7d..6aa7d57 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -65,7 +65,7 @@ st_bind_program(struct gl_context *ctx, GLenum target, struct gl_program *prog)
    case GL_FRAGMENT_PROGRAM_ARB:
       st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       st->dirty.st |= ST_NEW_GEOMETRY_PROGRAM;
       break;
    }
@@ -105,7 +105,7 @@ st_new_program(struct gl_context *ctx, GLenum target, GLuint id)
       return _mesa_init_fragment_program(ctx, &prog->Base, target, id);
    }
 
-   case MESA_GEOMETRY_PROGRAM: {
+   case GL_GEOMETRY_PROGRAM_NV: {
       struct st_geometry_program *prog = ST_CALLOC_STRUCT(st_geometry_program);
       return _mesa_init_geometry_program(ctx, &prog->Base, target, id);
    }
@@ -135,7 +135,7 @@ st_delete_program(struct gl_context *ctx, struct gl_program *prog)
             free_glsl_to_tgsi_visitor(stvp->glsl_to_tgsi);
       }
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       {
          struct st_geometry_program *stgp =
             (struct st_geometry_program *) prog;
@@ -198,7 +198,7 @@ st_program_string_notify( struct gl_context *ctx,
       if (st->fp == stfp)
 	 st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
    }
-   else if (target == MESA_GEOMETRY_PROGRAM) {
+   else if (target == GL_GEOMETRY_PROGRAM_NV) {
       struct st_geometry_program *stgp = (struct st_geometry_program *) prog;
 
       st_release_gp_variants(st, stgp);
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index bfb9c84..ed9ed0f 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -321,7 +321,7 @@ struct st_context *st_create_context(gl_api api, struct pipe_context *pipe,
    struct st_context *st;
 
    memset(&funcs, 0, sizeof(funcs));
-   st_init_driver_functions(&funcs);
+   st_init_driver_functions(pipe->screen, &funcs);
 
    ctx = _mesa_create_context(api, visual, shareCtx, &funcs);
    if (!ctx) {
@@ -376,12 +376,6 @@ void st_destroy_context( struct st_context *st )
    }
    pipe_surface_reference(&st->state.framebuffer.zsbuf, NULL);
 
-   pipe->set_index_buffer(pipe, NULL);
-
-   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
-      pipe->set_constant_buffer(pipe, i, 0, NULL);
-   }
-
    _mesa_delete_program_cache(st->ctx, st->pixel_xfer.cache);
 
    _vbo_DestroyContext(st->ctx);
@@ -401,7 +395,8 @@ void st_destroy_context( struct st_context *st )
 }
 
 
-void st_init_driver_functions(struct dd_function_table *functions)
+void st_init_driver_functions(struct pipe_screen *screen,
+                              struct dd_function_table *functions)
 {
    _mesa_init_shader_object_functions(functions);
    _mesa_init_sampler_object_functions(functions);
@@ -429,7 +424,7 @@ void st_init_driver_functions(struct dd_function_table *functions)
    st_init_readpixels_functions(functions);
    st_init_texture_functions(functions);
    st_init_texture_barrier_functions(functions);
-   st_init_flush_functions(functions);
+   st_init_flush_functions(screen, functions);
    st_init_string_functions(functions);
    st_init_viewport_functions(functions);
 
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index 8a9504b..dac5a4b 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -237,7 +237,8 @@ struct st_framebuffer
 };
 
 
-extern void st_init_driver_functions(struct dd_function_table *functions);
+extern void st_init_driver_functions(struct pipe_screen *screen,
+                                     struct dd_function_table *functions);
 
 void st_invalidate_state(struct gl_context * ctx, GLuint new_state);
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 488f6ea..8b43582 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -141,7 +141,7 @@ check_uniforms(struct gl_context *ctx)
       if (shProg[j] == NULL || !shProg[j]->LinkStatus)
 	 continue;
 
-      for (i = 0; i < shProg[j]->NumUserUniformStorage; i++) {
+      for (i = 0; i < shProg[j]->NumUniformStorage; i++) {
          const struct gl_uniform_storage *u = &shProg[j]->UniformStorage[i];
          if (!u->initialized) {
             _mesa_warning(ctx,
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 1fea860..25e30c7 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -57,11 +57,6 @@
                            (1 << PROGRAM_CONSTANT) |     \
                            (1 << PROGRAM_UNIFORM))
 
-/**
- * Maximum number of arrays
- */
-#define MAX_ARRAYS        256
-
 #define MAX_GLSL_TEXTURE_OFFSET 4
 
 class st_src_reg;
@@ -89,6 +84,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    st_src_reg(gl_register_file file, int index, int type)
@@ -103,6 +99,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    st_src_reg(gl_register_file file, int index, int type, int index2D)
@@ -117,6 +114,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    st_src_reg()
@@ -131,6 +129,7 @@ public:
       this->reladdr2 = NULL;
       this->has_index2 = false;
       this->double_reg2 = false;
+      this->array_id = 0;
    }
 
    explicit st_src_reg(st_dst_reg reg);
@@ -150,6 +149,7 @@ public:
     * currently used for input mapping only.
     */
    bool double_reg2;
+   unsigned array_id;
 };
 
 class st_dst_reg {
@@ -162,6 +162,7 @@ public:
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
       this->type = type;
+      this->array_id = 0;
    }
 
    st_dst_reg(gl_register_file file, int writemask, int type)
@@ -172,6 +173,7 @@ public:
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
       this->type = type;
+      this->array_id = 0;
    }
 
    st_dst_reg()
@@ -182,6 +184,7 @@ public:
       this->writemask = 0;
       this->cond_mask = COND_TR;
       this->reladdr = NULL;
+      this->array_id = 0;
    }
 
    explicit st_dst_reg(st_src_reg reg);
@@ -193,6 +196,7 @@ public:
    int type; /** GLSL_TYPE_* from GLSL IR (enum glsl_base_type) */
    /** Register index should be offset by the integer in this reg. */
    st_src_reg *reladdr;
+   unsigned array_id;
 };
 
 st_src_reg::st_src_reg(st_dst_reg reg)
@@ -207,6 +211,7 @@ st_src_reg::st_src_reg(st_dst_reg reg)
    this->reladdr2 = NULL;
    this->has_index2 = false;
    this->double_reg2 = false;
+   this->array_id = reg.array_id;
 }
 
 st_dst_reg::st_dst_reg(st_src_reg reg)
@@ -217,6 +222,7 @@ st_dst_reg::st_dst_reg(st_src_reg reg)
    this->writemask = WRITEMASK_XYZW;
    this->cond_mask = COND_TR;
    this->reladdr = reg.reladdr;
+   this->array_id = reg.array_id;
 }
 
 class glsl_to_tgsi_instruction : public exec_node {
@@ -233,6 +239,7 @@ public:
    st_src_reg sampler; /**< sampler register */
    int sampler_array_size; /**< 1-based size of sampler array, 1 if not array */
    int tex_target; /**< One of TEXTURE_*_INDEX */
+   glsl_base_type tex_type;
    GLboolean tex_shadow;
 
    st_src_reg tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
@@ -244,8 +251,9 @@ public:
 
 class variable_storage : public exec_node {
 public:
-   variable_storage(ir_variable *var, gl_register_file file, int index)
-      : file(file), index(index), var(var)
+   variable_storage(ir_variable *var, gl_register_file file, int index,
+                    unsigned array_id = 0)
+      : file(file), index(index), var(var), array_id(array_id)
    {
       /* empty */
    }
@@ -253,6 +261,7 @@ public:
    gl_register_file file;
    int index;
    ir_variable *var; /* variable that maps to this, if any */
+   unsigned array_id;
 };
 
 class immediate_storage : public exec_node {
@@ -302,6 +311,15 @@ public:
    st_src_reg return_reg;
 };
 
+static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
+static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
+
+struct array_decl {
+   unsigned mesa_index;
+   unsigned array_id;
+   unsigned array_size;
+};
+
 struct glsl_to_tgsi_visitor : public ir_visitor {
 public:
    glsl_to_tgsi_visitor();
@@ -317,11 +335,19 @@ public:
 
    int next_temp;
 
-   unsigned array_sizes[MAX_ARRAYS];
+   unsigned *array_sizes;
+   unsigned max_num_arrays;
    unsigned next_array;
 
+   struct array_decl input_arrays[PIPE_MAX_SHADER_INPUTS];
+   unsigned num_input_arrays;
+   struct array_decl output_arrays[PIPE_MAX_SHADER_OUTPUTS];
+   unsigned num_output_arrays;
+
    int num_address_regs;
    int samplers_used;
+   glsl_base_type sampler_types[PIPE_MAX_SAMPLERS];
+   int sampler_targets[PIPE_MAX_SAMPLERS];   /**< One of TGSI_TEXTURE_* */
    bool indirect_addr_consts;
    int wpos_transform_const;
 
@@ -372,6 +398,7 @@ public:
    virtual void visit(ir_if *);
    virtual void visit(ir_emit_vertex *);
    virtual void visit(ir_end_primitive *);
+   virtual void visit(ir_barrier *);
    /*@}*/
 
    st_src_reg result;
@@ -390,31 +417,19 @@ public:
    /** List of glsl_to_tgsi_instruction */
    exec_list instructions;
 
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_src_reg src0);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_dst_reg dst1,
-                                  st_src_reg src0);
+   glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
+                                      st_dst_reg dst = undef_dst,
+                                      st_src_reg src0 = undef_src,
+                                      st_src_reg src1 = undef_src,
+                                      st_src_reg src2 = undef_src,
+                                      st_src_reg src3 = undef_src);
 
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_src_reg src0, st_src_reg src1);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst,
-                                  st_src_reg src0, st_src_reg src1, st_src_reg src2);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst,
-                                  st_src_reg src0, st_src_reg src1,
-                                  st_src_reg src2, st_src_reg src3);
-
-   glsl_to_tgsi_instruction *emit(ir_instruction *ir, unsigned op,
-                                  st_dst_reg dst, st_dst_reg dst1,
-                                  st_src_reg src0, st_src_reg src1,
-                                  st_src_reg src2, st_src_reg src3);
+   glsl_to_tgsi_instruction *emit_asm(ir_instruction *ir, unsigned op,
+                                      st_dst_reg dst, st_dst_reg dst1,
+                                      st_src_reg src0 = undef_src,
+                                      st_src_reg src1 = undef_src,
+                                      st_src_reg src2 = undef_src,
+                                      st_src_reg src3 = undef_src);
 
    unsigned get_opcode(ir_instruction *ir, unsigned op,
                     st_dst_reg dst,
@@ -468,10 +483,6 @@ public:
    void *mem_ctx;
 };
 
-static st_src_reg undef_src = st_src_reg(PROGRAM_UNDEFINED, 0, GLSL_TYPE_ERROR);
-
-static st_dst_reg undef_dst = st_dst_reg(PROGRAM_UNDEFINED, SWIZZLE_NOOP, GLSL_TYPE_ERROR);
-
 static st_dst_reg address_reg = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 0);
 static st_dst_reg address_reg2 = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 1);
 static st_dst_reg sampler_reladdr = st_dst_reg(PROGRAM_ADDRESS, WRITEMASK_X, GLSL_TYPE_FLOAT, 2);
@@ -526,10 +537,10 @@ num_inst_src_regs(unsigned opcode)
 }
 
 glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_dst_reg dst1,
-                           st_src_reg src0, st_src_reg src1,
-                           st_src_reg src2, st_src_reg src3)
+glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
+                               st_dst_reg dst, st_dst_reg dst1,
+                               st_src_reg src0, st_src_reg src1,
+                               st_src_reg src2, st_src_reg src3)
 {
    glsl_to_tgsi_instruction *inst = new(mem_ctx) glsl_to_tgsi_instruction();
    int num_reladdr = 0, i, j;
@@ -571,6 +582,10 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
    inst->src[3] = src3;
    inst->ir = ir;
    inst->dead_mask = 0;
+   /* default to float, for paths where this is not initialized
+    * (since 0==UINT which is likely wrong):
+    */
+   inst->tex_type = GLSL_TYPE_FLOAT;
 
    inst->function = NULL;
 
@@ -716,48 +731,12 @@ glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
 }
 
 glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst,
-                           st_src_reg src0, st_src_reg src1,
-                           st_src_reg src2, st_src_reg src3)
+glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
+                               st_dst_reg dst,
+                               st_src_reg src0, st_src_reg src1,
+                               st_src_reg src2, st_src_reg src3)
 {
-   return emit(ir, op, dst, undef_dst, src0, src1, src2, src3);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_src_reg src0,
-                           st_src_reg src1, st_src_reg src2)
-{
-   return emit(ir, op, dst, undef_dst, src0, src1, src2, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_src_reg src0, st_src_reg src1)
-{
-   return emit(ir, op, dst, undef_dst, src0, src1, undef_src, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_src_reg src0)
-{
-   assert(dst.writemask != 0);
-   return emit(ir, op, dst, undef_dst, src0, undef_src, undef_src, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op,
-                           st_dst_reg dst, st_dst_reg dst1, st_src_reg src0)
-{
-   return emit(ir, op, dst, dst1, src0, undef_src, undef_src, undef_src);
-}
-
-glsl_to_tgsi_instruction *
-glsl_to_tgsi_visitor::emit(ir_instruction *ir, unsigned op)
-{
-   return emit(ir, op, undef_dst, undef_dst, undef_src, undef_src, undef_src, undef_src);
+   return emit_asm(ir, op, dst, undef_dst, src0, src1, src2, src3);
 }
 
 /**
@@ -879,7 +858,7 @@ glsl_to_tgsi_visitor::emit_dp(ir_instruction *ir,
       TGSI_OPCODE_DP2, TGSI_OPCODE_DP3, TGSI_OPCODE_DP4
    };
 
-   return emit(ir, dot_opcodes[elements - 2], dst, src0, src1);
+   return emit_asm(ir, dot_opcodes[elements - 2], dst, src0, src1);
 }
 
 /**
@@ -929,7 +908,7 @@ glsl_to_tgsi_visitor::emit_scalar(ir_instruction *ir, unsigned op,
                                    src1_swiz, src1_swiz);
 
       dst.writemask = this_mask;
-      emit(ir, op, dst, src0, src1);
+      emit_asm(ir, op, dst, src0, src1);
       done_mask |= this_mask;
    }
 }
@@ -958,7 +937,7 @@ glsl_to_tgsi_visitor::emit_arl(ir_instruction *ir,
    if (dst.index >= this->num_address_regs)
       this->num_address_regs = dst.index + 1;
 
-   emit(NULL, op, dst, src0);
+   emit_asm(NULL, op, dst, src0);
 }
 
 int
@@ -1142,6 +1121,12 @@ glsl_to_tgsi_visitor::get_temp(const glsl_type *type)
    if (!options->EmitNoIndirectTemp &&
        (type->is_array() || type->is_matrix())) {
 
+      if (next_array >= max_num_arrays) {
+         max_num_arrays += 32;
+         array_sizes = (unsigned*)
+            realloc(array_sizes, sizeof(array_sizes[0]) * max_num_arrays);
+      }
+
       src.file = PROGRAM_ARRAY;
       src.index = next_array << 16 | 0x8000;
       array_sizes[next_array] = type_size(type);
@@ -1242,7 +1227,7 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
              */
             st_src_reg src(PROGRAM_STATE_VAR, index, GLSL_TYPE_FLOAT);
             src.swizzle = slots[i].swizzle;
-            emit(ir, TGSI_OPCODE_MOV, dst, src);
+            emit_asm(ir, TGSI_OPCODE_MOV, dst, src);
             /* even a float takes up a whole vec4 reg in a struct/array. */
             dst.index++;
          }
@@ -1261,11 +1246,11 @@ glsl_to_tgsi_visitor::visit(ir_variable *ir)
 void
 glsl_to_tgsi_visitor::visit(ir_loop *ir)
 {
-   emit(NULL, TGSI_OPCODE_BGNLOOP);
+   emit_asm(NULL, TGSI_OPCODE_BGNLOOP);
 
    visit_exec_list(&ir->body_instructions, this);
 
-   emit(NULL, TGSI_OPCODE_ENDLOOP);
+   emit_asm(NULL, TGSI_OPCODE_ENDLOOP);
 }
 
 void
@@ -1273,10 +1258,10 @@ glsl_to_tgsi_visitor::visit(ir_loop_jump *ir)
 {
    switch (ir->mode) {
    case ir_loop_jump::jump_break:
-      emit(NULL, TGSI_OPCODE_BRK);
+      emit_asm(NULL, TGSI_OPCODE_BRK);
       break;
    case ir_loop_jump::jump_continue:
-      emit(NULL, TGSI_OPCODE_CONT);
+      emit_asm(NULL, TGSI_OPCODE_CONT);
       break;
    }
 }
@@ -1330,7 +1315,7 @@ glsl_to_tgsi_visitor::try_emit_mad(ir_expression *ir, int mul_operand)
    this->result = get_temp(ir->type);
    result_dst = st_dst_reg(this->result);
    result_dst.writemask = (1 << ir->type->vector_elements) - 1;
-   emit(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
+   emit_asm(ir, TGSI_OPCODE_MAD, result_dst, a, b, c);
 
    return true;
 }
@@ -1370,7 +1355,7 @@ glsl_to_tgsi_visitor::try_emit_mad_for_and_not(ir_expression *ir, int try_operan
    b.negate = ~b.negate;
 
    this->result = get_temp(ir->type);
-   emit(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
+   emit_asm(ir, TGSI_OPCODE_MAD, st_dst_reg(this->result), a, b, a);
 
    return true;
 }
@@ -1388,7 +1373,7 @@ glsl_to_tgsi_visitor::reladdr_to_temp(ir_instruction *ir,
    if (*num_reladdr != 1) {
       st_src_reg temp = get_temp(glsl_type::vec4_type);
 
-      emit(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
+      emit_asm(ir, TGSI_OPCODE_MOV, st_dst_reg(temp), *reg);
       *reg = temp;
    }
 
@@ -1464,7 +1449,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
    switch (ir->operation) {
    case ir_unop_logic_not:
       if (result_dst.type != GLSL_TYPE_FLOAT)
-         emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
       else {
          /* Previously 'SEQ dst, src, 0.0' was used for this.  However, many
           * older GPUs implement SEQ using multiple instructions (i915 uses two
@@ -1472,24 +1457,24 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           * 0.0 and 1.0, 1-x also implements !x.
           */
          op[0].negate = ~op[0].negate;
-         emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
+         emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], st_src_reg_for_float(1.0));
       }
       break;
    case ir_unop_neg:
       if (result_dst.type == GLSL_TYPE_INT || result_dst.type == GLSL_TYPE_UINT)
-         emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
       else if (result_dst.type == GLSL_TYPE_DOUBLE)
-         emit(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_DNEG, result_dst, op[0]);
       else {
          op[0].negate = ~op[0].negate;
          result_src = op[0];
       }
       break;
    case ir_unop_abs:
-      emit(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_ABS, result_dst, op[0]);
       break;
    case ir_unop_sign:
-      emit(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_SSG, result_dst, op[0]);
       break;
    case ir_unop_rcp:
       emit_scalar(ir, TGSI_OPCODE_RCP, result_dst, op[0]);
@@ -1513,17 +1498,17 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_saturate: {
       glsl_to_tgsi_instruction *inst;
-      inst = emit(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      inst = emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
       inst->saturate = true;
       break;
    }
 
    case ir_unop_dFdx:
    case ir_unop_dFdx_coarse:
-      emit(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DDX, result_dst, op[0]);
       break;
    case ir_unop_dFdx_fine:
-      emit(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DDX_FINE, result_dst, op[0]);
       break;
    case ir_unop_dFdy:
    case ir_unop_dFdy_coarse:
@@ -1547,18 +1532,18 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
       st_src_reg temp = get_temp(glsl_type::vec4_type);
 
-      emit(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
-      emit(ir, ir->operation == ir_unop_dFdy_fine ?
+      emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(temp), transform_y, op[0]);
+      emit_asm(ir, ir->operation == ir_unop_dFdy_fine ?
            TGSI_OPCODE_DDY_FINE : TGSI_OPCODE_DDY, result_dst, temp);
       break;
    }
 
    case ir_unop_frexp_sig:
-      emit(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DFRACEXP, result_dst, undef_dst, op[0]);
       break;
 
    case ir_unop_frexp_exp:
-      emit(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_DFRACEXP, undef_dst, result_dst, op[0]);
       break;
 
    case ir_unop_noise: {
@@ -1568,50 +1553,50 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        * place to do this is in the GL state tracker, not the poor
        * driver.
        */
-      emit(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
+      emit_asm(ir, TGSI_OPCODE_MOV, result_dst, st_src_reg_for_float(0.5));
       break;
    }
 
    case ir_binop_add:
-      emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
       break;
    case ir_binop_sub:
-      emit(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SUB, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_mul:
-      emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
       break;
    case ir_binop_div:
       if (result_dst.type == GLSL_TYPE_FLOAT || result_dst.type == GLSL_TYPE_DOUBLE)
          assert(!"not reached: should be handled by ir_div_to_mul_rcp");
       else
-         emit(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_DIV, result_dst, op[0], op[1]);
       break;
    case ir_binop_mod:
       if (result_dst.type == GLSL_TYPE_FLOAT)
          assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
       else
-         emit(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_MOD, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_less:
-      emit(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[0], op[1]);
       break;
    case ir_binop_greater:
-      emit(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
+      emit_asm(ir, TGSI_OPCODE_SLT, result_dst, op[1], op[0]);
       break;
    case ir_binop_lequal:
-      emit(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
+      emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[1], op[0]);
       break;
    case ir_binop_gequal:
-      emit(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SGE, result_dst, op[0], op[1]);
       break;
    case ir_binop_equal:
-      emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
       break;
    case ir_binop_nequal:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       break;
    case ir_binop_all_equal:
       /* "==" operator producing a scalar boolean. */
@@ -1625,7 +1610,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
             st_dst_reg temp_dst = st_dst_reg(temp);
             st_src_reg temp1 = st_src_reg(temp), temp2 = st_src_reg(temp);
 
-            emit(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
+            emit_asm(ir, TGSI_OPCODE_SEQ, st_dst_reg(temp), op[0], op[1]);
 
             /* Emit 1-3 AND operations to combine the SEQ results. */
             switch (ir->operands[0]->type->vector_elements) {
@@ -1635,24 +1620,24 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_YYYY;
                temp2.swizzle = SWIZZLE_ZZZZ;
-               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
                break;
             case 4:
                temp_dst.writemask = WRITEMASK_X;
                temp1.swizzle = SWIZZLE_XXXX;
                temp2.swizzle = SWIZZLE_YYYY;
-               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_ZZZZ;
                temp2.swizzle = SWIZZLE_WWWW;
-               emit(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_AND, temp_dst, temp1, temp2);
             }
 
             temp1.swizzle = SWIZZLE_XXXX;
             temp2.swizzle = SWIZZLE_YYYY;
-            emit(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
+            emit_asm(ir, TGSI_OPCODE_AND, result_dst, temp1, temp2);
          } else {
-            emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
+            emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
 
             /* After the dot-product, the value will be an integer on the
              * range [0,4].  Zero becomes 1.0, and positive values become zero.
@@ -1665,10 +1650,10 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
              */
             st_src_reg sge_src = result_src;
             sge_src.negate = ~sge_src.negate;
-            emit(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
+            emit_asm(ir, TGSI_OPCODE_SGE, result_dst, sge_src, st_src_reg_for_float(0.0));
          }
       } else {
-         emit(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SEQ, result_dst, op[0], op[1]);
       }
       break;
    case ir_binop_any_nequal:
@@ -1678,7 +1663,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
          st_src_reg temp = get_temp(native_integers ?
                                     glsl_type::uvec4_type :
                                     glsl_type::vec4_type);
-         emit(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SNE, st_dst_reg(temp), op[0], op[1]);
 
          if (native_integers) {
             st_dst_reg temp_dst = st_dst_reg(temp);
@@ -1692,22 +1677,22 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_YYYY;
                temp2.swizzle = SWIZZLE_ZZZZ;
-               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
                break;
             case 4:
                temp_dst.writemask = WRITEMASK_X;
                temp1.swizzle = SWIZZLE_XXXX;
                temp2.swizzle = SWIZZLE_YYYY;
-               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
                temp_dst.writemask = WRITEMASK_Y;
                temp1.swizzle = SWIZZLE_ZZZZ;
                temp2.swizzle = SWIZZLE_WWWW;
-               emit(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
+               emit_asm(ir, TGSI_OPCODE_OR, temp_dst, temp1, temp2);
             }
 
             temp1.swizzle = SWIZZLE_XXXX;
             temp2.swizzle = SWIZZLE_YYYY;
-            emit(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, temp1, temp2);
          } else {
             /* After the dot-product, the value will be an integer on the
              * range [0,4].  Zero stays zero, and positive values become 1.0.
@@ -1726,11 +1711,11 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                 */
                st_src_reg slt_src = result_src;
                slt_src.negate = ~slt_src.negate;
-               emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+               emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
             }
          }
       } else {
-         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       }
       break;
 
@@ -1763,7 +1748,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                           GET_SWZ(op0_swizzle, 3),
                                           GET_SWZ(op0_swizzle, 3),
                                           GET_SWZ(op0_swizzle, 3));
-            emit(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
             accum = st_src_reg(result_dst);
             accum.swizzle = dst_swizzle;
             /* fallthrough */
@@ -1772,7 +1757,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                           GET_SWZ(op0_swizzle, 2),
                                           GET_SWZ(op0_swizzle, 2),
                                           GET_SWZ(op0_swizzle, 2));
-            emit(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
             accum = st_src_reg(result_dst);
             accum.swizzle = dst_swizzle;
             /* fallthrough */
@@ -1781,7 +1766,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                           GET_SWZ(op0_swizzle, 1),
                                           GET_SWZ(op0_swizzle, 1),
                                           GET_SWZ(op0_swizzle, 1));
-            emit(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
+            emit_asm(ir, TGSI_OPCODE_OR, result_dst, accum, op[0]);
             break;
          default:
             assert(!"Unexpected vector size");
@@ -1807,11 +1792,11 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
              */
             st_src_reg slt_src = result_src;
             slt_src.negate = ~slt_src.negate;
-            emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+            emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
          }
          else {
             /* Use SNE 0 if integers are being used as boolean values. */
-            emit(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
+            emit_asm(ir, TGSI_OPCODE_SNE, result_dst, result_src, st_src_reg_for_int(0));
          }
       }
       break;
@@ -1819,9 +1804,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
    case ir_binop_logic_xor:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
       else
-         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_logic_or: {
@@ -1830,13 +1815,13 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           * instruction.
           */
          assert(native_integers);
-         emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
       } else {
          /* After the addition, the value will be an integer on the
           * range [0,2].  Zero stays zero, and positive values become 1.0.
           */
          glsl_to_tgsi_instruction *add =
-            emit(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
+            emit_asm(ir, TGSI_OPCODE_ADD, result_dst, op[0], op[1]);
          if (this->prog->Target == GL_FRAGMENT_PROGRAM_ARB) {
             /* The clamping to [0,1] can be done for free in the fragment
              * shader with a saturate if floats are being used as boolean values.
@@ -1849,7 +1834,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
              */
             st_src_reg slt_src = result_src;
             slt_src.negate = ~slt_src.negate;
-            emit(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
+            emit_asm(ir, TGSI_OPCODE_SLT, result_dst, slt_src, st_src_reg_for_float(0.0));
          }
       }
       break;
@@ -1861,9 +1846,9 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
        * actual AND opcode.
        */
       if (native_integers)
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
       else
-         emit(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_MUL, result_dst, op[0], op[1]);
       break;
 
    case ir_binop_dot:
@@ -1879,10 +1864,10 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       } else {
          /* sqrt(x) = x * rsq(x). */
          emit_scalar(ir, TGSI_OPCODE_RSQ, result_dst, op[0]);
-         emit(ir, TGSI_OPCODE_MUL, result_dst, result_src, op[0]);
+         emit_asm(ir, TGSI_OPCODE_MUL, result_dst, result_src, op[0]);
          /* For incoming channels <= 0, set the result to 0. */
          op[0].negate = ~op[0].negate;
-         emit(ir, TGSI_OPCODE_CMP, result_dst,
+         emit_asm(ir, TGSI_OPCODE_CMP, result_dst,
               op[0], result_src, st_src_reg_for_float(0.0));
       }
       break;
@@ -1891,13 +1876,13 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_i2f:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_I2F, result_dst, op[0]);
          break;
       }
       /* fallthrough to next case otherwise */
    case ir_unop_b2f:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_float(1.0));
          break;
       }
       /* fallthrough to next case otherwise */
@@ -1912,7 +1897,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
           * GLSL requires that int(bool) return 1 for true and 0 for false.
           * This conversion is done with AND, but it could be done with NEG.
           */
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], st_src_reg_for_int(1));
       } else {
          /* Booleans and integers are both stored as floats when native
           * integers are disabled.
@@ -1922,15 +1907,15 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       break;
    case ir_unop_f2i:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_F2I, result_dst, op[0]);
       else
-         emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_f2u:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_F2U, result_dst, op[0]);
       else
-         emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_bitcast_f2i:
       result_src = op[0];
@@ -1946,38 +1931,38 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       result_src.type = GLSL_TYPE_FLOAT;
       break;
    case ir_unop_f2b:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
+      emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
       break;
    case ir_unop_d2b:
-      emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_double(0.0));
+      emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_double(0.0));
       break;
    case ir_unop_i2b:
       if (native_integers)
-         emit(ir, TGSI_OPCODE_INEG, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_USNE, result_dst, op[0], st_src_reg_for_int(0));
       else
-         emit(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
+         emit_asm(ir, TGSI_OPCODE_SNE, result_dst, op[0], st_src_reg_for_float(0.0));
       break;
    case ir_unop_trunc:
-      emit(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_ceil:
-      emit(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_CEIL, result_dst, op[0]);
       break;
    case ir_unop_floor:
-      emit(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_FLR, result_dst, op[0]);
       break;
    case ir_unop_round_even:
-      emit(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_ROUND, result_dst, op[0]);
       break;
    case ir_unop_fract:
-      emit(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_FRC, result_dst, op[0]);
       break;
 
    case ir_binop_min:
-      emit(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_MIN, result_dst, op[0], op[1]);
       break;
    case ir_binop_max:
-      emit(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_MAX, result_dst, op[0], op[1]);
       break;
    case ir_binop_pow:
       emit_scalar(ir, TGSI_OPCODE_POW, result_dst, op[0], op[1]);
@@ -1985,37 +1970,37 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
 
    case ir_unop_bit_not:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_NOT, result_dst, op[0]);
          break;
       }
    case ir_unop_u2f:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
+         emit_asm(ir, TGSI_OPCODE_U2F, result_dst, op[0]);
          break;
       }
    case ir_binop_lshift:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_SHL, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_rshift:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_ISHR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_and:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_AND, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_xor:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_XOR, result_dst, op[0], op[1]);
          break;
       }
    case ir_binop_bit_or:
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_OR, result_dst, op[0], op[1]);
          break;
       }
 
@@ -2045,7 +2030,7 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
       }
       else {
          /* Relative/variable index into constant buffer */
-         emit(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), op[1],
+         emit_asm(ir, TGSI_OPCODE_USHR, st_dst_reg(index_reg), op[1],
               st_src_reg_for_int(4));
          cbuf.reladdr = ralloc(mem_ctx, st_src_reg);
          memcpy(cbuf.reladdr, &index_reg, sizeof(index_reg));
@@ -2078,88 +2063,88 @@ glsl_to_tgsi_visitor::visit(ir_expression *ir)
                                        const_offset % 16 / 4);
 
       if (ir->type->base_type == GLSL_TYPE_BOOL) {
-         emit(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
+         emit_asm(ir, TGSI_OPCODE_USNE, result_dst, cbuf, st_src_reg_for_int(0));
       } else {
-         emit(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
+         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, cbuf);
       }
       break;
    }
    case ir_triop_lrp:
       /* note: we have to reorder the three args here */
-      emit(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
+      emit_asm(ir, TGSI_OPCODE_LRP, result_dst, op[2], op[1], op[0]);
       break;
    case ir_triop_csel:
       if (this->ctx->Const.NativeIntegers)
-         emit(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_UCMP, result_dst, op[0], op[1], op[2]);
       else {
          op[0].negate = ~op[0].negate;
-         emit(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_CMP, result_dst, op[0], op[1], op[2]);
       }
       break;
    case ir_triop_bitfield_extract:
-      emit(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
+      emit_asm(ir, TGSI_OPCODE_IBFE, result_dst, op[0], op[1], op[2]);
       break;
    case ir_quadop_bitfield_insert:
-      emit(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
+      emit_asm(ir, TGSI_OPCODE_BFI, result_dst, op[0], op[1], op[2], op[3]);
       break;
    case ir_unop_bitfield_reverse:
-      emit(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_BREV, result_dst, op[0]);
       break;
    case ir_unop_bit_count:
-      emit(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_POPC, result_dst, op[0]);
       break;
    case ir_unop_find_msb:
-      emit(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_IMSB, result_dst, op[0]);
       break;
    case ir_unop_find_lsb:
-      emit(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_LSB, result_dst, op[0]);
       break;
    case ir_binop_imul_high:
-      emit(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_IMUL_HI, result_dst, op[0], op[1]);
       break;
    case ir_triop_fma:
       /* In theory, MAD is incorrect here. */
       if (have_fma)
-         emit(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_FMA, result_dst, op[0], op[1], op[2]);
       else
-         emit(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
+         emit_asm(ir, TGSI_OPCODE_MAD, result_dst, op[0], op[1], op[2]);
       break;
    case ir_unop_interpolate_at_centroid:
-      emit(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_INTERP_CENTROID, result_dst, op[0]);
       break;
    case ir_binop_interpolate_at_offset:
-      emit(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_INTERP_OFFSET, result_dst, op[0], op[1]);
       break;
    case ir_binop_interpolate_at_sample:
-      emit(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
+      emit_asm(ir, TGSI_OPCODE_INTERP_SAMPLE, result_dst, op[0], op[1]);
       break;
 
    case ir_unop_d2f:
-      emit(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_D2F, result_dst, op[0]);
       break;
    case ir_unop_f2d:
-      emit(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_F2D, result_dst, op[0]);
       break;
    case ir_unop_d2i:
-      emit(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_D2I, result_dst, op[0]);
       break;
    case ir_unop_i2d:
-      emit(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_I2D, result_dst, op[0]);
       break;
    case ir_unop_d2u:
-      emit(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_D2U, result_dst, op[0]);
       break;
    case ir_unop_u2d:
-      emit(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_U2D, result_dst, op[0]);
       break;
    case ir_unop_unpack_double_2x32:
    case ir_unop_pack_double_2x32:
-      emit(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
       break;
 
    case ir_binop_ldexp:
       if (ir->operands[0]->type->base_type == GLSL_TYPE_DOUBLE) {
-         emit(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
+         emit_asm(ir, TGSI_OPCODE_DLDEXP, result_dst, op[0], op[1]);
       } else {
          assert(!"Invalid ldexp for non-double opcode in glsl_to_tgsi_visitor::visit()");
       }
@@ -2243,11 +2228,38 @@ glsl_to_tgsi_visitor::visit(ir_swizzle *ir)
    this->result = src;
 }
 
+/* Test if the variable is an array. Note that geometry and
+ * tessellation shader inputs are outputs are always arrays (except
+ * for patch inputs), so only the array element type is considered.
+ */
+static bool
+is_inout_array(unsigned stage, ir_variable *var, bool *is_2d)
+{
+   const glsl_type *type = var->type;
+
+   if ((stage == MESA_SHADER_VERTEX && var->data.mode == ir_var_shader_in) ||
+       (stage == MESA_SHADER_FRAGMENT && var->data.mode == ir_var_shader_out))
+      return false;
+
+   *is_2d = false;
+
+   if (stage == MESA_SHADER_GEOMETRY && var->data.mode == ir_var_shader_in) {
+      if (!var->type->is_array())
+         return false; /* a system value probably */
+
+      type = var->type->fields.array;
+      *is_2d = true;
+   }
+
+   return type->is_array() || type->is_matrix();
+}
+
 void
 glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
 {
    variable_storage *entry = find_variable_storage(ir->var);
    ir_variable *var = ir->var;
+   bool is_2d;
 
    if (!entry) {
       switch (var->data.mode) {
@@ -2263,16 +2275,56 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
           * user-defined varyings.
           */
          assert(var->data.location != -1);
-         entry = new(mem_ctx) variable_storage(var,
-                                               PROGRAM_INPUT,
-                                               var->data.location);
+
+         if (is_inout_array(shader->Stage, var, &is_2d)) {
+            struct array_decl *decl = &input_arrays[num_input_arrays];
+
+            decl->mesa_index = var->data.location;
+            decl->array_id = num_input_arrays + 1;
+            if (is_2d)
+               decl->array_size = type_size(var->type->fields.array);
+            else
+               decl->array_size = type_size(var->type);
+            num_input_arrays++;
+
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_INPUT,
+                                                  var->data.location,
+                                                  decl->array_id);
+         }
+         else {
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_INPUT,
+                                                  var->data.location);
+         }
+         this->variables.push_tail(entry);
          break;
       case ir_var_shader_out:
          assert(var->data.location != -1);
-         entry = new(mem_ctx) variable_storage(var,
-                                               PROGRAM_OUTPUT,
-                                               var->data.location
-                                               + var->data.index);
+
+         if (is_inout_array(shader->Stage, var, &is_2d)) {
+            struct array_decl *decl = &output_arrays[num_output_arrays];
+
+            decl->mesa_index = var->data.location;
+            decl->array_id = num_output_arrays + 1;
+            if (is_2d)
+               decl->array_size = type_size(var->type->fields.array);
+            else
+               decl->array_size = type_size(var->type);
+            num_output_arrays++;
+
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_OUTPUT,
+                                                  var->data.location,
+                                                  decl->array_id);
+         }
+         else {
+            entry = new(mem_ctx) variable_storage(var,
+                                                  PROGRAM_OUTPUT,
+                                                  var->data.location
+                                                  + var->data.index);
+         }
+         this->variables.push_tail(entry);
          break;
       case ir_var_system_value:
          entry = new(mem_ctx) variable_storage(var,
@@ -2296,10 +2348,43 @@ glsl_to_tgsi_visitor::visit(ir_dereference_variable *ir)
    }
 
    this->result = st_src_reg(entry->file, entry->index, var->type);
+   this->result.array_id = entry->array_id;
    if (!native_integers)
       this->result.type = GLSL_TYPE_FLOAT;
 }
 
+static void
+shrink_array_declarations(struct array_decl *arrays, unsigned count,
+                          GLbitfield64 usage_mask)
+{
+   unsigned i, j;
+
+   /* Fix array declarations by removing unused array elements at both ends
+    * of the arrays. For example, mat4[3] where only mat[1] is used.
+    */
+   for (i = 0; i < count; i++) {
+      struct array_decl *decl = &arrays[i];
+
+      /* Shrink the beginning. */
+      for (j = 0; j < decl->array_size; j++) {
+         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+            break;
+
+         decl->mesa_index++;
+         decl->array_size--;
+         j--;
+      }
+
+      /* Shrink the end. */
+      for (j = decl->array_size-1; j >= 0; j--) {
+         if (usage_mask & BITFIELD64_BIT(decl->mesa_index+j))
+            break;
+
+         decl->array_size--;
+      }
+   }
+}
+
 void
 glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
 {
@@ -2341,7 +2426,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
          index_reg = get_temp(native_integers ?
                               glsl_type::int_type : glsl_type::float_type);
 
-         emit(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
+         emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(index_reg),
               this->result, st_src_reg_for_type(index_reg.type, element_size));
       }
 
@@ -2352,7 +2437,7 @@ glsl_to_tgsi_visitor::visit(ir_dereference_array *ir)
          st_src_reg accum_reg = get_temp(native_integers ?
                                 glsl_type::int_type : glsl_type::float_type);
 
-         emit(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
+         emit_asm(ir, TGSI_OPCODE_ADD, st_dst_reg(accum_reg),
               index_reg, *src.reladdr);
 
          index_reg = accum_reg;
@@ -2589,16 +2674,16 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
       l_src.swizzle = swizzle_for_size(type->vector_elements);
 
       if (native_integers) {
-         emit(ir, TGSI_OPCODE_UCMP, *l, *cond,
+         emit_asm(ir, TGSI_OPCODE_UCMP, *l, *cond,
               cond_swap ? l_src : *r,
               cond_swap ? *r : l_src);
       } else {
-         emit(ir, TGSI_OPCODE_CMP, *l, *cond,
+         emit_asm(ir, TGSI_OPCODE_CMP, *l, *cond,
               cond_swap ? l_src : *r,
               cond_swap ? *r : l_src);
       }
    } else {
-      emit(ir, TGSI_OPCODE_MOV, *l, *r);
+      emit_asm(ir, TGSI_OPCODE_MOV, *l, *r);
    }
    l->index++;
    r->index++;
@@ -2679,7 +2764,7 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
        */
       glsl_to_tgsi_instruction *inst, *new_inst;
       inst = (glsl_to_tgsi_instruction *)this->instructions.get_tail();
-      new_inst = emit(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
+      new_inst = emit_asm(ir, inst->op, l, inst->src[0], inst->src[1], inst->src[2]);
       new_inst->saturate = inst->saturate;
       inst->dead_mask = inst->dst[0].writemask;
    } else {
@@ -2717,7 +2802,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
          src = this->result;
 
          for (i = 0; i < (unsigned int)size; i++) {
-            emit(ir, TGSI_OPCODE_MOV, temp, src);
+            emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
 
             src.index++;
             temp.index++;
@@ -2739,7 +2824,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
          ir->array_elements[i]->accept(this);
          src = this->result;
          for (int j = 0; j < size; j++) {
-            emit(ir, TGSI_OPCODE_MOV, temp, src);
+            emit_asm(ir, TGSI_OPCODE_MOV, temp, src);
 
             src.index++;
             temp.index++;
@@ -2764,7 +2849,7 @@ glsl_to_tgsi_visitor::visit(ir_constant *ir)
                                   ir->type->vector_elements,
                                   GL_FLOAT,
                                   &src.swizzle);
-         emit(ir, TGSI_OPCODE_MOV, mat_column, src);
+         emit_asm(ir, TGSI_OPCODE_MOV, mat_column, src);
 
          mat_column.index++;
       }
@@ -2889,7 +2974,7 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
          l.cond_mask = COND_TR;
 
          for (i = 0; i < type_size(param->type); i++) {
-            emit(ir, TGSI_OPCODE_MOV, l, r);
+            emit_asm(ir, TGSI_OPCODE_MOV, l, r);
             l.index++;
             r.index++;
          }
@@ -2897,7 +2982,7 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
    }
 
    /* Emit call instruction */
-   call_inst = emit(ir, TGSI_OPCODE_CAL);
+   call_inst = emit_asm(ir, TGSI_OPCODE_CAL);
    call_inst->function = entry;
 
    /* Process out parameters. */
@@ -2922,7 +3007,7 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
          st_dst_reg l = st_dst_reg(this->result);
 
          for (i = 0; i < type_size(param->type); i++) {
-            emit(ir, TGSI_OPCODE_MOV, l, r);
+            emit_asm(ir, TGSI_OPCODE_MOV, l, r);
             l.index++;
             r.index++;
          }
@@ -2965,7 +3050,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       coord = get_temp(glsl_type::vec4_type);
       coord_dst = st_dst_reg(coord);
       coord_dst.writemask = (1 << ir->coordinate->type->vector_elements) - 1;
-      emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
+      emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
    }
 
    if (ir->projector) {
@@ -3074,7 +3159,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       if (opcode == TGSI_OPCODE_TEX) {
          /* Slot the projector in as the last component of the coord. */
          coord_dst.writemask = WRITEMASK_W;
-         emit(ir, TGSI_OPCODE_MOV, coord_dst, projector);
+         emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, projector);
          coord_dst.writemask = WRITEMASK_XYZW;
          opcode = TGSI_OPCODE_TXP;
       } else {
@@ -3086,7 +3171,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
           * projective divide now.
           */
          coord_dst.writemask = WRITEMASK_W;
-         emit(ir, TGSI_OPCODE_RCP, coord_dst, projector);
+         emit_asm(ir, TGSI_OPCODE_RCP, coord_dst, projector);
 
          /* In the case where we have to project the coordinates "by hand,"
           * the shadow comparator value must also be projected.
@@ -3105,14 +3190,14 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
             assert(!sampler_type->sampler_array);
 
             tmp_dst.writemask = WRITEMASK_Z;
-            emit(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
+            emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, this->result);
 
             tmp_dst.writemask = WRITEMASK_XY;
-            emit(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
+            emit_asm(ir, TGSI_OPCODE_MOV, tmp_dst, coord);
          }
 
          coord_dst.writemask = WRITEMASK_XYZ;
-         emit(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
+         emit_asm(ir, TGSI_OPCODE_MUL, coord_dst, tmp_src, coord_w);
 
          coord_dst.writemask = WRITEMASK_XYZW;
          coord.swizzle = SWIZZLE_XYZW;
@@ -3133,7 +3218,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
          cube_sc = get_temp(glsl_type::float_type);
          cube_sc_dst = st_dst_reg(cube_sc);
          cube_sc_dst.writemask = WRITEMASK_X;
-         emit(ir, TGSI_OPCODE_MOV, cube_sc_dst, this->result);
+         emit_asm(ir, TGSI_OPCODE_MOV, cube_sc_dst, this->result);
          cube_sc_dst.writemask = WRITEMASK_X;
       }
       else {
@@ -3144,20 +3229,20 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
          } else {
             coord_dst.writemask = WRITEMASK_Z;
          }
-         emit(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
+         emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, this->result);
          coord_dst.writemask = WRITEMASK_XYZW;
       }
    }
 
    if (ir->op == ir_txf_ms) {
       coord_dst.writemask = WRITEMASK_W;
-      emit(ir, TGSI_OPCODE_MOV, coord_dst, sample_index);
+      emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, sample_index);
       coord_dst.writemask = WRITEMASK_XYZW;
    } else if (opcode == TGSI_OPCODE_TXL || opcode == TGSI_OPCODE_TXB ||
        opcode == TGSI_OPCODE_TXF) {
       /* TGSI stores LOD or LOD bias in the last channel of the coords. */
       coord_dst.writemask = WRITEMASK_W;
-      emit(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
+      emit_asm(ir, TGSI_OPCODE_MOV, coord_dst, lod_info);
       coord_dst.writemask = WRITEMASK_XYZW;
    }
 
@@ -3167,30 +3252,30 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
    }
 
    if (opcode == TGSI_OPCODE_TXD)
-      inst = emit(ir, opcode, result_dst, coord, dx, dy);
+      inst = emit_asm(ir, opcode, result_dst, coord, dx, dy);
    else if (opcode == TGSI_OPCODE_TXQ) {
       if (ir->op == ir_query_levels) {
          /* the level is stored in W */
-         inst = emit(ir, opcode, st_dst_reg(levels_src), lod_info);
+         inst = emit_asm(ir, opcode, st_dst_reg(levels_src), lod_info);
          result_dst.writemask = WRITEMASK_X;
          levels_src.swizzle = SWIZZLE_WWWW;
-         emit(ir, TGSI_OPCODE_MOV, result_dst, levels_src);
+         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, levels_src);
       } else
-         inst = emit(ir, opcode, result_dst, lod_info);
+         inst = emit_asm(ir, opcode, result_dst, lod_info);
    } else if (opcode == TGSI_OPCODE_TXF) {
-      inst = emit(ir, opcode, result_dst, coord);
+      inst = emit_asm(ir, opcode, result_dst, coord);
    } else if (opcode == TGSI_OPCODE_TXL2 || opcode == TGSI_OPCODE_TXB2) {
-      inst = emit(ir, opcode, result_dst, coord, lod_info);
+      inst = emit_asm(ir, opcode, result_dst, coord, lod_info);
    } else if (opcode == TGSI_OPCODE_TEX2) {
-      inst = emit(ir, opcode, result_dst, coord, cube_sc);
+      inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
    } else if (opcode == TGSI_OPCODE_TG4) {
       if (is_cube_array && ir->shadow_comparitor) {
-         inst = emit(ir, opcode, result_dst, coord, cube_sc);
+         inst = emit_asm(ir, opcode, result_dst, coord, cube_sc);
       } else {
-         inst = emit(ir, opcode, result_dst, coord, component);
+         inst = emit_asm(ir, opcode, result_dst, coord, component);
       }
    } else
-      inst = emit(ir, opcode, result_dst, coord);
+      inst = emit_asm(ir, opcode, result_dst, coord);
 
    if (ir->shadow_comparitor)
       inst->tex_shadow = GL_TRUE;
@@ -3246,6 +3331,8 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
       assert(!"Should not get here.");
    }
 
+   inst->tex_type = ir->type->base_type;
+
    this->result = result_src;
 }
 
@@ -3264,13 +3351,13 @@ glsl_to_tgsi_visitor::visit(ir_return *ir)
       l = st_dst_reg(current_function->return_reg);
 
       for (i = 0; i < type_size(current_function->sig->return_type); i++) {
-         emit(ir, TGSI_OPCODE_MOV, l, r);
+         emit_asm(ir, TGSI_OPCODE_MOV, l, r);
          l.index++;
          r.index++;
       }
    }
 
-   emit(ir, TGSI_OPCODE_RET);
+   emit_asm(ir, TGSI_OPCODE_RET);
 }
 
 void
@@ -3283,16 +3370,16 @@ glsl_to_tgsi_visitor::visit(ir_discard *ir)
       /* Convert the bool condition to a float so we can negate. */
       if (native_integers) {
          st_src_reg temp = get_temp(ir->condition->type);
-         emit(ir, TGSI_OPCODE_AND, st_dst_reg(temp),
+         emit_asm(ir, TGSI_OPCODE_AND, st_dst_reg(temp),
               condition, st_src_reg_for_float(1.0));
          condition = temp;
       }
 
       condition.negate = ~condition.negate;
-      emit(ir, TGSI_OPCODE_KILL_IF, undef_dst, condition);
+      emit_asm(ir, TGSI_OPCODE_KILL_IF, undef_dst, condition);
    } else {
       /* unconditional kil */
-      emit(ir, TGSI_OPCODE_KILL);
+      emit_asm(ir, TGSI_OPCODE_KILL);
    }
 }
 
@@ -3307,18 +3394,18 @@ glsl_to_tgsi_visitor::visit(ir_if *ir)
 
    if_opcode = native_integers ? TGSI_OPCODE_UIF : TGSI_OPCODE_IF;
 
-   if_inst = emit(ir->condition, if_opcode, undef_dst, this->result);
+   if_inst = emit_asm(ir->condition, if_opcode, undef_dst, this->result);
 
    this->instructions.push_tail(if_inst);
 
    visit_exec_list(&ir->then_instructions, this);
 
    if (!ir->else_instructions.is_empty()) {
-      emit(ir->condition, TGSI_OPCODE_ELSE);
+      emit_asm(ir->condition, TGSI_OPCODE_ELSE);
       visit_exec_list(&ir->else_instructions, this);
    }
 
-   if_inst = emit(ir->condition, TGSI_OPCODE_ENDIF);
+   if_inst = emit_asm(ir->condition, TGSI_OPCODE_ENDIF);
 }
 
 
@@ -3328,7 +3415,7 @@ glsl_to_tgsi_visitor::visit(ir_emit_vertex *ir)
    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
 
    ir->stream->accept(this);
-   emit(ir, TGSI_OPCODE_EMIT, undef_dst, this->result);
+   emit_asm(ir, TGSI_OPCODE_EMIT, undef_dst, this->result);
 }
 
 void
@@ -3337,14 +3424,24 @@ glsl_to_tgsi_visitor::visit(ir_end_primitive *ir)
    assert(this->prog->Target == GL_GEOMETRY_PROGRAM_NV);
 
    ir->stream->accept(this);
-   emit(ir, TGSI_OPCODE_ENDPRIM, undef_dst, this->result);
+   emit_asm(ir, TGSI_OPCODE_ENDPRIM, undef_dst, this->result);
+}
+
+void
+glsl_to_tgsi_visitor::visit(ir_barrier *ir)
+{
+   unreachable("Not implemented!");
 }
 
 glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
 {
    result.file = PROGRAM_UNDEFINED;
    next_temp = 1;
+   array_sizes = NULL;
+   max_num_arrays = 0;
    next_array = 0;
+   num_input_arrays = 0;
+   num_output_arrays = 0;
    next_signature_id = 1;
    num_immediates = 0;
    current_function = NULL;
@@ -3366,6 +3463,7 @@ glsl_to_tgsi_visitor::glsl_to_tgsi_visitor()
 
 glsl_to_tgsi_visitor::~glsl_to_tgsi_visitor()
 {
+   free(array_sizes);
    ralloc_free(mem_ctx);
 }
 
@@ -3387,7 +3485,13 @@ count_resources(glsl_to_tgsi_visitor *v, gl_program *prog)
    foreach_in_list(glsl_to_tgsi_instruction, inst, &v->instructions) {
       if (is_tex_instruction(inst->op)) {
          for (int i = 0; i < inst->sampler_array_size; i++) {
-            v->samplers_used |= 1 << (inst->sampler.index + i);
+            unsigned idx = inst->sampler.index + i;
+            v->samplers_used |= 1 << idx;
+
+            debug_assert(idx < (int)ARRAY_SIZE(v->sampler_types));
+            v->sampler_types[idx] = inst->tex_type;
+            v->sampler_targets[idx] =
+               st_translate_texture_target(inst->tex_target, inst->tex_shadow);
 
             if (inst->tex_shadow) {
                prog->ShadowSamplers |= 1 << (inst->sampler.index + i);
@@ -3734,6 +3838,7 @@ glsl_to_tgsi_visitor::copy_propagate(void)
             inst->src[r].index2D = first->src[0].index2D;
             inst->src[r].has_index2 = first->src[0].has_index2;
             inst->src[r].double_reg2 = first->src[0].double_reg2;
+            inst->src[r].array_id = first->src[0].array_id;
 
             int swizzle = 0;
             for (int i = 0; i < 4; i++) {
@@ -4177,7 +4282,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
    coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type);
    src0 = v->get_temp(glsl_type::vec4_type);
    dst0 = st_dst_reg(src0);
-   inst = v->emit(NULL, TGSI_OPCODE_TEX, dst0, coord);
+   inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord);
    inst->sampler_array_size = 1;
    inst->tex_target = TEXTURE_2D_INDEX;
 
@@ -4201,7 +4306,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       /* MAD colorTemp, colorTemp, scale, bias; */
       scale = st_src_reg(PROGRAM_STATE_VAR, scale_p, GLSL_TYPE_FLOAT);
       bias = st_src_reg(PROGRAM_STATE_VAR, bias_p, GLSL_TYPE_FLOAT);
-      inst = v->emit(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_MAD, dst0, src0, scale, bias);
    }
 
    if (pixel_maps) {
@@ -4209,6 +4314,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       st_dst_reg temp_dst = st_dst_reg(temp);
 
       assert(st->pixel_xfer.pixelmap_texture);
+      (void) st;
 
       /* With a little effort, we can do four pixel map look-ups with
        * two TEX instructions:
@@ -4216,7 +4322,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
 
       /* TEX temp.rg, colorTemp.rgba, texture[1], 2D; */
       temp_dst.writemask = WRITEMASK_XY; /* write R,G */
-      inst = v->emit(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
       inst->sampler.index = 1;
       inst->sampler_array_size = 1;
       inst->tex_target = TEXTURE_2D_INDEX;
@@ -4224,7 +4330,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       /* TEX temp.ba, colorTemp.baba, texture[1], 2D; */
       src0.swizzle = MAKE_SWIZZLE4(SWIZZLE_Z, SWIZZLE_W, SWIZZLE_Z, SWIZZLE_W);
       temp_dst.writemask = WRITEMASK_ZW; /* write B,A */
-      inst = v->emit(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, temp_dst, src0);
       inst->sampler.index = 1;
       inst->sampler_array_size = 1;
       inst->tex_target = TEXTURE_2D_INDEX;
@@ -4233,7 +4339,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
       v->samplers_used |= (1 << 1);
 
       /* MOV colorTemp, temp; */
-      inst = v->emit(NULL, TGSI_OPCODE_MOV, dst0, temp);
+      inst = v->emit_asm(NULL, TGSI_OPCODE_MOV, dst0, temp);
    }
 
    /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
@@ -4256,7 +4362,7 @@ get_pixel_transfer_visitor(struct st_fragment_program *fp,
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      newinst = v->emit(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
       newinst->tex_target = inst->tex_target;
       newinst->sampler_array_size = inst->sampler_array_size;
    }
@@ -4306,7 +4412,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    coord = st_src_reg(PROGRAM_INPUT, VARYING_SLOT_TEX0, glsl_type::vec2_type);
    src0 = v->get_temp(glsl_type::vec4_type);
    dst0 = st_dst_reg(src0);
-   inst = v->emit(NULL, TGSI_OPCODE_TEX, dst0, coord);
+   inst = v->emit_asm(NULL, TGSI_OPCODE_TEX, dst0, coord);
    inst->sampler.index = samplerIndex;
    inst->sampler_array_size = 1;
    inst->tex_target = TEXTURE_2D_INDEX;
@@ -4319,7 +4425,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
    src0.negate = NEGATE_XYZW;
    if (st->bitmap.tex_format == PIPE_FORMAT_L8_UNORM)
       src0.swizzle = SWIZZLE_XXXX;
-   inst = v->emit(NULL, TGSI_OPCODE_KILL_IF, undef_dst, src0);
+   inst = v->emit_asm(NULL, TGSI_OPCODE_KILL_IF, undef_dst, src0);
 
    /* Now copy the instructions from the original glsl_to_tgsi_visitor into the
     * new visitor. */
@@ -4336,7 +4442,7 @@ get_bitmap_visitor(struct st_fragment_program *fp,
             prog->InputsRead |= BITFIELD64_BIT(src_regs[i].index);
       }
 
-      newinst = v->emit(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
+      newinst = v->emit_asm(NULL, inst->op, inst->dst[0], src_regs[0], src_regs[1], src_regs[2]);
       newinst->tex_target = inst->tex_target;
       newinst->sampler_array_size = inst->sampler_array_size;
    }
@@ -4362,7 +4468,8 @@ struct st_translate {
    unsigned temps_size;
    struct ureg_dst *temps;
 
-   struct ureg_dst arrays[MAX_ARRAYS];
+   struct ureg_dst *arrays;
+   unsigned num_temp_arrays;
    struct ureg_src *constants;
    int num_constants;
    struct ureg_src *immediates;
@@ -4373,7 +4480,9 @@ struct st_translate {
    struct ureg_src samplers[PIPE_MAX_SAMPLERS];
    struct ureg_src systemValues[SYSTEM_VALUE_MAX];
    struct tgsi_texture_offset tex_offsets[MAX_GLSL_TEXTURE_OFFSET];
-   unsigned array_sizes[MAX_ARRAYS];
+   unsigned *array_sizes;
+   struct array_decl *input_arrays;
+   struct array_decl *output_arrays;
 
    const GLuint *inputMapping;
    const GLuint *outputMapping;
@@ -4497,9 +4606,8 @@ emit_immediate(struct st_translate *t,
  * Map a glsl_to_tgsi dst register to a TGSI ureg_dst register.
  */
 static struct ureg_dst
-dst_register(struct st_translate *t,
-             gl_register_file file,
-             GLuint index)
+dst_register(struct st_translate *t, gl_register_file file, unsigned index,
+             unsigned array_id)
 {
    unsigned array;
 
@@ -4530,7 +4638,7 @@ dst_register(struct st_translate *t,
    case PROGRAM_ARRAY:
       array = index >> 16;
 
-      assert(array < ARRAY_SIZE(t->arrays));
+      assert(array < t->num_temp_arrays);
 
       if (ureg_dst_is_undef(t->arrays[array]))
          t->arrays[array] = ureg_DECL_array_temporary(
@@ -4540,16 +4648,25 @@ dst_register(struct st_translate *t,
                                    (int)(index & 0xFFFF) - 0x8000);
 
    case PROGRAM_OUTPUT:
-      if (t->procType == TGSI_PROCESSOR_VERTEX)
-         assert(index < VARYING_SLOT_MAX);
-      else if (t->procType == TGSI_PROCESSOR_FRAGMENT)
-         assert(index < FRAG_RESULT_MAX);
-      else
-         assert(index < VARYING_SLOT_MAX);
+      if (!array_id) {
+         if (t->procType == TGSI_PROCESSOR_FRAGMENT)
+            assert(index < FRAG_RESULT_MAX);
+         else
+            assert(index < VARYING_SLOT_MAX);
 
-      assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs));
+         assert(t->outputMapping[index] < ARRAY_SIZE(t->outputs));
+         assert(t->outputs[t->outputMapping[index]].File != TGSI_FILE_NULL);
+         return t->outputs[t->outputMapping[index]];
+      }
+      else {
+         struct array_decl *decl = &t->output_arrays[array_id-1];
+         unsigned mesa_index = decl->mesa_index;
+         int slot = t->outputMapping[mesa_index];
 
-      return t->outputs[t->outputMapping[index]];
+         assert(slot != -1 && t->outputs[slot].File == TGSI_FILE_OUTPUT);
+         assert(t->outputs[slot].ArrayID == array_id);
+         return ureg_dst_array_offset(t->outputs[slot], index - mesa_index);
+      }
 
    case PROGRAM_ADDRESS:
       return t->address[index];
@@ -4575,7 +4692,8 @@ src_register(struct st_translate *t, const st_src_reg *reg)
 
    case PROGRAM_TEMPORARY:
    case PROGRAM_ARRAY:
-      return ureg_src(dst_register(t, reg->file, reg->index));
+   case PROGRAM_OUTPUT:
+      return ureg_src(dst_register(t, reg->file, reg->index, reg->array_id));
 
    case PROGRAM_UNIFORM:
       assert(reg->index >= 0);
@@ -4598,12 +4716,20 @@ src_register(struct st_translate *t, const st_src_reg *reg)
        * map back to the original index and add the offset after
        * mapping. */
       index -= double_reg2;
-      assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
-      return t->inputs[t->inputMapping[index] + double_reg2];
+      if (!reg->array_id) {
+         assert(t->inputMapping[index] < ARRAY_SIZE(t->inputs));
+         assert(t->inputs[t->inputMapping[index]].File != TGSI_FILE_NULL);
+         return t->inputs[t->inputMapping[index]];
+      }
+      else {
+         struct array_decl *decl = &t->input_arrays[reg->array_id-1];
+         unsigned mesa_index = decl->mesa_index;
+         int slot = t->inputMapping[mesa_index];
 
-   case PROGRAM_OUTPUT:
-      assert(t->outputMapping[reg->index] < ARRAY_SIZE(t->outputs));
-      return ureg_src(t->outputs[t->outputMapping[reg->index]]); /* not needed? */
+         assert(slot != -1 && t->inputs[slot].File == TGSI_FILE_INPUT);
+         assert(t->inputs[slot].ArrayID == reg->array_id);
+         return ureg_src_array_offset(t->inputs[slot], index - mesa_index);
+      }
 
    case PROGRAM_ADDRESS:
       return ureg_src(t->address[reg->index]);
@@ -4626,9 +4752,8 @@ translate_dst(struct st_translate *t,
               const st_dst_reg *dst_reg,
               bool saturate, bool clamp_color)
 {
-   struct ureg_dst dst = dst_register(t,
-                                      dst_reg->file,
-                                      dst_reg->index);
+   struct ureg_dst dst = dst_register(t, dst_reg->file, dst_reg->index,
+                                      dst_reg->array_id);
 
    if (dst.File == TGSI_FILE_NULL)
       return dst;
@@ -4738,7 +4863,7 @@ translate_tex_offset(struct st_translate *t,
       array = in_offset->index >> 16;
 
       assert(array >= 0);
-      assert(array < (int) ARRAY_SIZE(t->arrays));
+      assert(array < (int)t->num_temp_arrays);
 
       dst = t->arrays[array];
       offset.File = dst.File;
@@ -5060,6 +5185,25 @@ emit_edgeflags(struct st_translate *t)
    ureg_MOV(ureg, edge_dst, edge_src);
 }
 
+static bool
+find_array(unsigned attr, struct array_decl *arrays, unsigned count,
+           unsigned *array_id, unsigned *array_size)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++) {
+      struct array_decl *decl = &arrays[i];
+
+      if (attr == decl->mesa_index) {
+         *array_id = decl->array_id;
+         *array_size = decl->array_size;
+         assert(*array_size);
+         return true;
+      }
+   }
+   return false;
+}
+
 /**
  * Translate intermediate IR (glsl_to_tgsi_instruction) to TGSI format.
  * \param program  the program to translate
@@ -5089,12 +5233,14 @@ st_translate_program(
    const struct gl_program *proginfo,
    GLuint numInputs,
    const GLuint inputMapping[],
+   const GLuint inputSlotToAttr[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
    const GLuint interpMode[],
    const GLuint interpLocation[],
    GLuint numOutputs,
    const GLuint outputMapping[],
+   const GLuint outputSlotToAttr[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[],
    boolean passthrough_edgeflags,
@@ -5132,25 +5278,101 @@ st_translate_program(
       goto out;
    }
 
-   memset(t, 0, sizeof *t);
-
    t->procType = procType;
    t->inputMapping = inputMapping;
    t->outputMapping = outputMapping;
    t->ureg = ureg;
+   t->num_temp_arrays = program->next_array;
+   if (t->num_temp_arrays)
+      t->arrays = (struct ureg_dst*)
+                  calloc(1, sizeof(t->arrays[0]) * t->num_temp_arrays);
 
    /*
     * Declare input attributes.
     */
-   if (procType == TGSI_PROCESSOR_FRAGMENT) {
+   switch (procType) {
+   case TGSI_PROCESSOR_FRAGMENT:
       for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
-                                                        inputSemanticName[i],
-                                                        inputSemanticIndex[i],
-                                                        interpMode[i], 0,
-                                                        interpLocation[i]);
+         unsigned array_id = 0;
+         unsigned array_size;
+
+         if (find_array(inputSlotToAttr[i], program->input_arrays,
+                        program->num_input_arrays, &array_id, &array_size)) {
+            /* We've found an array. Declare it so. */
+            t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
+                              inputSemanticName[i], inputSemanticIndex[i],
+                              interpMode[i], 0, interpLocation[i],
+                              array_id, array_size);
+            i += array_size - 1;
+         }
+         else {
+            t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
+                              inputSemanticName[i], inputSemanticIndex[i],
+                              interpMode[i], 0, interpLocation[i], 0, 1);
+         }
+      }
+      break;
+   case TGSI_PROCESSOR_GEOMETRY:
+      for (i = 0; i < numInputs; i++) {
+         unsigned array_id = 0;
+         unsigned array_size;
+
+         if (find_array(inputSlotToAttr[i], program->input_arrays,
+                        program->num_input_arrays, &array_id, &array_size)) {
+            /* We've found an array. Declare it so. */
+            t->inputs[i] = ureg_DECL_input(ureg, inputSemanticName[i],
+                                           inputSemanticIndex[i],
+                                           array_id, array_size);
+            i += array_size - 1;
+         }
+         else {
+            t->inputs[i] = ureg_DECL_input(ureg, inputSemanticName[i],
+                                           inputSemanticIndex[i], 0, 1);
+         }
       }
+      break;
+   case TGSI_PROCESSOR_VERTEX:
+      for (i = 0; i < numInputs; i++) {
+         t->inputs[i] = ureg_DECL_vs_input(ureg, i);
+      }
+      break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Declare output attributes.
+    */
+   switch (procType) {
+   case TGSI_PROCESSOR_FRAGMENT:
+      break;
+   case TGSI_PROCESSOR_GEOMETRY:
+   case TGSI_PROCESSOR_VERTEX:
+      for (i = 0; i < numOutputs; i++) {
+         unsigned array_id = 0;
+         unsigned array_size;
+
+         if (find_array(outputSlotToAttr[i], program->output_arrays,
+                        program->num_output_arrays, &array_id, &array_size)) {
+            /* We've found an array. Declare it so. */
+            t->outputs[i] = ureg_DECL_output_array(ureg,
+                                                   outputSemanticName[i],
+                                                   outputSemanticIndex[i],
+                                                   array_id, array_size);
+            i += array_size - 1;
+         }
+         else {
+            t->outputs[i] = ureg_DECL_output(ureg,
+                                             outputSemanticName[i],
+                                             outputSemanticIndex[i]);
+         }
+      }
+      break;
+   default:
+      assert(0);
+   }
+
+   if (procType == TGSI_PROCESSOR_FRAGMENT) {
       if (proginfo->InputsRead & VARYING_BIT_POS) {
           /* Must do this after setting up t->inputs. */
           emit_wpos(st_context(ctx), t, proginfo, ureg,
@@ -5160,9 +5382,6 @@ st_translate_program(
       if (proginfo->InputsRead & VARYING_BIT_FACE)
          emit_face_var(ctx, t);
 
-      /*
-       * Declare output attributes.
-       */
       for (i = 0; i < numOutputs; i++) {
          switch (outputSemanticName[i]) {
          case TGSI_SEMANTIC_POSITION:
@@ -5198,31 +5417,8 @@ st_translate_program(
          }
       }
    }
-   else if (procType == TGSI_PROCESSOR_GEOMETRY) {
-      for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_gs_input(ureg,
-                                           i,
-                                           inputSemanticName[i],
-                                           inputSemanticIndex[i]);
-      }
-
+   else if (procType == TGSI_PROCESSOR_VERTEX) {
       for (i = 0; i < numOutputs; i++) {
-         t->outputs[i] = ureg_DECL_output(ureg,
-                                          outputSemanticName[i],
-                                          outputSemanticIndex[i]);
-      }
-   }
-   else {
-      assert(procType == TGSI_PROCESSOR_VERTEX);
-
-      for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_vs_input(ureg, i);
-      }
-
-      for (i = 0; i < numOutputs; i++) {
-         t->outputs[i] = ureg_DECL_output(ureg,
-                                          outputSemanticName[i],
-                                          outputSemanticIndex[i]);
          if (outputSemanticName[i] == TGSI_SEMANTIC_FOG) {
             /* force register to contain a fog coordinate in the form (F, 0, 0, 1). */
             ureg_MOV(ureg,
@@ -5277,9 +5473,9 @@ st_translate_program(
       }
    }
 
-   /* Copy over array sizes
-    */
-   memcpy(t->array_sizes, program->array_sizes, sizeof(unsigned) * program->next_array);
+   t->array_sizes = program->array_sizes;
+   t->input_arrays = program->input_arrays;
+   t->output_arrays = program->output_arrays;
 
    /* Emit constants and uniforms.  TGSI uses a single index space for these,
     * so we put all the translated regs in t->constants.
@@ -5355,7 +5551,26 @@ st_translate_program(
    /* texture samplers */
    for (i = 0; i < ctx->Const.Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits; i++) {
       if (program->samplers_used & (1 << i)) {
+         unsigned type;
+
          t->samplers[i] = ureg_DECL_sampler(ureg, i);
+
+         switch (program->sampler_types[i]) {
+         case GLSL_TYPE_INT:
+            type = TGSI_RETURN_TYPE_SINT;
+            break;
+         case GLSL_TYPE_UINT:
+            type = TGSI_RETURN_TYPE_UINT;
+            break;
+         case GLSL_TYPE_FLOAT:
+            type = TGSI_RETURN_TYPE_FLOAT;
+            break;
+         default:
+            unreachable("not reached");
+         }
+
+         ureg_DECL_sampler_view( ureg, i, program->sampler_targets[i],
+                                 type, type, type, type );
       }
    }
 
@@ -5375,6 +5590,7 @@ st_translate_program(
 
 out:
    if (t) {
+      free(t->arrays);
       free(t->temps);
       free(t->insn);
       free(t->labels);
@@ -5470,7 +5686,7 @@ get_mesa_program(struct gl_context *ctx,
          if (!entry->bgn_inst) {
             v->current_function = entry;
 
-            entry->bgn_inst = v->emit(NULL, TGSI_OPCODE_BGNSUB);
+            entry->bgn_inst = v->emit_asm(NULL, TGSI_OPCODE_BGNSUB);
             entry->bgn_inst->function = entry;
 
             visit_exec_list(&entry->sig->body, v);
@@ -5478,10 +5694,10 @@ get_mesa_program(struct gl_context *ctx,
             glsl_to_tgsi_instruction *last;
             last = (glsl_to_tgsi_instruction *)v->instructions.get_tail();
             if (last->op != TGSI_OPCODE_RET)
-               v->emit(NULL, TGSI_OPCODE_RET);
+               v->emit_asm(NULL, TGSI_OPCODE_RET);
 
             glsl_to_tgsi_instruction *end;
-            end = v->emit(NULL, TGSI_OPCODE_ENDSUB);
+            end = v->emit_asm(NULL, TGSI_OPCODE_ENDSUB);
             end->function = entry;
 
             progress = GL_TRUE;
@@ -5513,7 +5729,7 @@ get_mesa_program(struct gl_context *ctx,
    v->renumber_registers();
 
    /* Write the END instruction. */
-   v->emit(NULL, TGSI_OPCODE_END);
+   v->emit_asm(NULL, TGSI_OPCODE_END);
 
    if (ctx->_Shader->Flags & GLSL_DUMP) {
       _mesa_log("\n");
@@ -5528,6 +5744,10 @@ get_mesa_program(struct gl_context *ctx,
    prog->NumInstructions = 0;
 
    do_set_program_inouts(shader->ir, prog, shader->Stage);
+   shrink_array_declarations(v->input_arrays, v->num_input_arrays,
+                             prog->InputsRead);
+   shrink_array_declarations(v->output_arrays, v->num_output_arrays,
+                             prog->OutputsWritten);
    count_resources(v, prog);
 
    /* This must be done before the uniform storage is associated. */
@@ -5549,6 +5769,7 @@ get_mesa_program(struct gl_context *ctx,
     */
    _mesa_associate_uniform_storage(ctx, shader_program, prog->Parameters);
    if (!shader_program->LinkStatus) {
+      free_glsl_to_tgsi_visitor(v);
       return NULL;
    }
 
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.h b/src/mesa/state_tracker/st_glsl_to_tgsi.h
index 2cb80bc..4af747f 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.h
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.h
@@ -43,12 +43,14 @@ enum pipe_error st_translate_program(
    const struct gl_program *proginfo,
    GLuint numInputs,
    const GLuint inputMapping[],
+   const GLuint inputSlotToAttr[],
    const ubyte inputSemanticName[],
    const ubyte inputSemanticIndex[],
    const GLuint interpMode[],
    const GLuint interpLocation[],
    GLuint numOutputs,
    const GLuint outputMapping[],
+   const GLuint outputSlotToAttr[],
    const ubyte outputSemanticName[],
    const ubyte outputSemanticIndex[],
    boolean passthrough_edgeflags,
diff --git a/src/mesa/state_tracker/st_manager.c b/src/mesa/state_tracker/st_manager.c
index 840f76a..a2dee62 100644
--- a/src/mesa/state_tracker/st_manager.c
+++ b/src/mesa/state_tracker/st_manager.c
@@ -680,6 +680,10 @@ st_api_create_context(struct st_api *stapi, struct st_manager *smapi,
 
    if (attribs->flags & ST_CONTEXT_FLAG_FORWARD_COMPATIBLE)
       st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_FORWARD_COMPATIBLE_BIT;
+   if (attribs->flags & ST_CONTEXT_FLAG_ROBUST_ACCESS)
+      st->ctx->Const.ContextFlags |= GL_CONTEXT_FLAG_ROBUST_ACCESS_BIT_ARB;
+   if (attribs->flags & ST_CONTEXT_FLAG_RESET_NOTIFICATION_ENABLED)
+      st->ctx->Const.ResetStrategy = GL_LOSE_CONTEXT_ON_RESET_ARB;
 
    /* need to perform version check */
    if (attribs->major > 1 || attribs->minor > 0) {
@@ -920,8 +924,7 @@ static unsigned get_version(struct pipe_screen *screen,
    struct gl_extensions extensions = {0};
    GLuint version;
 
-   if ((api == API_OPENGL_COMPAT || api == API_OPENGL_CORE) &&
-       _mesa_override_gl_version_contextless(&consts, &api, &version)) {
+   if (_mesa_override_gl_version_contextless(&consts, &api, &version)) {
       return version;
    }
 
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 98d525c..896e239 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -665,7 +665,7 @@ compile_instruction(
    if (num_dst) 
       dst[0] = translate_dst( t, 
                               &inst->DstReg,
-                              inst->SaturateMode,
+                              inst->Saturate,
                               clamp_dst_color_output);
 
    for (i = 0; i < num_src; i++) 
@@ -1095,10 +1095,9 @@ st_translate_mesa_program(
    }
    else if (procType == TGSI_PROCESSOR_GEOMETRY) {
       for (i = 0; i < numInputs; i++) {
-         t->inputs[i] = ureg_DECL_gs_input(ureg,
-                                           i,
-                                           inputSemanticName[i],
-                                           inputSemanticIndex[i]);
+         t->inputs[i] = ureg_DECL_input(ureg,
+                                        inputSemanticName[i],
+                                        inputSemanticIndex[i], 0, 1);
       }
 
       for (i = 0; i < numOutputs; i++) {
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index a9110d3..fa792bc 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -215,6 +215,7 @@ st_prepare_vertex_program(struct gl_context *ctx,
          unsigned slot = stvp->num_outputs++;
 
          stvp->result_to_output[attr] = slot;
+         stvp->output_slot_to_attr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_POS:
@@ -285,7 +286,8 @@ st_prepare_vertex_program(struct gl_context *ctx,
             /* fall through */
          case VARYING_SLOT_VAR0:
          default:
-            assert(attr < VARYING_SLOT_MAX);
+            assert(attr >= VARYING_SLOT_VAR0 ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
             stvp->output_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
             stvp->output_semantic_index[slot] =
                st_get_generic_varying_index(st, attr);
@@ -321,7 +323,7 @@ st_translate_vertex_program(struct st_context *st,
       _mesa_remove_output_reads(&stvp->Base.Base, PROGRAM_OUTPUT);
    }
 
-   ureg = ureg_create( TGSI_PROCESSOR_VERTEX );
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_VERTEX, st->pipe->screen);
    if (ureg == NULL) {
       free(vpv);
       return NULL;
@@ -351,6 +353,7 @@ st_translate_vertex_program(struct st_context *st,
                                    /* inputs */
                                    vpv->num_inputs,
                                    stvp->input_to_index,
+                                   NULL, /* inputSlotToAttr */
                                    NULL, /* input semantic name */
                                    NULL, /* input semantic index */
                                    NULL, /* interp mode */
@@ -358,6 +361,7 @@ st_translate_vertex_program(struct st_context *st,
                                    /* outputs */
                                    num_outputs,
                                    stvp->result_to_output,
+                                   stvp->output_slot_to_attr,
                                    stvp->output_semantic_name,
                                    stvp->output_semantic_index,
                                    key->passthrough_edgeflags,
@@ -482,6 +486,7 @@ st_translate_fragment_program(struct st_context *st,
 
    GLuint outputMapping[FRAG_RESULT_MAX];
    GLuint inputMapping[VARYING_SLOT_MAX];
+   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
    GLuint interpMode[PIPE_MAX_SHADER_INPUTS];  /* XXX size? */
    GLuint interpLocation[PIPE_MAX_SHADER_INPUTS];
    GLuint attr;
@@ -502,6 +507,7 @@ st_translate_fragment_program(struct st_context *st,
       return NULL;
 
    assert(!(key->bitmap && key->drawpixels));
+   memset(inputSlotToAttr, ~0, sizeof(inputSlotToAttr));
 
    if (key->bitmap) {
       /* glBitmap drawing */
@@ -543,6 +549,7 @@ st_translate_fragment_program(struct st_context *st,
          const GLuint slot = fs_num_inputs++;
 
          inputMapping[attr] = slot;
+         inputSlotToAttr[slot] = attr;
          if (stfp->Base.IsCentroid & BITFIELD64_BIT(attr))
             interpLocation[slot] = TGSI_INTERPOLATE_LOC_CENTROID;
          else if (stfp->Base.IsSample & BITFIELD64_BIT(attr))
@@ -657,7 +664,8 @@ st_translate_fragment_program(struct st_context *st,
              * consumed for the TEXi varyings, and we can base the locations of
              * the user varyings on VAR0.  Otherwise, we use TEX0 as base index.
              */
-            assert(attr >= VARYING_SLOT_TEX0);
+            assert(attr >= VARYING_SLOT_VAR0 || attr == VARYING_SLOT_PNTC ||
+                   (attr >= VARYING_SLOT_TEX0 && attr <= VARYING_SLOT_TEX7));
             input_semantic_name[slot] = TGSI_SEMANTIC_GENERIC;
             input_semantic_index[slot] = st_get_generic_varying_index(st, attr);
             if (attr == VARYING_SLOT_PNTC)
@@ -732,7 +740,7 @@ st_translate_fragment_program(struct st_context *st,
       }
    }
 
-   ureg = ureg_create( TGSI_PROCESSOR_FRAGMENT );
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_FRAGMENT, st->pipe->screen);
    if (ureg == NULL) {
       free(variant);
       return NULL;
@@ -778,6 +786,7 @@ st_translate_fragment_program(struct st_context *st,
                            /* inputs */
                            fs_num_inputs,
                            inputMapping,
+                           inputSlotToAttr,
                            input_semantic_name,
                            input_semantic_index,
                            interpMode,
@@ -785,6 +794,7 @@ st_translate_fragment_program(struct st_context *st,
                            /* outputs */
                            fs_num_outputs,
                            outputMapping,
+                           NULL,
                            fs_output_semantic_name,
                            fs_output_semantic_index, FALSE,
                            key->clamp_color );
@@ -867,7 +877,9 @@ st_translate_geometry_program(struct st_context *st,
                               struct st_geometry_program *stgp,
                               const struct st_gp_variant_key *key)
 {
+   GLuint inputSlotToAttr[VARYING_SLOT_MAX];
    GLuint inputMapping[VARYING_SLOT_MAX];
+   GLuint outputSlotToAttr[VARYING_SLOT_MAX];
    GLuint outputMapping[VARYING_SLOT_MAX];
    struct pipe_context *pipe = st->pipe;
    GLuint attr;
@@ -890,13 +902,15 @@ st_translate_geometry_program(struct st_context *st,
    if (!gpv)
       return NULL;
 
-   ureg = ureg_create(TGSI_PROCESSOR_GEOMETRY);
+   ureg = ureg_create_with_screen(TGSI_PROCESSOR_GEOMETRY, st->pipe->screen);
    if (ureg == NULL) {
       free(gpv);
       return NULL;
    }
 
+   memset(inputSlotToAttr, 0, sizeof(inputSlotToAttr));
    memset(inputMapping, 0, sizeof(inputMapping));
+   memset(outputSlotToAttr, 0, sizeof(outputSlotToAttr));
    memset(outputMapping, 0, sizeof(outputMapping));
 
    /*
@@ -907,6 +921,7 @@ st_translate_geometry_program(struct st_context *st,
          const GLuint slot = gs_num_inputs++;
 
          inputMapping[attr] = slot;
+         inputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_PRIMITIVE_ID:
@@ -985,6 +1000,7 @@ st_translate_geometry_program(struct st_context *st,
          GLuint slot = gs_num_outputs++;
 
          outputMapping[attr] = slot;
+         outputSlotToAttr[slot] = attr;
 
          switch (attr) {
          case VARYING_SLOT_POS:
@@ -1080,6 +1096,7 @@ st_translate_geometry_program(struct st_context *st,
                         /* inputs */
                         gs_num_inputs,
                         inputMapping,
+                        inputSlotToAttr,
                         input_semantic_name,
                         input_semantic_index,
                         NULL,
@@ -1087,6 +1104,7 @@ st_translate_geometry_program(struct st_context *st,
                         /* outputs */
                         gs_num_outputs,
                         outputMapping,
+                        outputSlotToAttr,
                         gs_output_semantic_name,
                         gs_output_semantic_index,
                         FALSE,
@@ -1201,7 +1219,7 @@ destroy_program_variants(struct st_context *st, struct gl_program *program)
          }
       }
       break;
-   case MESA_GEOMETRY_PROGRAM:
+   case GL_GEOMETRY_PROGRAM_NV:
       {
          struct st_geometry_program *stgp =
             (struct st_geometry_program *) program;
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index a2c5606..bb77eb6 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -163,6 +163,7 @@ struct st_vertex_program
 
    /** Maps VARYING_SLOT_x to slot */
    GLuint result_to_output[VARYING_SLOT_MAX];
+   GLuint output_slot_to_attr[VARYING_SLOT_MAX];
    ubyte output_semantic_name[VARYING_SLOT_MAX];
    ubyte output_semantic_index[VARYING_SLOT_MAX];
    GLuint num_outputs;
diff --git a/src/mesa/swrast/s_texrender.c b/src/mesa/swrast/s_texrender.c
index fa853c9..4e41b3b 100644
--- a/src/mesa/swrast/s_texrender.c
+++ b/src/mesa/swrast/s_texrender.c
@@ -72,7 +72,7 @@ update_wrapper(struct gl_context *ctx, struct gl_renderbuffer_attachment *att)
  * \param fb  the framebuffer object the texture is being bound to
  * \param att  the fb attachment point of the texture
  *
- * \sa _mesa_framebuffer_renderbuffer
+ * \sa _mesa_FramebufferRenderbuffer_sw
  */
 void
 _swrast_render_texture(struct gl_context *ctx,
diff --git a/src/mesa/tnl/t_context.c b/src/mesa/tnl/t_context.c
index 5b9dd54..bc77ba8 100644
--- a/src/mesa/tnl/t_context.c
+++ b/src/mesa/tnl/t_context.c
@@ -36,6 +36,7 @@
 #include "math/m_xform.h"
 #include "main/state.h"
 #include "main/viewport.h"
+#include "util/simple_list.h"
 
 #include "tnl.h"
 #include "t_context.h"
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 3ea775c..72b8206 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -1817,9 +1817,12 @@ vbo_initialize_exec_dispatch(const struct gl_context *ctx,
       SET_DrawElementsInstancedBaseVertexBaseInstance(exec, vbo_exec_DrawElementsInstancedBaseVertexBaseInstance);
    }
 
-   if (ctx->API == API_OPENGL_CORE) {
+   if (ctx->API == API_OPENGL_CORE || _mesa_is_gles31(ctx)) {
       SET_DrawArraysIndirect(exec, vbo_exec_DrawArraysIndirect);
       SET_DrawElementsIndirect(exec, vbo_exec_DrawElementsIndirect);
+   }
+
+   if (ctx->API == API_OPENGL_CORE) {
       SET_MultiDrawArraysIndirect(exec, vbo_exec_MultiDrawArraysIndirect);
       SET_MultiDrawElementsIndirect(exec, vbo_exec_MultiDrawElementsIndirect);
    }
diff --git a/src/util/list.h b/src/util/list.h
index 9460347..b98ce59 100644
--- a/src/util/list.h
+++ b/src/util/list.h
@@ -140,6 +140,13 @@ static inline void list_validate(struct list_head *list)
 	     - ((char *)&(sample)->member - (char *)(sample)))
 #endif
 
+#define list_first_entry(ptr, type, member) \
+        LIST_ENTRY(type, (ptr)->next, member)
+
+#define list_last_entry(ptr, type, member) \
+        LIST_ENTRY(type, (ptr)->prev, member)
+
+
 #define LIST_FOR_EACH_ENTRY(pos, head, member)				\
    for (pos = NULL, pos = container_of((head)->next, pos, member);	\
 	&pos->member != (head);						\
diff --git a/src/vulkan/compiler.cpp b/src/vulkan/compiler.cpp
index 19a403a..0ea44ac 100644
--- a/src/vulkan/compiler.cpp
+++ b/src/vulkan/compiler.cpp
@@ -719,10 +719,9 @@ anv_compiler_create(struct anv_device *device)
    compiler->brw->intelScreen = compiler->screen;
    compiler->screen->devinfo = &device->info;
 
-   brw_process_intel_debug_variable(compiler->brw);
+   brw_process_intel_debug_variable(compiler->screen);
 
-   if (device->info.gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS))
-      compiler->brw->scalar_vs = true;
+   compiler->screen->compiler = brw_compiler_create(compiler, &device->info);
 
    ctx = &compiler->brw->ctx;
    _mesa_init_shader_object_functions(&ctx->Driver);
@@ -736,7 +735,6 @@ anv_compiler_create(struct anv_device *device)
    /* Set dd::NewShader */
    brwInitFragProgFuncs(&ctx->Driver);
 
-   compiler->screen->compiler = brw_compiler_create(compiler, &device->info);
    ctx->_Shader = &compiler->pipeline;
 
    compiler->brw->precompile = false;